In [None]:

from google.colab import drive
drive.mount('/content/drive')

In [None]:

import pandas as pd
import re
import os
from glob import glob

In [None]:

input_folder_path = '/content/drive/MyDrive/SustainabilityReports/Firm_ID/Results/81_90'  
output_folder_path =  '/content/drive/MyDrive/SustainabilityReports/Firm_ID/Results/Merged'

In [None]:

csv_files = glob(os.path.join(input_folder_path, '*.csv'))


company_dataframes = {}


In [None]:
# Function to expand rows based on the 'Year' column having multiple values
def expand_rows(df, year_col):
    df[year_col] = df[year_col].astype(str)  # Ensure the year column is string type
    df = df[df[year_col].notna()]  # Remove rows where year column is NaN
    s = df[year_col].str.split(',').apply(pd.Series, 1).stack()
    s.index = s.index.droplevel(-1)  
    s.name = year_col
    del df[year_col]
    df = df.join(s)
    return df

In [None]:

for file_path in csv_files:
    file_name = os.path.basename(file_path)
    match = re.match(r'(.*?)_(IR|ESG)_EN_(\d{4})\.csv', file_name)
    if match:
        company_name = match.group(1)
        year_from_filename = int(match.group(3))
    else:
        print(f"Filename doesn't match the expected pattern: {file_name}")
        continue

    df = pd.read_csv(file_path)

    df['Year'] = df['Year'].fillna(year_from_filename)

    df['Company'] = company_name

    df['published_year'] = year_from_filename

    df = expand_rows(df, 'Year')

    if company_name not in company_dataframes:
        company_dataframes[company_name] = []
    company_dataframes[company_name].append(df)


In [None]:
for company_name, df_list in company_dataframes.items():
    merged_df = pd.concat(df_list, ignore_index=True)
    output_file_path = os.path.join(output_folder_path, f'{company_name}.csv')
    merged_df.to_csv(output_file_path, index=False)
    print(f"Merged CSV file saved successfully: {output_file_path}")