<a href="https://colab.research.google.com/github/adityachavan09/ESG_Classifier/blob/main/OxML_2023_ESG_Classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -q transformers datasets PyPDF2 accelerate pycryptodome

# Imports

In [None]:
import os
import shutil

from PyPDF2 import PdfReader

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score

import plotly.express as px

import torch
from torch.utils.data import DataLoader, Dataset
from transformers import AdamW, AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding, TrainingArguments, Trainer

# Pre-Reqs and Helper Functions

In [None]:
DRIVE = True # If you are loading data from drive, keep this as true

DRIVE_BASE_PATH = '/content/drive/MyDrive/OxML Task 1'

if DRIVE:
  from google.colab import drive
  drive.mount('/content/drive', force_remount=True)

  if not os.path.exists("/content/oxml"):
    shutil.copytree(DRIVE_BASE_PATH, '/content/oxml')

Mounted at /content/drive


In [None]:
BASE_PATH = '/content/oxml'
REPORTS_PATH = os.path.join(BASE_PATH, 'data', 'reports')

def extract_text_from_pdf_with_path(pdf_path, page_number):
  try:
    page_index = page_number - 1
    reader = PdfReader(pdf_path)
    page = reader.pages[page_index]
    return page.extract_text()
  except Exception as e:
    print(f'Exception occurred for pdf = {pdf_path} and page = {page_number}')
    print(e)

In [None]:
mock_pdf_path = os.path.join(REPORTS_PATH, 'report_1827.pdf')
mock_page = 4
pdf_text = extract_text_from_pdf_with_path(mock_pdf_path, mock_page)
pdf_text

'Financial Highlights 2018 (IFRS)\n2018 2017 Change\nOperating Highlights (€ in millions)\nNet sales 121,915 21,218 3%\nGross profit 111,363 10,703 6%\nOther operating expenses 1, 29,172 8,766 5%\nEBITDA 12,882 2,511 15%\nOperating profit 12,368 2,070 14%\nNet income from continuing operations 1, 31,709 1,430 20%\nNet income attributable to shareholders 3, 41,702 1,173 45%\nKey RatiosGross margin\n 151.8% 50.4% 1.4pp\nOther operating expenses in % of net sales 1, 241.9% 41.3% 0.5pp\nOperating margin 110.8% 9.8% 1.1pp\nEffective tax rate 1, 328.1% 29.3% (1.2pp)\nNet income attributable to shareholders in % of net  sal\nes 3, 47.8% 5.5% 2.2pp\nAverage operating working capital in % of net  sal\nes 119.0% 20.4% (1.4pp)\nEquity ratio 540.8% 43.0% (2.2pp)\nNet borrowings/EBITDA 1(0.3) (0.2) n.a.\nFinancial leverage 5(15.0%) (8.0%) (7.0pp)\nReturn on equity 4, 526.7% 18.2% 8.5pp\nBalance Sheet and Cash Flow Data (€ in millions)Total assets\n 515,612 14,019 11%\nInventories 3,445 3,692 (7%)\n

# Data Loading

In [None]:
LABELS_PATH = os.path.join(BASE_PATH, 'data', 'labels.csv')

df = pd.read_csv(LABELS_PATH)
df.sample(20)

Unnamed: 0,id,class
947,report_1132.pdf.27,governance
971,report_189.pdf.14,environmental
1462,report_422.pdf.15,environmental
377,report_711.pdf.69,environmental
57,report_1830.pdf.66,other
322,report_733.pdf.14,social
28,report_1825.pdf.38,other
1295,report_1132.pdf.23,social
1206,report_1241.pdf.133,environmental
1898,report_1828.pdf.79,other


In [None]:
df['report_name'] = [x.split('pdf.')[0] + 'pdf' for x in df['id']]
df['page_number'] = [x.split('pdf.')[1] for x in df['id']]
df['page_number'] = df['page_number'].astype(int)

df.sample(20)

Unnamed: 0,id,class,report_name,page_number
666,report_1611.pdf.20,social,report_1611.pdf,20
1269,report_1825.pdf.222,other,report_1825.pdf,222
951,report_1832.pdf.41,other,report_1832.pdf,41
1295,report_1132.pdf.23,social,report_1132.pdf,23
238,report_1827.pdf.212,other,report_1827.pdf,212
680,report_1824.pdf.27,other,report_1824.pdf,27
354,report_1825.pdf.226,other,report_1825.pdf,226
93,report_711.pdf.175,governance,report_711.pdf,175
813,report_1828.pdf.97,other,report_1828.pdf,97
917,report_1831.pdf.116,other,report_1831.pdf,116


In [None]:
PROCESSED_DATA = '/content/oxml_data.csv'

if os.path.exists(PROCESSED_DATA):
  df = pd.read_csv(PROCESSED_DATA)
else:
  df['text'] = [extract_text_from_pdf_with_path(os.path.join(REPORTS_PATH, pdf_name), page_number) for pdf_name, page_number in zip(df['report_name'], df['page_number'])]

  df.to_csv('oxml_data.csv')

# EDA

In [None]:
df.sample(10)

Unnamed: 0.1,Unnamed: 0,id,class,report_name,page_number,text
1666,1666,report_279.pdf.16,environmental,report_279.pdf,16,Water\nWe recognize that water usage and sourc...
543,543,report_1825.pdf.355,other,report_1825.pdf,355,CONSOLIDATED FINANCIAL STATEMENTS \n5.6 NO...
73,73,report_1827.pdf.215,other,report_1827.pdf,215,Tax expenses\nTax expenses are split as follow...
1933,1933,report_1706.pdf.17,social,report_1706.pdf,17,CROWN CASTLE | 2019 CORPORATE SUSTAINABILITY...
1013,1013,report_1827.pdf.235,other,report_1827.pdf,235,(systems) as they have considered necessary to...
1701,1701,report_1181.pdf.38,social,report_1181.pdf,38,\n2021 Sustainability Report 38Introduction...
1237,1237,report_1253.pdf.57,social,report_1253.pdf,57,Pfizer 2021 ESG Report57\n PerformanceThe Pati...
849,849,report_1430.pdf.24,environmental,report_1430.pdf,24,Introduction Our Communities Our People Our...
1865,1865,report_1826.pdf.223,other,report_1826.pdf,223,NOTES TO THE BALANCE SHEET / / NOTES TO THE CO...
914,914,report_519.pdf.37,social,report_519.pdf,37,2021 ESG Impact Report \n \n \n \n \n \n \n \...


In [None]:
fig = px.pie(df, values=df['class'].value_counts().values, names=df['class'].value_counts().index)
fig.update_traces(hoverinfo='label+percent', textinfo='value')
fig.show()

# Train and Validate Transformer

In [None]:
le = LabelEncoder()
df['labels'] = le.fit_transform(df['class'])

In [None]:
df.sample(10)

Unnamed: 0.1,Unnamed: 0,id,class,report_name,page_number,text,labels
18,18,report_1833.pdf.187,other,report_1833.pdf,187,Anglo American plc Integrated Annual Report ...,2
594,594,report_1824.pdf.283,other,report_1824.pdf,283,ENVIRONMENTAL AND SOCIETAL REPORTING\n4\n Soci...,2
1718,1718,report_1683.pdf.21,social,report_1683.pdf,21,GOVERNANCE CEO MESSAGE INNOVATION ASSOCIATES C...,3
382,382,report_1829.pdf.79,other,report_1829.pdf,79,\n \n \n C _ Group Management Report \nAnnua...,2
1890,1890,report_1622.pdf.45,environmental,report_1622.pdf,45,Topic-specific standards\nGRI 300: ENVIRONMENT...,0
426,426,report_1155.pdf.23,social,report_1155.pdf,23,Training\nTraining and development of our team...,3
408,408,report_1611.pdf.19,social,report_1611.pdf,19,Webuildstrongconnectionswithourcustomers.Welis...,3
928,928,report_1132.pdf.16,environmental,report_1132.pdf,16,"For more than a decade, the Sunlight Advantage...",0
440,440,report_1237.pdf.39,environmental,report_1237.pdf,39,39ACCOUNTING METRIC CODE INFORMATION 2020\nTop...,0
1613,1613,report_1827.pdf.246,other,report_1827.pdf,246,DECLARA TION \nOF SUPPOR\nT\nadidas AG declare...,2


In [None]:
df.columns

Index(['Unnamed: 0', 'id', 'class', 'report_name', 'page_number', 'text',
       'labels'],
      dtype='object')

In [None]:
transformer_name = 'nbroad/ESG-BERT'

tokenizer = AutoTokenizer.from_pretrained(transformer_name)

model = AutoModelForSequenceClassification.from_pretrained(transformer_name)

if torch.cuda.is_available():
  model = model.to('cuda')

In [None]:
class CustomDataset(Dataset):
    def __init__(self, dataframe, tokenizer):
        self.data = dataframe
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        text = self.data.loc[index, 'text']
        label = self.data.loc[index, 'labels']
        encoding = self.tokenizer(text, truncation=True, padding='max_length', max_length=512)
        inputs = {
            'input_ids': torch.tensor(encoding['input_ids']),
            'attention_mask': torch.tensor(encoding['attention_mask']),
            'labels': torch.tensor(label)
        }
        return inputs

train_dataset = CustomDataset(df, tokenizer)

In [None]:
def data_collator(features):
    batch = {
        'input_ids': torch.stack([f['input_ids'] for f in features]),
        'attention_mask': torch.stack([f['attention_mask'] for f in features]),
        'labels': torch.stack([f['labels'] for f in features])
    }
    return batch

In [None]:
train_dataloader = DataLoader(train_dataset, batch_size=16, collate_fn=data_collator)

In [None]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred.predictions, eval_pred.label_ids
    predictions = logits.argmax(axis=-1)
    f1 = f1_score(labels, predictions, average='weighted')
    return {"f1_score": f1}

In [None]:
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=5,
    per_device_train_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=50,
)

# Define the trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

In [None]:
trainer.train()





Step,Training Loss
50,3.935
100,1.2815
150,0.5866
200,0.4113
250,0.4257
300,0.2633
350,0.2474
400,0.1703
450,0.1902
500,0.1782


TrainOutput(global_step=615, training_loss=0.6458052200999687, metrics={'train_runtime': 985.2831, 'train_samples_per_second': 9.926, 'train_steps_per_second': 0.624, 'total_flos': 2573780616437760.0, 'train_loss': 0.6458052200999687, 'epoch': 5.0})

In [None]:
trainer.evaluate(train_dataset)

{'eval_loss': 0.04682925343513489,
 'eval_f1_score': 0.9902979018008535,
 'eval_runtime': 68.7357,
 'eval_samples_per_second': 28.457,
 'eval_steps_per_second': 3.564,
 'epoch': 5.0}

# Submission

In [None]:
SUBMISSION_SAMPLE_PATH = os.path.join(BASE_PATH, 'sample_submission.csv')

submission = pd.read_csv(SUBMISSION_SAMPLE_PATH)

submission.sample(10)

Unnamed: 0,id,class
55,report_1352.pdf.63,environmental
63,report_1352.pdf.84,environmental
33,report_1179.pdf.28,environmental
461,report_1012.pdf.116,environmental
72,report_1352.pdf.50,environmental
332,report_1179.pdf.8,environmental
228,report_1145.pdf.56,environmental
9,report_576.pdf.11,environmental
172,report_1179.pdf.29,environmental
229,report_576.pdf.15,environmental


In [None]:
submission['report_name'] = [x.split('pdf.')[0] + 'pdf' for x in submission['id']]
submission['page_number'] = [x.split('pdf.')[1] for x in submission['id']]
submission['page_number'] = submission['page_number'].astype(int)
submission['text'] = [extract_text_from_pdf_with_path(os.path.join(REPORTS_PATH, pdf_name), page_number) for pdf_name, page_number in zip(submission['report_name'], submission['page_number'])]
submission['labels'] = le.transform(submission['class'])

In [None]:
submission.sample(10)

Unnamed: 0,id,class,report_name,page_number,text,labels
323,report_1179.pdf.13,environmental,report_1179.pdf,13,FY21 ESG Data Disclosures\n2022 Unaudited 8whi...,0
194,report_1260.pdf.14,environmental,report_1260.pdf,14,CORPORATE GOVERNANCE\nHoneywell’s Board of Dir...,0
30,report_1145.pdf.29,environmental,report_1145.pdf,29,29\nEFFICIENCY IN OPERATIONS\nOur commitment t...,0
295,report_1145.pdf.34,environmental,report_1145.pdf,34,34\nGlobal program examples:\n• Lighting: LED...,0
294,report_1179.pdf.14,environmental,report_1179.pdf,14,FY21 ESG Data Disclosures\n2022 Unaudited 9Eff...,0
63,report_1352.pdf.84,environmental,report_1352.pdf,84,LG H&H has an advanced governance structure ce...,0
215,report_1145.pdf.59,environmental,report_1145.pdf,59,59\n• Made it a top priority to drive inclusio...,0
168,report_293.pdf.92,environmental,report_293.pdf,92,Ferrexpo plc Responsible Business Report 2021 ...,0
127,report_293.pdf.55,environmental,report_293.pdf,55,SUPPORTING UKRAINE WORKFORCE ETHICAL BUSINESS ...,0
243,report_1352.pdf.89,environmental,report_1352.pdf,89,Integrated Risk Management System\nLG H&H oper...,0


In [None]:
test_dataset = CustomDataset(submission, tokenizer)
predictions = trainer.predict(test_dataset)

In [None]:
output_labels = predictions.predictions.argmax(axis=1)
class_labels = le.inverse_transform(output_labels)
submission['class'] = class_labels

In [None]:
submission.sample(10)

Unnamed: 0,id,class,report_name,page_number,text,labels
15,report_1352.pdf.117,environmental,report_1352.pdf,117,Waste\nNOx 6)\n Total amount \nIntensityTotal ...,0
208,report_1352.pdf.54,social,report_1352.pdf,54,Activating employee communication\nOperating c...,0
396,report_1179.pdf.41,governance,report_1179.pdf,41,FY21 ESG Data Disclosures\n2022 Unaudited 36Co...,0
339,report_1012.pdf.91,social,report_1012.pdf,91,PSEG\n2021 Sustainability and Climate Report89...,0
101,report_1352.pdf.94,governance,report_1352.pdf,94,LG H&H is improving its rules and systems to e...,0
78,report_1179.pdf.19,environmental,report_1179.pdf,19,FY21 ESG Data Disclosures\n2022 Unaudited 14EN...,0
76,report_1260.pdf.19,governance,report_1260.pdf,19,The framework is aligned to industry standards...,0
67,report_1260.pdf.74,social,report_1260.pdf,74,An essential element of Honeywell’s Corporate ...,0
425,report_576.pdf.32,governance,report_576.pdf,32,2021 MAHA ENERGY SUSTAINABILITY REPORT\n59 60R...,0
405,report_1352.pdf.86,governance,report_1352.pdf,86,Subcommittee\nAudit committee\nLG H&H operates...,0


In [None]:
submission[['id', 'class']].to_csv('submission.csv', index=False)