<a href="https://colab.research.google.com/github/alexandrastna/AI-for-ESG/blob/main/Notebooks/4_Thesis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Thesis 4 - ESG Classification of Extracted Sentences
This notebook classifies each extracted sentence from company reports into one of four categories: Environmental, Social, Governance, or None. It uses three pre-trained transformer models (ESGBERT) and saves the final output to Google Drive.

In [None]:
# 1 - Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
# 2 - Import Libraries
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
import torch
from tqdm import tqdm
tqdm.pandas()


In [None]:
# 3 - Load Extracted Sentences
path = "/content/drive/MyDrive/Thèse Master/Exports2/parsed_sentences.csv"
df = pd.read_csv(path)

# Optional: Display full sentence content
pd.set_option("display.max_colwidth", None)

# Preview
df.head()


Unnamed: 0,company,year,document_type,sentence
0,ABB Ltd,2023,Integrated Report,ABB’s purpose is to enable a more sustainable and resource-efficient ­future with our technology leadership in electrification and automation.
1,ABB Ltd,2023,Integrated Report,"Building on over 140 years of excel­ lence, our more than 105,000 employ­ ees are committed to delivering on our purpose by driving innovations that create success for ABB and all our stakeholders."
2,ABB Ltd,2023,Integrated Report,"Together, we address the world’s energy challenges, trans­ form industries, reduce emissions, preserve natural resources, promote social progress, and push the frontiers of technology to make things possi­ ble that were not possible before."
3,ABB Ltd,2023,Integrated Report,Our solutions connect engineer­
4,ABB Ltd,2023,Integrated Report,"ing know-how and software to opti­ mize how things are manufactured, moved, powered, and operated."


In [None]:
# 4 - Set Processing Device (GPU if available)
device = 0 if torch.cuda.is_available() else -1
print(f"🖥️ Using device: {'GPU' if device == 0 else 'CPU'}")


🖥️ Using device: GPU


In [None]:
# 5 - Load ESG Classification Models

# ENVIRONMENTAL
name = "ESGBERT/EnvironmentalBERT-environmental"
tokenizer_env = AutoTokenizer.from_pretrained(name)
model_env = AutoModelForSequenceClassification.from_pretrained(name)
pipe_env = pipeline("text-classification", model=model_env, tokenizer=tokenizer_env, device=device)

# SOCIAL
name = "ESGBERT/SocialBERT-social"
tokenizer_soc = AutoTokenizer.from_pretrained(name)
model_soc = AutoModelForSequenceClassification.from_pretrained(name)
pipe_soc = pipeline("text-classification", model=model_soc, tokenizer=tokenizer_soc, device=device)

# GOVERNANCE
name = "ESGBERT/GovernanceBERT-governance"
tokenizer_gov = AutoTokenizer.from_pretrained(name)
model_gov = AutoModelForSequenceClassification.from_pretrained(name)
pipe_gov = pipeline("text-classification", model=model_gov, tokenizer=tokenizer_gov, device=device)


Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0


In [None]:
# 6 - Define ESG Classification Function

def classify_all_esg(text):
    try:
        label_env = pipe_env(text, truncation=True)[0]
        label_soc = pipe_soc(text, truncation=True)[0]
        label_gov = pipe_gov(text, truncation=True)[0]
    except Exception as e:
        print(f"❌ Erreur sur texte : {text[:50]}... | {e}")
        label_env = {"label": "ERROR", "score": 0}
        label_soc = {"label": "ERROR", "score": 0}
        label_gov = {"label": "ERROR", "score": 0}
    return pd.Series({
        "label_env": label_env["label"],
        "score_env": label_env["score"],
        "label_soc": label_soc["label"],
        "score_soc": label_soc["score"],
        "label_gov": label_gov["label"],
        "score_gov": label_gov["score"],
    })


In [None]:
# 7 - Apply Classification to a Random Sample (for testing)

# Random sample of 100 sentences
df_sample = df.sample(100, random_state=42).copy()

# Apply classification function
df_sample[[
    "label_env", "score_env",
    "label_soc", "score_soc",
    "label_gov", "score_gov"
]] = df_sample["sentence"].progress_apply(classify_all_esg)

# Preview results
df_sample.head(100)


100%|██████████| 100/100 [00:02<00:00, 39.75it/s]


Unnamed: 0,company,year,document_type,sentence,label_env,score_env,label_soc,score_soc,label_gov,score_gov
166962,Compagnie Financière Richemont,2022,Sustainability Report,"Chaired by dunhill’s CEO, the newly appointed Sustainability Committee ensures the implementation of Richemont’s strategy across the business and increases overall accountability.",environmental,0.989977,social,0.996938,none,0.774423
108086,Novartis AG,2023,Earnings Call Transcript,"What we haven't done as well perhaps at AZ and Daiichi and Seattle Genetics is get that all together in a way to, as I said earlier, thread the PK so that you can get that high efficacy without the toxicities that we've often seen with ADCs.",none,0.950158,none,0.999698,none,0.990211
58969,Roche Holding AG,2021,Earnings Call Transcript,"On the U.S. environment, difficult to judge it at this stage I guess.",environmental,0.978121,none,0.999937,none,0.997985
55848,Roche Holding AG,2023,Earnings Call Transcript,"But with that, I will hand it over to you.",none,0.992668,none,0.999892,none,0.997254
53683,Roche Holding AG,2022,Earnings Call Transcript,Next question would come from Emily Field from Barclays.,none,0.994357,none,0.999938,none,0.997646
...,...,...,...,...,...,...,...,...,...,...
44243,Lonza Group AG,2021,Earnings Call Transcript,"In line with our guidance in July, H2 CORE EBITDA margin was softer than in H1.",none,0.997605,none,0.999931,none,0.998116
134640,Swiss Re Ltd,2021,Sustainability Report,"The digitalisation of services also creates cyber risks, and the interconnectedness of mobility services adds complexity, eg regarding service interruptions and resulting loss accumulation potentials.",none,0.994699,none,0.999883,none,0.994120
24629,Holcim Ltd,2021,Integrated Report,"As part of our focus on affordable housing and infrastructure, we use innovation, technology and market- based approaches to create positive impact at increasing scale and speed.",none,0.903463,none,0.995456,none,0.994250
128731,Zurich Insurance Group AG,2021,Annual Report,The quality control procedures used depend on the nature and complexity of the invested assets.,none,0.998235,none,0.999940,none,0.997185


In [None]:
# 8 - Full Batch Processing
# Settings
batch_size = 10000
total_sentences = len(df)
num_batches = int(np.ceil(total_sentences / batch_size))
output_dir = "/content/drive/MyDrive/Thèse Master/Exports2/classified_batches"


In [None]:
# Create output directory if needed
import os
os.makedirs(output_dir, exist_ok=True)

In [None]:
# Loop through sentence batches
for i in range(num_batches):
    print(f"\n🔹 Traitement du batch {i+1}/{num_batches}")

    # Subset dataframe
    start = i * batch_size
    end = min((i + 1) * batch_size, total_sentences)
    df_batch = df.iloc[start:end].copy()

    # Apply classification
    df_batch[[
        "label_env", "score_env",
        "label_soc", "score_soc",
        "label_gov", "score_gov"
    ]] = df_batch["sentence"].progress_apply(classify_all_esg)

    # Save batch to Drive
    output_path = f"{output_dir}/classified_batch_{i+1:03}.csv"
    df_batch.to_csv(output_path, index=False)
    print(f"✅ Sauvé : {output_path}")


🔹 Traitement du batch 1/21


100%|██████████| 10000/10000 [03:09<00:00, 52.84it/s]


✅ Sauvé : /content/drive/MyDrive/Thèse Master/Exports2/classified_batches/classified_batch_001.csv

🔹 Traitement du batch 2/21


100%|██████████| 10000/10000 [03:07<00:00, 53.26it/s]


✅ Sauvé : /content/drive/MyDrive/Thèse Master/Exports2/classified_batches/classified_batch_002.csv

🔹 Traitement du batch 3/21


100%|██████████| 10000/10000 [03:05<00:00, 54.02it/s]


✅ Sauvé : /content/drive/MyDrive/Thèse Master/Exports2/classified_batches/classified_batch_003.csv

🔹 Traitement du batch 4/21


100%|██████████| 10000/10000 [03:01<00:00, 55.13it/s]


✅ Sauvé : /content/drive/MyDrive/Thèse Master/Exports2/classified_batches/classified_batch_004.csv

🔹 Traitement du batch 5/21


100%|██████████| 10000/10000 [02:59<00:00, 55.65it/s]


✅ Sauvé : /content/drive/MyDrive/Thèse Master/Exports2/classified_batches/classified_batch_005.csv

🔹 Traitement du batch 6/21


100%|██████████| 10000/10000 [03:00<00:00, 55.32it/s]


✅ Sauvé : /content/drive/MyDrive/Thèse Master/Exports2/classified_batches/classified_batch_006.csv

🔹 Traitement du batch 7/21


100%|██████████| 10000/10000 [03:01<00:00, 55.23it/s]


✅ Sauvé : /content/drive/MyDrive/Thèse Master/Exports2/classified_batches/classified_batch_007.csv

🔹 Traitement du batch 8/21


100%|██████████| 10000/10000 [03:00<00:00, 55.29it/s]


✅ Sauvé : /content/drive/MyDrive/Thèse Master/Exports2/classified_batches/classified_batch_008.csv

🔹 Traitement du batch 9/21


100%|██████████| 10000/10000 [02:58<00:00, 55.94it/s]


✅ Sauvé : /content/drive/MyDrive/Thèse Master/Exports2/classified_batches/classified_batch_009.csv

🔹 Traitement du batch 10/21


100%|██████████| 10000/10000 [03:05<00:00, 53.88it/s]


✅ Sauvé : /content/drive/MyDrive/Thèse Master/Exports2/classified_batches/classified_batch_010.csv

🔹 Traitement du batch 11/21


100%|██████████| 10000/10000 [03:04<00:00, 54.25it/s]


✅ Sauvé : /content/drive/MyDrive/Thèse Master/Exports2/classified_batches/classified_batch_011.csv

🔹 Traitement du batch 12/21


100%|██████████| 10000/10000 [03:01<00:00, 55.16it/s]


✅ Sauvé : /content/drive/MyDrive/Thèse Master/Exports2/classified_batches/classified_batch_012.csv

🔹 Traitement du batch 13/21


100%|██████████| 10000/10000 [02:59<00:00, 55.71it/s]


✅ Sauvé : /content/drive/MyDrive/Thèse Master/Exports2/classified_batches/classified_batch_013.csv

🔹 Traitement du batch 14/21


100%|██████████| 10000/10000 [02:59<00:00, 55.63it/s]


✅ Sauvé : /content/drive/MyDrive/Thèse Master/Exports2/classified_batches/classified_batch_014.csv

🔹 Traitement du batch 15/21


100%|██████████| 10000/10000 [03:06<00:00, 53.67it/s]


✅ Sauvé : /content/drive/MyDrive/Thèse Master/Exports2/classified_batches/classified_batch_015.csv

🔹 Traitement du batch 16/21


100%|██████████| 10000/10000 [03:06<00:00, 53.48it/s]


✅ Sauvé : /content/drive/MyDrive/Thèse Master/Exports2/classified_batches/classified_batch_016.csv

🔹 Traitement du batch 17/21


100%|██████████| 10000/10000 [03:05<00:00, 53.94it/s]


✅ Sauvé : /content/drive/MyDrive/Thèse Master/Exports2/classified_batches/classified_batch_017.csv

🔹 Traitement du batch 18/21


100%|██████████| 10000/10000 [03:02<00:00, 54.82it/s]


✅ Sauvé : /content/drive/MyDrive/Thèse Master/Exports2/classified_batches/classified_batch_018.csv

🔹 Traitement du batch 19/21


100%|██████████| 10000/10000 [03:06<00:00, 53.70it/s]


✅ Sauvé : /content/drive/MyDrive/Thèse Master/Exports2/classified_batches/classified_batch_019.csv

🔹 Traitement du batch 20/21


100%|██████████| 10000/10000 [03:01<00:00, 54.96it/s]


✅ Sauvé : /content/drive/MyDrive/Thèse Master/Exports2/classified_batches/classified_batch_020.csv

🔹 Traitement du batch 21/21


100%|██████████| 1247/1247 [00:23<00:00, 52.92it/s]

✅ Sauvé : /content/drive/MyDrive/Thèse Master/Exports2/classified_batches/classified_batch_021.csv





In [None]:
# 9 - Merge All Batches

import glob

# Load all classified batches
files = sorted(glob.glob(f"{output_dir}/classified_batch_*.csv"))
df_all = pd.concat([pd.read_csv(f) for f in files], ignore_index=True)

# Add dominant label column
def assign_major_label(row):
    scores = {
        "E": row["score_env"],
        "S": row["score_soc"],
        "G": row["score_gov"]
    }
    best_label = max(scores, key=scores.get)
    if scores[best_label] < 0.5:  # seuil de confiance
        return "NONE"
    return best_label

df_all["label_majoritaire"] = df_all.apply(assign_major_label, axis=1)

# Save final file
df_all.to_csv("/content/drive/MyDrive/Thèse Master/Exports2/classified_all_sentences.csv", index=False)
print("✅ Fichier final sauvegardé avec labels majoritaires.")


✅ Fichier final sauvegardé avec labels majoritaires.


10 - (Optional) Fix and Re-Assign Dominant Label in English

In [None]:
path = "/content/drive/MyDrive/Thèse Master/Exports2/classified_all_sentences.csv"
df_all = pd.read_csv(path)

In [None]:
# Remove old dominant label if needed
if "label_majoritaire" in df_all.columns:
    df_all = df_all.drop(columns=["label_majoritaire"])

# 🔁 Assign corrected dominant label in English
def assign_dominant_label(row):
    options = []
    if row["label_env"] != "none":
        options.append(("environmental", row["score_env"]))
    if row["label_soc"] != "none":
        options.append(("social", row["score_soc"]))
    if row["label_gov"] != "none":
        options.append(("governance", row["score_gov"]))

    if not options:
        return "none"

    best_label, best_score = max(options, key=lambda x: x[1])

    if best_score < 0.5:
        return "none"

    return best_label

df_all["label_dominant"] = df_all.apply(assign_dominant_label, axis=1)

# 💾 Save final version
df_all.to_csv("/content/drive/MyDrive/Thèse Master/Exports2/classified_all_sentences.csv", index=False)
print("✅ Column 'dominant_label' assigned and file saved.")


✅ Column 'dominant_label' assigned and file saved.
