<a href="https://colab.research.google.com/github/alexandrastna/AI-for-ESG/blob/main/Notebooks/8_1_Thesis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Thesis 8 - Model Benchmarking on ESG and Sentiment Classification

Step 1 - Here we create a sample in order to test our classification models (in step 2) : FinBERT classification, BERT ESG pillar classifier, and GPT-3.5 sentiment analysis. The sample is called "gold standard 150", and it's manually balanced.

In [None]:
# 📦 Imports
import pandas as pd
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

# Load the ESG dataframe (already annotated by models)
file_path = "/content/drive/MyDrive/Thèse Master/Exports2/df_esg_with_batch_results.csv"
df = pd.read_csv(file_path)

# Keep only valid rows: with dominant ESG label AND a valid sentiment from GPT
df = df[df["label_dominant"].isin(["environmental", "social", "governance"])]
df = df[df["sentiment_gpt_batch"].isin(["positive", "neutral", "negative"])]

# Sample 50 rows per sentiment for initial balance
df_pos = df[df["sentiment_gpt_batch"] == "positive"].sample(n=50, random_state=1)
df_neu = df[df["sentiment_gpt_batch"] == "neutral"].sample(n=50, random_state=2)
df_neg = df[df["sentiment_gpt_batch"] == "negative"].sample(n=50, random_state=3)

# Combine the 3 sentiment samples into one dataset
df_sample = pd.concat([df_pos, df_neu, df_neg])

# Ensure minimum of 50 samples per ESG pillar (E, S, G), complete if needed
def add_missing_pillar(df_sample, full_df, label_value, min_count):
    current_count = (df_sample["label_dominant"] == label_value).sum()
    if current_count >= min_count:
        return df_sample
    missing_n = min_count - current_count
    # Add new samples for the underrepresented pillar, avoiding duplicates
    extra = full_df[(full_df["label_dominant"] == label_value) & (~full_df["sentence"].isin(df_sample["sentence"]))].sample(n=missing_n, random_state=42)
    print(f"⚠️ Ajout de {missing_n} phrases pour pilier {label_value}")
    return pd.concat([df_sample, extra])

df_sample = add_missing_pillar(df_sample, df, "environmental", 50)
df_sample = add_missing_pillar(df_sample, df, "social", 50)
df_sample = add_missing_pillar(df_sample, df, "governance", 50)

# Ensure all document types are represented
doc_types_needed = ["Annual Report", "Sustainability Report", "Earnings Call Transcript", "Integrated Report", "Half-Year Report"]
missing_doc_types = set(doc_types_needed) - set(df_sample["document_type"].unique())
for doc_type in missing_doc_types:
    extra = df[df["document_type"] == doc_type].sample(n=1, random_state=42)
    print(f"⚠️ Ajout d’un doc manquant : {doc_type}")
    df_sample = pd.concat([df_sample, extra])

# Ensure all companies are represented at least once
missing_companies = set(df["company"].unique()) - set(df_sample["company"].unique())
if missing_companies:
    print(f"⚠️ Ajout de {len(missing_companies)} entreprises manquantes")
    extra = df[df["company"].isin(missing_companies)].groupby("company").head(1)
    df_sample = pd.concat([df_sample, extra])

# Final cleaning steps
df_sample = df_sample.drop_duplicates(subset=["sentence"])
df_sample = df_sample.sample(n=min(150, len(df_sample)), random_state=999)

# Add columns to be filled by human annotator
df_sample["sentiment_humain"] = ""
df_sample["esg_label_humain"] = ""

# Export to Excel
output_path = "/content/drive/MyDrive/Thèse Master/Exports2/df_gold_standard_final_150.xlsx"
df_sample.to_excel(output_path, index=False)
print(f"\n✅ Échantillon final exporté ici : {output_path}")

# Final checks
print("\n📌 Nb total de phrases :", len(df_sample))

# Add boolean flags for ESG pillar verification
df_sample["is_E"] = df_sample["label_dominant"] == "environmental"
df_sample["is_S"] = df_sample["label_dominant"] == "social"
df_sample["is_G"] = df_sample["label_dominant"] == "governance"

print("🔎 Nb de phrases E (is_E) :", df_sample["is_E"].sum())
print("🔎 Nb de phrases S (is_S) :", df_sample["is_S"].sum())
print("🔎 Nb de phrases G (is_G) :", df_sample["is_G"].sum())

# Document type coverage
print("📁 Types de documents couverts :", df_sample["document_type"].nunique())
print("📁 Documents présents :", df_sample["document_type"].unique())

# Company representation
print("🏢 Nombre d’entreprises présentes :", df_sample["company"].nunique())

# Sentiment distribution
df_sample = df_sample.rename(columns={"sentiment_gpt_batch": "sent_label"})  # pour compatibilité avec ancien nom
print("\n🎯 Répartition des sentiments dans l'échantillon :")
print(df_sample["sent_label"].value_counts())

# Sentiment percentages
print("\n📈 Pourcentages :")
proportions = (df_sample["sent_label"].value_counts(normalize=True) * 100).round(2).astype(str) + " %"
print(proportions)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
⚠️ Ajout de 20 phrases pour pilier social
⚠️ Ajout de 35 phrases pour pilier governance

✅ Échantillon final exporté ici : /content/drive/MyDrive/Thèse Master/Exports2/df_gold_standard_final_150.xlsx

📌 Nb total de phrases : 150
🔎 Nb de phrases E (is_E) : 77
🔎 Nb de phrases S (is_S) : 38
🔎 Nb de phrases G (is_G) : 35
📁 Types de documents couverts : 6
📁 Documents présents : ['Integrated Report' 'Sustainability Report' 'Annual Report'
 'Earnings Call Transcript' 'Half-Year Report' 'Governance Report']
🏢 Nombre d’entreprises présentes : 3

🎯 Répartition des sentiments dans l'échantillon :
sent_label
positive    56
neutral     55
negative    39
Name: count, dtype: int64

📈 Pourcentages :
sent_label
positive    37.33 %
neutral     36.67 %
negative     26.0 %
Name: proportion, dtype: object
