<a href="https://colab.research.google.com/github/alexandrastna/AI-for-ESG/blob/main/Notebooks/10_Thesis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Thesis 10 : Comparing SASB and Refinitiv ESG Weights

In this notebook, we test the correlation and differences between **ESG weights** derived from **SASB** (Sustainability Accounting Standards Board) and those from **Refinitiv** for SMI companies.

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# 📦 Required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.spatial.distance import euclidean
import seaborn as sns

# 📁 Load the Excel file from Google Drive (adjust the path if needed)
file_path = "/content/drive/MyDrive/Thèse Master/Data/SMI companies.xlsx"

# 🔹 Read SASB and Refinitiv ESG sheets
sasb_df = pd.read_excel(file_path, sheet_name="SASB")
refinitiv_df = pd.read_excel(file_path, sheet_name="Refinitiv ESG")

# 🧼 Clean and rename columns
sasb_df = sasb_df.rename(columns={
    "company": "Company", "Hybrid E": "SASB_E", "Hybrid S": "SASB_S", "Hybrid G": "SASB_G"
})

refinitiv_df = refinitiv_df.rename(columns={
    "company": "Company", "E Weight": "Refinitiv_E", "S Weight": "Refinitiv_S", "G Weight": "Refinitiv_G"
})

# 🎯 Filter Refinitiv data to year 2021
refinitiv_2021 = refinitiv_df[refinitiv_df["year"] == 2021]

# 🔀 Merge SASB and Refinitiv data
df = pd.merge(
    sasb_df[["Company", "SASB_E", "SASB_S", "SASB_G"]],
    refinitiv_2021[["Company", "Refinitiv_E", "Refinitiv_S", "Refinitiv_G"]],
    on="Company", how="inner"
)

# 🧮 Compute differences per pillar
df["E_diff"] = df["Refinitiv_E"] - df["SASB_E"]
df["S_diff"] = df["Refinitiv_S"] - df["SASB_S"]
df["G_diff"] = df["Refinitiv_G"] - df["SASB_G"]

# 📏 Euclidean distance
df["euclidean_distance"] = df.apply(
    lambda row: euclidean(
        [row["SASB_E"], row["SASB_S"], row["SASB_G"]],
        [row["Refinitiv_E"], row["Refinitiv_S"], row["Refinitiv_G"]]
    ),
    axis=1
)

# 🔍 Hellinger distance (more suitable for normalized proportions)
def hellinger_distance(p, q):
    p = np.sqrt(np.array(p))
    q = np.sqrt(np.array(q))
    return np.linalg.norm(p - q) / np.sqrt(2)

df["hellinger_distance"] = df.apply(
    lambda row: hellinger_distance(
        [row["SASB_E"], row["SASB_S"], row["SASB_G"]],
        [row["Refinitiv_E"], row["Refinitiv_S"], row["Refinitiv_G"]]
    ),
    axis=1
)

# 📊 Descriptive statistics
for pillar in ["E_diff", "S_diff", "G_diff"]:
    mean = df[pillar].mean()
    std = df[pillar].std()
    print(f"{pillar} ➤ Mean: {mean:.3f} / Std Dev: {std:.3f}")

# 📈 Radar charts for all companies
import plotly.graph_objects as go

def plot_radar(company, row):
    fig = go.Figure()
    labels = ["E", "S", "G"]

    sasb_values = [row["SASB_E"], row["SASB_S"], row["SASB_G"]]
    ref_values = [row["Refinitiv_E"], row["Refinitiv_S"], row["Refinitiv_G"]]

    fig.add_trace(go.Scatterpolar(r=sasb_values + [sasb_values[0]], theta=labels + [labels[0]], fill='toself', name='SASB'))
    fig.add_trace(go.Scatterpolar(r=ref_values + [ref_values[0]], theta=labels + [labels[0]], fill='toself', name='Refinitiv'))

    fig.update_layout(title=f"Radar Chart – {company}", polar=dict(radialaxis=dict(visible=True, range=[0, 1])), showlegend=True)
    fig.show()

# 🔁 Display a radar chart for each company (or a selection)
for i, row in df.iterrows():
    plot_radar(row["Company"], row)

# 🔝 Rank companies by difference between SASB and Refinitiv
top_n = 5

# Ranking by Euclidean distance
print(f"\n🏁 Top {top_n} companies with largest gaps (Euclidean distance):")
df_sorted_euclidean = df.sort_values(by="euclidean_distance", ascending=False)
print(df_sorted_euclidean[["Company", "euclidean_distance"]].head(top_n))

# Ranking by Hellinger distance
print(f"\n🏁 Top {top_n} companies with largest gaps (Hellinger distance):")
df_sorted_hellinger = df.sort_values(by="hellinger_distance", ascending=False)
print(df_sorted_hellinger[["Company", "hellinger_distance"]].head(top_n))

# 🟢 Optional: companies with the smallest gaps
print(f"\n✅ Top {top_n} companies with smallest gaps (Hellinger distance):")
print(df_sorted_hellinger[["Company", "hellinger_distance"]].tail(top_n).sort_values(by="hellinger_distance"))


E_diff ➤ Mean: -0.094 / Std Dev: 0.195
S_diff ➤ Mean: 0.022 / Std Dev: 0.170
G_diff ➤ Mean: 0.072 / Std Dev: 0.056



🏁 Top 5 companies with largest gaps (Euclidean distance):
                     Company  euclidean_distance
8               Swiss Re Ltd            0.391124
5  Zurich Insurance Group AG            0.391124
9             Lonza Group AG            0.334715
7                 Holcim Ltd            0.282797
0                    ABB Ltd            0.282108

🏁 Top 5 companies with largest gaps (Hellinger distance):
                     Company  hellinger_distance
8               Swiss Re Ltd            0.255598
5  Zurich Insurance Group AG            0.255598
9             Lonza Group AG            0.206660
6               UBS Group AG            0.165659
7                 Holcim Ltd            0.164649

✅ Top 5 companies with smallest gaps (Hellinger distance):
                          Company  hellinger_distance
4  Compagnie Financière Richemont            0.085113
1                       Nestlé SA            0.120712
2                Roche Holding AG            0.139366
3                 