In [None]:
# ===============================
# Stage 3 — Output Table & Data Visualisation
# Read from the JSON file from Stage 2 result
# ===============================

from pathlib import Path
import json, pandas as pd

# Root folder that contains company subfolders (e.g. "001. Walmart", etc.)
COMPANIES_ROOT = Path("<<Google Drive Link>>") # I uploaded the PDFs files in Google Drive. Each company has one folder.

# Locate all Stage 2 result files within the esg_stage2 folder of each company
files = sorted(COMPANIES_ROOT.glob("*/esg_stage2/esg_stage2_*.json"))

# 1) Discover all pillar×concept combos from the results
combos = set()
for p in files:
    try:
        j = json.loads(p.read_text(encoding="utf-8"))
    except Exception:
        # Skip files that cannot be read or decoded
        continue
    cov = j.get("coverage") or {}
    for pillar, d in cov.items():
        if isinstance(d, dict):
            for concept in d.keys():
                combos.add((pillar, concept))
combos = sorted(combos)  # stable order

# 2) Build rows (wide)
rows = []
for p in files:
    try:
        j = json.loads(p.read_text(encoding="utf-8"))
    except Exception:
        # Skip files that cannot be read or decoded
        continue

    meta = j.get("meta", {}) or {}
    sic  = (j.get("sic_division") or {}).get("name")
    cov  = j.get("coverage") or {}

    row = {
        "filename": meta.get("filename"),
        "SIC division": sic,
        "year": meta.get("doc_year"),
    }

    # For each pillar×concept, pull n_qual/n_qgen/n_qspec (fallback to 0 if absent)
    for pillar, concept in combos:
        st = (cov.get(pillar) or {}).get(concept) or {}
        row[f"{pillar} | {concept} | n_qual"]  = int(st.get("n_qual", 0) or 0)
        row[f"{pillar} | {concept} | n_qgen"]  = int(st.get("n_qgen", 0) or 0)
        row[f"{pillar} | {concept} | n_qspec"] = int(st.get("n_qspec", 0) or 0)

        # If your Stage 2 variant only has "count"/"examples", the above will remain in 0.
        # (That’s OK; it keeps the schema stable across runs.)

    rows.append(row)

df_wide = pd.DataFrame(rows)

In [None]:
# Create a copy to avoid modifying the original DataFrame
df_modified = df_wide.copy()

# Rename 'filename' to 'company' and extract company name
df_modified['company'] = df_modified['filename'].apply(lambda x: x.split('_')[0])
df_modified = df_modified.drop(columns=['filename'])


# Rename pillars in column names
new_columns = {}
for col in df_modified.columns:
    if 'environmental' in col:
        new_columns[col] = col.replace('environmental', 'ENV')
    elif 'governance' in col:
        new_columns[col] = col.replace('governance', 'GOV')
    elif 'social' in col:
        new_columns[col] = col.replace('social', 'SOC')
    else:
        new_columns[col] = col # Keep other columns as is

df_modified = df_modified.rename(columns=new_columns)

# Reorder columns to make 'company' the first column
cols = df_modified.columns.tolist()
cols.insert(0, cols.pop(cols.index('company')))
df_modified = df_modified[cols]

# Display the modified DataFrame
display(df_modified.head())

# Save to CSV
df_modified.to_csv("esg_stage3_wide.csv", index=False)

Unnamed: 0,company,SIC division,year,ENV | Climate Change Vulnerability | n_qual,ENV | Climate Change Vulnerability | n_qgen,ENV | Climate Change Vulnerability | n_qspec,ENV | Financing Environmental Impact | n_qual,ENV | Financing Environmental Impact | n_qgen,ENV | Financing Environmental Impact | n_qspec,ENV | Opportunity in Clean Tech | n_qual,...,SOC | SOC misconduct | n_qspec,SOC | SOC responsibility | n_qual,SOC | SOC responsibility | n_qgen,SOC | SOC responsibility | n_qspec,SOC | SOC values | n_qual,SOC | SOC values | n_qgen,SOC | SOC values | n_qspec,SOC | well-being | n_qual,SOC | well-being | n_qgen,SOC | well-being | n_qspec
0,Apple,Manufacturing,2020.0,7,1,7,15,0,5,42,...,0,1,0,0,0,0,0,0,0,0
1,Apple,Manufacturing,2021.0,3,0,2,3,4,5,11,...,0,12,4,4,2,0,0,0,0,1
2,Apple,Manufacturing,2022.0,11,5,9,23,3,9,54,...,0,4,0,1,1,0,0,0,0,0
3,Apple,Manufacturing,2023.0,14,0,3,24,8,8,51,...,0,4,0,0,0,0,0,0,0,0
4,Apple,Manufacturing,,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
