# Analysis: Scoring Real Annual Reports

## Overview
This notebook runs the complete Greenwashing Detection pipeline on real-world Annual Reports. 
It automatically finds all PDF files in the `inputs/` folder, extracts their text, and scores them using a hybrid approach:

1.  **Vagueness (VUI):** Uses rule-based NLP to count "hedging" words (e.g., "aim," "might," "intend").
2.  **Specificity (SPI):** Uses our **Fine-Tuned RoBERTa model** to detect concrete, quantitative claims.
3.  **Greenwashing Risk (GW):** Combines these two signals into a single risk score (0 to 1).


## Import & Setup

In [None]:
import sys
import os
import yaml
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import torch
from transformers import pipeline
import spacy
from tqdm.notebook import tqdm  # progress bar

# local 'src' library to the python path so we can import our modules
sys.path.append(os.path.abspath(".."))

# Import logic from thesis code
from src.parsing import extract_pages, split_sentences
from src.sectioning import section_by_headings, collect_section_sentences
from src.vui import compute_vui
from src.spi import compute_spi_rule
from src.gw import aggregate_gw

# Load the project configuration (keywords, rules, etc.)
with open("../config.yml", "r") as f:
    cfg = yaml.safe_load(f)

print("Libraries and Configuration was loaded successfully.")

# Ensure Spacy English model is installed for sentence splitting
try:
    nlp = spacy.load("en_core_web_sm")
except OSError:
    print("Downloading Spacy model (en_core_web_sm)...")
    from spacy.cli import download
    download("en_core_web_sm")
    nlp = spacy.load("en_core_web_sm")

## Load Fine-Tuned Model

In [None]:
# Load the model of 2_FineTuning_RoBERTa.ipynb
MODEL_PATH = "../models/gw_finetuned"

# Use GPU (Metal/MPS on Mac) if available, otherwise CPU
device = 0 if torch.cuda.is_available() or torch.backends.mps.is_available() else -1
print(f"   Running on device index: {device} (0/-1)")

# Initialize the classification pipeline
# We strictly truncate text to 512 tokens to prevent crashes on long sentences
classifier = pipeline(
    "text-classification", 
    model=MODEL_PATH, 
    tokenizer=MODEL_PATH, 
    device=device, 
    truncation=True, 
    max_length=512
)

## Find Data

In [None]:
input_dir = "../inputs"

# Get a list of all PDF files in the inputs folder
pdf_files = [f for f in os.listdir(input_dir) if f.lower().endswith(".pdf")]
pdf_files.sort()  # Sort alphabetically for consistent order

print(f" Found {len(pdf_files)} reports to analyze:")
for f in pdf_files:
    print(f"   - {f}")

## Main Processing Loop

In [None]:
# Stroing final scores for every report
all_report_scores = []

print(f"Starting analysis of {len(pdf_files)} reports...")

# Loop through every PDF file we have
for pdf_file in tqdm(pdf_files, desc="Processing Reports"):
    
    # --- Step 1: Extract Metadata (Issuer & Year) from Filename ---
    try:
        filename_clean = pdf_file.replace(".pdf", "")
        parts = filename_clean.split("-")
        year = int(parts[-1])
        # The first part is the company name (e.g., dws)
        issuer = parts[0].capitalize()
    except:
        # If the filename is wrong, the whole name as the issuer and a default year will be used
        print(f"Could not parse filename '{pdf_file}'. Using defaults.")
        year = 2023
        issuer = filename_clean

    # --- Step 2: Parse PDF Text ---
    pdf_path = os.path.join(input_dir, pdf_file)
    try:
        # extract_pages is a function from your src/parsing.py
        pages = extract_pages(pdf_path)
    except Exception as e:
        print(f"Error reading {pdf_file}: {e}")
        continue

    # --- Step 3: Sectioning ---
    # Dont analyzing legal footer or table of contents
    # section_by_headings from config.yml to find "Strategy", "Risk", etc.
    buckets = section_by_headings(pages, cfg)
    target_sections = cfg["sectioning"]["target_sections"]
    
    # Collect all sentences from the relevant sections
    all_sentences = []
    for sec in target_sections:
        sec_pages = buckets.get(sec, [])
        if sec_pages:
            # Split text into individual sentences
            sents = collect_section_sentences(sec_pages, lambda txt: split_sentences(txt, "en_core_web_sm"))
            all_sentences.extend(sents)
            
    if not all_sentences:
        print(f"{pdf_file}: No text found in target sections (Strategy, Risk, etc). Skipping.")
        continue

    # --- Step 4: Scoring ---
    
    # Vagueness (VUI) - Rule Based
    # Calculates the density of vague words per 1000 words
    vui_res = compute_vui(all_sentences, cfg)
    vui_score = vui_res["vui_norm"]
    
    # Specificity (SPI) - Hybrid Approach
    # Part 1: Rules (Does it have numbers? Dates?, ... )
    spi_res = compute_spi_rule(all_sentences, cfg)
    
    # Part 2: AI (Asks the model: "Is this specific?")
    # Grabing just the text from the sentence objects
    texts = [s["text"] for s in all_sentences]
    
    # Run the classifier in batches (faster than one by one)
    preds = classifier(texts, batch_size=16)
    
    # Count how many sentences were labeled "Specific" (LABEL_1)
    count_specific = sum(1 for p in preds if p["label"] == "LABEL_1")
    ai_spi_score = count_specific / len(all_sentences)
    
    # Combine Rule-based and AI-based specificity
    # (Weights are from current bachelor thesis config: 60% Rules, 40% AI)
    spi_hybrid = (0.6 * spi_res["spi_rule"]) + (0.4 * ai_spi_score)
    
    # C. Final Greenwashing Risk Score
    # Formula: Risk increases with Vagueness and decreases with Specificity
    gw_score = (0.56 * vui_score) + (0.44 * (1 - spi_hybrid))
    
    # Save the data
    all_report_scores.append({
        "Issuer": issuer,
        "Year": year,
        "Filename": pdf_file,
        "VUI_Score": round(vui_score, 3),
        "SPI_Score": round(spi_hybrid, 3),
        "GW_Risk_Score": round(gw_score, 3),
        "Sentence_Count": len(all_sentences)
    })

print(f"\n Processing complete. Analyzed {len(all_report_scores)} reports successfully.")

## Results Table

In [None]:
# Creating a Pandas DataFrame to view the results
df_scores = pd.DataFrame(all_report_scores)

# Saving to CSV so we can use it in Notebook 4 (SFDR correlation)
output_path = "../outputs/gw_scores_all.csv"
df_scores.to_csv(output_path, index=False)

print(f"Results saved to: {output_path}")

# Show the table sorted by Risk Score (Highest Risk first)
display(df_scores.sort_values("GW_Risk_Score", ascending=False))

## Visualization 1 - Scores by Issuer

In [None]:
# Setting visual style
sns.set_theme(style="whitegrid")

plt.figure(figsize=(12, 6))

# Bar chart - average GW Risk per Issuer
# Group by Issuer just in case we have multiple years per issuer
avg_scores = df_scores.groupby("Issuer")["GW_Risk_Score"].mean().sort_values()

colors = ['green' if x < 0.4 else 'orange' if x < 0.6 else 'red' for x in avg_scores]
avg_scores.plot(kind='barh', color=colors)

plt.title("Average Greenwashing Risk by Asset Manager (2021-2024)")
plt.xlabel("Greenwashing Risk Score")
plt.ylabel("Issuer")
plt.axvline(x=0.5, color='grey', linestyle='--', alpha=0.5, label="Risk Threshold")
plt.legend()
plt.show()

## Visualization 2 - Time Series

In [None]:
# Only plot for multiple years of data
if df_scores['Year'].nunique() > 1:
    plt.figure(figsize=(12, 6))
    
    # Line chart tracking scores over time for each issuer
    sns.lineplot(data=df_scores, x="Year", y="GW_Risk_Score", hue="Issuer", marker="o", linewidth=2.5)
    
    plt.title("Greenwashing Trends Over Time")
    plt.ylabel("Greenwashing Risk Score")
    plt.xlabel("Year")
    
    # Force integer ticks for years (2021, 2022...) instead of decimals (2021.5)
    plt.xticks(df_scores['Year'].unique())
    plt.grid(True, alpha=0.3)
    plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.tight_layout()
    plt.show()
else:
    print("Not enough multi-year data to plot trends yet.")