# Imports & Data

In [1]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score
import os

In [2]:
# Load preprocessed data

sample = pd.read_csv("/workspaces/ERP_Newsletter/data_processed/sample_300_full.csv")
labels = pd.read_csv("/workspaces/ERP_Newsletter/data_processed/sample_llm_prelabeled.csv")

print(f"Loaded {len(labels)} rows")
print(f"Loaded {len(sample)} rows")

sample.columns

Loaded 300 rows
Loaded 300 rows


Index(['id', 'newsletter_number', 'issue_date', 'new_theme', 'text', 'domain',
       'organisation', 'org_group', 'year_quarter'],
      dtype='object')

In [3]:
#merge
labels = labels.rename(columns={"doc_id": "id"})

In [5]:
df = pd.merge(sample, labels, on=["id", "text"], how="inner")
print(f"✅ Merged dataset shape: {df.shape}")
print("Columns:", df.columns.tolist())

✅ Merged dataset shape: (300, 12)
Columns: ['id', 'newsletter_number', 'issue_date', 'new_theme', 'text', 'domain', 'organisation', 'org_group', 'year_quarter', 'llm_label', 'llm_confidence', 'llm_rationale']


In [7]:
keep_cols = [
    "id",
    "text",
    "new_theme",
    "organisation",
    "org_group",
    "year_quarter",
    "llm_label",
    "llm_confidence"
]

df = df[keep_cols].copy()

print(f"✅ Columns retained: {keep_cols}")

✅ Columns retained: ['id', 'text', 'new_theme', 'organisation', 'org_group', 'year_quarter', 'llm_label', 'llm_confidence']


# Light Preprocessing

In [8]:
df['text_for_vader'] = df['text'].fillna('')  # Ensure no NaNs

In [9]:
#remove URLs 
df['text_for_vader'] = df['text_for_vader'].str.replace(
    r'http\S+|www\S+', '', regex=True
).str.strip()

In [10]:
# Check text lengths (VADER works better on sentences/paragraphs)
print(df['text_for_vader'].str.split().str.len().describe())

count    300.000000
mean      42.133333
std       21.375029
min        6.000000
25%       26.000000
50%       37.000000
75%       52.000000
max      125.000000
Name: text_for_vader, dtype: float64


# Train-Test-Validation Split 

In [12]:
# -Train/Test/Validation Split (70/15/15) ---
train_df, temp_df = train_test_split(df, test_size=0.30, random_state=42, stratify=df["llm_label"])
val_df, test_df = train_test_split(temp_df, test_size=0.50, random_state=42, stratify=temp_df["llm_label"])

print(f" Train: {len(train_df)} | Val: {len(val_df)} | Test: {len(test_df)}")

 Train: 210 | Val: 45 | Test: 45


In [13]:
# Save splits 
train_df.to_csv("/workspaces/ERP_Newsletter/data_processed/train.csv", index=False)
val_df.to_csv("/workspaces/ERP_Newsletter/data_processed/val.csv", index=False)
test_df.to_csv("/workspaces/ERP_Newsletter/data_processed/test.csv", index=False)

print(" Saved train/val/test splits.")

 Saved train/val/test splits.


# VADER MODEL 

In [14]:
#VADER Model 

analyzer = SentimentIntensityAnalyzer()

def get_vader_sentiment(df, text_col="text_for_vader"):
    """Apply VADER sentiment and return dataframe with scores and labels"""
    def get_scores(text):
        scores = analyzer.polarity_scores(str(text))
        return pd.Series({
            "vader_neg": scores["neg"],
            "vader_neu": scores["neu"],
            "vader_pos": scores["pos"],
            "vader_compound": scores["compound"]
        })
    
    vader_scores = df[text_col].apply(get_scores)
    df = pd.concat([df.reset_index(drop=True), vader_scores], axis=1)

    # Map compound scores → sentiment label
    def map_label(c):
        if c >= 0.05:
            return "positive"
        elif c <= -0.05:
            return "critical"   # use 'critical' instead of 'negative'
        else:
            return "neutral"
    
    df["vader_label"] = df["vader_compound"].apply(map_label)
    return df

In [16]:
# Apply to each split 
for name, split in {"train": train_df, "val": val_df, "test": test_df}.items():
    scored = get_vader_sentiment(split, text_col="text_for_vader")
    path = f"/workspaces/ERP_Newsletter/data_processed/{name}_with_vader.csv"
    scored.to_csv(path, index=False)
    print(f"💾 Saved {name} split with VADER → {path}")

💾 Saved train split with VADER → /workspaces/ERP_Newsletter/data_processed/train_with_vader.csv
💾 Saved val split with VADER → /workspaces/ERP_Newsletter/data_processed/val_with_vader.csv
💾 Saved test split with VADER → /workspaces/ERP_Newsletter/data_processed/test_with_vader.csv


# Evaluate Performance 

In [17]:
# Reload the test set (already scored by VADER)
test = pd.read_csv("/workspaces/ERP_Newsletter/data_processed/test_with_vader.csv")


In [18]:
# Use your LLM labels as the comparison target
y_true = test["llm_label"].astype(str)
y_pred = test["vader_label"].astype(str)

In [19]:
# Define class order
labels_order = ["positive", "neutral", "critical"]


In [20]:
# Confusion matrix
print("\n=== Confusion Matrix (rows=True, cols=Pred) ===")
print(pd.DataFrame(
    confusion_matrix(y_true, y_pred, labels=labels_order),
    index=[f"True_{c}" for c in labels_order],
    columns=[f"Pred_{c}" for c in labels_order]
))


=== Confusion Matrix (rows=True, cols=Pred) ===
               Pred_positive  Pred_neutral  Pred_critical
True_positive              1             5              0
True_neutral               1            30              1
True_critical              2             5              0


In [21]:
# Classification metrics
print("\n=== Classification Report ===")
print(classification_report(y_true, y_pred, labels=labels_order, digits=3))

print("\n=== Summary Metrics ===")
print(f"Accuracy: {accuracy_score(y_true, y_pred):.3f}")
print(f"Macro F1: {f1_score(y_true, y_pred, average='macro'):.3f}")
print(f"Weighted F1: {f1_score(y_true, y_pred, average='weighted'):.3f}")


=== Classification Report ===
              precision    recall  f1-score   support

    positive      0.045     0.167     0.071         6
     neutral      0.600     0.938     0.732        32
    critical      0.000     0.000     0.000         7

   micro avg      0.373     0.689     0.484        45
   macro avg      0.215     0.368     0.268        45
weighted avg      0.433     0.689     0.530        45


=== Summary Metrics ===
Accuracy: 0.373
Macro F1: 0.201
Weighted F1: 0.287


# Inspect Errors

In [22]:
# Where VADER and LLM disagree
errors = test[test["llm_label"] != test["vader_label"]][
    ["id", "text", "llm_label", "vader_label", "vader_compound"]
]
print(f"\n❌ {len(errors)} disagreements found")
print(errors.sample(10))



❌ 52 disagreements found
                                      id  \
60                                   NaN   
82                                   NaN   
79                                   NaN   
6   b7c41fb0-5a01-4b91-a750-fdbe01ddc64c   
77                                   NaN   
50                                   NaN   
58                                   NaN   
41  7f84506f-4ce9-4c47-8955-4adb3e464e97   
42  2f4d0e26-5f5c-4883-bd7f-0951a5e9fe0d   
81                                   NaN   

                                                 text llm_label vader_label  \
60                                                NaN       NaN    positive   
82                                                NaN       NaN    critical   
79                                                NaN       NaN    critical   
6   DfE NI - Launches New Integrated Education Str...  positive     neutral   
77                                                NaN       NaN    positive   
50             

# Quick Summaries

In [23]:
summary_by_theme = test.groupby(["new_theme", "vader_label"]).size().unstack(fill_value=0)
summary_by_theme["Total"] = summary_by_theme.sum(axis=1)
summary_by_theme["% Positive"] = (summary_by_theme["positive"] / summary_by_theme["Total"] * 100).round(1)
summary_by_theme["% Critical"] = (summary_by_theme["critical"] / summary_by_theme["Total"] * 100).round(1)
summary_by_theme["% Neutral"] = (summary_by_theme["neutral"] / summary_by_theme["Total"] * 100).round(1)

print("\n📊 Sentiment by Theme:")
print(summary_by_theme.sort_values("Total", ascending=False).head(10))



📊 Sentiment by Theme:
vader_label                          critical  neutral  positive  Total  \
new_theme                                                                 
political_context_and_organisations         1       16         0     17   
teacher_rrd                                 0       12         3     15   
digital_ed                                  0       12         1     13   

vader_label                          % Positive  % Critical  % Neutral  
new_theme                                                               
political_context_and_organisations         0.0         5.9       94.1  
teacher_rrd                                20.0         0.0       80.0  
digital_ed                                  7.7         0.0       92.3  
