# Imports & Data

In [4]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

import os

import matplotlib.pyplot as plt 
import seaborn as sns 

from sklearn.metrics import f1_score

In [5]:
# Load preprocessed data

sample = pd.read_csv("/workspaces/ERP_Newsletter/data_processed/sample_300_full.csv")
labels = pd.read_csv("/workspaces/ERP_Newsletter/data_processed/sample_llm_prelabeled.csv")

print(f"Loaded {len(labels)} rows")
print(f"Loaded {len(sample)} rows")

sample.columns

Loaded 300 rows
Loaded 300 rows


Index(['id', 'newsletter_number', 'issue_date', 'new_theme', 'text', 'domain',
       'organisation', 'org_group', 'year_quarter'],
      dtype='object')

In [6]:
#merge
labels = labels.rename(columns={"doc_id": "id"})

In [8]:
df = pd.merge(sample, labels, on=["id", "text"], how="inner")
print(f"✅ Merged dataset shape: {df.shape}")
print("Columns:", df.columns.tolist())

✅ Merged dataset shape: (300, 12)
Columns: ['id', 'newsletter_number', 'issue_date', 'new_theme', 'text', 'domain', 'organisation', 'org_group', 'year_quarter', 'llm_label', 'llm_confidence', 'llm_rationale']


In [9]:
keep_cols = [
    "id",
    "text",
    "new_theme",
    "organisation",
    "org_group",
    "year_quarter",
    "llm_label",
    "llm_confidence"
]

df = df[keep_cols].copy()

print(f"✅ Columns retained: {keep_cols}")

✅ Columns retained: ['id', 'text', 'new_theme', 'organisation', 'org_group', 'year_quarter', 'llm_label', 'llm_confidence']


# Light Preprocessing

In [10]:
df['text_for_vader'] = df['text'].fillna('')  # Ensure no NaNs

In [11]:
#remove URLs 
df['text_for_vader'] = df['text_for_vader'].str.replace(
    r'http\S+|www\S+', '', regex=True
).str.strip()

In [12]:
# Check text lengths (VADER works better on sentences/paragraphs)
print(df['text_for_vader'].str.split().str.len().describe())

count    300.000000
mean      42.133333
std       21.375029
min        6.000000
25%       26.000000
50%       37.000000
75%       52.000000
max      125.000000
Name: text_for_vader, dtype: float64


# VADER MODEL 

In [13]:
# VADER Model 
analyzer = SentimentIntensityAnalyzer()

def get_vader_sentiment(df, text_col="text_for_vader"):
    """Apply VADER sentiment and return dataframe with scores and labels"""
    def get_scores(text):
        scores = analyzer.polarity_scores(str(text))
        return pd.Series({
            "vader_neg": scores["neg"],
            "vader_neu": scores["neu"],
            "vader_pos": scores["pos"],
            "vader_compound": scores["compound"]
        })
    
    vader_scores = df[text_col].apply(get_scores)
    df = df.join(vader_scores)
    
    df["vader_label"] = df["vader_compound"].apply(
        lambda c: "positive" if c >= 0.05 else ("critical" if c <= -0.05 else "neutral")
    )
    return df

# Apply VADER to entire dataset
print(f"\n🔄 Applying VADER to full dataset (n={len(df)})...")
df_with_vader = get_vader_sentiment(df, text_col="text_for_vader")

# Save complete scored dataset
output_path = "/workspaces/ERP_Newsletter/data_processed/full_dataset_with_vader.csv"
df_with_vader.to_csv(output_path, index=False)
print(f"✅ Saved complete dataset with VADER scores → {output_path}")

# Display sample results
print("\n📊 Sample VADER Results:")
print(df_with_vader[["id", "text", "llm_label", "vader_compound", "vader_label"]].head(10))

# Quick distribution check
print(f"\n📈 VADER Label Distribution:")
print(df_with_vader["vader_label"].value_counts())
print(f"\n📈 LLM Label Distribution:")
print(df_with_vader["llm_label"].value_counts())


🔄 Applying VADER to full dataset (n=300)...
✅ Saved complete dataset with VADER scores → /workspaces/ERP_Newsletter/data_processed/full_dataset_with_vader.csv

📊 Sample VADER Results:
                                     id  \
0  1b109222-dc42-4d14-92fc-c60320f919f7   
1  7991db15-4956-4f31-901e-a5957934fafa   
2  914f2011-717a-400c-9e22-a4463f2b3f07   
3  e5fcb0b3-f40f-435f-9c52-e357ce815ff0   
4  a0a05c90-af1c-4671-8fdc-bcaab4aac7fc   
5  4d566bdd-bb9c-4e83-beda-13171ae20bc0   
6  02a26253-a12d-457a-bf1c-7ac8c734b751   
7  d0bd333a-f888-442c-9bf7-41c64e22d345   
8  58ead779-ad80-492b-844e-00bfd566990e   
9  be4fe8a5-4b5f-41d6-9de2-4e8df14f5b47   

                                                text llm_label  \
0  FCDO - Foreign Secretary to call for internati...   neutral   
1  ChatGPT isn't the death of homework – just an ...  positive   
2  DfE - Generative AI in education: educator and...   neutral   
3  Ofsted Statement - How Ofsted looks at AI duri...   neutral   
4  DfE Cons

# Evaluate Performance 

# Diagnostics 

# Visualisations

### Class distribution

### Precision-Recall by Class 