# Imports & Data

In [1]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

import os

import matplotlib.pyplot as plt 
import seaborn as sns 

from sklearn.metrics import f1_score

In [2]:
# Load preprocessed data

sample = pd.read_csv("/workspaces/ERP_Newsletter/data_processed/sample_300_full.csv")
labels = pd.read_csv("/workspaces/ERP_Newsletter/data_processed/sample_llm_prelabeled.csv")

print(f"Loaded {len(labels)} rows")
print(f"Loaded {len(sample)} rows")

sample.columns

Loaded 300 rows
Loaded 300 rows


Index(['id', 'newsletter_number', 'issue_date', 'new_theme', 'text', 'domain',
       'organisation', 'org_group', 'year_quarter'],
      dtype='object')

In [3]:
#merge
labels = labels.rename(columns={"doc_id": "id"})

In [4]:
df = pd.merge(sample, labels, on=["id", "text"], how="inner")
print(f"✅ Merged dataset shape: {df.shape}")
print("Columns:", df.columns.tolist())

✅ Merged dataset shape: (300, 12)
Columns: ['id', 'newsletter_number', 'issue_date', 'new_theme', 'text', 'domain', 'organisation', 'org_group', 'year_quarter', 'llm_label', 'llm_confidence', 'llm_rationale']


In [5]:
keep_cols = [
    "id",
    "text",
    "new_theme",
    "organisation",
    "org_group",
    "year_quarter",
    "llm_label",
    "llm_confidence"
]

df = df[keep_cols].copy()

print(f"✅ Columns retained: {keep_cols}")

✅ Columns retained: ['id', 'text', 'new_theme', 'organisation', 'org_group', 'year_quarter', 'llm_label', 'llm_confidence']


# Light Preprocessing

In [6]:
df['text_for_vader'] = df['text'].fillna('')  # Ensure no NaNs

In [7]:
#remove URLs 
df['text_for_vader'] = df['text_for_vader'].str.replace(
    r'http\S+|www\S+', '', regex=True
).str.strip()

In [8]:
# Check text lengths (VADER works better on sentences/paragraphs)
print(df['text_for_vader'].str.split().str.len().describe())

count    300.000000
mean      42.133333
std       21.375029
min        6.000000
25%       26.000000
50%       37.000000
75%       52.000000
max      125.000000
Name: text_for_vader, dtype: float64


# Train-Test-Validation Split 

In [9]:
# -Train/Test/Validation Split (70/15/15) ---
train_df, temp_df = train_test_split(df, test_size=0.30, random_state=42, stratify=df["llm_label"])
val_df, test_df = train_test_split(temp_df, test_size=0.50, random_state=42, stratify=temp_df["llm_label"])

print(f" Train: {len(train_df)} | Val: {len(val_df)} | Test: {len(test_df)}")

 Train: 210 | Val: 45 | Test: 45


In [10]:
# Save splits 
train_df.to_csv("/workspaces/ERP_Newsletter/data_processed/train.csv", index=False)
val_df.to_csv("/workspaces/ERP_Newsletter/data_processed/val.csv", index=False)
test_df.to_csv("/workspaces/ERP_Newsletter/data_processed/test.csv", index=False)

print(" Saved train/val/test splits.")

 Saved train/val/test splits.


# VADER MODEL 

In [11]:
#VADER Model 

analyzer = SentimentIntensityAnalyzer()

def get_vader_sentiment(df, text_col="text_for_vader"):
    """Apply VADER sentiment and return dataframe with scores and labels"""
    def get_scores(text):
        scores = analyzer.polarity_scores(str(text))
        return pd.Series({
            "vader_neg": scores["neg"],
            "vader_neu": scores["neu"],
            "vader_pos": scores["pos"],
            "vader_compound": scores["compound"]
        })
    
    vader_scores = df[text_col].apply(get_scores)
    df = df.join(vader_scores)

    df["vader_label"] = df["vader_compound"].apply(
        lambda c: "positive" if c >= 0.05 else ("critical" if c <= -0.05 else "neutral")
    )
    return df

In [12]:
# Apply to each split 
for name, split in {"train": train_df, "val": val_df, "test": test_df}.items():
    scored = get_vader_sentiment(split, text_col="text_for_vader")
    path = f"/workspaces/ERP_Newsletter/data_processed/{name}_with_vader.csv"
    scored.to_csv(path, index=False)
    print(f"Saved {name} split with VADER → {path}")

Saved train split with VADER → /workspaces/ERP_Newsletter/data_processed/train_with_vader.csv
Saved val split with VADER → /workspaces/ERP_Newsletter/data_processed/val_with_vader.csv
Saved test split with VADER → /workspaces/ERP_Newsletter/data_processed/test_with_vader.csv


# Evaluate Performance 

In [13]:
# Reload the test set (already scored by VADER)
test = pd.read_csv("/workspaces/ERP_Newsletter/data_processed/test_with_vader.csv")


In [14]:
# Use your LLM labels as the comparison target
y_true = test["llm_label"].astype(str)
y_pred = test["vader_label"].astype(str)

In [15]:
# Define class order
labels_order = ["positive", "neutral", "critical"]


In [16]:
# Confusion matrix
print("\n=== Confusion Matrix (rows=True, cols=Pred) ===")
print(pd.DataFrame(
    confusion_matrix(y_true, y_pred, labels=labels_order),
    index=[f"True_{c}" for c in labels_order],
    columns=[f"Pred_{c}" for c in labels_order]
))


=== Confusion Matrix (rows=True, cols=Pred) ===
               Pred_positive  Pred_neutral  Pred_critical
True_positive              5             1              0
True_neutral              15            10              7
True_critical              2             1              4


In [20]:
# Classification metrics
print("\n=== Classification Report ===")
print(classification_report(y_true, y_pred, labels=labels_order, digits=3))

print("\n=== Summary Metrics ===")
print(f"Accuracy: {accuracy_score(y_true, y_pred):.3f}")
print(f"Macro F1: {f1_score(y_true, y_pred, average='macro'):.3f}")
print(f"Weighted F1: {f1_score(y_true, y_pred, average='weighted'):.3f}")


=== Classification Report ===
              precision    recall  f1-score   support

    positive      0.227     0.833     0.357         6
     neutral      0.833     0.312     0.455        32
    critical      0.364     0.571     0.444         7

    accuracy                          0.422        45
   macro avg      0.475     0.572     0.419        45
weighted avg      0.679     0.422     0.440        45


=== Summary Metrics ===
Accuracy: 0.422
Macro F1: 0.419
Weighted F1: 0.440


# Inspect Errors

In [21]:
# Where VADER and LLM disagree
errors = test[test["llm_label"] != test["vader_label"]][
    ["id", "text", "llm_label", "vader_label", "vader_compound"]
]
print(f"\n❌ {len(errors)} disagreements found")
print(errors.sample(10))



❌ 26 disagreements found
                                      id  \
18  17f6595b-5405-4414-8e01-897d5686bc96   
23  9407f618-d7fc-4588-bf28-2bb6ba2a3ecc   
22  d733e51e-8a74-4fb4-8392-a511c882d9d7   
28  1d99f921-ac60-46bf-9d84-9de09359ddf1   
35  c22b3758-1b67-4a5b-a74d-2db1a67061f7   
16  2632b516-8632-4e33-ab08-d729028964e1   
4   136858de-8e48-4aac-95cf-a8fb8c005a3b   
6   b7c41fb0-5a01-4b91-a750-fdbe01ddc64c   
34  7298e4d0-00d3-46b6-bdc8-ac551434a3d8   
37  914f2011-717a-400c-9e22-a4463f2b3f07   

                                                 text llm_label vader_label  \
18  Consultation: Curriculum for Wales: continuing...   neutral    positive   
23  IFS - Support for children with disabilities a...   neutral    positive   
22  CAPE - Building a National Agenda for Regional...   neutral    positive   
28  SchoolsWeek - Schools wanted for AI lesson pla...   neutral    positive   
35  The Conversation - Should you give your child ...   neutral    critical   
16  BBC - Attai

# Quick Summaries

In [22]:
summary_by_theme = test.groupby(["new_theme", "vader_label"]).size().unstack(fill_value=0)
summary_by_theme["Total"] = summary_by_theme.sum(axis=1)
summary_by_theme["% Positive"] = (summary_by_theme["positive"] / summary_by_theme["Total"] * 100).round(1)
summary_by_theme["% Critical"] = (summary_by_theme["critical"] / summary_by_theme["Total"] * 100).round(1)
summary_by_theme["% Neutral"] = (summary_by_theme["neutral"] / summary_by_theme["Total"] * 100).round(1)

print("\n📊 Sentiment by Theme:")
print(summary_by_theme.sort_values("Total", ascending=False).head(10))



📊 Sentiment by Theme:
vader_label                          critical  neutral  positive  Total  \
new_theme                                                                 
political_context_and_organisations         3        6         8     17   
teacher_rrd                                 6        4         5     15   
digital_ed                                  2        2         9     13   

vader_label                          % Positive  % Critical  % Neutral  
new_theme                                                               
political_context_and_organisations        47.1        17.6       35.3  
teacher_rrd                                33.3        40.0       26.7  
digital_ed                                 69.2        15.4       15.4  


# Compare VADER with manual labels 

In [23]:
# --- Paths ---
BASE = "/workspaces/ERP_Newsletter/data_processed"
p_train = f"{BASE}/train_with_vader.csv"
p_val   = f"{BASE}/val_with_vader.csv"
p_test  = f"{BASE}/test_with_vader.csv"
p_manual = f"{BASE}/sample_llm_vs_manual_labels.csv"

In [24]:
# --- Load and combine VADER splits ---
v_train = pd.read_csv(p_train)
v_val   = pd.read_csv(p_val)
v_test  = pd.read_csv(p_test)

vader_all = pd.concat([v_train, v_val, v_test], ignore_index=True)

In [25]:
# Keep useful columns if present
keep_vader_cols = [c for c in [
    "id","text","new_theme","organisation","org_group","year_quarter",
    "llm_label","llm_confidence","text_for_vader",
    "vader_neg","vader_neu","vader_pos","vader_compound","vader_label"
] if c in vader_all.columns]

vader_all = vader_all[keep_vader_cols].copy()

In [26]:
# Deduplicate on id (safest to keep the first occurrence)
if "id" not in vader_all.columns:
    raise ValueError("Column 'id' missing from VADER files. Ensure you saved it in the splits.")
vader_all = vader_all.drop_duplicates(subset=["id"], keep="first")

print(f"VADER combined: {vader_all.shape}")

VADER combined: (300, 14)


In [30]:
# --- Load manual labels (explicit column name) ---
manual = pd.read_csv(p_manual)
needed = ["doc_id", "manual_label"]
missing = [c for c in needed if c not in manual.columns]
if missing:
    raise ValueError(f"Missing columns in manual file: {missing}")

manual = manual.drop_duplicates(subset=["doc_id"], keep="first")


In [36]:
manual.head(0)

Unnamed: 0.1,Unnamed: 0,doc_id,text,llm_label,manual_label,llm_confidence,llm_rationale,did,id


In [35]:
manual["id"] = manual["doc_id"]

In [37]:
# --- Merge manual with VADER ---
merged = pd.merge(manual, vader_all, on="id", how="inner", suffixes=("_manualsrc","_vader"))
print("Merged (manual ∩ vader):", merged.shape)

Merged (manual ∩ vader): (300, 22)


In [38]:
# Quick diagnostics: coverage
ids_in_manual_not_vader = set(manual["id"]) - set(vader_all["id"])
print(f"Manual-only IDs (not in VADER splits): {len(ids_in_manual_not_vader)}")

Manual-only IDs (not in VADER splits): 0


In [39]:
# --- Normalize labels ---
def norm_label(x):
    if pd.isna(x): return x
    s = str(x).strip().lower()
    if s == "negative": s = "critical"   # align naming
    return s

merged["manual_label"] = merged["manual_label"].apply(norm_label)
if "vader_label" in merged.columns:
    merged["vader_label"] = merged["vader_label"].apply(norm_label)
if "llm_label" in merged.columns:
    merged["llm_label"] = merged["llm_label"].apply(norm_label)


In [40]:
# --- Evaluate VADER vs MANUAL ---
labels_order = ["positive","neutral","critical"]
eval_df = merged.dropna(subset=["manual_label","vader_label"]).copy()

y_true = eval_df["manual_label"].astype(str)
y_pred_vader = eval_df["vader_label"].astype(str)

print("\n=== Confusion Matrix (Manual vs VADER) ===")
print(pd.DataFrame(
    confusion_matrix(y_true, y_pred_vader, labels=labels_order),
    index=[f"True_{c}" for c in labels_order],
    columns=[f"Pred_{c}" for c in labels_order]
))
print("\n=== Classification Report (Manual vs VADER) ===")
print(classification_report(y_true, y_pred_vader, labels=labels_order, digits=3))
print(f"Accuracy:  {accuracy_score(y_true, y_pred_vader):.3f}")
print(f"Macro F1:  {f1_score(y_true, y_pred_vader, average='macro'):.3f}")
print(f"Weighted F1: {f1_score(y_true, y_pred_vader, average='weighted'):.3f}")


=== Confusion Matrix (Manual vs VADER) ===
               Pred_positive  Pred_neutral  Pred_critical
True_positive             37             4              3
True_neutral              23             6              8
True_critical             22            10             37

=== Classification Report (Manual vs VADER) ===
              precision    recall  f1-score   support

    positive      0.451     0.841     0.587        44
     neutral      0.300     0.162     0.211        37
    critical      0.771     0.536     0.632        69

    accuracy                          0.533       150
   macro avg      0.507     0.513     0.477       150
weighted avg      0.561     0.533     0.515       150

Accuracy:  0.533
Macro F1:  0.477
Weighted F1: 0.515


# Diagnostics 

In [43]:
#Threshold Analysis
for true_label in ['positive', 'neutral', 'critical']:
    subset = test[test['llm_label'] == true_label]
    pos_count = (subset['vader_compound'] >= 0.05).sum()
    neu_count = ((subset['vader_compound'] > -0.05) & (subset['vader_compound'] < 0.05)).sum()
    crit_count = (subset['vader_compound'] <= -0.05).sum()
    
    print(f"\n  True {true_label.upper()} (n={len(subset)}):")
    print(f"    → VADER says Positive: {pos_count} ({pos_count/len(subset)*100:.1f}%)")
    print(f"    → VADER says Neutral:  {neu_count} ({neu_count/len(subset)*100:.1f}%)")
    print(f"    → VADER says Critical: {crit_count} ({crit_count/len(subset)*100:.1f}%)")



  True POSITIVE (n=6):
    → VADER says Positive: 5 (83.3%)
    → VADER says Neutral:  1 (16.7%)
    → VADER says Critical: 0 (0.0%)

  True NEUTRAL (n=32):
    → VADER says Positive: 15 (46.9%)
    → VADER says Neutral:  10 (31.2%)
    → VADER says Critical: 7 (21.9%)

  True CRITICAL (n=7):
    → VADER says Positive: 2 (28.6%)
    → VADER says Neutral:  1 (14.3%)
    → VADER says Critical: 4 (57.1%)


In [44]:
# Sample Misclassifications 
mistakes = test[test['llm_label'] != test['vader_label']].copy()
print(f"Total errors: {len(mistakes)} / {len(test)} ({len(mistakes)/len(test)*100:.1f}%)")


Total errors: 26 / 45 (57.8%)
The history saving thread hit an unexpected error (OperationalError('attempt to write a readonly database')).History will not be written to the database.


In [47]:
# 4. Vocabulary Analysis

analyzer = SentimentIntensityAnalyzer()

# Check if common business/ERP terms are in VADER lexicon
business_terms = [
    'implementation', 'rollout', 'migration', 'transition', 'upgrade',
    'budget', 'cost', 'savings', 'efficiency', 'productivity',
    'delay', 'issue', 'problem', 'challenge', 'risk',
    'success', 'improvement', 'benefit', 'advantage', 'effective'
]

print("Common business terms in VADER lexicon:")
for term in business_terms:
    if term in analyzer.lexicon:
        print(f"  ✓ {term}: {analyzer.lexicon[term]:.2f}")
    else:
        print(f"  ✗ {term}: NOT IN LEXICON")


Common business terms in VADER lexicon:
  ✗ implementation: NOT IN LEXICON
  ✗ rollout: NOT IN LEXICON
  ✗ migration: NOT IN LEXICON
  ✗ transition: NOT IN LEXICON
  ✗ upgrade: NOT IN LEXICON
  ✗ budget: NOT IN LEXICON
  ✗ cost: NOT IN LEXICON
  ✗ savings: NOT IN LEXICON
  ✓ efficiency: 1.50
  ✗ productivity: NOT IN LEXICON
  ✓ delay: -1.30
  ✗ issue: NOT IN LEXICON
  ✓ problem: -1.70
  ✓ challenge: 0.30
  ✓ risk: -1.10
  ✓ success: 2.70
  ✓ improvement: 2.00
  ✓ benefit: 2.00
  ✓ advantage: 1.00
  ✓ effective: 2.10


In [53]:
# Try different thresholds 
best_f1 = 0
best_thresholds = None

pos_thresholds = np.arange(0.0, 0.5, 0.05)
neg_thresholds = np.arange(-0.5, 0.0, 0.05)

for pos_thresh in pos_thresholds:
    for neg_thresh in neg_thresholds:
        if neg_thresh >= pos_thresh:
            continue
        
        test['vader_label_new'] = test['vader_compound'].apply(
            lambda c: "positive" if c >= pos_thresh else ("critical" if c <= neg_thresh else "neutral")
        )
        
        f1 = f1_score(test['llm_label'], test['vader_label_new'], average='macro')
        
        if f1 > best_f1:
            best_f1 = f1
            best_thresholds = (pos_thresh, neg_thresh)

print(f"Current thresholds: pos >= 0.05, crit <= -0.05 → Macro F1: 0.419")
print(f"Optimal thresholds: pos >= {best_thresholds[0]:.2f}, crit <= {best_thresholds[1]:.2f} → Macro F1: {best_f1:.3f}")

Current thresholds: pos >= 0.05, crit <= -0.05 → Macro F1: 0.419
Optimal thresholds: pos >= 0.25, crit <= -0.30 → Macro F1: 0.502


# Visualisations

In [55]:
# Set professional style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")
COLORS = {'positive': '#2ecc71', 'neutral': '#95a5a6', 'critical': '#e74c3c'}

In [56]:
# Load data
test = pd.read_csv("/workspaces/ERP_Newsletter/data_processed/test_with_vader.csv")
y_true = test["llm_label"]
y_pred = test["vader_label"]

### Confusion Matrix 

In [60]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))

labels_order = ["positive", "neutral", "critical"]

# Default thresholds
cm_default = confusion_matrix(y_true, y_pred, labels=labels_order)
sns.heatmap(cm_default, annot=True, fmt='d', cmap='Blues', 
            xticklabels=labels_order, yticklabels=labels_order,
            cbar_kws={'label': 'Count'}, ax=ax1)
ax1.set_title('Default Thresholds\n(pos ≥ 0.05, crit ≤ -0.05)', fontsize=12, fontweight='bold')
ax1.set_ylabel('True Label', fontsize=11)
ax1.set_xlabel('Predicted Label', fontsize=11)

# Optimized thresholds
test['vader_optimized'] = test['vader_compound'].apply(
    lambda c: "positive" if c >= 0.25 else ("critical" if c <= -0.30 else "neutral")
)
cm_optimized = confusion_matrix(y_true, test['vader_optimized'], labels=labels_order)
sns.heatmap(cm_optimized, annot=True, fmt='d', cmap='Greens',
            xticklabels=labels_order, yticklabels=labels_order,
            cbar_kws={'label': 'Count'}, ax=ax2)
ax2.set_title('Optimized Thresholds\n(pos ≥ 0.25, crit ≤ -0.30)', fontsize=12, fontweight='bold')
ax2.set_ylabel('True Label', fontsize=11)
ax2.set_xlabel('Predicted Label', fontsize=11)

plt.tight_layout()
plt.savefig('/workspaces/ERP_Newsletter/visualisations/1_confusion_matrices.png', 
            dpi=300, bbox_inches='tight')
print("✅ Saved: 1_confusion_matrices.png")
plt.close()


✅ Saved: 1_confusion_matrices.png


# F1 Scores by Class (Before/After)

In [62]:
fig, ax = plt.subplots(figsize=(10, 6))

# Calculate F1 scores
f1_default = [
    f1_score(y_true == label, y_pred == label) 
    for label in labels_order
]
f1_optimized = [
    f1_score(y_true == label, test['vader_optimized'] == label) 
    for label in labels_order
]

x = np.arange(len(labels_order))
width = 0.35

bars1 = ax.bar(x - width/2, f1_default, width, label='Default', 
               color='#3498db', alpha=0.8)
bars2 = ax.bar(x + width/2, f1_optimized, width, label='Optimized',
               color='#2ecc71', alpha=0.8)

# Add value labels on bars
for bars in [bars1, bars2]:
    for bar in bars:
        height = bar.get_height()
        ax.text(bar.get_x() + bar.get_width()/2., height,
                f'{height:.2f}',
                ha='center', va='bottom', fontsize=10, fontweight='bold')

ax.set_ylabel('F1 Score', fontsize=12, fontweight='bold')
ax.set_xlabel('Sentiment Class', fontsize=12, fontweight='bold')
ax.set_title('Model Performance by Class: Before vs After Optimization', 
             fontsize=14, fontweight='bold', pad=20)
ax.set_xticks(x)
ax.set_xticklabels([l.capitalize() for l in labels_order])
ax.legend(fontsize=11, loc='upper left')
ax.set_ylim(0, 1)
ax.grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.savefig('/workspaces/ERP_Newsletter/visualisations/2_f1_comparison.png',
            dpi=300, bbox_inches='tight')
print("✅ Saved: 2_f1_comparison.png")
plt.close()


✅ Saved: 2_f1_comparison.png


### VADER compound score distribution by true label

In [64]:
fig, axes = plt.subplots(1, 3, figsize=(15, 4))

for idx, label in enumerate(labels_order):
    subset = test[test['llm_label'] == label]['vader_compound']
    
    axes[idx].hist(subset, bins=15, color=COLORS[label], 
                   alpha=0.7, edgecolor='black', linewidth=1.2)
    axes[idx].axvline(0.25, color='green', linestyle='--', linewidth=2, 
                      label='Positive threshold', alpha=0.8)
    axes[idx].axvline(-0.30, color='red', linestyle='--', linewidth=2,
                      label='Critical threshold', alpha=0.8)
    axes[idx].axvline(0, color='gray', linestyle='-', linewidth=1,
                      alpha=0.5)
    
    axes[idx].set_title(f'True Label: {label.upper()}\n(n={len(subset)})',
                        fontsize=12, fontweight='bold')
    axes[idx].set_xlabel('VADER Compound Score', fontsize=10)
    axes[idx].set_ylabel('Frequency', fontsize=10)
    axes[idx].legend(fontsize=8, loc='upper right')
    axes[idx].grid(axis='y', alpha=0.3)

plt.suptitle('Distribution of VADER Scores by True Sentiment Label',
             fontsize=14, fontweight='bold', y=1.02)
plt.tight_layout()
plt.savefig('/workspaces/ERP_Newsletter/visualisations/3_score_distributions.png',
            dpi=300, bbox_inches='tight')
print("✅ Saved: 3_score_distributions.png")
plt.close()


✅ Saved: 3_score_distributions.png


### Overall metrics comparison 

In [65]:
fig, ax = plt.subplots(figsize=(10, 5))
ax.axis('off')

# Calculate metrics
def get_metrics(y_true, y_pred):
    acc = accuracy_score(y_true, y_pred)
    macro_f1 = f1_score(y_true, y_pred, average='macro')
    weighted_f1 = f1_score(y_true, y_pred, average='weighted')
    return [acc, macro_f1, weighted_f1]

default_metrics = get_metrics(y_true, y_pred)
optimized_metrics = get_metrics(y_true, test['vader_optimized'])
improvement = [(opt - def_) / def_ * 100 for opt, def_ in zip(optimized_metrics, default_metrics)]

# Create table
table_data = [
    ['Metric', 'Default', 'Optimized', 'Improvement'],
    ['Accuracy', f'{default_metrics[0]:.3f}', f'{optimized_metrics[0]:.3f}', 
     f'+{improvement[0]:.1f}%'],
    ['Macro F1', f'{default_metrics[1]:.3f}', f'{optimized_metrics[1]:.3f}',
     f'+{improvement[1]:.1f}%'],
    ['Weighted F1', f'{default_metrics[2]:.3f}', f'{optimized_metrics[2]:.3f}',
     f'+{improvement[2]:.1f}%']
]

table = ax.table(cellText=table_data, cellLoc='center', loc='center',
                 colWidths=[0.25, 0.25, 0.25, 0.25],
                 bbox=[0, 0, 1, 1])

table.auto_set_font_size(False)
table.set_fontsize(12)
table.scale(1, 2.5)

# Style header row
for i in range(4):
    table[(0, i)].set_facecolor('#34495e')
    table[(0, i)].set_text_props(weight='bold', color='white')

# Style data rows
colors = ['#ecf0f1', '#ffffff']
for i in range(1, 4):
    for j in range(4):
        table[(i, j)].set_facecolor(colors[i % 2])
        if j == 3:  # Improvement column in green
            table[(i, j)].set_facecolor('#d5f4e6')
            table[(i, j)].set_text_props(weight='bold', color='#27ae60')

ax.set_title('VADER Performance: Threshold Optimization Results',
             fontsize=14, fontweight='bold', pad=20)

plt.savefig('/workspaces/ERP_Newsletter/visualisations/4_metrics_summary.png',
            dpi=300, bbox_inches='tight')
print("✅ Saved: 4_metrics_summary.png")
plt.close()

✅ Saved: 4_metrics_summary.png


### Class distribution

In [67]:
fig, axes = plt.subplots(1, 3, figsize=(15, 5))

# Load all splits
train = pd.read_csv("/workspaces/ERP_Newsletter/data_processed/train.csv")
val = pd.read_csv("/workspaces/ERP_Newsletter/data_processed/val.csv")

datasets = [
    ('Training Set\n(n=210)', train['llm_label']),
    ('Validation Set\n(n=45)', val['llm_label']),
    ('Test Set\n(n=45)', test['llm_label'])
]

for idx, (title, data) in enumerate(datasets):
    counts = data.value_counts()
    colors_list = [COLORS[label] for label in counts.index]
    
    wedges, texts, autotexts = axes[idx].pie(
        counts.values,
        labels=[l.capitalize() for l in counts.index],
        autopct='%1.1f%%',
        startangle=90,
        colors=colors_list,
        textprops={'fontsize': 10, 'weight': 'bold'}
    )
    
    # Make percentage text white for better visibility
    for autotext in autotexts:
        autotext.set_color('white')
    
    axes[idx].set_title(title, fontsize=12, fontweight='bold', pad=10)

plt.suptitle('Class Distribution Across Dataset Splits (Stratified)',
             fontsize=14, fontweight='bold', y=0.98)
plt.tight_layout()
plt.savefig('/workspaces/ERP_Newsletter/visualisations/5_class_distribution.png',
            dpi=300, bbox_inches='tight')
print("✅ Saved: 5_class_distribution.png")
plt.close()

✅ Saved: 5_class_distribution.png


### Precision-Recall by Class 

In [71]:
fig, ax = plt.subplots(figsize=(12, 6))

# Get precision and recall for optimized model
precision, recall, f1, support = precision_recall_fscore_support(
    y_true, test['vader_optimized'], labels=labels_order
)

x = np.arange(len(labels_order))
width = 0.25

bars1 = ax.bar(x - width, precision, width, label='Precision',
               color='#3498db', alpha=0.8)
bars2 = ax.bar(x, recall, width, label='Recall',
               color='#e67e22', alpha=0.8)
bars3 = ax.bar(x + width, f1, width, label='F1 Score',
               color='#2ecc71', alpha=0.8)

# Add value labels
for bars in [bars1, bars2, bars3]:
    for bar in bars:
        height = bar.get_height()
        ax.text(bar.get_x() + bar.get_width()/2., height,
                f'{height:.2f}',
                ha='center', va='bottom', fontsize=9, fontweight='bold')

ax.set_ylabel('Score', fontsize=12, fontweight='bold')
ax.set_xlabel('Sentiment Class', fontsize=12, fontweight='bold')
ax.set_title('Detailed Performance Metrics by Class (Optimized Model)',
             fontsize=14, fontweight='bold', pad=20)
ax.set_xticks(x)
ax.set_xticklabels([l.capitalize() for l in labels_order])
ax.legend(fontsize=11, loc='upper left')
ax.set_ylim(0, 1)
ax.grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.savefig('/workspaces/ERP_Newsletter/visualisations/6_precision_recall.png',
            dpi=300, bbox_inches='tight')
print("✅ Saved: 6_precision_recall.png")
plt.close()

✅ Saved: 6_precision_recall.png
