In [1]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import pandas as pd
import os

In [2]:
# Load preprocessed data
input_path = "/workspaces/ERP_Newsletter/data_processed/data_for_preprocessing.csv"
df = pd.read_csv(input_path)
print(f"Loaded {len(df)} rows")

ERROR! Session/line number was not unique in database. History logging moved to new session 17
Loaded 1152 rows


In [3]:
df.head()

Unnamed: 0,id,newsletter_number,issue_date,new_theme,text,domain,organisation
0,c97ff62f-83ca-47ec-a4c7-b4e24157ae0a,1,11 July 2023,political_context_and_organisations,Deadline 23 August 2023 Education secretary Gi...,schoolsweek.co.uk,schools_week
1,e66a8dd2-0dce-444a-876c-3c8137ea2985,1,11 July 2023,political_context_and_organisations,Revealed: the experts advising ministers on te...,schoolsweek.co.uk,schools_week
2,2ce7fa71-0b07-417c-a7d3-8d18e8f12716,1,11 July 2023,political_context_and_organisations,"Reject fewer teacher applicants, DfE tells tra...",schoolsweek.co.uk,schools_week
3,64c74d6d-b558-4183-a42d-e7f0141ce527,1,11 July 2023,political_context_and_organisations,Ofqual and DfE studying 'feasibility' of 'full...,schoolsweek.co.uk,schools_week
4,5564ff61-bf33-482f-9010-32aea9ce1f73,1,11 July 2023,political_context_and_organisations,Revealed: The full details of Labour's educati...,schoolsweek.co.uk,schools_week


In [4]:
df['text_for_vader'] = df['text'].fillna('')  # Ensure no NaNs

In [5]:
#remove URLs 
df['text_for_vader'] = df['text_for_vader'].str.replace(
    r'http\S+|www\S+', '', regex=True
).str.strip()

In [6]:
# Check text lengths (VADER works better on sentences/paragraphs)
print(df['text_for_vader'].str.split().str.len().describe())

count    1152.00000
mean       45.41059
std        27.35227
min         4.00000
25%        28.00000
50%        38.00000
75%        55.00000
max       237.00000
Name: text_for_vader, dtype: float64


In [7]:
#VADER Model 

analyzer = SentimentIntensityAnalyzer()

def get_vader_scores(text):
    """Get VADER sentiment scores"""
    scores = analyzer.polarity_scores(text)
    return pd.Series({
        'vader_neg': scores['neg'],
        'vader_neu': scores['neu'],
        'vader_pos': scores['pos'],
        'vader_compound': scores['compound']  # This is the main score (-1 to +1)
    })

In [8]:
# Apply to text
vader_scores = df['text_for_vader'].apply(get_vader_scores)
df = pd.concat([df, vader_scores], axis=1)

In [9]:
#Classify sentiment 
df['sentiment_label'] = df['vader_compound'].apply(
    lambda x: 'positive' if x >= 0.05 else ('negative' if x <= -0.05 else 'neutral')
)

In [10]:
# Sample some results to validate
print(df[['text', 'vader_compound', 'sentiment_label']].sample(10))

                                                   text  vader_compound  \
642   FED has key questions for the new Education Se...          0.7677   
1090  ResearchGate - Who Aspires to Become a Teacher...          0.0000   
802   The i Paper - How 'naïve' Bridget Phillipson's...         -0.8201   
561   Schools Week - NEU teachers vote to accept 5.5...         -0.1128   
507   Digital Futures for Children with 5Rights-LSE ...         -0.3612   
636   SchoolsWeek - Measuring school disadvantage: a...         -0.5574   
40    This research paper brings together three of t...          0.0000   
265   This project was featured in the 30th Annivers...          0.4019   
715   UCL - AI to Enhance Knowledge Exchange UCL Pol...          0.9403   
812   CCT – EdTech Evidence Board: Contribute to the...          0.6705   

     sentiment_label  
642         positive  
1090         neutral  
802         negative  
561         negative  
507         negative  
636         negative  
40           

In [11]:
print("\n📊 Sentiment Distribution:")
print(df['sentiment_label'].value_counts())
print(f"\nCompound Score Stats:")
print(df['vader_compound'].describe())


📊 Sentiment Distribution:
sentiment_label
positive    665
negative    279
neutral     208
Name: count, dtype: int64

Compound Score Stats:
count    1152.000000
mean        0.230596
std         0.507219
min        -0.962800
25%        -0.000450
50%         0.296000
75%         0.690800
max         0.991000
Name: vader_compound, dtype: float64


In [12]:
# Save
output_dir = "/workspaces/ERP_Newsletter/data_processed"
os.makedirs(output_dir, exist_ok=True)
output_path = os.path.join(output_dir, "data_with_sentiment.csv")

df.to_csv(output_path, index=False)
print(f"\n✅ Saved results to: {output_path}")
print(f"   Shape: {df.shape}")


✅ Saved results to: /workspaces/ERP_Newsletter/data_processed/data_with_sentiment.csv
   Shape: (1152, 13)


In [13]:
# 7. Show some examples
print("\n📝 Sample Results:")
sample_cols = ['text_for_vader', 'vader_compound', 'sentiment_label']
for idx, row in df[sample_cols].sample(5).iterrows():
    text_preview = row['text_for_vader'][:100] + "..." if len(row['text_for_vader']) > 100 else row['text_for_vader']
    print(f"\nText: {text_preview}")
    print(f"Score: {row['vader_compound']:.3f} | Label: {row['sentiment_label']}")


📝 Sample Results:

Text: OECD Digital Education Outlook 2023 The OECD have published a comparative, thematic analysis of how ...
Score: 0.000 | Label: neutral

Text: Scottish Government - More funding to tackle child poverty Projects aimed at tackling child poverty ...
Score: 0.026 | Label: neutral

Text: Scottish Government - Driving the national mission to end child poverty The Child Poverty Practice A...
Score: -0.791 | Label: negative

Text: IOE blog - Teacher education, research and practice: addressing the recruitment and retention crisis...
Score: -0.625 | Label: negative

Text: UKRI - ESRC - £32m for four independent social and economic research centres Funding announced for t...
Score: 0.000 | Label: neutral


In [14]:
print(f"Total articles: {len(df)}")
print(f"Date range: {df['issue_date'].min()} to {df['issue_date'].max()}")

Total articles: 1152
Date range: 09 October 2023 to 9 May 2025


In [15]:
# ==========================================
# 1. SENTIMENT BY THEME (CATEGORY)
# ==========================================

sentiment_by_theme = df.groupby(['new_theme', 'sentiment_label']).size().unstack(fill_value=0)

# Add totals and percentages
sentiment_by_theme['Total'] = sentiment_by_theme.sum(axis=1)
sentiment_by_theme['% Positive'] = (sentiment_by_theme['positive'] / sentiment_by_theme['Total'] * 100).round(1)
sentiment_by_theme['% Neutral'] = (sentiment_by_theme['neutral'] / sentiment_by_theme['Total'] * 100).round(1)
sentiment_by_theme['% Negative'] = (sentiment_by_theme['negative'] / sentiment_by_theme['Total'] * 100).round(1)

# Add average compound score
avg_compound = df.groupby('new_theme')['vader_compound'].mean().round(3)
sentiment_by_theme['Avg_Compound_Score'] = avg_compound

# Sort by most articles
sentiment_by_theme = sentiment_by_theme.sort_values('Total', ascending=False)

print("\n" + "="*80)
print("📊 SENTIMENT DISTRIBUTION BY THEME")
print("="*80)
print(sentiment_by_theme)



📊 SENTIMENT DISTRIBUTION BY THEME
sentiment_label                      negative  neutral  positive  Total  \
new_theme                                                                 
political_context_and_organisations       157      105       352    614   
project_updates                            24       36       122    182   
teacher_rrd                                62       27        79    168   
digital_ed                                 30       35        93    158   
events_opportunities_research               6        5        19     30   

sentiment_label                      % Positive  % Neutral  % Negative  \
new_theme                                                                
political_context_and_organisations        57.3       17.1        25.6   
project_updates                            67.0       19.8        13.2   
teacher_rrd                                47.0       16.1        36.9   
digital_ed                                 58.9       22.2        19.

In [16]:
# ==========================================
# 2. SENTIMENT BY ORGANISATION
# ==========================================

sentiment_by_org = df.groupby(['organisation', 'sentiment_label']).size().unstack(fill_value=0)

# Add totals and percentages
sentiment_by_org['Total'] = sentiment_by_org.sum(axis=1)
sentiment_by_org['% Positive'] = (sentiment_by_org['positive'] / sentiment_by_org['Total'] * 100).round(1)
sentiment_by_org['% Neutral'] = (sentiment_by_org['neutral'] / sentiment_by_org['Total'] * 100).round(1)
sentiment_by_org['% Negative'] = (sentiment_by_org['negative'] / sentiment_by_org['Total'] * 100).round(1)

# Add average compound score
avg_compound_org = df.groupby('organisation')['vader_compound'].mean().round(3)
sentiment_by_org['Avg_Compound_Score'] = avg_compound_org

# Sort by most articles
sentiment_by_org = sentiment_by_org.sort_values('Total', ascending=False)

print("\n" + "="*80)
print("📰 SENTIMENT DISTRIBUTION BY ORGANISATION")
print("="*80)
print(sentiment_by_org.head(20))  # Top 20 organizations



📰 SENTIMENT DISTRIBUTION BY ORGANISATION
sentiment_label      negative  neutral  positive  Total  % Positive  \
organisation                                                          
schools_week               58       21        71    150        47.3   
ucl                        15       16        70    101        69.3   
uk_government               6       12        51     69        73.9   
conversation               13        6        13     32        40.6   
guardian                   19        4         7     30        23.3   
bera                        9        8        12     29        41.4   
epi                         7       11         9     27        33.3   
nfer                        5        4        15     24        62.5   
uk_parliament               1        9        12     22        54.5   
welsh_government            0        8        13     21        61.9   
scottish_government         6        3        12     21        57.1   
oecd                        2      

In [20]:
# ==========================================
# SELECT TOP ORGANISATIONS
# ==========================================

# Get top organisations by article count
top_orgs = df['organisation'].value_counts().head(15).index.tolist()

print(f"📰 Selected organisations: {top_orgs}\n")

# Filter to only these organisations
df_filtered = df[df['organisation'].isin(top_orgs)].copy()

# ==========================================
# CROSSTAB: ORGANISATION × THEME (Percentage)
# ==========================================

# Create crosstab with sentiment counts
crosstab_counts = pd.crosstab(
    index=df_filtered['organisation'],
    columns=[df_filtered['new_theme'], df_filtered['sentiment_label']],
    margins=True,
    margins_name='Total'
)

print("="*120)
print("📊 RAW COUNTS: ORGANISATION × THEME × SENTIMENT")
print("="*120)
print(crosstab_counts)

# ==========================================
# PERCENTAGE BREAKDOWN BY ORGANISATION-THEME
# ==========================================

# Get percentage of each sentiment within each org-theme combination
crosstab_pct = pd.crosstab(
    index=df_filtered['organisation'],
    columns=[df_filtered['new_theme'], df_filtered['sentiment_label']],
    normalize='index'  # Percentage within each organisation
) * 100

crosstab_pct = crosstab_pct.round(1)

print("\n" + "="*120)
print("📊 PERCENTAGE: ORGANISATION × THEME × SENTIMENT (% within each org)")
print("="*120)
print(crosstab_pct)


📰 Selected organisations: ['schools_week', 'ucl', 'uk_government', 'conversation', 'guardian', 'bera', 'epi', 'nfer', 'uk_parliament', 'scottish_government', 'welsh_government', 'oecd', 'bera_journals', 'twitter', 'belfast_telegraph']

📊 RAW COUNTS: ORGANISATION × THEME × SENTIMENT
new_theme           digital_ed                  events_opportunities_research  \
sentiment_label       negative neutral positive                      negative   
organisation                                                                    
belfast_telegraph            0       0        0                             0   
bera                         1       0        4                             0   
bera_journals                1       0        1                             1   
conversation                 1       1        0                             0   
epi                          0       0        0                             0   
guardian                     5       1        1                      

In [21]:
# ==========================================
# CLEANER VERSION: ONE TABLE PER SENTIMENT
# ==========================================

print("\n" + "="*120)
print("✅ POSITIVE SENTIMENT % by Organisation and Theme")
print("="*120)

positive_pivot = df_filtered.groupby(['organisation', 'new_theme', 'sentiment_label']).size().unstack(fill_value=0)
positive_pivot['Total'] = positive_pivot.sum(axis=1)
positive_pivot['% Positive'] = (positive_pivot.get('positive', 0) / positive_pivot['Total'] * 100).round(1)
positive_pivot['% Neutral'] = (positive_pivot.get('neutral', 0) / positive_pivot['Total'] * 100).round(1)
positive_pivot['% Negative'] = (positive_pivot.get('negative', 0) / positive_pivot['Total'] * 100).round(1)

positive_summary = positive_pivot[['Total', '% Positive', '% Neutral', '% Negative']].sort_values('Total', ascending=False)
print(positive_summary.head(30))



✅ POSITIVE SENTIMENT % by Organisation and Theme
sentiment_label                                          Total  % Positive  \
organisation        new_theme                                                
schools_week        political_context_and_organisations     79        45.6   
ucl                 project_updates                         73        72.6   
schools_week        teacher_rrd                             55        43.6   
uk_government       political_context_and_organisations     39        76.9   
scottish_government political_context_and_organisations     21        57.1   
welsh_government    political_context_and_organisations     21        61.9   
guardian            political_context_and_organisations     19        31.6   
epi                 political_context_and_organisations     19        26.3   
uk_government       digital_ed                              17        82.4   
belfast_telegraph   political_context_and_organisations     16        56.2   
ucl           

In [18]:
print("\n" + "="*80)
print("💡 KEY INSIGHTS")
print("="*80)

# Most positive theme
most_positive_theme = sentiment_by_theme['Avg_Compound_Score'].idxmax()
print(f"✅ Most positive theme: {most_positive_theme} "
      f"(score: {sentiment_by_theme.loc[most_positive_theme, 'Avg_Compound_Score']:.3f})")

# Most negative theme
most_negative_theme = sentiment_by_theme['Avg_Compound_Score'].idxmin()
print(f"❌ Most negative theme: {most_negative_theme} "
      f"(score: {sentiment_by_theme.loc[most_negative_theme, 'Avg_Compound_Score']:.3f})")

# Most positive organisation (min 10 articles)
org_min_articles = sentiment_by_org[sentiment_by_org['Total'] >= 10]
most_positive_org = org_min_articles['Avg_Compound_Score'].idxmax()
print(f"\n✅ Most positive organisation: {most_positive_org} "
      f"(score: {org_min_articles.loc[most_positive_org, 'Avg_Compound_Score']:.3f}, "
      f"n={org_min_articles.loc[most_positive_org, 'Total']:.0f})")

# Most negative organisation (min 10 articles)
most_negative_org = org_min_articles['Avg_Compound_Score'].idxmin()
print(f"❌ Most negative organisation: {most_negative_org} "
      f"(score: {org_min_articles.loc[most_negative_org, 'Avg_Compound_Score']:.3f}, "
      f"n={org_min_articles.loc[most_negative_org, 'Total']:.0f})")



💡 KEY INSIGHTS
✅ Most positive theme: project_updates (score: 0.351)
❌ Most negative theme: teacher_rrd (score: 0.088)

✅ Most positive organisation: ukri (score: 0.633, n=12)
❌ Most negative organisation: guardian (score: -0.179, n=30)
