In [1]:
import os 
import re 

import pandas as pd 
import numpy as np 

from ftfy import fix_text
import unicodedata as ud
from urllib.parse import urlparse

In [2]:
# Treat these text tokens as missing on read
NA_TOKENS = ["", " ", "NA", "N/A", "na", "NaN", "nan", "null", "NULL", "-"]

In [3]:
#load data 
input_path = "/workspaces/ERP_Newsletter/data_processed/newsletter_items.csv"
data_cleaning_path = "/workspaces/ERP_Newsletter/data_cleaning"
output_path = "/workspaces/ERP_Newsletter/data_processed"


df = pd.read_csv(input_path, keep_default_na=True, na_values=NA_TOKENS)

In [4]:
#inspect 
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1668 entries, 0 to 1667
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   id                 1668 non-null   object
 1   newsletter_number  1668 non-null   int64 
 2   issue_date         1668 non-null   object
 3   theme              1668 non-null   object
 4   subtheme           114 non-null    object
 5   title              1667 non-null   object
 6   description        1346 non-null   object
 7   link               1616 non-null   object
dtypes: int64(1), object(7)
memory usage: 104.4+ KB


In [5]:
print(f"Total rows: {len(df)}")
print(f"Unique newsletter: {df['newsletter_number'].nunique()}")

Total rows: 1668
Unique newsletter: 87


# Clean Up Text

In [6]:
def clean_series(s: pd.Series) -> pd.Series:
    # Use pandas "string" dtype so NaNs stay as <NA>
    s = s.astype("string")
    mask = s.notna()
    # Fix mojibake and normalize only on non-missing cells
    s.loc[mask] = s.loc[mask].apply(fix_text)
    s.loc[mask] = s.loc[mask].apply(lambda x: ud.normalize("NFKC", x))
    # Basic whitespace cleanup
    s.loc[mask] = s.loc[mask].str.replace(r"\s+", " ", regex=True).str.strip()
    return s

# Clean all object/string columns (quick and safe)
obj_cols = [c for c in df.columns if df[c].dtype == object or pd.api.types.is_string_dtype(df[c])]
for c in obj_cols:
    df[c] = clean_series(df[c])

# Quick exact replacements for the most common artifacts (optional, simple)
REPL = {
    "Â ": " ", "Â": "",
    "‚Äì": "–", "‚Äî": "—",
    "‚Äô": "’", "‚Äò": "‘",
    "‚Äú": "“", "‚Äù": "”",
    "â€“": "–", "â€”": "—",
    "â€˜": "‘", "â€™": "’",
    "â€œ": "“", "â€\x9d": "”",
    "â€¢": "•", "â€¦": "…",
}
for c in obj_cols:
    s = df[c].astype("string")
    for bad, good in REPL.items():
        s = s.str.replace(bad, good, regex=False)
    s = s.str.replace(r"\s+", " ", regex=True).str.strip()
    df[c] = s

# Check for Missing Values 

In [7]:
def missing_table(d: pd.DataFrame) -> pd.DataFrame:
    mc = d.isna().sum()
    return pd.DataFrame({
        "Missing Values": mc,
        "Percentage (%)": (mc / len(d)) * 100
    }).sort_values("Missing Values", ascending=False)

print("\n=== Missing values (before drop) ===")
print(missing_table(df))


=== Missing values (before drop) ===
                   Missing Values  Percentage (%)
subtheme                     1554       93.165468
description                   322       19.304556
link                           52        3.117506
title                           1        0.059952
theme                           0        0.000000
issue_date                      0        0.000000
newsletter_number               0        0.000000
id                              0        0.000000


# Remove items where description or link are missing

In [8]:
# Remove rows where 'description' or 'link' is missing
df_cleaned = df.dropna(subset=['description', 'link'])

# (Optional) Check how many rows remain
print(f"Rows before: {len(df)}")
print(f"Rows after : {len(df_cleaned)}")

df = df_cleaned

Rows before: 1668
Rows after : 1324


# Check for Duplicates 

In [9]:
#All rows identical 
total_duplicates = df.duplicated().sum()
print(f"Total duplicate rows (all columns identical): {total_duplicates}")

Total duplicate rows (all columns identical): 0


In [10]:
# Check duplicates where both title and link are the same
title_link_dupes = df[df.duplicated(subset=["title", "link"], keep=False)]

print(f"Number of duplicate title+link pairs: {title_link_dupes.shape[0]}")
title_link_dupes.sort_values(by=["title"]).head(1)

Number of duplicate title+link pairs: 89


Unnamed: 0,id,newsletter_number,issue_date,theme,subtheme,title,description,link
1301,bf6c4fd6-a5bd-48ca-9249-b5b92849e038,70,4 April 2025,Updates from the programme,,A reminder that the ESRC Education Research Pr...,"AI in Education: From chalkboards to chatbots,...",https://uk.bettshow.com/speakers/dominik-lukes


In [11]:
# Count duplicates based on title only
title_dupes = df[df.duplicated(subset=["title"], keep=False)]

print(f"Number of rows with duplicate titles: {title_dupes.shape[0]}")
title_dupes.sort_values(by="title").head(1)


Number of rows with duplicate titles: 103


Unnamed: 0,id,newsletter_number,issue_date,theme,subtheme,title,description,link
1301,bf6c4fd6-a5bd-48ca-9249-b5b92849e038,70,4 April 2025,Updates from the programme,,A reminder that the ESRC Education Research Pr...,"AI in Education: From chalkboards to chatbots,...",https://uk.bettshow.com/speakers/dominik-lukes


In [12]:
# Count duplicates based on link only
link_dupes = df[df.duplicated(subset=["link"], keep=False)]

print(f"Number of rows with duplicate links: {link_dupes.shape[0]}")
link_dupes.sort_values(by="link").head(1)

Number of rows with duplicate links: 202


Unnamed: 0,id,newsletter_number,issue_date,theme,subtheme,title,description,link
780,5e06ebc5-9779-4c58-9367-ddc9d80d648c,45,5 September 2024,Updates from the programme,,BERJ Call for Papers - Making teaching an attr...,This special issue of BERJ is guest edited by ...,https://bera-journals.onlinelibrary.wiley.com/...


# Identify themes and subthemes

In [13]:
#Unique counts of columns 
print("Unique titles:", df["title"].nunique())
print("Unique themes:", df["theme"].nunique())
print("Unique subthemes", df["subtheme"].nunique())
print("Unique links:", df["link"].nunique())

Unique titles: 1247
Unique themes: 63
Unique subthemes 36
Unique links: 1192


In [14]:
### Add placeholders for missing themes/subhtemes

# 1) Normalize empties/whitespace/"nan"/"none" to real NA
df_norm = df.copy()
for col in ["theme", "subtheme"]:
    df_norm[col] = (
        df_norm[col]
        .astype("string")
        .replace(r"^\s*$", pd.NA, regex=True)   # empty/whitespace → NA
        .replace({"nan": pd.NA, "NaN": pd.NA, "None": pd.NA, "none": pd.NA})
    )

# 2) Create a version that fills NA with placeholders so ALL cases are counted
df_filled = df_norm.fillna({"theme": "No theme", "subtheme": "No subtheme"})

# 3) Group and count every (theme, subtheme) combo, including placeholder cases
theme_subtheme_counts = (
    df_filled
    .groupby(["theme", "subtheme"], dropna=False)
    .size()
    .reset_index(name="count")
    .sort_values(by=["theme", "subtheme"])
)

# 4) Export to Excel  
out_dir = "/workspaces/ERP_Newsletter/data_cleaning"
os.makedirs(out_dir, exist_ok=True)
out_path = os.path.join(out_dir, "theme_subtheme_counts.xlsx")

theme_subtheme_counts.to_excel(out_path, index=False)  # <- this one
print(f"✅ Exported {len(theme_subtheme_counts)} rows to {out_path}")


✅ Exported 102 rows to /workspaces/ERP_Newsletter/data_cleaning/theme_subtheme_counts.xlsx


# Check Themes and Articles 

In [15]:
# Filter articles under themes

check_themes = df[df["theme"] == "Research – Practice – Policy"].copy()

# View a few examples
display(check_themes.head(5))

Unnamed: 0,id,newsletter_number,issue_date,theme,subtheme,title,description,link
390,2fbf7b7b-334a-4d82-977c-d14a5bbf4778,25,16 February 2024,Research – Practice – Policy,,CAPE - Quid pro quo? Why academics meet with p...,"Patrick McAlary, CAPE coordinator, explores wh...",https://t.co/DbEx7Z1PPJ
391,7f2bc1b1-fcd3-4602-89d3-cdd1d39845de,25,16 February 2024,Research – Practice – Policy,,The SHAPE of research impact,British Academy report exploring research impa...,https://www.thebritishacademy.ac.uk/publicatio...
392,1775fa9d-d0de-4be6-9e61-f466f7131eb2,25,16 February 2024,Research – Practice – Policy,,NFER Event – Disadvantaged Policy webinar,Thursday 22 February 2024 – 11am Online,https://www.nfer.ac.uk/events/disadvantaged-po...
407,e684b418-8313-40f5-a40d-1fb8e68784bc,26,23 February 2024,Research – Practice – Policy,,Post from the Co-Production Collective - How t...,Through her lived experience of working with c...,https://www.coproductioncollective.co.uk/news/...
409,a8733168-3fb2-4eb4-b5d4-d15318b122f8,26,23 February 2024,Research – Practice – Policy,,"BERA event - Social theory, educational resear...",22 May 2024 2pm – 4pm (Free for BERA members),https://www.bera.ac.uk/event/social-theory-edu...


# Rename Themes

In [16]:
# ---------- 0) Drop rows where the entire theme is the unsubscribe text
UNSUB_THEME = (
    "You have indicated that you are happy to receive news and updates from the "
    "ESRC Education Research Programme. To unsubscribe, please email "
    "Elizabeth.hudson@ucl.ac.uk with the word UNSUBSCRIBE in the title of the email."
)
mask_unsub = df["theme"].astype(str).str.strip().eq(UNSUB_THEME)
dropped_rows = int(mask_unsub.sum())
df = df[~mask_unsub].copy()

# ---------- 1) Normalizers
def norm_theme(s: str) -> str:
    if not isinstance(s, str):
        return ""
    s = s.strip()
    s = re.sub(r"\s+", " ", s)
    s = s.replace("—", "-").replace("–", "-")  # normalize dashes
    s = s.replace("’", "'").replace("‘", "'").replace("“", '"').replace("”", '"')
    return s.lower()

def norm_key(s: str) -> str:
    """Strong normalizer for matching keys like subthemes:
       - lowercase; & -> and; remove punctuation; normalize dashes; collapse spaces
    """
    if not isinstance(s, str):
        return ""
    s = s.strip().lower()
    s = s.replace("—", " ").replace("–", " ").replace("-", " ")
    s = s.replace("&", " and ")
    s = s.replace("’", "'").replace("‘", "'").replace("“", '"').replace("”", '"')
    s = re.sub(r"[,\.\u00A0]", " ", s)         # remove commas, periods, NBSP
    s = re.sub(r"[^a-z0-9\s]", " ", s)         # drop other punctuation
    s = re.sub(r"\s+", " ", s).strip()
    return s

# ---------- 2) Theme mapping list: (new_theme, current_theme)
# NOTE: Moved "Four Nations" + variants AND "Update from UKRI" to political_context_and_organisations
pairs = [
    # project_updates
    ("project_updates", "Embedding children's participation rights in pedagogical practice in lower primary classrooms in Wales PI: Sarah Chicken"),
    ("project_updates", "Investigating the recruitment and retention of ethnic minority teachers PI: Stephen Gorard"),
    ("project_updates", "News from the Projects"),
    ("project_updates", "News from the projects"),
    ("project_updates", "PI Updates and Papers"),
    ("project_updates", "PI: David Lundie"),
    ("project_updates", "Programme news"),
    ("project_updates", "Programme Update"),
    ("project_updates", "Programme update"),
    ("project_updates", "Project news"),
    ("project_updates", "Rethinking teacher recruitment: New approaches to attracting prospective STEM teachers PI: Rob Klassen"),
    ("project_updates", "Sustainable school leadership: comparing approaches to the training, supply and retention of senior school leaders across the UK PI Toby Greany"),
    ("project_updates", "Toby Greany"),
    ("project_updates", "Towards equity focused approaches to EdTech: a socio-technical perspective PI: Professor Rebecca Eynon"),
    ("project_updates", "Towards equity focused approaches to EdTech: a socio-technical perspective PI: Rebecca Eynon"),
    ("project_updates", "Update from the ERP projects"),
    ("project_updates", "Update from the ESRC Education Research Programme"),
    ("project_updates", "Update from the projects"),
    ("project_updates", "Updates from David Lundie"),
    ("project_updates", "Updates from Steph Ainsworth"),
    ("project_updates", "Updates from the ERP projects"),
    ("project_updates", "Updates from the ESRC"),
    ("project_updates", "Updates from the Programme"),
    ("project_updates", "Updates from the programme"),
    ("project_updates", "Updates from the projects"),
    ("project_updates", "Decentring the 'resilient teacher': exploring interactions between individuals and their social ecologies PI: Steph Ainsworth"),
    ("project_updates", "Peer reviewed articles from the ERP projects"),
    ("project_updates", "Peer reviewed publications from the ERP projects"),
    ("project_updates", "Seminar series topics"),
    ("project_updates", "Seminar topics"),

    # digital_ed
    ("digital_ed", "EdTech"),

    # political_context_and_organisations
    ("political_context_and_organisations", "What are the politicians saying?"),
    ("political_context_and_organisations", "What Matters in Education?"),
    ("political_context_and_organisations", "What matters in education?"),
    ("political_context_and_organisations", "4 Nations"),
    ("political_context_and_organisations", "4 Nations & key organisations"),
    ("political_context_and_organisations", "Political environment and key organisations"),
    ("political_context_and_organisations", "Political landscape - the election"),
    ("political_context_and_organisations", "Political landscape & key organisations"),
    ("political_context_and_organisations", "Political landscape across Four Nations & key organisations"),
    ("political_context_and_organisations", "Research – Practice – Policy"),
    ("political_context_and_organisations", "Calls for evidence"),
    ("political_context_and_organisations", "DfE"),
    ("political_context_and_organisations", "Education, Policy & Practice"),
    ("political_context_and_organisations", "EEF"),
    ("political_context_and_organisations", "ESRC"),
    ("political_context_and_organisations", "Politics"),
    ("political_context_and_organisations", "Launch of ESRC survey on social science research skills"),
    ("political_context_and_organisations", "Updates from UKRI"),  # plural already here
    ("political_context_and_organisations", "Update from UKRI"),   # moved here (singular)
    ("political_context_and_organisations", "Four Nations"),       # moved here
    ("political_context_and_organisations", "Four Nations Landscape"),
    ("political_context_and_organisations", "Four Nations landscape"),

    # events_opportunities_research
    ("events_opportunities_research", "Conferences"),
    ("events_opportunities_research", "Opportunities"),
    ("events_opportunities_research", "Opportunities for funding"),
    ("events_opportunities_research", "Opportunities to blog"),
    ("events_opportunities_research", "Other Reports"),
    ("events_opportunities_research", "Other Research"),
    ("events_opportunities_research", "Relevant Events"),
    ("events_opportunities_research", "Relevant Research"),
    ("events_opportunities_research", "Reports"),
    ("events_opportunities_research", "Research"),
    ("events_opportunities_research", "Events"),

    # teacher_rrd
    ("teacher_rrd", "Teacher recruitment, retention & development"),
]

# ---------- 3) Build lookup (normalized)
lookup = {norm_theme(curr): new for new, curr in pairs}

# ---------- 4) Apply theme mapping (no fill yet)
theme_norm = df["theme"].map(norm_theme)
df["new_theme"] = theme_norm.map(lookup)

# ---------- 4b) Defensive keyword overrides (force correct bucket if text contains patterns)
kw_mask = (
    theme_norm.str.contains(r"\bfour nations\b", regex=True, na=False) |
    theme_norm.str.contains(r"\bupdate(s)? from ukri\b", regex=True, na=False)
)
df.loc[kw_mask, "new_theme"] = "political_context_and_organisations"

# ---------- 5) Subtheme-based overrides
sub_norm = df["subtheme"].map(norm_key)

target_rrd = "teacher recruitment retention and development"
df.loc[sub_norm.eq(target_rrd), "new_theme"] = "teacher_rrd"  # any variant mapped earlier → normalized equals this
df.loc[sub_norm.eq("digital"), "new_theme"] = "digital_ed"

# ---------- 6) Fill any remaining unmapped with the original theme text (your previous behavior)
df["new_theme"] = df["new_theme"].fillna(df["theme"])

# ---------- 7) Export a summary
summary = (
    df.assign(theme_norm=theme_norm, subtheme_norm=sub_norm)
      .groupby(["new_theme", "theme_norm"], dropna=False)
      .size()
      .reset_index(name="count")
      .sort_values(["new_theme", "count"], ascending=[True, False])
)

out_dir = "/workspaces/ERP_Newsletter/data_cleaning"
os.makedirs(out_dir, exist_ok=True)
summary_path = os.path.join(out_dir, "theme_mapping_summary.xlsx")

with pd.ExcelWriter(summary_path) as xw:
    df.to_excel(xw, sheet_name="data_with_new_theme", index=False)
    summary.to_excel(xw, sheet_name="mapping_summary", index=False)

print(f"✅ Dropped {dropped_rows} unsubscribe row(s).")
print("✅ Mapping applied.")
print("📄 Excel written to:", summary_path)

  theme_norm.str.contains(r"\bupdate(s)? from ukri\b", regex=True, na=False)


✅ Dropped 28 unsubscribe row(s).
✅ Mapping applied.
📄 Excel written to: /workspaces/ERP_Newsletter/data_cleaning/theme_mapping_summary.xlsx


In [17]:
# ---------- 7) View unique new_theme values and their counts
theme_counts = (
    df["new_theme"]
    .value_counts(dropna=False)
    .reset_index()
    .rename(columns={"index": "new_theme", "new_theme": "count"})
)

print("🧭 Unique new_theme values and their counts:")
print(theme_counts)

🧭 Unique new_theme values and their counts:
                                 count  count
0  political_context_and_organisations    697
1                      project_updates    203
2                          teacher_rrd    184
3                           digital_ed    176
4        events_opportunities_research     36


# Number of unique domain names 

In [18]:
# Extract domain names from the 'link' column
df["domain"] = df["link"].apply(lambda x: urlparse(str(x)).netloc if pd.notna(x) else None)

# Count unique domains
unique_domains = df["domain"].nunique()

print(f"🌐 There are {unique_domains} unique domains in this dataset.")

# Optional: see the top 10 most common domains
domain_counts = df["domain"].value_counts().reset_index()
domain_counts.columns = ["domain", "count"]
print(domain_counts.head(40))

🌐 There are 346 unique domains in this dataset.
                                   domain  count
0                       schoolsweek.co.uk    150
1                           www.ucl.ac.uk     75
2                              www.gov.uk     59
3                     theconversation.com     32
4                     www.theguardian.com     30
5                          www.bera.ac.uk     29
6                              epi.org.uk     27
7                    www.eventbrite.co.uk     25
8                          www.nfer.ac.uk     23
9                            www.gov.scot     20
10  bera-journals.onlinelibrary.wiley.com     18
11             www.belfasttelegraph.co.uk     16
12                www.education-ni.gov.uk     16
13               committees.parliament.uk     15
14             ffteducationdatalab.org.uk     14
15                            www.tes.com     14
16                      meetoecd1.zoom.us     12
17             www.nuffieldfoundation.org     12
18                   

In [19]:
#export a file to review domain names, remove irrelevant names and create a new 'organisation' name based on domain name 
review_path = "/workspaces/ERP_Newsletter/data_cleaning/domain_review.xlsx"
domain_counts.to_excel(review_path, index=False)
print("Wrote:", review_path)

Wrote: /workspaces/ERP_Newsletter/data_cleaning/domain_review.xlsx


# Add'organisation' column and remove irreleant domain names

In [20]:
# Mapping from domain → organisation
domain_to_org = {
    "schoolsweek.co.uk": "schools_week",
    "www.ucl.ac.uk": "ucl",
    "www.gov.uk": "uk_government",
    "theconversation.com": "conversation",
    "www.theguardian.com": "guardian",
    "www.bera.ac.uk": "bera",
    "epi.org.uk": "epi",
    "www.eventbrite.co.uk": "REMOVE",
    "www.nfer.ac.uk": "nfer",
    "www.gov.scot": "scottish_government",
    "bera-journals.onlinelibrary.wiley.com": "bera_journals",
    "www.belfasttelegraph.co.uk": "belfast_telegraph",
    "www.education-ni.gov.uk": "ni_government",
    "committees.parliament.uk": "uk_parliament",
    "ffteducationdatalab.org.uk": "fft_ed_datalab",
    "www.tes.com": "tes",
    "meetoecd1.zoom.us": "REMOVE",
    "www.nuffieldfoundation.org": "nuffield",
    "www.gov.wales": "welsh_government",
    "blogs.ucl.ac.uk": "ucl",
    "": "REMOVE",
    "blog.bham.ac.uk": "university_of_birmingham",
    "upen.ac.uk": "upen",
    "www.instituteforgovernment.org.uk": "ifg",
    "fed.education": "fed",
    "education.us18.list-manage.com": "REMOVE",
    "ifs.org.uk": "ifs",
    "mediacentral.ucl.ac.uk": "ucl",
    "educationwales.blog.gov.wales": "welsh_government",
    "www.childrenscommissioner.gov.uk": "childrens_commissioner",
    "www.bbc.co.uk": "bbc",
    "edtech.oii.ox.ac.uk": "oii_edtech_equity",
    "bera.us9.list-manage.com": "REMOVE",
    "teachertapp.co.uk": "teacher_tapp",
    "www.oecd.org": "oecd",
    "educationendowmentfoundation.org.uk": "eef",
    "www.ukri.org": "ukri",
    "childrens-participation.org": "childrends_participation_in_schools",
    "t.co": "twitter",
    "lordslibrary.parliament.uk": "house_of_lords_library",
    "my.chartered.college": "cct",
    "post.parliament.uk": "post_parliament",
    "lgiu.us3.list-manage.com": "REMOVE",
    "parliament.us16.list-manage.com": "REMOVE",
    "wonkhe.com": "wonkhe",
    "uk.bettshow.com": "bett_show",
    "newsletter.oecd.org": "oecd",
    "bit.ly": "twitter",
    "www.tandfonline.com": "taylor_and_francis",
    "chartered.college": "chartered_college_of_teaching",
    "teachertapp.com": "teacher_tapp",
    "www.thebritishacademy.ac.uk": "british_academy",
    "5rightsfoundation.com": "5rights_foundation",
    "www.independent.co.uk": "independent",
    "neu.org.uk": "national_education_union",
    "theippo.co.uk": "ippo",
    "www.linkedin.com": "linkedin",
    "www.eventbrite.com": "REMOVE",
    "forms.office.com": "REMOVE",
    "www.ippr.org": "ippr",
    "www.hepi.ac.uk": "hepi",
    "www.nesta.org.uk": "nesta",
    "assets.publishing.service.gov.uk": "uk_government",
    "journals.sagepub.com": "sage_journals",
    "d2tic4wvo1iusb.cloudfront.net": "REMOVE",
    "inews.co.uk": "inews",
    "discovery.ucl.ac.uk": "ucl",
    "education.gov.scot": "education_scotland",
    "email.thebritishacademy.ac.uk": "british_academy",
    "digitalpovertyalliance.org": "digital_poverty_alliance",
    "feweek.co.uk": "fe_week",
    "contacts.epi.org.uk": "REMOVE",
    "sustainableschoolleadership.uk": "sustainable_school_leadership",
    "www.oecd-ilibrary.org": "oecd",
    "www.transformingsociety.co.uk": "transforming_society",
    "www.upen.ac.uk": "upen",
    "manmetjobs.mmu.ac.uk": "manchester_metropolitan_university",
    "www.edge.co.uk": "edge_foundation",
    "www.jrf.org.uk": "joseph_rowntree_foundation",
    "www.unicef.org": "unicef",
    "www.adalovelaceinstitute.org": "ada_lovelace_institute",
    "literacytrust.org.uk": "national_literacy_trust",
    "events.teams.microsoft.com": "REMOVE",
    "engagementhub.ukri.org": "ukri",
    "cfey.org": "centre_for_education_and_youth",
    "www.nao.org.uk": "national_audit_office",
    "upen.us14.list-manage.com": "REMOVE",
    "www.edtechstrategylab.org": "edtech_strategy_lab",
    "el.wiley.com": "wiley",
    "senedd.wales": "welsh_parliament",
    "hansard.parliament.uk": "uk_parliament",
    "www.cape.ac.uk": "cape_collaboration_for_public_engagement",
    "options2040.co.uk": "options_2040_project",
    "transforming-evidence.org": "transforming_evidence",
    "www.orielsquare.co.uk": "oriel_square",
    "www.institute.global": "tony_blair_institute",
    "www.evaluation.impactedgroup.uk": "impacted_group",
    "open.spotify.com": "spotify_podcast",
    "educationhub.blog.gov.uk": "uk_government",
    "www.politicshome.com": "politics_home",
    "eprints.lse.ac.uk": "lse_repository",
    "business.senedd.wales": "welsh_parliament",
    "bigeducation.org": "big_education",
    "cpag.org.uk": "child_poverty_action_group",
    "www.tickettailor.com": "REMOVE",
    "click.communications.gse.harvard.edu": "REMOVE",
    "www.telegraph.co.uk": "daily_telegraph",
    "www.n8research.org.uk": "n8_research_partnership",
    "www.centreforyounglives.org.uk": "centre_for_young_lives",
    "cdn.prod.website-files.com": "REMOVE",
    "www.nasuwt.org.uk": "nasuwt_teachers_union",
    "www.de.ed.ac.uk": "university_of_edinburgh",
    "www.libdems.org.uk": "liberal_democrats",
    "wcpp.org.uk": "wales_centre_for_public_policy",
    "www.naht.org.uk": "national_association_head_teachers",
    "cstuk.org.uk": "charities_supporting_teachers_uk",
    "www.turing.ac.uk": "alan_turing_institute",
    "www.health-ni.gov.uk": "ni_department_of_health",
    "covidandsociety.us1.list-manage.com": "REMOVE",
    "lgiu.org": "local_government_information_unit",
    "www.mirror.co.uk": "daily_mirror",
    "drive.google.com": "REMOVE",
    "www.educationsupport.org.uk": "education_support_charity",
    "publicpolicydesign.blog.gov.uk": "uk_government",
    "epi.us15.list-manage.com": "REMOVE",
    "durhamuniversity.zoom.us": "REMOVE",
    "www.ascl.org.uk": "ascl",
    "www.nurseryworld.co.uk": "nursery_world_magazine",
    "teachingcommission.co.uk": "teaching_commission",
    "acss.org.uk": "academy_of_social_sciences",
    "defenddigitalme.org": "defend_digital_me",
    "www.researchgate.net": "researchgate",
    "crae.org.uk": "children_rights_alliance_england",
    "fairnessfoundation.com": "fairness_foundation",
    "www.durham.ac.uk": "durham_university",
    "www.gse.harvard.edu": "harvard_graduate_school_of_education",
    "politico.us8.list-manage.com": "REMOVE",
    "www.internetmatters.org": "internet_matters",
    "www.parliament.uk": "uk_parliament",
    "ripl.uk": "research_improvement_for_policy_and_learning",
    "www.youtube.com": "youtube",
    "www.coproductioncollective.co.uk": "coproduction_collective",
    "www.britishcouncil.org": "british_academy",
    "www.techuk.org": "tech_uk",
    "localed2025.org.uk": "local_ed_2025",
    "www.lgcplus.com": "local_government_chronicle",
    "lnkd.in": "linkedin",
    "www.economy-ni.gov.uk": "ni_department_for_economy",
    "techbullion.com": "techbullion",
    "ukla.us10.list-manage.com": "REMOVE",
    "educationscape.us4.list-manage.com": "REMOVE",
    "linkprotect.cudasvc.com": "REMOVE",
    "commonslibrary.parliament.uk": "house_of_commons_library",
    "play.wales": "play_wales",
    "www.edtechdigest.com": "edtech_digest",
    "cep.lse.ac.uk": "centre_for_economic_performance_lse",
    "www.elsevier.com": "elsevier",
    "www.express.co.uk": "daily_express",
    "onlinelibrary.wiley.com": "wiley",
    "institute.global": "tony_blair_institute",
    "www.insideedgetraining.co.uk": "inside_edge_training",
    "research.senedd.wales": "welsh_parliament",
    "www.frontiersin.org": "frontiers_journal",
    "openpolicy.blog.gov.uk": "uk_government",
    "www.mmu.ac.uk": "manchester_metropolitan_university",
    "blogs.uwe.ac.uk": "uwe_bristol_blog",
    "www.contractsfinder.service.gov.uk": "uk_government",
    "www.parliament.scot": "scottish_parliament",
    "njmok7zy3oa.typeform.com": "REMOVE",
    "public-api.wordpress.com": "wordpress",
    "www.sciencecampaign.org.uk": "campaign_for_science_and_engineering",
    "www.nottingham.ac.uk": "university_of_nottingham",
    "zoom.us": "REMOVE",
    "www.digit.fyi": "digit_fyi",
    "explore-education-statistics.service.gov.uk": "dfe_education_statistics",
    "www.atkinsrealis.com": "atkins_realis",
    "profbeckyallen.substack.com": "becky_allen_substack",
    "www.lse.ac.uk": "london_school_of_economics",
    "www.uwe.ac.uk": "uew_england",
    "wcpp.us12.list-manage.com": "REMOVE",
    "researchonresearch.org": "research_on_research_institute",
    "www.funding-futures.org": "funding_futures",
    "www.royalacademy.org.uk": "royal_academy",
    "unesdoc.unesco.org": "unesco",
    "srhe.ac.uk": "society_for_research_into_higher_education",
    "nfer.ac.uk": "nfer",
    "www.expressandstar.com": "express_and_star",
    "labour.org.uk": "labour_party",
    "econpapers.repec.org": "repec_econpapers",
    "uclpress.scienceopen.com": "ucl",
    "edarxiv.org": "education_arxiv",
    "mmail.dods.co.uk": "REMOVE",
    "www.unesco.org": "unesco",
    "daily.jstor.org": "jstor_daily",
    "www.jstor.org": "jstor",
    "www.thenhsa.co.uk": "northern_health_science_alliance",
    "www.ambition.org.uk": "ambition_institute",
    "www.fda.org.uk": "fda_union",
    "www.birmingham.ac.uk": "university_of_birmingham",
    "magicsmoke.substack.com": "magicsmoke_substack",
    "thebritishacademyecrn.com": "british_academy",
    "www.edtechinnovationhub.com": "edtech_innovation_hub",
    "link.news.inews.co.uk": "inews",
    "arcinstitute.org": "arc_institute",
    "www.thetimes.com": "the_times",
    "scholar.harvard.edu": "harvard_graduate_school_of_education",
    "www.the-tls.co.uk": "times_literary_supplement",
    "shadowpanel.uk": "shadow_panel_project",
    "www.oxfordschoolofthought.org": "oxford_school_of_thought",
    "www.schoolsappg.org.uk": "all_party_parliamentary_group_schools",
    "educationappg.org.uk": "education_appg",
    "benniekara.substack.com": "bennie_kara_substack",
    "www.twinkl.co.uk": "twinkl",
    "www.eyalliance.org.uk": "early_years_alliance",
    "niot.org.uk": "national_institute_of_teaching",
    "universitas21.com": "universitas_21",
    "ascl.org.uk": "ascl",
    "www.scottishai.com": "scottish_ai",
    "www.rijksoverheid.nl": "dutch_government",
    "samf.substack.com": "samf_substack",
    "londondesignbiennale.com": "london_design_biennale",
    "upp-foundation.org": "upp_foundation",
    "inclusioninpractice.org.uk": "inclusion_in_practice",
    "publications.parliament.uk": "uk_parliament",
    "observer.co.uk": "the_observer",
    "dundee.onlinesurveys.ac.uk": "university_of_dundee_surveys",
    "wonkhe.cmail20.com": "REMOVE",
    "digitalyouthindex.uk": "digital_youth_index",
    "neweconomics.org": "new_economics_foundation",
    "beyth.co.uk": "beyth_consultancy",
    "the-difference.com": "the_difference",
    "www.innovate-ed.uk": "innovate_ed",
    "www.oecd-events.org": "oecd",
    "youtu.be": "youtube",
    "ffteducationdatalab.us12.list-manage.com": "REMOVE",
    "www.centreforsocialjustice.org.uk": "centre_for_social_justice",
    "gamayo.co.uk": "gamayo",
    "profiles.ucl.ac.uk": "ucl",
    "rebeccaallen.co.uk": "rebecca_allen",
    "pod.co": "pod_co_podcast",
    "www.teachfirst.org.uk": "teach_first",
    "edsk.org": "edsk_think_tank",
    "digitalgood.net": "digital_good_network",
    "lnks.gd": "REMOVE",
    "issuu.com": "issuu",
    "adcs.org.uk": "association_of_directors_of_childrens_services",
    "www.labourtogether.uk": "labour_together",
    "downloads2.dodsmonitoring.com": "dods_monitoring",
    "www.besa.org.uk": "british_academy",
    "nepc.colorado.edu": "national_education_policy_center",
    "cipr.co.uk": "chartered_institute_of_public_relations",
    "img1.wsimg.com": "REMOVE",
    "www.leverhulme.ac.uk": "leverhulme_trust",
    "www.ons.gov.uk": "office_for_national_statistics",
    "labourlist.org": "labour_list",
    "www.centreformentalhealth.org.uk": "centre_for_mental_health",
    "lnu-se.zoom.us": "REMOVE",
    "lse.zoom.us": "REMOVE",
    "hechingerreport.org": "hechinger_report",
    "researcheracademy.elsevier.com": "elsevier_researcher_academy",
    "www.smf.co.uk": "social_market_foundation",
    "www.suttontrust.com": "sutton_trust",
    "www.research.net": "REMOVE",
    "y3r710.r.eu-west-1.awstrack.me": "REMOVE",
    "lucaf.org": "lucas_education_foundation",
    "learning.nspcc.org.uk": "nspcc_learning",
    "news.comms.nao.org.uk": "national_audit_office",
    "youthendowmentfund.org.uk": "youth_endowment_fund",
    "www.bigissue.com": "big_issue",
    "demos.co.uk": "demos",
    "links-2.govdelivery.com": "REMOVE",
    "news.chartered.college": "chartered_college_news",
    "www.workinglivesofteachers.com": "working_lives_of_teachers",
    "one.oecd.org": "oecd",
    "blog.policy.manchester.ac.uk": "policy_manchester_blog",
    "twitter.com": "twitter",
    "mcrmetropolis.uk": "manchester_metropolitan_university",
    "gtr.ukri.org": "ukri",
    "teaching-vacancies.service.gov.uk": "dfe_teaching_vacancies",
    "ari.org.uk": "ari_association_for_research_innovation",
    "kingsfundmail.org.uk": "kings_fund",
    "www.mdpi.com": "mdpi_journals",
    "blogs.gov.scot": "scottish_government",
    "parliamentlive.tv": "uk_parliament",
    "e-estonia.com": "e_estonia",
    "us9.campaign-archive.com": "REMOVE",
   "podfollow.com": "podfollow_podcast",
    "nation.cymru": "nation_cymru",
    "www.holyrood.com": "holyrood_magazine",
    "impactedgroup.us22.list-manage.com": "REMOVE",
    "teachersuccess.co.uk": "teacher_success",
    "tpea.ac.uk": "tpea_association",
    "www.fenews.co.uk": "fe_news",
    "www.qmul.ac.uk": "queen_mary_university_london",
    "www.echild.ac.uk": "echild_research_centre",
    "www.standard.co.uk": "evening_standard",
    "x.com": "twitter",
    "goodthingsfoundation.us7.list-manage.com": "REMOVE",
    "www.ocr.org.uk": "ocr_exam_board",
    "ideas.repec.org": "repec_ideas",
    "educationinspection.blog.gov.uk": "ofsted_blog",
    "medium.com": "medium",
    "niot.s3.amazonaws.com": "REMOVE",
    "lxhriqcab.cc.rs6.net": "REMOVE",
    "consult.education.gov.uk": "dfe_consultations",
    "onthinktanks.org": "on_think_tanks",
    "etat.uea.ac.uk": "university_of_east_anglia",
    "ucl.us20.list-manage.com": "REMOVE",
    "www.wsj.com": "wall_street_journal",
    "doi-org.libproxy.ucl.ac.uk": "doi_via_ucl_proxy",
    "www.kcl.ac.uk": "kings_college_london",
    "www.chandlerinstitute.org": "chandler_institute",
    "unige.zoom.us": "REMOVE",
    "insights.taylorandfrancis.com": "taylor_and_francis",
    "www.faircomment.co.uk": "fair_comment",
    "www.sciencedirect.com": "sciencedirect",
    "jacobsfoundation.us11.list-manage.com": "REMOVE",
    "www.barnardos.org.uk": "barnardos",
    "bbc.co.uk": "bbc",
    "fullfact.org": "full_fact",
    "news.sky.com": "sky_news",
    "www.magicbreakfast.com": "magic_breakfast",
    "ow.ly": "twitter",
    "durham.cloud.panopto.eu": "REMOVE",
    "adc.bmj.com": "british_medical_journal",
    "www.nationalcrimeagency.gov.uk": "national_crime_agency",
    "newvisionsforeducation.org.uk": "new_visions_for_education",
    "thestaffcollege.uk": "staff_college",
    "www.yorkshirepost.co.uk": "yorkshire_post",
    "doi.org": "REMOVE",
    "www.bristol.ac.uk": "university_of_bristol",
    "media.actionforchildren.org.uk": "action_for_children",
    "www.pearson.com": "pearson",
    "www.coe.int": "council_of_europe",
    "whatson.parliament.uk": "uk_parliament",
    "www.civilservicejobs.service.gov.uk": "uk_civil_service_jobs",
    "www.ft.com": "financial_times",
    "soundcloud.com": "soundcloud",
    "www.hmc.org.uk": "headmasters_and_headmistresses_conference",
    "app.getresponse.com": "REMOVE",
    "www.ntu.ac.uk": "nottingham_trent_university",
    "ippr-org.files.svdcdn.com": "ippr",
    "www.childreninwales.org.uk": "children_in_wales",
    "nuffieldfoundation.cmail20.com": "REMOVE",
    "www.wired-gov.net": "wired_gov",
    "nuffieldfoundation.cmail19.com": "REMOVE"
}

In [21]:
df["organisation"] = df["domain"].map(domain_to_org)

In [22]:
df = df[df["organisation"].notna()]
df = df[df["organisation"] != "REMOVE"]

In [23]:
df.head(1)


Unnamed: 0,id,newsletter_number,issue_date,theme,subtheme,title,description,link,new_theme,domain,organisation
1,c97ff62f-83ca-47ec-a4c7-b4e24157ae0a,1,11 July 2023,Calls for evidence,,Deadline 23 August 2023,Education secretary Gillian Keegan has launche...,https://schoolsweek.co.uk/chatgpt-keegan-launc...,political_context_and_organisations,schoolsweek.co.uk,schools_week


In [24]:
#number of unique organisations 
unique_orgs = df["organisation"].nunique()
print(f"Number of unique organisations: {unique_orgs}")

Number of unique organisations: 245


In [25]:
#Groups organisations by theme 

orgs_by_theme = (
    df.groupby("new_theme")["organisation"]
      .apply(lambda x: ", ".join(sorted(x.unique())))
      .reset_index()
)

# Save to CSV for reference
orgs_by_theme.to_csv("organisations_by_theme.csv", index=False)

print("✅ Grouped table saved as 'organisations_by_theme.csv'")

✅ Grouped table saved as 'organisations_by_theme.csv'


# Inspect "Title" and "Description" 


In [26]:
df[['title', 'description']].info()
df[['title', 'description']].isna().sum()
df[['title', 'description']].head(10)

<class 'pandas.core.frame.DataFrame'>
Index: 1152 entries, 1 to 1666
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   title        1151 non-null   string
 1   description  1152 non-null   string
dtypes: string(2)
memory usage: 27.0 KB


Unnamed: 0,title,description
1,Deadline 23 August 2023,Education secretary Gillian Keegan has launche...
2,Revealed: the experts advising ministers on te...,The Department for Education has appointed an ...
3,"Reject fewer teacher applicants, DfE tells tra...","Susan Acland-Hood, the DfE's permanent secreta..."
4,Ofqual and DfE studying 'feasibility' of 'full...,Some exam boards are already piloting on-scree...
6,Revealed: The full details of Labour's educati...,Entitled 'Breaking down the barrier to opportu...
7,Lib Dem,"Munira Wilson, Lib Dem spokesperson for educat..."
8,Teacher retention commission: 8 proposals to s...,Teacher wellbeing chari ty Education Support h...
9,Who's supporting school leaders to stop them h...,"Recruitment and retention difficulties, Ofsted..."
11,Digital Poverty Alliance,A charity whose vision is for everyone to acce...
12,A long read from Nuffield funded research proj...,"In brief, the report finds:"


In [27]:
df['title_length'] = df['title'].str.len()
df['description_length'] = df['description'].str.len()
df[['title_length', 'description_length']].describe()

Unnamed: 0,title_length,description_length
count,1151.0,1152.0
mean,84.141616,219.789931
std,43.560133,181.233339
min,6.0,5.0
25%,60.0,105.0
50%,77.0,176.0
75%,100.5,275.0
max,482.0,1478.0


In [28]:
#Inspect titles with >50 words 
df['title_word_count'] = df['title'].apply(lambda x: len(str(x).split()))

long_titles = df[df['title_word_count'] > 50]

output_path = "long_titles_over_50_words.csv"
long_titles.to_csv(output_path, index=False)

print(f"✅ Saved {len(long_titles)} rows with titles over 50 words to: {output_path}")

✅ Saved 6 rows with titles over 50 words to: long_titles_over_50_words.csv


In [29]:
#Inspect descriptions with <10 words 

df['description_word_count'] = df['description'].apply(lambda x: len(str(x).split()))

short_descriptions = df[df['description_word_count'] < 10]

output_path = "short_descriptions_under_10_words.csv"
short_descriptions.to_csv(output_path, index=False)

print(f"✅ Saved {len(short_descriptions)} rows with descriptions under 10 words to: {output_path}")


✅ Saved 176 rows with descriptions under 10 words to: short_descriptions_under_10_words.csv


# Create 'Text' Variable = 'Title' + 'Description'

In [30]:
#Create "Text" variable = "Title" + "Description" 
df['text'] = df['title'].fillna('') + ' ' + df['description'].fillna('')

In [31]:
# Basic info on the new column
print(df['text'].info())

# Add a column for text length (number of words or characters)
df['text_length_chars'] = df['text'].str.len()
df['text_length_words'] = df['text'].str.split().str.len()

# Summary statistics
print("\nCharacter length stats:")
print(df['text_length_chars'].describe())

<class 'pandas.core.series.Series'>
Index: 1152 entries, 1 to 1666
Series name: text
Non-Null Count  Dtype 
--------------  ----- 
1152 non-null   string
dtypes: string(1)
memory usage: 18.0 KB
None

Character length stats:
count        1152.0
mean     304.858507
std      183.590543
min            43.0
25%           184.0
50%           261.0
75%          361.25
max          1544.0
Name: text_length_chars, dtype: Float64


In [32]:
# Check for missing or empty values
missing_mask = df['text'].isna() | (df['text'].str.strip() == '')

# Count how many
missing_count = missing_mask.sum()
print(f"Missing or empty 'text' entries: {missing_count}")

# Optionally view them
if missing_count > 0:
    print(df.loc[missing_mask, ['title', 'description']].head())


Missing or empty 'text' entries: 0


In [33]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1152 entries, 1 to 1666
Data columns (total 18 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   id                      1152 non-null   string
 1   newsletter_number       1152 non-null   int64 
 2   issue_date              1152 non-null   string
 3   theme                   1152 non-null   string
 4   subtheme                68 non-null     string
 5   title                   1151 non-null   string
 6   description             1152 non-null   string
 7   link                    1152 non-null   string
 8   new_theme               1152 non-null   object
 9   domain                  1152 non-null   object
 10  organisation            1152 non-null   object
 11  title_length            1151 non-null   Int64 
 12  description_length      1152 non-null   Int64 
 13  title_word_count        1152 non-null   int64 
 14  description_word_count  1152 non-null   int64 
 15  text     

In [34]:
df.columns

Index(['id', 'newsletter_number', 'issue_date', 'theme', 'subtheme', 'title',
       'description', 'link', 'new_theme', 'domain', 'organisation',
       'title_length', 'description_length', 'title_word_count',
       'description_word_count', 'text', 'text_length_chars',
       'text_length_words'],
      dtype='object')

# Save Files 

In [35]:
# Define output directory
out_dir = "/workspaces/ERP_Newsletter/data_processed"
os.makedirs(out_dir, exist_ok=True)

# 1️Save full cleaned dataset
data_cleaned_path = os.path.join(out_dir, "data_cleaned.csv")
df.to_csv(data_cleaned_path, index=False)
print(f"✅ Saved full cleaned dataset -> {data_cleaned_path}")

# 2️Save subset for preprocessing
cols_for_preprocessing = [
    "id",
    "newsletter_number",
    "issue_date",
    "new_theme",
    "text",
    "domain",
    "organisation"
]

# Keep only columns that exist in the DataFrame
existing_cols = [c for c in cols_for_preprocessing if c in df.columns]
missing_cols = [c for c in cols_for_preprocessing if c not in df.columns]

if missing_cols:
    print(f"⚠️ Warning: Missing columns not found in df: {missing_cols}")

# Create and save subset
df_preproc = df[existing_cols].copy()
data_preproc_path = os.path.join(out_dir, "data_for_preprocessing.csv")
df_preproc.to_csv(data_preproc_path, index=False)

# ✅Confirmation summary
print(f"✅ Saved preprocessing dataset -> {data_preproc_path}")
print(f"\nSummary:")
print(f"  • data_cleaned.csv: {df.shape[0]} rows, {df.shape[1]} columns")
print(f"  • data_for_preprocessing.csv: {df_preproc.shape[0]} rows, {df_preproc.shape[1]} columns")

✅ Saved full cleaned dataset -> /workspaces/ERP_Newsletter/data_processed/data_cleaned.csv
✅ Saved preprocessing dataset -> /workspaces/ERP_Newsletter/data_processed/data_for_preprocessing.csv

Summary:
  • data_cleaned.csv: 1152 rows, 18 columns
  • data_for_preprocessing.csv: 1152 rows, 7 columns
