In [1]:
import os 
import re 

import pandas as pd 
import numpy as np 

from ftfy import fix_text
import unicodedata as ud
from urllib.parse import urlparse

In [2]:
# Treat these text tokens as missing on read
NA_TOKENS = ["", " ", "NA", "N/A", "na", "NaN", "nan", "null", "NULL", "-"]

In [3]:
#load data 
input_path = "/workspaces/ERP_Newsletter/data/data01_newsletter_items/newsletter_items.csv"
data_cleaning_path = "/workspaces/ERP_Newsletter/data/data02_cleaning"
output_path = "/workspaces/ERP_Newsletter/data/data03_newsletter_items_clean"


df = pd.read_csv(input_path, keep_default_na=True, na_values=NA_TOKENS)

In [4]:
#inspect 
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1668 entries, 0 to 1667
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   id                 1668 non-null   object
 1   newsletter_number  1668 non-null   int64 
 2   issue_date         1668 non-null   object
 3   theme              1668 non-null   object
 4   subtheme           114 non-null    object
 5   title              1667 non-null   object
 6   description        1346 non-null   object
 7   link               1616 non-null   object
dtypes: int64(1), object(7)
memory usage: 104.4+ KB


In [5]:
print(f"Total rows: {len(df)}")
print(f"Unique newsletter: {df['newsletter_number'].nunique()}")

Total rows: 1668
Unique newsletter: 87


#¬†Clean Up Text

In [6]:
def clean_series(s: pd.Series) -> pd.Series:
    # Use pandas "string" dtype so NaNs stay as <NA>
    s = s.astype("string")
    mask = s.notna()
    # Fix mojibake and normalize only on non-missing cells
    s.loc[mask] = s.loc[mask].apply(fix_text)
    s.loc[mask] = s.loc[mask].apply(lambda x: ud.normalize("NFKC", x))
    # Basic whitespace cleanup
    s.loc[mask] = s.loc[mask].str.replace(r"\s+", " ", regex=True).str.strip()
    return s

# Clean all object/string columns (quick and safe)
obj_cols = [c for c in df.columns if df[c].dtype == object or pd.api.types.is_string_dtype(df[c])]
for c in obj_cols:
    df[c] = clean_series(df[c])

# Quick exact replacements for the most common artifacts (optional, simple)
REPL = {
    "√Ç ": " ", "√Ç": "",
    "‚Äö√Ñ√¨": "‚Äì", "‚Äö√Ñ√Æ": "‚Äî",
    "‚Äö√Ñ√¥": "‚Äô", "‚Äö√Ñ√≤": "‚Äò",
    "‚Äö√Ñ√∫": "‚Äú", "‚Äö√Ñ√π": "‚Äù",
    "√¢‚Ç¨‚Äú": "‚Äì", "√¢‚Ç¨‚Äù": "‚Äî",
    "√¢‚Ç¨Àú": "‚Äò", "√¢‚Ç¨‚Ñ¢": "‚Äô",
    "√¢‚Ç¨≈ì": "‚Äú", "√¢‚Ç¨\x9d": "‚Äù",
    "√¢‚Ç¨¬¢": "‚Ä¢", "√¢‚Ç¨¬¶": "‚Ä¶",
}
for c in obj_cols:
    s = df[c].astype("string")
    for bad, good in REPL.items():
        s = s.str.replace(bad, good, regex=False)
    s = s.str.replace(r"\s+", " ", regex=True).str.strip()
    df[c] = s

# Check for Missing Values 

In [7]:
def missing_table(d: pd.DataFrame) -> pd.DataFrame:
    mc = d.isna().sum()
    return pd.DataFrame({
        "Missing Values": mc,
        "Percentage (%)": (mc / len(d)) * 100
    }).sort_values("Missing Values", ascending=False)

print("\n=== Missing values (before drop) ===")
print(missing_table(df))


=== Missing values (before drop) ===
                   Missing Values  Percentage (%)
subtheme                     1554       93.165468
description                   322       19.304556
link                           52        3.117506
title                           1        0.059952
theme                           0        0.000000
issue_date                      0        0.000000
newsletter_number               0        0.000000
id                              0        0.000000


#¬†Remove items where description, link or title are missing

In [8]:
# Remove rows where 'description' or 'link' is missing
df_cleaned = df.dropna(subset=['description', 'link', 'title'])

# (Optional) Check how many rows remain
print(f"Rows before: {len(df)}")
print(f"Rows after : {len(df_cleaned)}")

df = df_cleaned

Rows before: 1668
Rows after : 1323


#¬†Check for Duplicates 

###¬†All rows identical 

In [9]:
#All rows identical 
total_duplicates = df.duplicated().sum()
print(f"Total duplicate rows (all columns identical): {total_duplicates}")

Total duplicate rows (all columns identical): 0


### Title and link identical 

In [10]:
# Check duplicates where both title and link are the same
title_link_dupes = df[df.duplicated(subset=["title", "link"], keep=False)]

print(f"Number of duplicate title+link pairs: {title_link_dupes.shape[0]}")
title_link_dupes.sort_values(by=["title"]).head(2)

Number of duplicate title+link pairs: 89


Unnamed: 0,id,newsletter_number,issue_date,theme,subtheme,title,description,link
1301,bf6c4fd6-a5bd-48ca-9249-b5b92849e038,70,4 April 2025,Updates from the programme,,A reminder that the ESRC Education Research Pr...,"AI in Education: From chalkboards to chatbots,...",https://uk.bettshow.com/speakers/dominik-lukes
1327,0ad9176d-2b5a-4306-a9f3-2b4ffdf96be6,71,11 April 2025,Updates from the programme,,A reminder that the ESRC Education Research Pr...,"AI in Education: From chalkboards to chatbots,...",https://uk.bettshow.com/speakers/dominik-lukes


In [11]:
title_link_dupes.theme.value_counts()

theme
Updates from the programme                                                                                                                                                                                                    35
You have indicated that you are happy to receive news and updates from the ESRC Education Research Programme. To unsubscribe, please email Elizabeth.hudson@ucl.ac.uk with the word UNSUBSCRIBE in the title of the email.    28
What Matters in Education?                                                                                                                                                                                                     8
Updates from the Programme                                                                                                                                                                                                     5
Update from the ESRC Education Research Programme                                             

In [12]:
title_link_dupes[title_link_dupes.theme == "Teacher recruitment, retention & development"]

Unnamed: 0,id,newsletter_number,issue_date,theme,subtheme,title,description,link
992,8991c36b-65fb-4011-9249-8f5917d32a0e,56,6 December 2024,"Teacher recruitment, retention & development",,DfE - Working lives of teachers and leaders: w...,A summary report of early findings from the th...,https://www.gov.uk/government/publications/wor...
1547,ac53ae25-d785-47b2-abd5-63d8e9583cbb,82,11 July 2025,"Teacher recruitment, retention & development",,DfE - Working lives of teachers and leaders: w...,Findings from the third wave of the working li...,https://www.gov.uk/government/publications/wor...


In [13]:
#drop duplicates keeping only first occurence 
df = df.drop_duplicates(subset=["title", "link"], keep="first").reset_index(drop=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1258 entries, 0 to 1257
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   id                 1258 non-null   string
 1   newsletter_number  1258 non-null   int64 
 2   issue_date         1258 non-null   string
 3   theme              1258 non-null   string
 4   subtheme           85 non-null     string
 5   title              1258 non-null   string
 6   description        1258 non-null   string
 7   link               1258 non-null   string
dtypes: int64(1), string(7)
memory usage: 78.8 KB


### Title only duplicates

In [14]:
# Count duplicates based on title only
title_dupes = df[df.duplicated(subset=["title"], keep=False)]

print(f"Number of rows with duplicate titles: {title_dupes.shape[0]}")
title_dupes.sort_values(by="title").head(1)

Number of rows with duplicate titles: 20


Unnamed: 0,id,newsletter_number,issue_date,theme,subtheme,title,description,link
1240,15f17205-bafc-43d8-ba3d-b4881956e48b,87,10 October 2025,Updates from the Programme,,Addressing key issues in teacher recruitment a...,Catch up with the video of the latest in the W...,https://mediacentral.ucl.ac.uk/Play/126585


In [15]:
title_table = title_dupes[["title", "theme"]].value_counts().reset_index(name="count")
title_table

Unnamed: 0,title,theme,count
0,Making Teaching Attractive and Worthwhile (Par...,Project news,3
1,Deadline: 28 April 2025,Political environment and key organisations,2
2,What matters in education? Education after the...,Updates from the programme,2
3,Panel:,Updates from the programme,2
4,Addressing key issues in teacher recruitment a...,Updates from the Programme,2
5,What matters in education? Education in a brok...,Updates from the programme,2
6,Labour,Political landscape & key organisations,1
7,Digital Poverty Alliance,EdTech,1
8,Digital Poverty Alliance,Thematic roundup,1
9,Panel:,"Teacher recruitment, retention & development",1


In [16]:
df = df.drop_duplicates(subset=["title"], keep="first").reset_index(drop=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1247 entries, 0 to 1246
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   id                 1247 non-null   string
 1   newsletter_number  1247 non-null   int64 
 2   issue_date         1247 non-null   string
 3   theme              1247 non-null   string
 4   subtheme           84 non-null     string
 5   title              1247 non-null   string
 6   description        1247 non-null   string
 7   link               1247 non-null   string
dtypes: int64(1), string(7)
memory usage: 78.1 KB


### link-only duplicates 

In [17]:
# Count duplicates based on link only
link_dupes = df[df.duplicated(subset=["link"], keep=False)]

print(f"Number of rows with duplicate links: {link_dupes.shape[0]}")
link_dupes.sort_values(by="link").head(1)

Number of rows with duplicate links: 114


Unnamed: 0,id,newsletter_number,issue_date,theme,subtheme,title,description,link
450,17268e59-d380-4e8f-a2e7-964543717f17,35,10 May 2024,What matters in education?,,Big Education conference - 'Next Generation Sc...,Hear from schools across the country who are w...,https://bigeducation.org/product/next-generati...


In [18]:
pd.set_option("display.max_colwidth", None)

link_table = link_dupes[["link"]].value_counts().reset_index(name="count")
link_table

Unnamed: 0,link,count
0,https://www.ucl.ac.uk/education-research-programme/events/2023/oct/practical-policies-or-bright-ideas-how-particular-topics-get-front-policy-queue,4
1,https://www.ucl.ac.uk/education-research-programme/events/2024/mar/investing-early-years-priorities-and-challenges,4
2,https://uk.bettshow.com/speakers/dominik-lukes,3
3,https://www.ucl.ac.uk/education-research-programme/events/2024/jan/pupil-absence-questions-policy-research-and-practice,3
4,https://childrens-participation.org/,3
5,https://www.ucl.ac.uk/education-research-programme/events/2025/may/how-build-resilient-schools-place-based-approaches-supporting-teachers-and-leaders,3
6,https://edtech.oii.ox.ac.uk/cutting-through-the-conjecture,2
7,https://engagementhub.ukri.org/esrc-1/weshorizonscanningsurvey,2
8,https://education.us18.list-manage.com/track/click?u=61f408a2f9c6d02a726ce6200&id=bea3b5fbac&e=4eb2cf985e,2
9,https://epi.org.uk/events/labour-party-conference-prioritising-equality-education-policy-as-a-lever-to-tackling-disadvantage-and-inequalities,2


In [19]:
df = df.drop_duplicates(subset=["link"], keep="first").reset_index(drop=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1186 entries, 0 to 1185
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   id                 1186 non-null   string
 1   newsletter_number  1186 non-null   int64 
 2   issue_date         1186 non-null   string
 3   theme              1186 non-null   string
 4   subtheme           80 non-null     string
 5   title              1186 non-null   string
 6   description        1186 non-null   string
 7   link               1186 non-null   string
dtypes: int64(1), string(7)
memory usage: 74.3 KB


# Identify themes and subthemes

In [20]:
#Unique counts of columns 
print("Unique titles:", df["title"].nunique())
print("Unique themes:", df["theme"].nunique())
print("Unique subthemes", df["subtheme"].nunique())
print("Unique links:", df["link"].nunique())

Unique titles: 1186
Unique themes: 62
Unique subthemes 35
Unique links: 1186


In [21]:
### Add placeholders for missing themes/subhtemes

# 1) Normalize empties/whitespace/"nan"/"none" to real NA
df_norm = df.copy()
for col in ["theme", "subtheme"]:
    df_norm[col] = (
        df_norm[col]
        .astype("string")
        .replace(r"^\s*$", pd.NA, regex=True)   # empty/whitespace ‚Üí NA
        .replace({"nan": pd.NA, "NaN": pd.NA, "None": pd.NA, "none": pd.NA})
    )

# 2) Create a version that fills NA with placeholders so ALL cases are counted
df_filled = df_norm.fillna({"theme": "No theme", "subtheme": "No subtheme"})

# 3) Group and count every (theme, subtheme) combo, including placeholder cases
theme_subtheme_counts = (
    df_filled
    .groupby(["theme", "subtheme"], dropna=False)
    .size()
    .reset_index(name="count")
    .sort_values(by=["theme", "subtheme"])
)

# 4) Export to Excel  
out_dir = data_cleaning_path
out_path = os.path.join(out_dir, "theme_subtheme_counts.xlsx")


theme_subtheme_counts.to_excel(out_path, index=False)  # <- this one
print(f"‚úÖ Exported {len(theme_subtheme_counts)} rows to {out_path}")

‚úÖ Exported 99 rows to /workspaces/ERP_Newsletter/data/data02_cleaning/theme_subtheme_counts.xlsx


# Check Themes and Articles 

In [22]:
# Filter articles under themes

check_themes = df[df["theme"] == "Research ‚Äì Practice ‚Äì Policy"].copy()

# View a few examples
display(check_themes.head(0))

Unnamed: 0,id,newsletter_number,issue_date,theme,subtheme,title,description,link


# Rename Themes

In [23]:
# ---------- 0) Drop rows where the entire theme is the unsubscribe text
UNSUB_THEME = (
    "You have indicated that you are happy to receive news and updates from the "
    "ESRC Education Research Programme. To unsubscribe, please email "
    "Elizabeth.hudson@ucl.ac.uk with the word UNSUBSCRIBE in the title of the email."
)
mask_unsub = df["theme"].astype(str).str.strip().eq(UNSUB_THEME)
dropped_rows = int(mask_unsub.sum())
df = df[~mask_unsub].copy()

# ---------- 1) Normalizers
def norm_theme(s: str) -> str:
    if not isinstance(s, str):
        return ""
    s = s.strip()
    s = re.sub(r"\s+", " ", s)
    s = s.replace("‚Äî", "-").replace("‚Äì", "-")  # normalize dashes
    s = s.replace("‚Äô", "'").replace("‚Äò", "'").replace("‚Äú", '"').replace("‚Äù", '"')
    return s.lower()

def norm_key(s: str) -> str:
    """Strong normalizer for matching keys like subthemes:
       - lowercase; & -> and; remove punctuation; normalize dashes; collapse spaces
    """
    if not isinstance(s, str):
        return ""
    s = s.strip().lower()
    s = s.replace("‚Äî", " ").replace("‚Äì", " ").replace("-", " ")
    s = s.replace("&", " and ")
    s = s.replace("‚Äô", "'").replace("‚Äò", "'").replace("‚Äú", '"').replace("‚Äù", '"')
    s = re.sub(r"[,\.\u00A0]", " ", s)         # remove commas, periods, NBSP
    s = re.sub(r"[^a-z0-9\s]", " ", s)         # drop other punctuation
    s = re.sub(r"\s+", " ", s).strip()
    return s

# ---------- 2) Theme mapping list: (new_theme, current_theme)
pairs = [
    # ============ erp_project ============
    ("erp_project", "Embedding children's participation rights in pedagogical practice in lower primary classrooms in Wales PI: Sarah Chicken"),
    ("erp_project", "Investigating the recruitment and retention of ethnic minority teachers PI: Stephen Gorard"),
    ("erp_project", "News from the Projects"),
    ("erp_project", "News from the projects"),
    ("erp_project", "PI Updates and Papers"),
    ("erp_project", "PI: David Lundie"),
    ("erp_project", "Programme news"),
    ("erp_project", "Programme Update"),
    ("erp_project", "Programme update"),
    ("erp_project", "Project news"),
    ("erp_project", "Rethinking teacher recruitment: New approaches to attracting prospective STEM teachers PI: Rob Klassen"),
    ("erp_project", "Sustainable school leadership: comparing approaches to the training, supply and retention of senior school leaders across the UK PI Toby Greany"),
    ("erp_project", "Toby Greany"),
    ("erp_project", "Towards equity focused approaches to EdTech: a socio-technical perspective PI: Professor Rebecca Eynon"),
    ("erp_project", "Towards equity focused approaches to EdTech: a socio-technical perspective PI: Rebecca Eynon"),
    ("erp_project", "Update from the ERP projects"),
    ("erp_project", "Update from the ESRC Education Research Programme"),
    ("erp_project", "Update from the projects"),
    ("erp_project", "Updates from David Lundie"),
    ("erp_project", "Updates from Steph Ainsworth"),
    ("erp_project", "Updates from the ERP projects"),
    ("erp_project", "Updates from the ESRC"),
    ("erp_project", "Updates from the Programme"),
    ("erp_project", "Updates from the programme"),
    ("erp_project", "Updates from the projects"),
    ("erp_project", "Decentring the 'resilient teacher': exploring interactions between individuals and their social ecologies PI: Steph Ainsworth"),
    ("erp_project", "Peer reviewed articles from the ERP projects"),
    ("erp_project", "Peer reviewed publications from the ERP projects"),

    # ============ what_matters_ed ============
    ("what_matters_ed", "What Matters in Education?"),
    ("what_matters_ed", "What matters in education?"),

    # ============ teacher_rrd ============
    ("teacher_rrd", "Teacher recruitment, retention & development"),

    # ============ edtech ============
    ("edtech", "EdTech"),

    # ============ four_nations ============
    ("four_nations", "4 Nations"),
    ("four_nations", "4 Nations & key organisations"),
    ("four_nations", "Four Nations"),
    ("four_nations", "Four Nations Landscape"),
    ("four_nations", "Four Nations landscape"),
    ("four_nations", "Political landscape across Four Nations & key organisations"),

    # ============ ppr (Policy‚ÄìPractice‚ÄìResearch) ============
    ("ppr", "Research ‚Äì Practice ‚Äì Policy"),
    ("ppr", "Education, Policy & Practice"),

    # ============ events_opportunities_research ============
    ("events_opportunities_research", "Conferences"),
    ("events_opportunities_research", "Opportunities"),
    ("events_opportunities_research", "Opportunities for funding"),
    ("events_opportunities_research", "Opportunities to blog"),
    ("events_opportunities_research", "Other Reports"),
    ("events_opportunities_research", "Other Research"),
    ("events_opportunities_research", "Relevant Events"),
    ("events_opportunities_research", "Relevant Research"),
    ("events_opportunities_research", "Reports"),
    ("events_opportunities_research", "Research"),
    ("events_opportunities_research", "Events"),
    ("events_opportunities_research", "Seminar series topics"),
    ("events_opportunities_research", "Seminar topics"),

    # ============ political_environment_key_organisations ============
    ("political_environment_key_organisations", "What are the politicians saying?"),
    ("political_environment_key_organisations", "Political environment and key organisations"),
    ("political_environment_key_organisations", "Political landscape - the election"),
    ("political_environment_key_organisations", "Political landscape & key organisations"),
    ("political_environment_key_organisations", "Calls for evidence"),
    ("political_environment_key_organisations", "DfE"),
    ("political_environment_key_organisations", "EEF"),
    ("political_environment_key_organisations", "ESRC"),
    ("political_environment_key_organisations", "Politics"),
    ("political_environment_key_organisations", "Launch of ESRC survey on social science research skills"),
    ("political_environment_key_organisations", "Updates from UKRI"),
    ("political_environment_key_organisations", "Update from UKRI"),
]

# ---------- 3) Build lookup (normalized)
lookup = {norm_theme(curr): new for new, curr in pairs}

# ---------- 4) Apply theme mapping (no fill yet)
theme_norm = df["theme"].map(norm_theme)
df["new_theme"] = theme_norm.map(lookup)

# ---------- 4b) Defensive keyword overrides (force correct bucket if text contains patterns)
kw_four_nations = theme_norm.str.contains(r"\b(4|four) nations\b", regex=True, na=False)
df.loc[kw_four_nations, "new_theme"] = "four_nations"

kw_ukri = theme_norm.str.contains(r"\bupdate(s)? from ukri\b", regex=True, na=False)
df.loc[kw_ukri, "new_theme"] = "political_environment_key_organisations"

# ---------- 5) Subtheme-based overrides
sub_norm = df["subtheme"].map(norm_key)

target_rrd = "teacher recruitment retention and development"
df.loc[sub_norm.eq(target_rrd), "new_theme"] = "teacher_rrd"  # any variant mapped earlier ‚Üí normalized equals this
df.loc[sub_norm.eq("digital"), "new_theme"] = "edtech"

# ---------- 6) Fill any remaining unmapped with the original theme text
df["new_theme"] = df["new_theme"].fillna(df["theme"])

# ---------- 7) Export a summary
summary = (
    df.assign(theme_norm=theme_norm, subtheme_norm=sub_norm)
      .groupby(["new_theme", "theme_norm"], dropna=False)
      .size()
      .reset_index(name="count")
      .sort_values(["new_theme", "count"], ascending=[True, False])
)

out_dir = data_cleaning_path
summary_path = os.path.join(out_dir, "theme_mapping_summary.xlsx")

with pd.ExcelWriter(summary_path) as xw:
    df.to_excel(xw, sheet_name="data_with_new_theme", index=False)
    summary.to_excel(xw, sheet_name="mapping_summary", index=False)

print(f"‚úÖ Dropped {dropped_rows} unsubscribe row(s).")
print("‚úÖ Mapping applied with canonical themes.")
print("üìÑ Excel written to:", summary_path)


  kw_four_nations = theme_norm.str.contains(r"\b(4|four) nations\b", regex=True, na=False)
  kw_ukri = theme_norm.str.contains(r"\bupdate(s)? from ukri\b", regex=True, na=False)


‚úÖ Dropped 0 unsubscribe row(s).
‚úÖ Mapping applied with canonical themes.
üìÑ Excel written to: /workspaces/ERP_Newsletter/data/data02_cleaning/theme_mapping_summary.xlsx


In [24]:
# ---------- 7) View unique new_theme values and their counts
theme_counts = (
    df["new_theme"]
    .value_counts(dropna=False)
    .reset_index()
    .rename(columns={"index": "new_theme", "new_theme": "count"})
)

print("üß≠ Unique new_theme values and their counts:")
print(theme_counts)

üß≠ Unique new_theme values and their counts:
                                     count  count
0  political_environment_key_organisations    221
1                          what_matters_ed    184
2                              teacher_rrd    178
3                                   edtech    166
4                                      ppr    147
5                              erp_project    144
6                             four_nations    112
7            events_opportunities_research     34


In [25]:
#save dataset with all themes 
all_clean_path = "/workspaces/ERP_Newsletter/data/data03_newsletter_items_clean/items_all_themes.csv"
df.to_csv(all_clean_path, index=False)

# keep only "ERP PROJECT" rows

In [26]:
df.new_theme.value_counts()

new_theme
political_environment_key_organisations    221
what_matters_ed                            184
teacher_rrd                                178
edtech                                     166
ppr                                        147
erp_project                                144
four_nations                               112
events_opportunities_research               34
Name: count, dtype: int64

In [27]:
# ---------- Keep only selected themes in new_theme
themes_to_keep = [
    "erp_project",
]

df = df[df["new_theme"].isin(themes_to_keep)].copy()

print(df["new_theme"].value_counts())

new_theme
erp_project    144
Name: count, dtype: int64


#¬†Number of unique domain names 

In [28]:
# Extract domain names
df["domain"] = df["link"].apply(lambda x: urlparse(str(x)).netloc if pd.notna(x) else None)

# Create a sorted list of unique domains
unique_domains_list = sorted(df["domain"].dropna().unique())

# Convert to DataFrame
unique_domains_df = pd.DataFrame(unique_domains_list, columns=["domain"])

# Define output path
output_file = "/workspaces/ERP_Newsletter/data/data02_cleaning/unique_domains_programme_updates.csv"

# Save to CSV
unique_domains_df.to_csv(output_file, index=False)

print(f"‚úÖ Saved {len(unique_domains_list)} unique domains to:\n{output_file}")

‚úÖ Saved 68 unique domains to:
/workspaces/ERP_Newsletter/data/data02_cleaning/unique_domains_programme_updates.csv


# Add'organisation' column and remove irreleant domain names

In [29]:
# Mapping from domain ‚Üí organisation
domain_to_org = {
    "assets.publishing.service.gov.uk": "uk_gov_publications",
    "bera-journals.onlinelibrary.wiley.com": "bera_journals",
    "bera.us9.list-manage.com": "bera_blog",
    "beyth.co.uk": "bristol_early_years_teaching_hub",
    "bit.ly": "undefined",
    "blogs.ucl.ac.uk": "ucl_blog",
    "blogs.uwe.ac.uk": "uwe_bristol_blog",
    "childrens-participation.org": "childrens_participation_in_schools",
    "daily.jstor.org": "jstor_daily",
    "discovery.ucl.ac.uk": "ucl_discovery",
    "drive.google.com": "google_drive",
    "durham.cloud.panopto.eu": "durham",
    "durhamuniversity.zoom.us": "durham",
    "edtech.oii.ox.ac.uk": "edtech_oii",
    "educationendowmentfoundation.org.uk": "eef",
    "educationscape.us4.list-manage.com": "educationscape",
    "engagementhub.ukri.org": "ukri_engagementhub",
    "etat.uea.ac.uk": "university_of_east_anglia",
    "events.teams.microsoft.com": "microsoft_teams_events",
    "forms.office.com": "microsoft_forms",
    "gamayo.co.uk": "gamayo",
    "gtr.ukri.org": "ukri_grant_tracker",
    "insights.taylorandfrancis.com": "taylor_and_francis_insights",
    "issuu.com": "undefined",
    "journals.sagepub.com": "sage_journals",
    "linkprotect.cudasvc.com": "undefined",
    "lnu-se.zoom.us": "uni_linne_zoom",
    "localed2025.org.uk": "local_ed_2025",
    "manmetjobs.mmu.ac.uk": "manchester_metropolitan_university_jobs",
    "mcrmetropolis.uk": "manchester_metropolitan_university_project",
    "mediacentral.ucl.ac.uk": "ucl_mediacentral",
    "my.chartered.college": "chartered_college_of_teaching",
    "onlinelibrary.wiley.com": "wiley_online_library",
    "ow.ly": "undefined",
    "pod.co": "pod_co_podcast",
    "profiles.ucl.ac.uk": "ucl",
    "ripl.uk": "research_in_primary_languages",
    "schoolsweek.co.uk": "schools_week",
    "sustainableschoolleadership.uk": "sustainable_school_leadership",
    "t.co": "undefined",
    "teachersuccess.co.uk": "teacher_success",
    "theconversation.com": "conversation",
    "twitter.com": "twitter",
    "uk.bettshow.com": "bett_show",
    "unige.zoom.us": "uni_geneva_zoom",
    "universitas21.com": "universitas_21",
    "wonkhe.com": "wonkhe",
    "www.bera.ac.uk": "bera",
    "www.durham.ac.uk": "durham",
    "www.eventbrite.co.uk": "eventbrite",
    "www.eventbrite.com": "eventbrite",
    "www.gov.uk": "gov_uk",
    "www.jstor.org": "jstor",
    "www.linkedin.com": "linkedin",
    "www.mdpi.com": "multidisciplinary_digital_publishing_institute",
    "www.naht.org.uk": "naht",
    "www.nottingham.ac.uk": "nottingham_uni",
    "www.sciencedirect.com": "science_direct",
    "www.tandfonline.com": "taylor_and_francis",
    "www.tes.com": "tes",
    "www.theguardian.com": "guardian",
    "www.ucl.ac.uk": "ucl",
    "www.ukri.org": "ukri",
    "www.yorkshirepost.co.uk": "yorkshire_post",
    "www.youtube.com": "youtube",
    "x.com": "x",
    "youtu.be": "youtube",
}

In [30]:
df["organisation"] = df["domain"].map(domain_to_org)

In [31]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 144 entries, 12 to 1185
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   id                 144 non-null    string
 1   newsletter_number  144 non-null    int64 
 2   issue_date         144 non-null    string
 3   theme              144 non-null    string
 4   subtheme           49 non-null     string
 5   title              144 non-null    string
 6   description        144 non-null    string
 7   link               144 non-null    string
 8   new_theme          144 non-null    object
 9   domain             144 non-null    object
 10  organisation       141 non-null    object
dtypes: int64(1), object(3), string(7)
memory usage: 13.5+ KB


In [32]:
# Replace missing organisation with "erp_news"
df["organisation"] = df["organisation"].fillna("erp_news")
df.loc[df["organisation"] == "REMOVE", "organisation"] = "erp_news"

In [33]:
#number of unique organisations 
unique_orgs = df["organisation"].nunique()
print(f"Number of unique organisations: {unique_orgs}")

Number of unique organisations: 59


In [34]:
org_counts = df["organisation"].value_counts()
org_5plus = org_counts[org_counts >=3]
print(org_5plus)

organisation
ucl                                   21
conversation                          12
edtech_oii                             7
undefined                              6
ucl_mediacentral                       6
bera                                   5
tes                                    4
childrens_participation_in_schools     4
bera_journals                          4
eventbrite                             3
erp_news                               3
durham                                 3
youtube                                3
sage_journals                          3
ucl_blog                               3
schools_week                           3
microsoft_teams_events                 3
Name: count, dtype: int64


# Inspect "Title" and "Description" 


In [35]:
df[['title', 'description']].info()
df[['title', 'description']].isna().sum()
df[['title', 'description']].head(10)

<class 'pandas.core.frame.DataFrame'>
Index: 144 entries, 12 to 1185
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   title        144 non-null    string
 1   description  144 non-null    string
dtypes: string(2)
memory usage: 3.4 KB


Unnamed: 0,title,description
12,Poorer pupils do worse at school ‚Äì here's how to reduce the attainment gap,"Stephen Gorard writes for The Conversation on closing the attainment gap, specifically by looking at 'spread(ing) out the most disadvantaged students between schools' and better calibrating the funding to follow the most disadvantaged pupils."
24,A three-part series on leadership from Toby Greany and team on TES:,Part 1: Headteacher recruitment crisis: 5 tips for action
25,"Teacher recruitment, retention and development - rethinking policy and practice priorities seminar ‚Äì 3 July 2023",Inaugural event in the 'What Matters in education?' series Blog post on the topic - Looking at teacher recruitment and retention in a new light
26,Teaching for Digital Citizenship Delphi conference,View the recording of David Lundie's event (held on May 22). BERA membership required to access. https://www.bera.ac.uk/media/teaching-for-digital-citizenship-may2023
27,My corona*: listening to children in corona times,Resource shared via Sarah Chicken's website - https://childrens-participation.org/ This research paper brings together three of the young journalists who worked on The Corona Times Journal to reflect on their experiences of being involved in the project. https://www.tandfonline.com/doi/epdf/10.1080/13642987.2022.2061954?needAccess=true&role=button
39,This research paper brings together three of the young journalists who worked on The Corona Times Journal to reflect on their experiences of being involved in the project.,https://www.tandfonline.com/doi/epdf/10.1080/13642987.2022.2061954?needAccess=true&role=button
51,"19 October 2023, 5:30 pm‚Äì7:00 pm","In this online panel discussion, policymakers, researchers and practitioners will debate how education policy priorities are shaped, and the extent to which they take into account realities on the ground. Using specific examples that highlight some of the difficulties in research, policy and practice working constructively together, the panel will consider how else good policy choices might be made. https://www.ucl.ac.uk/education-research-programme/events/2023/oct/practical-policies-or-bright-ideas-how-particular-topics-get-front-policy-queue"
67,Stephen Gorard in the media,The Guardian - https://www.theguardian.com/education/2023/aug/29/lack-diversity-teaching-means-minority-ethnic-pupils-england-miss-out Science X (Phys.org) - https://phys.org/news/2023-08-ethnic-disparity-teachers-pupils-england.html TES - https://www.tes.com/magazine/analysis/secondary/north-south-schools-attainment-gap-fears
80,Artificial Intelligence and Education: A Reading List,"JStor has put out a bibliography to help educators prepare students and themselves for a future shaped by AI‚Äîwith all its opportunities and drawbacks. https://daily.jstor.org/artificial-intelligence-and-education-a-reading-list/ An article by Rebecca Eynon and colleagues is on it: Nabeel Gillani, Rebecca Eynon, Catherine Chiabaut, and Kelsey Finkel, "" Unpacking the 'Black Box' of AI in Education ,"" Educational Technology & Society 26, no. 1 (2023): 99‚Äì111."
90,An article by Rebecca Eynon and colleagues is on it:,"Nabeel Gillani, Rebecca Eynon, Catherine Chiabaut, and Kelsey Finkel, "" Unpacking the 'Black Box' of AI in Education ,"" Educational Technology & Society 26, no. 1 (2023): 99‚Äì111."


In [36]:
df['title_length'] = df['title'].str.len()
df['description_length'] = df['description'].str.len()
df[['title_length', 'description_length']].describe()

Unnamed: 0,title_length,description_length
count,144.0,144.0
mean,95.131944,283.819444
std,56.644919,264.819813
min,6.0,8.0
25%,65.0,86.75
50%,84.0,211.5
75%,114.25,433.25
max,357.0,1478.0


In [37]:
#Inspect titles with >50 words 
df['title_word_count'] = df['title'].apply(lambda x: len(str(x).split()))

long_titles = df[df['title_word_count'] > 50]
long_titles.head()



Unnamed: 0,id,newsletter_number,issue_date,theme,subtheme,title,description,link,new_theme,domain,organisation,title_length,description_length,title_word_count
117,97a6ef72-0d9d-4e1f-b626-98dfad5181a5,10,09 October 2023,Programme news,,"The ESRC's Work, Education and Skills (WES) team are exploring the potential shape of a new, forward-looking education research agenda and are very keen to hear from the academic, policy, and practice communities about what you think will be the big persistent and future challenges over the coming decades in the following priority areas:","¬∑ Societal impacts on educational provision and learner experience ¬∑ Educational inequalities ¬∑ Special Educational Needs and Disability ¬∑ Skills for life For full details and the opportunity to submit your ideas , here is the link to the horizon scanning survey :",https://engagementhub.ukri.org/esrc-1/weshorizonscanningsurvey,erp_project,engagementhub.ukri.org,ukri_engagementhub,339,264,54
742,9ea38fc1-35e2-42b0-9a9a-bf23acc6cd89,59,17 January 2025,Updates from the programme,,Do you find this newsletter useful? Are you happy with its frequency? Which section do you find most useful? Any pressing topics we haven't covered? Any good sources of information you think we're missing? How could we improve? We advocate for listening to different stakeholders and so we want your views too!,Answer these short questions,https://forms.office.com/e/pTypS1MAqH,erp_project,forms.office.com,microsoft_forms,310,28,52
897,b7c2c85e-de2b-4483-93ea-0f395f3d7193,69,28 March 2025,Updates from the programme,,The ERP's double symposium entitled: ' Doing Policy Relevant Research: Using knowledge mobilisation and knowledge exchange strategies to translate findings into actionable insights.' Part One. Technology in education and Part Two. Teachers and Teaching will be presented at BERA 2025. The symposia will combine insights from seven projects in the programme.,The ERP have created a new Knowledge Exchange Resource Hub . This brings together a range of resources we are aware of that are designed to support research engagement with different stakeholders. We welcome other suggestions. Please get in touch if you are aware of other resources that we are missing.,https://www.ucl.ac.uk/education-research-programme/knowledge-exchange-resource-hub,erp_project,www.ucl.ac.uk,ucl,357,303,51


In [38]:
#Inspect descriptions with <10 words 

df['description_word_count'] = df['description'].apply(lambda x: len(str(x).split()))

short_descriptions = df[df['description_word_count'] < 10]

short_descriptions.head()


Unnamed: 0,id,newsletter_number,issue_date,theme,subtheme,title,description,link,new_theme,domain,organisation,title_length,description_length,title_word_count,description_word_count
24,e1f9ff04-4b3a-48da-b7db-82a12c2eb364,2,16 July 2023,PI Updates and Papers,Leadership,A three-part series on leadership from Toby Greany and team on TES:,Part 1: Headteacher recruitment crisis: 5 tips for action,https://www.tes.com/magazine/leadership/staff-management/headteacher-recruitment-crisis-applications,erp_project,www.tes.com,tes,67,57,12,9
39,cf6cb065-2904-44d4-a9d2-dd1f81ca4d47,3,20 July 2023,PI Updates and Papers,,This research paper brings together three of the young journalists who worked on The Corona Times Journal to reflect on their experiences of being involved in the project.,https://www.tandfonline.com/doi/epdf/10.1080/13642987.2022.2061954?needAccess=true&role=button,https://www.tandfonline.com/doi/epdf/10.1080/13642987.2022.2061954?needAccess=true&role=button,erp_project,www.tandfonline.com,taylor_and_francis,171,94,28,1
93,2d9bbc41-e009-4117-a7c9-98fb898a47cd,8,19 October 2023,Project news,,Digital methods and the digital native: A cautionary note for participatory researchers,Blog post from Rebecca Eynon's project https://edtech.oii.ox.ac.uk/digital-methods-and-the-digital-native-a-cautionary-note-for-participatory-researchers/,https://edtech.oii.ox.ac.uk/digital-methods-and-the-digital-native-a-cautionary-note-for-participatory-researchers,erp_project,edtech.oii.ox.ac.uk,edtech_oii,87,154,12,7
94,fb722ff1-08ff-4c1f-90fd-e83f6fb69669,8,19 October 2023,Project news,,Should you send your child to an academy or a council-run school? Why Ofsted results don't mean much,Stephen Gorard on The Conversation https://theconversation.com/should-you-send-your-child-to-an-academy-or-a-council-run-school-why-ofsted-results-dont-mean-much-211370,https://theconversation.com/should-you-send-your-child-to-an-academy-or-a-council-run-school-why-ofsted-results-dont-mean-much-211370,erp_project,theconversation.com,conversation,100,168,18,6
101,92b73099-8bd0-4555-b75e-73f5e42e4ed4,9,19 October 2023,Programme news,,Professional knowledge and research-informed practice: Time for a rethink?,Gemma Moss and Rachel France in Impact Journal https://my.chartered.college/impact_article/professional-knowledge-and-research-informed-practice-time-for-a-rethink/,https://my.chartered.college/impact_article/professional-knowledge-and-research-informed-practice-time-for-a-rethink,erp_project,my.chartered.college,chartered_college_of_teaching,74,164,9,9


#### Create 'Text' Variable = 'Title' + 'Description'

In [39]:
#Create "Text" variable = "Title" + "Description" 
df['text'] = df['title'].fillna('') + ' ' + df['description'].fillna('')

In [40]:
# Basic info on the new column
print(df['text'].info())

# Add a column for text length (number of words or characters)
df['text_length_chars'] = df['text'].str.len()
df['text_length_words'] = df['text'].str.split().str.len()

# Summary statistics
print("\nCharacter length stats:")
print(df['text_length_chars'].describe())

<class 'pandas.core.series.Series'>
Index: 144 entries, 12 to 1185
Series name: text
Non-Null Count  Dtype 
--------------  ----- 
144 non-null    string
dtypes: string(1)
memory usage: 2.2 KB
None

Character length stats:
count         144.0
mean     379.951389
std      257.147024
min            74.0
25%           191.0
50%           321.0
75%           544.0
max          1544.0
Name: text_length_chars, dtype: Float64


In [41]:
# Check for missing or empty values
missing_mask = df['text'].isna() | (df['text'].str.strip() == '')

# Count how many
missing_count = missing_mask.sum()
print(f"Missing or empty 'text' entries: {missing_count}")

# Optionally view them
if missing_count > 0:
    print(df.loc[missing_mask, ['title', 'description']].head())


Missing or empty 'text' entries: 0


In [42]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 144 entries, 12 to 1185
Data columns (total 18 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   id                      144 non-null    string
 1   newsletter_number       144 non-null    int64 
 2   issue_date              144 non-null    string
 3   theme                   144 non-null    string
 4   subtheme                49 non-null     string
 5   title                   144 non-null    string
 6   description             144 non-null    string
 7   link                    144 non-null    string
 8   new_theme               144 non-null    object
 9   domain                  144 non-null    object
 10  organisation            144 non-null    object
 11  title_length            144 non-null    Int64 
 12  description_length      144 non-null    Int64 
 13  title_word_count        144 non-null    int64 
 14  description_word_count  144 non-null    int64 
 15  text     

In [43]:
df.columns

Index(['id', 'newsletter_number', 'issue_date', 'theme', 'subtheme', 'title',
       'description', 'link', 'new_theme', 'domain', 'organisation',
       'title_length', 'description_length', 'title_word_count',
       'description_word_count', 'text', 'text_length_chars',
       'text_length_words'],
      dtype='object')

# Save Files 

In [44]:
df.to_csv("/workspaces/ERP_Newsletter/data/data03_newsletter_items_clean/programme_updates.csv", index=False)

print(f"‚úÖ ‚úÖ ‚úÖ Saved")

‚úÖ ‚úÖ ‚úÖ Saved


In [45]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 144 entries, 12 to 1185
Data columns (total 18 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   id                      144 non-null    string
 1   newsletter_number       144 non-null    int64 
 2   issue_date              144 non-null    string
 3   theme                   144 non-null    string
 4   subtheme                49 non-null     string
 5   title                   144 non-null    string
 6   description             144 non-null    string
 7   link                    144 non-null    string
 8   new_theme               144 non-null    object
 9   domain                  144 non-null    object
 10  organisation            144 non-null    object
 11  title_length            144 non-null    Int64 
 12  description_length      144 non-null    Int64 
 13  title_word_count        144 non-null    int64 
 14  description_word_count  144 non-null    int64 
 15  text     