In [1]:
import os 
import re 

import pandas as pd 
import numpy as np 

from ftfy import fix_text
import unicodedata as ud
from urllib.parse import urlparse

In [2]:
# Treat these text tokens as missing on read
NA_TOKENS = ["", " ", "NA", "N/A", "na", "NaN", "nan", "null", "NULL", "-"]

In [3]:
#load data 
input_path = "/workspaces/ERP_Newsletter/data/0_raw/1_newsletter_short_text/newsletter_items.csv"
data_cleaning_path = "/workspaces/ERP_Newsletter/data/1_interim/0_ingestion"
output_path = "/workspaces/ERP_Newsletter/data/1_interim/1_cleaning"


df = pd.read_csv(input_path, keep_default_na=True, na_values=NA_TOKENS)

In [4]:
#inspect 
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1668 entries, 0 to 1667
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   id                 1668 non-null   object
 1   newsletter_number  1668 non-null   int64 
 2   issue_date         1668 non-null   object
 3   theme              1668 non-null   object
 4   subtheme           114 non-null    object
 5   title              1667 non-null   object
 6   description        1346 non-null   object
 7   link               1616 non-null   object
dtypes: int64(1), object(7)
memory usage: 104.4+ KB


In [5]:
print(f"Total rows: {len(df)}")
print(f"Unique newsletter: {df['newsletter_number'].nunique()}")

Total rows: 1668
Unique newsletter: 87


#¬†Clean Up Text

In [6]:
def clean_series(s: pd.Series) -> pd.Series:
    # Use pandas "string" dtype so NaNs stay as <NA>
    s = s.astype("string")
    mask = s.notna()
    # Fix mojibake and normalize only on non-missing cells
    s.loc[mask] = s.loc[mask].apply(fix_text)
    s.loc[mask] = s.loc[mask].apply(lambda x: ud.normalize("NFKC", x))
    # Basic whitespace cleanup
    s.loc[mask] = s.loc[mask].str.replace(r"\s+", " ", regex=True).str.strip()
    return s

# Clean all object/string columns (quick and safe)
obj_cols = [c for c in df.columns if df[c].dtype == object or pd.api.types.is_string_dtype(df[c])]
for c in obj_cols:
    df[c] = clean_series(df[c])

# Quick exact replacements for the most common artifacts (optional, simple)
REPL = {
    "√Ç ": " ", "√Ç": "",
    "‚Äö√Ñ√¨": "‚Äì", "‚Äö√Ñ√Æ": "‚Äî",
    "‚Äö√Ñ√¥": "‚Äô", "‚Äö√Ñ√≤": "‚Äò",
    "‚Äö√Ñ√∫": "‚Äú", "‚Äö√Ñ√π": "‚Äù",
    "√¢‚Ç¨‚Äú": "‚Äì", "√¢‚Ç¨‚Äù": "‚Äî",
    "√¢‚Ç¨Àú": "‚Äò", "√¢‚Ç¨‚Ñ¢": "‚Äô",
    "√¢‚Ç¨≈ì": "‚Äú", "√¢‚Ç¨\x9d": "‚Äù",
    "√¢‚Ç¨¬¢": "‚Ä¢", "√¢‚Ç¨¬¶": "‚Ä¶",
}
for c in obj_cols:
    s = df[c].astype("string")
    for bad, good in REPL.items():
        s = s.str.replace(bad, good, regex=False)
    s = s.str.replace(r"\s+", " ", regex=True).str.strip()
    df[c] = s

# Check for Missing Values 

In [7]:
def missing_table(d: pd.DataFrame) -> pd.DataFrame:
    mc = d.isna().sum()
    return pd.DataFrame({
        "Missing Values": mc,
        "Percentage (%)": (mc / len(d)) * 100
    }).sort_values("Missing Values", ascending=False)

print("\n=== Missing values (before drop) ===")
print(missing_table(df))


=== Missing values (before drop) ===
                   Missing Values  Percentage (%)
subtheme                     1554       93.165468
description                   322       19.304556
link                           52        3.117506
title                           1        0.059952
theme                           0        0.000000
issue_date                      0        0.000000
newsletter_number               0        0.000000
id                              0        0.000000


#¬†Remove items where description, link or title are missing

In [20]:
# Remove rows where 'description' or 'link' is missing
df_cleaned = df.dropna(subset=['description', 'link', 'title'])

# (Optional) Check how many rows remain
print(f"Rows before: {len(df)}")
print(f"Rows after : {len(df_cleaned)}")

df = df_cleaned

Rows before: 1323
Rows after : 1323


#¬†Check for Duplicates 

###¬†All rows identical 

In [10]:
#All rows identical 
total_duplicates = df.duplicated().sum()
print(f"Total duplicate rows (all columns identical): {total_duplicates}")

Total duplicate rows (all columns identical): 0


### Title and link identical 

In [33]:
# Check duplicates where both title and link are the same
title_link_dupes = df[df.duplicated(subset=["title", "link"], keep=False)]

print(f"Number of duplicate title+link pairs: {title_link_dupes.shape[0]}")
title_link_dupes.sort_values(by=["title"]).head(2)

Number of duplicate title+link pairs: 0


Unnamed: 0,id,newsletter_number,issue_date,theme,subtheme,title,description,link


In [34]:
title_link_dupes.theme.value_counts()

Series([], Name: count, dtype: Int64)

In [29]:
title_link_dupes[title_link_dupes.theme == "Teacher recruitment, retention & development"]

Unnamed: 0,id,newsletter_number,issue_date,theme,subtheme,title,description,link
992,8991c36b-65fb-4011-9249-8f5917d32a0e,56,6 December 2024,"Teacher recruitment, retention & development",,DfE - Working lives of teachers and leaders: w...,A summary report of early findings from the th...,https://www.gov.uk/government/publications/wor...
1547,ac53ae25-d785-47b2-abd5-63d8e9583cbb,82,11 July 2025,"Teacher recruitment, retention & development",,DfE - Working lives of teachers and leaders: w...,Findings from the third wave of the working li...,https://www.gov.uk/government/publications/wor...


In [36]:
#drop duplicates keeping only first occurence 
df = df.drop_duplicates(subset=["title", "link"], keep="first").reset_index(drop=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1258 entries, 0 to 1257
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   id                 1258 non-null   string
 1   newsletter_number  1258 non-null   int64 
 2   issue_date         1258 non-null   string
 3   theme              1258 non-null   string
 4   subtheme           85 non-null     string
 5   title              1258 non-null   string
 6   description        1258 non-null   string
 7   link               1258 non-null   string
dtypes: int64(1), string(7)
memory usage: 78.8 KB


### Title only duplicates

In [37]:
# Count duplicates based on title only
title_dupes = df[df.duplicated(subset=["title"], keep=False)]

print(f"Number of rows with duplicate titles: {title_dupes.shape[0]}")
title_dupes.sort_values(by="title").head(1)

Number of rows with duplicate titles: 20


Unnamed: 0,id,newsletter_number,issue_date,theme,subtheme,title,description,link
1240,15f17205-bafc-43d8-ba3d-b4881956e48b,87,10 October 2025,Updates from the Programme,,Addressing key issues in teacher recruitment a...,Catch up with the video of the latest in the W...,https://mediacentral.ucl.ac.uk/Play/126585


In [46]:
title_table = title_dupes[["title", "theme"]].value_counts().reset_index(name="count")
title_table

Unnamed: 0,title,theme,count
0,Making Teaching Attractive and Worthwhile (Par...,Project news,3
1,Deadline: 28 April 2025,Political environment and key organisations,2
2,What matters in education? Education after the...,Updates from the programme,2
3,Panel:,Updates from the programme,2
4,Addressing key issues in teacher recruitment a...,Updates from the Programme,2
5,What matters in education? Education in a brok...,Updates from the programme,2
6,Labour,Political landscape & key organisations,1
7,Digital Poverty Alliance,EdTech,1
8,Digital Poverty Alliance,Thematic roundup,1
9,Panel:,"Teacher recruitment, retention & development",1


In [47]:
df = df.drop_duplicates(subset=["title"], keep="first").reset_index(drop=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1247 entries, 0 to 1246
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   id                 1247 non-null   string
 1   newsletter_number  1247 non-null   int64 
 2   issue_date         1247 non-null   string
 3   theme              1247 non-null   string
 4   subtheme           84 non-null     string
 5   title              1247 non-null   string
 6   description        1247 non-null   string
 7   link               1247 non-null   string
dtypes: int64(1), string(7)
memory usage: 78.1 KB


### link-only duplicates 

In [48]:
# Count duplicates based on link only
link_dupes = df[df.duplicated(subset=["link"], keep=False)]

print(f"Number of rows with duplicate links: {link_dupes.shape[0]}")
link_dupes.sort_values(by="link").head(1)

Number of rows with duplicate links: 114


Unnamed: 0,id,newsletter_number,issue_date,theme,subtheme,title,description,link
450,17268e59-d380-4e8f-a2e7-964543717f17,35,10 May 2024,What matters in education?,,Big Education conference - 'Next Generation Sc...,Hear from schools across the country who are w...,https://bigeducation.org/product/next-generati...


In [53]:
pd.set_option("display.max_colwidth", None)

link_table = link_dupes[["link"]].value_counts().reset_index(name="count")
link_table

Unnamed: 0,link,count
0,https://www.ucl.ac.uk/education-research-programme/events/2023/oct/practical-policies-or-bright-ideas-how-particular-topics-get-front-policy-queue,4
1,https://www.ucl.ac.uk/education-research-programme/events/2024/mar/investing-early-years-priorities-and-challenges,4
2,https://uk.bettshow.com/speakers/dominik-lukes,3
3,https://www.ucl.ac.uk/education-research-programme/events/2024/jan/pupil-absence-questions-policy-research-and-practice,3
4,https://childrens-participation.org/,3
5,https://www.ucl.ac.uk/education-research-programme/events/2025/may/how-build-resilient-schools-place-based-approaches-supporting-teachers-and-leaders,3
6,https://edtech.oii.ox.ac.uk/cutting-through-the-conjecture,2
7,https://engagementhub.ukri.org/esrc-1/weshorizonscanningsurvey,2
8,https://education.us18.list-manage.com/track/click?u=61f408a2f9c6d02a726ce6200&id=bea3b5fbac&e=4eb2cf985e,2
9,https://epi.org.uk/events/labour-party-conference-prioritising-equality-education-policy-as-a-lever-to-tackling-disadvantage-and-inequalities,2


In [54]:
df = df.drop_duplicates(subset=["link"], keep="first").reset_index(drop=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1186 entries, 0 to 1185
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   id                 1186 non-null   string
 1   newsletter_number  1186 non-null   int64 
 2   issue_date         1186 non-null   string
 3   theme              1186 non-null   string
 4   subtheme           80 non-null     string
 5   title              1186 non-null   string
 6   description        1186 non-null   string
 7   link               1186 non-null   string
dtypes: int64(1), string(7)
memory usage: 74.3 KB


# Identify themes and subthemes

In [55]:
#Unique counts of columns 
print("Unique titles:", df["title"].nunique())
print("Unique themes:", df["theme"].nunique())
print("Unique subthemes", df["subtheme"].nunique())
print("Unique links:", df["link"].nunique())

Unique titles: 1186
Unique themes: 62
Unique subthemes 35
Unique links: 1186


In [56]:
### Add placeholders for missing themes/subhtemes

# 1) Normalize empties/whitespace/"nan"/"none" to real NA
df_norm = df.copy()
for col in ["theme", "subtheme"]:
    df_norm[col] = (
        df_norm[col]
        .astype("string")
        .replace(r"^\s*$", pd.NA, regex=True)   # empty/whitespace ‚Üí NA
        .replace({"nan": pd.NA, "NaN": pd.NA, "None": pd.NA, "none": pd.NA})
    )

# 2) Create a version that fills NA with placeholders so ALL cases are counted
df_filled = df_norm.fillna({"theme": "No theme", "subtheme": "No subtheme"})

# 3) Group and count every (theme, subtheme) combo, including placeholder cases
theme_subtheme_counts = (
    df_filled
    .groupby(["theme", "subtheme"], dropna=False)
    .size()
    .reset_index(name="count")
    .sort_values(by=["theme", "subtheme"])
)

# 4) Export to Excel  
out_dir = data_cleaning_path
out_path = os.path.join(out_dir, "theme_subtheme_counts.xlsx")


theme_subtheme_counts.to_excel(out_path, index=False)  # <- this one
print(f"‚úÖ Exported {len(theme_subtheme_counts)} rows to {out_path}")

‚úÖ Exported 99 rows to /workspaces/ERP_Newsletter/data/1_interim/0_ingestion/theme_subtheme_counts.xlsx


# Check Themes and Articles 

In [57]:
# Filter articles under themes

check_themes = df[df["theme"] == "Research ‚Äì Practice ‚Äì Policy"].copy()

# View a few examples
display(check_themes.head(5))

Unnamed: 0,id,newsletter_number,issue_date,theme,subtheme,title,description,link
262,2fbf7b7b-334a-4d82-977c-d14a5bbf4778,25,16 February 2024,Research ‚Äì Practice ‚Äì Policy,,CAPE - Quid pro quo? Why academics meet with policy professionals,"Patrick McAlary, CAPE coordinator, explores what benefits academics report from giving up their time to chat with policy professionals about their policy priorities",https://t.co/DbEx7Z1PPJ
263,7f2bc1b1-fcd3-4602-89d3-cdd1d39845de,25,16 February 2024,Research ‚Äì Practice ‚Äì Policy,,The SHAPE of research impact,"British Academy report exploring research impact for the SHAPE disciplines, looking at the body of impact case studies submitted to the most recent research assessment exercise in the UK (REF21)",https://www.thebritishacademy.ac.uk/publications/the-shape-of-research-impact
264,1775fa9d-d0de-4be6-9e61-f466f7131eb2,25,16 February 2024,Research ‚Äì Practice ‚Äì Policy,,NFER Event ‚Äì Disadvantaged Policy webinar,Thursday 22 February 2024 ‚Äì 11am Online,https://www.nfer.ac.uk/events/disadvantaged-policy-webinar
276,e684b418-8313-40f5-a40d-1fb8e68784bc,26,23 February 2024,Research ‚Äì Practice ‚Äì Policy,,Post from the Co-Production Collective - How to best engage the public to participate in collaborative projects,"Through her lived experience of working with community groups, member of the Co-Production Collective Yesmin Begum shares her key principles for meaningful co-production and involvement.",https://www.coproductioncollective.co.uk/news/how-to-best-engage-the-public-to-participate-in-collaborative-projects?dm_i=2HJW%2C1ZV0V%2C7XUV43%2C74IFU%2C1
277,a8733168-3fb2-4eb4-b5d4-d15318b122f8,26,23 February 2024,Research ‚Äì Practice ‚Äì Policy,,"BERA event - Social theory, educational research and polycrisis",22 May 2024 2pm ‚Äì 4pm (Free for BERA members),https://www.bera.ac.uk/event/social-theory-educational-research-and-polycrisis-2024


# Rename Themes

In [58]:
# ---------- 0) Drop rows where the entire theme is the unsubscribe text
UNSUB_THEME = (
    "You have indicated that you are happy to receive news and updates from the "
    "ESRC Education Research Programme. To unsubscribe, please email "
    "Elizabeth.hudson@ucl.ac.uk with the word UNSUBSCRIBE in the title of the email."
)
mask_unsub = df["theme"].astype(str).str.strip().eq(UNSUB_THEME)
dropped_rows = int(mask_unsub.sum())
df = df[~mask_unsub].copy()

# ---------- 1) Normalizers
def norm_theme(s: str) -> str:
    if not isinstance(s, str):
        return ""
    s = s.strip()
    s = re.sub(r"\s+", " ", s)
    s = s.replace("‚Äî", "-").replace("‚Äì", "-")  # normalize dashes
    s = s.replace("‚Äô", "'").replace("‚Äò", "'").replace("‚Äú", '"').replace("‚Äù", '"')
    return s.lower()

def norm_key(s: str) -> str:
    """Strong normalizer for matching keys like subthemes:
       - lowercase; & -> and; remove punctuation; normalize dashes; collapse spaces
    """
    if not isinstance(s, str):
        return ""
    s = s.strip().lower()
    s = s.replace("‚Äî", " ").replace("‚Äì", " ").replace("-", " ")
    s = s.replace("&", " and ")
    s = s.replace("‚Äô", "'").replace("‚Äò", "'").replace("‚Äú", '"').replace("‚Äù", '"')
    s = re.sub(r"[,\.\u00A0]", " ", s)         # remove commas, periods, NBSP
    s = re.sub(r"[^a-z0-9\s]", " ", s)         # drop other punctuation
    s = re.sub(r"\s+", " ", s).strip()
    return s

# ---------- 2) Theme mapping list: (new_theme, current_theme)
# NOTE: Moved "Four Nations" + variants AND "Update from UKRI" to political_context_and_organisations
pairs = [
    # project_updates
    ("project_updates", "Embedding children's participation rights in pedagogical practice in lower primary classrooms in Wales PI: Sarah Chicken"),
    ("project_updates", "Investigating the recruitment and retention of ethnic minority teachers PI: Stephen Gorard"),
    ("project_updates", "News from the Projects"),
    ("project_updates", "News from the projects"),
    ("project_updates", "PI Updates and Papers"),
    ("project_updates", "PI: David Lundie"),
    ("project_updates", "Programme news"),
    ("project_updates", "Programme Update"),
    ("project_updates", "Programme update"),
    ("project_updates", "Project news"),
    ("project_updates", "Rethinking teacher recruitment: New approaches to attracting prospective STEM teachers PI: Rob Klassen"),
    ("project_updates", "Sustainable school leadership: comparing approaches to the training, supply and retention of senior school leaders across the UK PI Toby Greany"),
    ("project_updates", "Toby Greany"),
    ("project_updates", "Towards equity focused approaches to EdTech: a socio-technical perspective PI: Professor Rebecca Eynon"),
    ("project_updates", "Towards equity focused approaches to EdTech: a socio-technical perspective PI: Rebecca Eynon"),
    ("project_updates", "Update from the ERP projects"),
    ("project_updates", "Update from the ESRC Education Research Programme"),
    ("project_updates", "Update from the projects"),
    ("project_updates", "Updates from David Lundie"),
    ("project_updates", "Updates from Steph Ainsworth"),
    ("project_updates", "Updates from the ERP projects"),
    ("project_updates", "Updates from the ESRC"),
    ("project_updates", "Updates from the Programme"),
    ("project_updates", "Updates from the programme"),
    ("project_updates", "Updates from the projects"),
    ("project_updates", "Decentring the 'resilient teacher': exploring interactions between individuals and their social ecologies PI: Steph Ainsworth"),
    ("project_updates", "Peer reviewed articles from the ERP projects"),
    ("project_updates", "Peer reviewed publications from the ERP projects"),
    ("project_updates", "Seminar series topics"),
    ("project_updates", "Seminar topics"),

    # digital_ed
    ("digital_ed", "EdTech"),

    # political_context_and_organisations
    ("political_context_and_organisations", "What are the politicians saying?"),
    ("political_context_and_organisations", "What Matters in Education?"),
    ("political_context_and_organisations", "What matters in education?"),
    ("political_context_and_organisations", "4 Nations"),
    ("political_context_and_organisations", "4 Nations & key organisations"),
    ("political_context_and_organisations", "Political environment and key organisations"),
    ("political_context_and_organisations", "Political landscape - the election"),
    ("political_context_and_organisations", "Political landscape & key organisations"),
    ("political_context_and_organisations", "Political landscape across Four Nations & key organisations"),
    ("political_context_and_organisations", "Research ‚Äì Practice ‚Äì Policy"),
    ("political_context_and_organisations", "Calls for evidence"),
    ("political_context_and_organisations", "DfE"),
    ("political_context_and_organisations", "Education, Policy & Practice"),
    ("political_context_and_organisations", "EEF"),
    ("political_context_and_organisations", "ESRC"),
    ("political_context_and_organisations", "Politics"),
    ("political_context_and_organisations", "Launch of ESRC survey on social science research skills"),
    ("political_context_and_organisations", "Updates from UKRI"),  # plural already here
    ("political_context_and_organisations", "Update from UKRI"),   # moved here (singular)
    ("political_context_and_organisations", "Four Nations"),       # moved here
    ("political_context_and_organisations", "Four Nations Landscape"),
    ("political_context_and_organisations", "Four Nations landscape"),

    # events_opportunities_research
    ("events_opportunities_research", "Conferences"),
    ("events_opportunities_research", "Opportunities"),
    ("events_opportunities_research", "Opportunities for funding"),
    ("events_opportunities_research", "Opportunities to blog"),
    ("events_opportunities_research", "Other Reports"),
    ("events_opportunities_research", "Other Research"),
    ("events_opportunities_research", "Relevant Events"),
    ("events_opportunities_research", "Relevant Research"),
    ("events_opportunities_research", "Reports"),
    ("events_opportunities_research", "Research"),
    ("events_opportunities_research", "Events"),

    # teacher_rrd
    ("teacher_rrd", "Teacher recruitment, retention & development"),
]

# ---------- 3) Build lookup (normalized)
lookup = {norm_theme(curr): new for new, curr in pairs}

# ---------- 4) Apply theme mapping (no fill yet)
theme_norm = df["theme"].map(norm_theme)
df["new_theme"] = theme_norm.map(lookup)

# ---------- 4b) Defensive keyword overrides (force correct bucket if text contains patterns)
kw_mask = (
    theme_norm.str.contains(r"\bfour nations\b", regex=True, na=False) |
    theme_norm.str.contains(r"\bupdate(s)? from ukri\b", regex=True, na=False)
)
df.loc[kw_mask, "new_theme"] = "political_context_and_organisations"

# ---------- 5) Subtheme-based overrides
sub_norm = df["subtheme"].map(norm_key)

target_rrd = "teacher recruitment retention and development"
df.loc[sub_norm.eq(target_rrd), "new_theme"] = "teacher_rrd"  # any variant mapped earlier ‚Üí normalized equals this
df.loc[sub_norm.eq("digital"), "new_theme"] = "digital_ed"

# ---------- 6) Fill any remaining unmapped with the original theme text (your previous behavior)
df["new_theme"] = df["new_theme"].fillna(df["theme"])

# ---------- 7) Export a summary
summary = (
    df.assign(theme_norm=theme_norm, subtheme_norm=sub_norm)
      .groupby(["new_theme", "theme_norm"], dropna=False)
      .size()
      .reset_index(name="count")
      .sort_values(["new_theme", "count"], ascending=[True, False])
)

out_dir = data_cleaning_path
summary_path = os.path.join(out_dir, "theme_mapping_summary.xlsx")


with pd.ExcelWriter(summary_path) as xw:
    df.to_excel(xw, sheet_name="data_with_new_theme", index=False)
    summary.to_excel(xw, sheet_name="mapping_summary", index=False)

print(f"‚úÖ Dropped {dropped_rows} unsubscribe row(s).")
print("‚úÖ Mapping applied.")
print("üìÑ Excel written to:", summary_path)

‚úÖ Dropped 0 unsubscribe row(s).
‚úÖ Mapping applied.
üìÑ Excel written to: /workspaces/ERP_Newsletter/data/1_interim/0_ingestion/theme_mapping_summary.xlsx


  theme_norm.str.contains(r"\bupdate(s)? from ukri\b", regex=True, na=False)


In [59]:
# ---------- 7) View unique new_theme values and their counts
theme_counts = (
    df["new_theme"]
    .value_counts(dropna=False)
    .reset_index()
    .rename(columns={"index": "new_theme", "new_theme": "count"})
)

print("üß≠ Unique new_theme values and their counts:")
print(theme_counts)

üß≠ Unique new_theme values and their counts:
                                 count  count
0  political_context_and_organisations    664
1                          teacher_rrd    178
2                           digital_ed    166
3                      project_updates    148
4        events_opportunities_research     30


#¬†Number of unique domain names 

In [62]:
# Extract domain names from the 'link' column
df["domain"] = df["link"].apply(lambda x: urlparse(str(x)).netloc if pd.notna(x) else None)

# Count unique domains
unique_domains = df["domain"].nunique()

print(f"üåê There are {unique_domains} unique domains in this dataset.")

# Optional: see the top 10 most common domains
domain_counts = df["domain"].value_counts().reset_index()
domain_counts.columns = ["domain", "count"]
print(domain_counts.head(60))

üåê There are 346 unique domains in this dataset.
                                   domain  count
0                       schoolsweek.co.uk    143
1                              www.gov.uk     57
2                           www.ucl.ac.uk     32
3                     theconversation.com     31
4                     www.theguardian.com     28
5                          www.bera.ac.uk     26
6                              epi.org.uk     24
7                          www.nfer.ac.uk     23
8                    www.eventbrite.co.uk     20
9                            www.gov.scot     20
10  bera-journals.onlinelibrary.wiley.com     17
11                www.education-ni.gov.uk     16
12             www.belfasttelegraph.co.uk     16
13             ffteducationdatalab.org.uk     14
14               committees.parliament.uk     14
15                            www.tes.com     14
16                          www.gov.wales     12
17                        blog.bham.ac.uk     11
18             www

In [63]:
#export a file to review domain names, remove irrelevant names and create a new 'organisation' name based on domain name 
review_path = os.path.join(data_cleaning_path, "domain_review.xlsx")
domain_counts.to_excel(review_path, index=False)

print("Wrote:", review_path)

Wrote: /workspaces/ERP_Newsletter/data/1_interim/0_ingestion/domain_review.xlsx


# Add'organisation' column and remove irreleant domain names

In [64]:
# Mapping from domain ‚Üí organisation
domain_to_org = {
    "schoolsweek.co.uk": "schools_week",
    "www.ucl.ac.uk": "ucl",
    "www.gov.uk": "uk_government",
    "theconversation.com": "conversation",
    "www.theguardian.com": "guardian",
    "www.bera.ac.uk": "bera",
    "epi.org.uk": "epi",
    "www.eventbrite.co.uk": "REMOVE",
    "www.nfer.ac.uk": "nfer",
    "www.gov.scot": "scottish_government",
    "bera-journals.onlinelibrary.wiley.com": "bera_journals",
    "www.belfasttelegraph.co.uk": "belfast_telegraph",
    "www.education-ni.gov.uk": "ni_government",
    "committees.parliament.uk": "uk_parliament",
    "ffteducationdatalab.org.uk": "fft_ed_datalab",
    "www.tes.com": "tes",
    "meetoecd1.zoom.us": "REMOVE",
    "www.nuffieldfoundation.org": "nuffield",
    "www.gov.wales": "welsh_government",
    "blogs.ucl.ac.uk": "ucl",
    "": "REMOVE",
    "blog.bham.ac.uk": "university_of_birmingham",
    "upen.ac.uk": "upen",
    "www.instituteforgovernment.org.uk": "ifg",
    "fed.education": "fed",
    "education.us18.list-manage.com": "REMOVE",
    "ifs.org.uk": "ifs",
    "mediacentral.ucl.ac.uk": "ucl",
    "educationwales.blog.gov.wales": "welsh_government",
    "www.childrenscommissioner.gov.uk": "childrens_commissioner",
    "www.bbc.co.uk": "bbc",
    "edtech.oii.ox.ac.uk": "oii_edtech_equity",
    "bera.us9.list-manage.com": "REMOVE",
    "teachertapp.co.uk": "teacher_tapp",
    "www.oecd.org": "oecd",
    "educationendowmentfoundation.org.uk": "eef",
    "www.ukri.org": "ukri",
    "childrens-participation.org": "childrends_participation_in_schools",
    "t.co": "twitter",
    "lordslibrary.parliament.uk": "house_of_lords_library",
    "my.chartered.college": "cct",
    "post.parliament.uk": "post_parliament",
    "lgiu.us3.list-manage.com": "REMOVE",
    "parliament.us16.list-manage.com": "REMOVE",
    "wonkhe.com": "wonkhe",
    "uk.bettshow.com": "bett_show",
    "newsletter.oecd.org": "oecd",
    "bit.ly": "twitter",
    "www.tandfonline.com": "taylor_and_francis",
    "chartered.college": "chartered_college_of_teaching",
    "teachertapp.com": "teacher_tapp",
    "www.thebritishacademy.ac.uk": "british_academy",
    "5rightsfoundation.com": "5rights_foundation",
    "www.independent.co.uk": "independent",
    "neu.org.uk": "national_education_union",
    "theippo.co.uk": "ippo",
    "www.linkedin.com": "linkedin",
    "www.eventbrite.com": "REMOVE",
    "forms.office.com": "REMOVE",
    "www.ippr.org": "ippr",
    "www.hepi.ac.uk": "hepi",
    "www.nesta.org.uk": "nesta",
    "assets.publishing.service.gov.uk": "uk_government",
    "journals.sagepub.com": "sage_journals",
    "d2tic4wvo1iusb.cloudfront.net": "REMOVE",
    "inews.co.uk": "inews",
    "discovery.ucl.ac.uk": "ucl",
    "education.gov.scot": "education_scotland",
    "email.thebritishacademy.ac.uk": "british_academy",
    "digitalpovertyalliance.org": "digital_poverty_alliance",
    "feweek.co.uk": "fe_week",
    "contacts.epi.org.uk": "REMOVE",
    "sustainableschoolleadership.uk": "sustainable_school_leadership",
    "www.oecd-ilibrary.org": "oecd",
    "www.transformingsociety.co.uk": "transforming_society",
    "www.upen.ac.uk": "upen",
    "manmetjobs.mmu.ac.uk": "manchester_metropolitan_university",
    "www.edge.co.uk": "edge_foundation",
    "www.jrf.org.uk": "joseph_rowntree_foundation",
    "www.unicef.org": "unicef",
    "www.adalovelaceinstitute.org": "ada_lovelace_institute",
    "literacytrust.org.uk": "national_literacy_trust",
    "events.teams.microsoft.com": "REMOVE",
    "engagementhub.ukri.org": "ukri",
    "cfey.org": "centre_for_education_and_youth",
    "www.nao.org.uk": "national_audit_office",
    "upen.us14.list-manage.com": "REMOVE",
    "www.edtechstrategylab.org": "edtech_strategy_lab",
    "el.wiley.com": "wiley",
    "senedd.wales": "welsh_parliament",
    "hansard.parliament.uk": "uk_parliament",
    "www.cape.ac.uk": "cape_collaboration_for_public_engagement",
    "options2040.co.uk": "options_2040_project",
    "transforming-evidence.org": "transforming_evidence",
    "www.orielsquare.co.uk": "oriel_square",
    "www.institute.global": "tony_blair_institute",
    "www.evaluation.impactedgroup.uk": "impacted_group",
    "open.spotify.com": "spotify_podcast",
    "educationhub.blog.gov.uk": "uk_government",
    "www.politicshome.com": "politics_home",
    "eprints.lse.ac.uk": "lse_repository",
    "business.senedd.wales": "welsh_parliament",
    "bigeducation.org": "big_education",
    "cpag.org.uk": "child_poverty_action_group",
    "www.tickettailor.com": "REMOVE",
    "click.communications.gse.harvard.edu": "REMOVE",
    "www.telegraph.co.uk": "daily_telegraph",
    "www.n8research.org.uk": "n8_research_partnership",
    "www.centreforyounglives.org.uk": "centre_for_young_lives",
    "cdn.prod.website-files.com": "REMOVE",
    "www.nasuwt.org.uk": "nasuwt_teachers_union",
    "www.de.ed.ac.uk": "university_of_edinburgh",
    "www.libdems.org.uk": "liberal_democrats",
    "wcpp.org.uk": "wales_centre_for_public_policy",
    "www.naht.org.uk": "national_association_head_teachers",
    "cstuk.org.uk": "charities_supporting_teachers_uk",
    "www.turing.ac.uk": "alan_turing_institute",
    "www.health-ni.gov.uk": "ni_department_of_health",
    "covidandsociety.us1.list-manage.com": "REMOVE",
    "lgiu.org": "local_government_information_unit",
    "www.mirror.co.uk": "daily_mirror",
    "drive.google.com": "REMOVE",
    "www.educationsupport.org.uk": "education_support_charity",
    "publicpolicydesign.blog.gov.uk": "uk_government",
    "epi.us15.list-manage.com": "REMOVE",
    "durhamuniversity.zoom.us": "REMOVE",
    "www.ascl.org.uk": "ascl",
    "www.nurseryworld.co.uk": "nursery_world_magazine",
    "teachingcommission.co.uk": "teaching_commission",
    "acss.org.uk": "academy_of_social_sciences",
    "defenddigitalme.org": "defend_digital_me",
    "www.researchgate.net": "researchgate",
    "crae.org.uk": "children_rights_alliance_england",
    "fairnessfoundation.com": "fairness_foundation",
    "www.durham.ac.uk": "durham_university",
    "www.gse.harvard.edu": "harvard_graduate_school_of_education",
    "politico.us8.list-manage.com": "REMOVE",
    "www.internetmatters.org": "internet_matters",
    "www.parliament.uk": "uk_parliament",
    "ripl.uk": "research_improvement_for_policy_and_learning",
    "www.youtube.com": "youtube",
    "www.coproductioncollective.co.uk": "coproduction_collective",
    "www.britishcouncil.org": "british_academy",
    "www.techuk.org": "tech_uk",
    "localed2025.org.uk": "local_ed_2025",
    "www.lgcplus.com": "local_government_chronicle",
    "lnkd.in": "linkedin",
    "www.economy-ni.gov.uk": "ni_department_for_economy",
    "techbullion.com": "techbullion",
    "ukla.us10.list-manage.com": "REMOVE",
    "educationscape.us4.list-manage.com": "REMOVE",
    "linkprotect.cudasvc.com": "REMOVE",
    "commonslibrary.parliament.uk": "house_of_commons_library",
    "play.wales": "play_wales",
    "www.edtechdigest.com": "edtech_digest",
    "cep.lse.ac.uk": "centre_for_economic_performance_lse",
    "www.elsevier.com": "elsevier",
    "www.express.co.uk": "daily_express",
    "onlinelibrary.wiley.com": "wiley",
    "institute.global": "tony_blair_institute",
    "www.insideedgetraining.co.uk": "inside_edge_training",
    "research.senedd.wales": "welsh_parliament",
    "www.frontiersin.org": "frontiers_journal",
    "openpolicy.blog.gov.uk": "uk_government",
    "www.mmu.ac.uk": "manchester_metropolitan_university",
    "blogs.uwe.ac.uk": "uwe_bristol_blog",
    "www.contractsfinder.service.gov.uk": "uk_government",
    "www.parliament.scot": "scottish_parliament",
    "njmok7zy3oa.typeform.com": "REMOVE",
    "public-api.wordpress.com": "wordpress",
    "www.sciencecampaign.org.uk": "campaign_for_science_and_engineering",
    "www.nottingham.ac.uk": "university_of_nottingham",
    "zoom.us": "REMOVE",
    "www.digit.fyi": "digit_fyi",
    "explore-education-statistics.service.gov.uk": "dfe_education_statistics",
    "www.atkinsrealis.com": "atkins_realis",
    "profbeckyallen.substack.com": "becky_allen_substack",
    "www.lse.ac.uk": "london_school_of_economics",
    "www.uwe.ac.uk": "uew_england",
    "wcpp.us12.list-manage.com": "REMOVE",
    "researchonresearch.org": "research_on_research_institute",
    "www.funding-futures.org": "funding_futures",
    "www.royalacademy.org.uk": "royal_academy",
    "unesdoc.unesco.org": "unesco",
    "srhe.ac.uk": "society_for_research_into_higher_education",
    "nfer.ac.uk": "nfer",
    "www.expressandstar.com": "express_and_star",
    "labour.org.uk": "labour_party",
    "econpapers.repec.org": "repec_econpapers",
    "uclpress.scienceopen.com": "ucl",
    "edarxiv.org": "education_arxiv",
    "mmail.dods.co.uk": "REMOVE",
    "www.unesco.org": "unesco",
    "daily.jstor.org": "jstor_daily",
    "www.jstor.org": "jstor",
    "www.thenhsa.co.uk": "northern_health_science_alliance",
    "www.ambition.org.uk": "ambition_institute",
    "www.fda.org.uk": "fda_union",
    "www.birmingham.ac.uk": "university_of_birmingham",
    "magicsmoke.substack.com": "magicsmoke_substack",
    "thebritishacademyecrn.com": "british_academy",
    "www.edtechinnovationhub.com": "edtech_innovation_hub",
    "link.news.inews.co.uk": "inews",
    "arcinstitute.org": "arc_institute",
    "www.thetimes.com": "the_times",
    "scholar.harvard.edu": "harvard_graduate_school_of_education",
    "www.the-tls.co.uk": "times_literary_supplement",
    "shadowpanel.uk": "shadow_panel_project",
    "www.oxfordschoolofthought.org": "oxford_school_of_thought",
    "www.schoolsappg.org.uk": "all_party_parliamentary_group_schools",
    "educationappg.org.uk": "education_appg",
    "benniekara.substack.com": "bennie_kara_substack",
    "www.twinkl.co.uk": "twinkl",
    "www.eyalliance.org.uk": "early_years_alliance",
    "niot.org.uk": "national_institute_of_teaching",
    "universitas21.com": "universitas_21",
    "ascl.org.uk": "ascl",
    "www.scottishai.com": "scottish_ai",
    "www.rijksoverheid.nl": "dutch_government",
    "samf.substack.com": "samf_substack",
    "londondesignbiennale.com": "london_design_biennale",
    "upp-foundation.org": "upp_foundation",
    "inclusioninpractice.org.uk": "inclusion_in_practice",
    "publications.parliament.uk": "uk_parliament",
    "observer.co.uk": "the_observer",
    "dundee.onlinesurveys.ac.uk": "university_of_dundee_surveys",
    "wonkhe.cmail20.com": "REMOVE",
    "digitalyouthindex.uk": "digital_youth_index",
    "neweconomics.org": "new_economics_foundation",
    "beyth.co.uk": "beyth_consultancy",
    "the-difference.com": "the_difference",
    "www.innovate-ed.uk": "innovate_ed",
    "www.oecd-events.org": "oecd",
    "youtu.be": "youtube",
    "ffteducationdatalab.us12.list-manage.com": "REMOVE",
    "www.centreforsocialjustice.org.uk": "centre_for_social_justice",
    "gamayo.co.uk": "gamayo",
    "profiles.ucl.ac.uk": "ucl",
    "rebeccaallen.co.uk": "rebecca_allen",
    "pod.co": "pod_co_podcast",
    "www.teachfirst.org.uk": "teach_first",
    "edsk.org": "edsk_think_tank",
    "digitalgood.net": "digital_good_network",
    "lnks.gd": "REMOVE",
    "issuu.com": "issuu",
    "adcs.org.uk": "association_of_directors_of_childrens_services",
    "www.labourtogether.uk": "labour_together",
    "downloads2.dodsmonitoring.com": "dods_monitoring",
    "www.besa.org.uk": "british_academy",
    "nepc.colorado.edu": "national_education_policy_center",
    "cipr.co.uk": "chartered_institute_of_public_relations",
    "img1.wsimg.com": "REMOVE",
    "www.leverhulme.ac.uk": "leverhulme_trust",
    "www.ons.gov.uk": "office_for_national_statistics",
    "labourlist.org": "labour_list",
    "www.centreformentalhealth.org.uk": "centre_for_mental_health",
    "lnu-se.zoom.us": "REMOVE",
    "lse.zoom.us": "REMOVE",
    "hechingerreport.org": "hechinger_report",
    "researcheracademy.elsevier.com": "elsevier_researcher_academy",
    "www.smf.co.uk": "social_market_foundation",
    "www.suttontrust.com": "sutton_trust",
    "www.research.net": "REMOVE",
    "y3r710.r.eu-west-1.awstrack.me": "REMOVE",
    "lucaf.org": "lucas_education_foundation",
    "learning.nspcc.org.uk": "nspcc_learning",
    "news.comms.nao.org.uk": "national_audit_office",
    "youthendowmentfund.org.uk": "youth_endowment_fund",
    "www.bigissue.com": "big_issue",
    "demos.co.uk": "demos",
    "links-2.govdelivery.com": "REMOVE",
    "news.chartered.college": "chartered_college_news",
    "www.workinglivesofteachers.com": "working_lives_of_teachers",
    "one.oecd.org": "oecd",
    "blog.policy.manchester.ac.uk": "policy_manchester_blog",
    "twitter.com": "twitter",
    "mcrmetropolis.uk": "manchester_metropolitan_university",
    "gtr.ukri.org": "ukri",
    "teaching-vacancies.service.gov.uk": "dfe_teaching_vacancies",
    "ari.org.uk": "ari_association_for_research_innovation",
    "kingsfundmail.org.uk": "kings_fund",
    "www.mdpi.com": "mdpi_journals",
    "blogs.gov.scot": "scottish_government",
    "parliamentlive.tv": "uk_parliament",
    "e-estonia.com": "e_estonia",
    "us9.campaign-archive.com": "REMOVE",
   "podfollow.com": "podfollow_podcast",
    "nation.cymru": "nation_cymru",
    "www.holyrood.com": "holyrood_magazine",
    "impactedgroup.us22.list-manage.com": "REMOVE",
    "teachersuccess.co.uk": "teacher_success",
    "tpea.ac.uk": "tpea_association",
    "www.fenews.co.uk": "fe_news",
    "www.qmul.ac.uk": "queen_mary_university_london",
    "www.echild.ac.uk": "echild_research_centre",
    "www.standard.co.uk": "evening_standard",
    "x.com": "twitter",
    "goodthingsfoundation.us7.list-manage.com": "REMOVE",
    "www.ocr.org.uk": "ocr_exam_board",
    "ideas.repec.org": "repec_ideas",
    "educationinspection.blog.gov.uk": "ofsted_blog",
    "medium.com": "medium",
    "niot.s3.amazonaws.com": "REMOVE",
    "lxhriqcab.cc.rs6.net": "REMOVE",
    "consult.education.gov.uk": "dfe_consultations",
    "onthinktanks.org": "on_think_tanks",
    "etat.uea.ac.uk": "university_of_east_anglia",
    "ucl.us20.list-manage.com": "REMOVE",
    "www.wsj.com": "wall_street_journal",
    "doi-org.libproxy.ucl.ac.uk": "doi_via_ucl_proxy",
    "www.kcl.ac.uk": "kings_college_london",
    "www.chandlerinstitute.org": "chandler_institute",
    "unige.zoom.us": "REMOVE",
    "insights.taylorandfrancis.com": "taylor_and_francis",
    "www.faircomment.co.uk": "fair_comment",
    "www.sciencedirect.com": "sciencedirect",
    "jacobsfoundation.us11.list-manage.com": "REMOVE",
    "www.barnardos.org.uk": "barnardos",
    "bbc.co.uk": "bbc",
    "fullfact.org": "full_fact",
    "news.sky.com": "sky_news",
    "www.magicbreakfast.com": "magic_breakfast",
    "ow.ly": "twitter",
    "durham.cloud.panopto.eu": "REMOVE",
    "adc.bmj.com": "british_medical_journal",
    "www.nationalcrimeagency.gov.uk": "national_crime_agency",
    "newvisionsforeducation.org.uk": "new_visions_for_education",
    "thestaffcollege.uk": "staff_college",
    "www.yorkshirepost.co.uk": "yorkshire_post",
    "doi.org": "REMOVE",
    "www.bristol.ac.uk": "university_of_bristol",
    "media.actionforchildren.org.uk": "action_for_children",
    "www.pearson.com": "pearson",
    "www.coe.int": "council_of_europe",
    "whatson.parliament.uk": "uk_parliament",
    "www.civilservicejobs.service.gov.uk": "uk_civil_service_jobs",
    "www.ft.com": "financial_times",
    "soundcloud.com": "soundcloud",
    "www.hmc.org.uk": "headmasters_and_headmistresses_conference",
    "app.getresponse.com": "REMOVE",
    "www.ntu.ac.uk": "nottingham_trent_university",
    "ippr-org.files.svdcdn.com": "ippr",
    "www.childreninwales.org.uk": "children_in_wales",
    "nuffieldfoundation.cmail20.com": "REMOVE",
    "www.wired-gov.net": "wired_gov",
    "nuffieldfoundation.cmail19.com": "REMOVE"
}

In [65]:
df["organisation"] = df["domain"].map(domain_to_org)

In [66]:
df = df[df["organisation"].notna()]
df = df[df["organisation"] != "REMOVE"]

In [67]:
df.head(1)


Unnamed: 0,id,newsletter_number,issue_date,theme,subtheme,title,description,link,new_theme,domain,organisation
0,c97ff62f-83ca-47ec-a4c7-b4e24157ae0a,1,11 July 2023,Calls for evidence,,Deadline 23 August 2023,"Education secretary Gillian Keegan has launched a call for evidence on using artificial intelligence (AI) like ChatGPT in schools ""to get the best"" out of the new technology.",https://schoolsweek.co.uk/chatgpt-keegan-launches-call-for-evidence-on-ai-in-schools,political_context_and_organisations,schoolsweek.co.uk,schools_week


In [68]:
#number of unique organisations 
unique_orgs = df["organisation"].nunique()
print(f"Number of unique organisations: {unique_orgs}")

Number of unique organisations: 245


In [73]:
# Build full file path using your existing variable
output_path = os.path.join(data_cleaning_path, "unique_organisations.xlsx")

# Get the unique organisations as a sorted list
unique_org_list = sorted(df["organisation"].dropna().unique())

# Convert to DataFrame
unique_org_df = pd.DataFrame(unique_org_list, columns=["organisation"])

# Save to Excel
unique_org_df.to_excel(output_path, index=False)

print(f"Saved {len(unique_org_list)} unique organisations to {output_path}")

Saved 245 unique organisations to /workspaces/ERP_Newsletter/data/1_interim/0_ingestion/unique_organisations.xlsx


In [72]:
org_counts = df["organisation"].value_counts()
org_5plus = org_counts[org_counts >=5]
print(org_5plus)

organisation
schools_week                           143
uk_government                           67
ucl                                     54
conversation                            31
guardian                                28
bera                                    26
nfer                                    24
epi                                     24
scottish_government                     21
uk_parliament                           21
welsh_government                        21
oecd                                    18
bera_journals                           17
belfast_telegraph                       16
ni_government                           16
twitter                                 15
tes                                     14
fft_ed_datalab                          14
upen                                    12
university_of_birmingham                12
ukri                                    11
nuffield                                11
british_academy                         1

# Assign categories to the organiastions 

In [88]:
org_to_category = {

    # --- LEGISLATURE / GOVERNMENT ---
    "all_party_parliamentary_group_schools": "Legislature/Government",
    "council_of_europe": "Legislature/Government",
    "dfe_consultations": "Legislature/Government",
    "dfe_education_statistics": "Legislature/Government",
    "dfe_teaching_vacancies": "Legislature/Government",
    "dods_monitoring": "Legislature/Government",
    "dutch_government": "Legislature/Government",
    "education_scotland": "Legislature/Government",
    "house_of_commons_library": "Legislature/Government",
    "house_of_lords_library": "Legislature/Government",
    "labour_party": "Legislature/Government",
    "liberal_democrats": "Legislature/Government",
    "local_government_chronicle": "Legislature/Government",
    "local_government_information_unit": "Legislature/Government",
    "national_audit_office": "Legislature/Government",
    "national_crime_agency": "Legislature/Government",
    "ni_department_for_economy": "Legislature/Government",
    "ni_department_of_health": "Legislature/Government",
    "ni_government": "Legislature/Government",
    "office_for_national_statistics": "Legislature/Government",
    "post_parliament": "Legislature/Government",
    "scottish_government": "Legislature/Government",
    "scottish_parliament": "Legislature/Government",
    "uk_civil_service_jobs": "Legislature/Government",
    "uk_government": "Legislature/Government",
    "uk_parliament": "Legislature/Government",
    "welsh_government": "Legislature/Government",
    "welsh_parliament": "Legislature/Government",

    # --- MEDIA ---
    "bbc": "Media",
    "belfast_telegraph": "Media",
    "big_issue": "Media",
    "daily_express": "Media",
    "daily_mirror": "Media",
    "daily_telegraph": "Media",
    "digit_fyi": "Media",
    "evening_standard": "Media",
    "express_and_star": "Media",
    "financial_times": "Media",
    "guardian": "Media",
    "hechinger_report": "Media",
    "holyrood_magazine": "Media",
    "independent": "Media",
    "inews": "Media",
    "medium": "Media",
    "nation_cymru": "Media",
    "nursery_world_magazine": "Media",
    "politics_home": "Media",
    "schools_week": "Media",
    "sky_news": "Media",
    "tes": "Media",
    "the_observer": "Media",
    "the_times": "Media",
    "times_literary_supplement": "Media",
    "wall_street_journal": "Media",
    "wired_gov": "Media",
    "wonkhe": "Media",
    "yorkshire_post": "Media",

    # Added missing MEDIA
    "samf_substack": "Media",
    "london_design_biennale": "Media",

    # --- KNOWLEDGE PRODUCERS ---
    "academy_of_social_sciences": "Knowledge Producer",
    "ada_lovelace_institute": "Knowledge Producer",
    "alan_turing_institute": "Knowledge Producer",
    "arc_institute": "Knowledge Producer",
    "ari_association_for_research_innovation": "Knowledge Producer",
    "bera": "Knowledge Producer",
    "bera_journals": "Knowledge Producer",
    "british_academy": "Knowledge Producer",
    "british_medical_journal": "Knowledge Producer",
    "centre_for_economic_performance_lse": "Knowledge Producer",
    "coproduction_collective": "Knowledge Producer",
    "doi_via_ucl_proxy": "Knowledge Producer",
    "durham_university": "Knowledge Producer",
    "echild_research_centre": "Knowledge Producer",
    "elsevier": "Knowledge Producer",
    "elsevier_researcher_academy": "Knowledge Producer",
    "frontiers_journal": "Knowledge Producer",
    "harvard_graduate_school_of_education": "Knowledge Producer",
    "jstor": "Knowledge Producer",
    "jstor_daily": "Knowledge Producer",
    "kings_college_london": "Knowledge Producer",
    "kings_fund": "Knowledge Producer",
    "london_school_of_economics": "Knowledge Producer",
    "lse_repository": "Knowledge Producer",
    "manchester_metropolitan_university": "Knowledge Producer",
    "mdpi_journals": "Knowledge Producer",
    "national_education_policy_center": "Knowledge Producer",
    "national_institute_of_teaching": "Knowledge Producer",
    "nfer": "Knowledge Producer",
    "northern_health_science_alliance": "Knowledge Producer",
    "nottingham_trent_university": "Knowledge Producer",
    "oecd": "Knowledge Producer",
    "oii_edtech_equity": "Knowledge Producer",
    "oxford_school_of_thought": "Knowledge Producer",
    "queen_mary_university_london": "Knowledge Producer",
    "repec_econpapers": "Knowledge Producer",
    "repec_ideas": "Knowledge Producer",
    "research_improvement_for_policy_and_learning": "Knowledge Producer",
    "research_on_research_institute": "Knowledge Producer",
    "researchgate": "Knowledge Producer",
    "royal_academy": "Knowledge Producer",
    "sage_journals": "Knowledge Producer",
    "sciencedirect": "Knowledge Producer",
    "scottish_ai": "Knowledge Producer",
    "taylor_and_francis": "Knowledge Producer",
    "ucl": "Knowledge Producer",
    "universitas_21": "Knowledge Producer",
    "university_of_birmingham": "Knowledge Producer",
    "university_of_bristol": "Knowledge Producer",
    "university_of_dundee_surveys": "Knowledge Producer",
    "university_of_east_anglia": "Knowledge Producer",
    "university_of_edinburgh": "Knowledge Producer",
    "university_of_nottingham": "Knowledge Producer",
    "wiley": "Knowledge Producer",

    # Added missing KNOWLEDGE PRODUCERS
    "unesco": "Knowledge Producer",
    "education_arxiv": "Knowledge Producer",
    "n8_research_partnership": "Knowledge Producer",
    "ocr_exam_board": "Knowledge Producer",
    "uwe_bristol_blog": "Knowledge Producer",

    # --- KNOWLEDGE MOBILISERS ---
    "ambition_institute": "Knowledge Mobiliser",
    "ascl": "Knowledge Mobiliser",
    "atkins_realis": "Knowledge Mobiliser",
    "beyth_consultancy": "Knowledge Mobiliser",
    "big_education": "Knowledge Mobiliser",
    "campaign_for_science_and_engineering": "Knowledge Mobiliser",
    "cape_collaboration_for_public_engagement": "Knowledge Mobiliser",
    "centre_for_education_and_youth": "Knowledge Mobiliser",
    "centre_for_mental_health": "Knowledge Mobiliser",
    "centre_for_social_justice": "Knowledge Mobiliser",
    "centre_for_young_lives": "Knowledge Mobiliser",
    "charities_supporting_teachers_uk": "Knowledge Mobiliser",
    "chartered_college_news": "Knowledge Mobiliser",
    "chartered_college_of_teaching": "Knowledge Mobiliser",
    "chartered_institute_of_public_relations": "Knowledge Mobiliser",
    "conversation": "Knowledge Mobiliser",
    "defend_digital_me": "Knowledge Mobiliser",
    "demos": "Knowledge Mobiliser",
    "digital_good_network": "Knowledge Mobiliser",
    "digital_poverty_alliance": "Knowledge Mobiliser",
    "digital_youth_index": "Knowledge Mobiliser",
    "edge_foundation": "Knowledge Mobiliser",
    "edsk_think_tank": "Knowledge Mobiliser",
    "edtech_digest": "Knowledge Mobiliser",
    "edtech_innovation_hub": "Knowledge Mobiliser",
    "edtech_strategy_lab": "Knowledge Mobiliser",
    "education_appg": "Knowledge Mobiliser",
    "education_support_charity": "Knowledge Mobiliser",
    "eef": "Knowledge Mobiliser",
    "epi": "Knowledge Mobiliser",
    "fair_comment": "Knowledge Mobiliser",
    "fairness_foundation": "Knowledge Mobiliser",
    "fe_news": "Knowledge Mobiliser",
    "fe_week": "Knowledge Mobiliser",
    "fed": "Knowledge Mobiliser",
    "fft_ed_datalab": "Knowledge Mobiliser",
    "full_fact": "Knowledge Mobiliser",
    "funding_futures": "Knowledge Mobiliser",
    "ifg": "Knowledge Mobiliser",
    "ifs": "Knowledge Mobiliser",
    "impacted_group": "Knowledge Mobiliser",
    "inclusion_in_practice": "Knowledge Mobiliser",
    "innovate_ed": "Knowledge Mobiliser",
    "inside_edge_training": "Knowledge Mobiliser",
    "internet_matters": "Knowledge Mobiliser",
    "ippo": "Knowledge Mobiliser",
    "ippr": "Knowledge Mobiliser",
    "issuu": "Knowledge Mobiliser",
    "joseph_rowntree_foundation": "Knowledge Mobiliser",
    "labour_list": "Knowledge Mobiliser",
    "labour_together": "Knowledge Mobiliser",
    "local_ed_2025": "Knowledge Mobiliser",
    "lucas_education_foundation": "Knowledge Mobiliser",
    "national_literacy_trust": "Knowledge Mobiliser",
    "nesta": "Knowledge Mobiliser",
    "new_economics_foundation": "Knowledge Mobiliser",
    "new_visions_for_education": "Knowledge Mobiliser",
    "ofsted_blog": "Knowledge Mobiliser",
    "on_think_tanks": "Knowledge Mobiliser",
    "options_2040_project": "Knowledge Mobiliser",
    "oriel_square": "Knowledge Mobiliser",
    "pearson": "Knowledge Mobiliser",
    "play_wales": "Knowledge Mobiliser",
    "policy_manchester_blog": "Knowledge Mobiliser",
    "shadow_panel_project": "Knowledge Mobiliser",
    "social_market_foundation": "Knowledge Mobiliser",
    "society_for_research_into_higher_education": "Knowledge Mobiliser",
    "staff_college": "Knowledge Mobiliser",
    "sustainable_school_leadership": "Knowledge Mobiliser",
    "sutton_trust": "Knowledge Mobiliser",
    "teach_first": "Knowledge Mobiliser",
    "teacher_success": "Knowledge Mobiliser",
    "teacher_tapp": "Knowledge Mobiliser",
    "teaching_commission": "Knowledge Mobiliser",
    "tech_uk": "Knowledge Mobiliser",
    "techbullion": "Knowledge Mobiliser",
    "the_difference": "Knowledge Mobiliser",
    "tony_blair_institute": "Knowledge Mobiliser",
    "tpea_association": "Knowledge Mobiliser",
    "transforming_evidence": "Knowledge Mobiliser",
    "transforming_society": "Knowledge Mobiliser",
    "twinkl": "Knowledge Mobiliser",
    "uew_england": "Knowledge Mobiliser",
    "upen": "Knowledge Mobiliser",
    "upp_foundation": "Knowledge Mobiliser",
    "wales_centre_for_public_policy": "Knowledge Mobiliser",
    "working_lives_of_teachers": "Knowledge Mobiliser",
    "youth_endowment_fund": "Knowledge Mobiliser",
    "national_education_union": "Knowledge Mobiliser",
    "cct": "Knowledge Mobiliser",

    # Added missing KNOWLEDGE MOBILISERS
    "gamayo": "Knowledge Mobiliser",
    "rebecca_allen": "Knowledge Mobiliser",
    "headmasters_and_headmistresses_conference": "Knowledge Mobiliser",
    "nasuwt_teachers_union": "Knowledge Mobiliser",
    "chandler_institute": "Knowledge Mobiliser",
    "national_association_head_teachers": "Knowledge Mobiliser",
    "fda_union": "Knowledge Mobiliser",
    "bett_show": "Knowledge Mobiliser",
    "hepi": "Knowledge Mobiliser",
    "e_estonia": "Knowledge Mobiliser",

    # --- COMMISSIONERS / FUNDERS ---
    "leverhulme_trust": "Commissioner/Funder",
    "nuffield": "Commissioner/Funder",
    "ukri": "Commissioner/Funder",

    # --- CIVIL SOCIETY / ADVOCACY ---
    "5rights_foundation": "Civil Society/Advocacy",
    "action_for_children": "Civil Society/Advocacy",
    "association_of_directors_of_childrens_services": "Civil Society/Advocacy",
    "barnardos": "Civil Society/Advocacy",
    "child_poverty_action_group": "Civil Society/Advocacy",
    "children_in_wales": "Civil Society/Advocacy",
    "children_rights_alliance_england": "Civil Society/Advocacy",
    "childrends_participation_in_schools": "Civil Society/Advocacy",
    "childrens_commissioner": "Civil Society/Advocacy",
    "early_years_alliance": "Civil Society/Advocacy",
    "magic_breakfast": "Civil Society/Advocacy",
    "nspcc_learning": "Civil Society/Advocacy",
    "unicef": "Civil Society/Advocacy",

    # --- SOCIAL MEDIA ---
    "becky_allen_substack": "Social Media",
    "bennie_kara_substack": "Social Media",
    "linkedin": "Social Media",
    "magicsmoke_substack": "Social Media",
    "pod_co_podcast": "Social Media",
    "podfollow_podcast": "Social Media",
    "soundcloud": "Social Media",
    "spotify_podcast": "Social Media",
    "twitter": "Social Media",
    "wordpress": "Social Media",
    "youtube": "Social Media",
}


In [89]:
df["org_category"] = df["organisation"].map(org_to_category)

In [90]:
df.head(0)

Unnamed: 0,id,newsletter_number,issue_date,theme,subtheme,title,description,link,new_theme,domain,organisation,org_category


In [93]:
df["org_category"].value_counts(dropna=False)

org_category
Knowledge Mobiliser       279
Media                     253
Knowledge Producer        246
Legislature/Government    191
Civil Society/Advocacy     32
Social Media               32
Commissioner/Funder        23
Name: count, dtype: int64

In [94]:
# Create a table showing organisation + its org_category
top_orgs_df = (
    df[df["organisation"].isin(org_5plus.index)]
    .groupby("organisation")["org_category"]
    .first()   # every org belongs to one category
    .reset_index()
)

print(top_orgs_df)

                           organisation            org_category
0                    5rights_foundation  Civil Society/Advocacy
1                                   bbc                   Media
2                     belfast_telegraph                   Media
3                                  bera      Knowledge Producer
4                         bera_journals      Knowledge Producer
5                       british_academy      Knowledge Producer
6                                   cct     Knowledge Mobiliser
7         chartered_college_of_teaching     Knowledge Mobiliser
8   childrends_participation_in_schools  Civil Society/Advocacy
9                childrens_commissioner  Civil Society/Advocacy
10                         conversation     Knowledge Mobiliser
11                                  eef     Knowledge Mobiliser
12                                  epi     Knowledge Mobiliser
13                                  fed     Knowledge Mobiliser
14                       fft_ed_datalab 

# Inspect "Title" and "Description" 


In [95]:
df[['title', 'description']].info()
df[['title', 'description']].isna().sum()
df[['title', 'description']].head(10)

<class 'pandas.core.frame.DataFrame'>
Index: 1056 entries, 0 to 1185
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   title        1056 non-null   string
 1   description  1056 non-null   string
dtypes: string(2)
memory usage: 24.8 KB


Unnamed: 0,title,description
0,Deadline 23 August 2023,"Education secretary Gillian Keegan has launched a call for evidence on using artificial intelligence (AI) like ChatGPT in schools ""to get the best"" out of the new technology."
1,Revealed: the experts advising ministers on teacher training reforms review,"The Department for Education has appointed an ""external steering group"" to review both the initial teacher training and early career frameworks, first launched in 2019. The group is made up of seven experts who are ""closely familiar"" with both reforms, as well as the ""underpinning evidence"". They will help ""shape the work of the review, scrutinising, supporting and challenging our thinking"", the DfE said."
2,"Reject fewer teacher applicants, DfE tells trainers","Susan Acland-Hood, the DfE's permanent secretary, told providers a 7 per cent jump in applicants this year had not led to an equivalent rise in offers for courses."
3,Ofqual and DfE studying 'feasibility' of 'fully digital' exams,"Some exam boards are already piloting on-screen assessment, but research by AQA last year found teachers' biggest barrier to digital exams was a lack of infrastructure. https://schoolsweek.co.uk/ofqual-and-dfe-studying-feasibility-of-fully-digital-exams/"
4,Revealed: The full details of Labour's education 'mission',"Entitled 'Breaking down the barrier to opportunity', Labour will 'revise delivery' of the ECF and give more details on the plan to simplify the system of teacher incentives. Full mission document here - https://schoolsweek.co.uk/wp-content/uploads/2023/07/Labour-breaking-down-barriers-document.pdf"
5,Lib Dem,"Munira Wilson, Lib Dem spokesperson for education, is currently drawing up what she says is a ""strong education offer"" in the Lib Dem manifesto. Details seem thin on the ground so far. https://schoolsweek.co.uk/munira-wilson-lib-dem-education-spokesperson/"
6,Teacher retention commission: 8 proposals to stem exodus,"Teacher wellbeing chari ty Education Support has put forward a list of proposals to boost retention in the sector (published in partnership with Public First.) Report calls for review of teacher hours, retention targets and sabbaticals for headteachers every five years."
7,Who's supporting school leaders to stop them hitting crisis point?,"Recruitment and retention difficulties, Ofsted pressures, and the dismantling of other public services is leaving heads, as Executive head Sara-Jane Bake puts it, ""exhausted trying to keep all the plates spinning. We are so busy looking after everybody else ‚Äì but we need looking after too."""
8,Digital Poverty Alliance,A charity whose vision is for everyone to access the life changing benefits that digital brings. Homepage - https://digitalpovertyalliance.org/
9,A long read from Nuffield funded research project ' Advancing Leadership Development in Early Years Education via Digitally Mediated Professional Learning',"In brief, the report finds:"


In [96]:
df['title_length'] = df['title'].str.len()
df['description_length'] = df['description'].str.len()
df[['title_length', 'description_length']].describe()

Unnamed: 0,title_length,description_length
count,1056.0,1056.0
mean,83.770833,209.854167
std,40.58838,165.502571
min,6.0,5.0
25%,60.0,106.0
50%,77.0,174.0
75%,100.0,267.25
max,482.0,1478.0


In [100]:
#Inspect titles with >50 words 
df['title_word_count'] = df['title'].apply(lambda x: len(str(x).split()))

long_titles = df[df['title_word_count'] > 50]
long_titles.head()




Unnamed: 0,id,newsletter_number,issue_date,theme,subtheme,title,description,link,new_theme,domain,organisation,org_category,title_length,description_length,title_word_count
117,97a6ef72-0d9d-4e1f-b626-98dfad5181a5,10,09 October 2023,Programme news,,"The ESRC's Work, Education and Skills (WES) team are exploring the potential shape of a new, forward-looking education research agenda and are very keen to hear from the academic, policy, and practice communities about what you think will be the big persistent and future challenges over the coming decades in the following priority areas:","¬∑ Societal impacts on educational provision and learner experience ¬∑ Educational inequalities ¬∑ Special Educational Needs and Disability ¬∑ Skills for life For full details and the opportunity to submit your ideas , here is the link to the horizon scanning survey :",https://engagementhub.ukri.org/esrc-1/weshorizonscanningsurvey,project_updates,engagementhub.ukri.org,ukri,Commissioner/Funder,339,264,54
412,a27ecb62-6146-4ebd-815f-157df254f616,35,10 May 2024,Political landscape & key organisations,,"The DfE is inviting applications to its newly formed Science Advisory Council (SAC) including from those with expertise in AI and education technology, sustainable and secure school buildings and adapting to climate change, and physical and mental health challenges. The group will support the Department's Chief Scientific Adviser, ensuring that DfE has access to cutting-edge scientific evidence, analytical approaches, and expertise for robust, evidence-informed decision making.",Applications close 27 May More,https://www.civilservicejobs.service.gov.uk/csr/jobs.cgi?jcode=1907685&csource=csalerts,political_context_and_organisations,www.civilservicejobs.service.gov.uk,uk_civil_service_jobs,Legislature/Government,482,30,66
474,0c3aa38a-3031-4986-a05d-2eb822939a85,40,21 June 2024,"Teacher recruitment, retention & development",,"EPI blog - Blog: The workforce challenges facing an incoming government - this blog post argues that ""retention problems persist, leaders are leaving the profession at a growing rate, and recruitment remains a challenge. On the other hand, an improved pay settlement, the sustained retention of early career teachers, and the increase in returners to the profession offer reasons for hope.""",By James Zuccollo,https://epi.org.uk/publications-and-research/blog-the-workforce-challenges-facing-an-incoming-government,teacher_rrd,epi.org.uk,epi,Knowledge Mobiliser,390,17,61
759,8446df04-fc2b-4bd7-96b4-bcede957a2c0,60,24 January 2025,EdTech,,"The Chartered College of Teaching will pilot setting up the EdTech Evidence Board to ""explore how we effectively build evidence of AI products that work well, helping education settings feel confident that they are choosing products that work well for them and for their classrooms"" You can read more about how they will approach this here.",Mor e,https://chartered.college/2025/01/22/supporting-effective-education-through-education-technology,digital_ed,chartered.college,chartered_college_of_teaching,Knowledge Mobiliser,340,5,56
897,b7c2c85e-de2b-4483-93ea-0f395f3d7193,69,28 March 2025,Updates from the programme,,The ERP's double symposium entitled: ' Doing Policy Relevant Research: Using knowledge mobilisation and knowledge exchange strategies to translate findings into actionable insights.' Part One. Technology in education and Part Two. Teachers and Teaching will be presented at BERA 2025. The symposia will combine insights from seven projects in the programme.,The ERP have created a new Knowledge Exchange Resource Hub . This brings together a range of resources we are aware of that are designed to support research engagement with different stakeholders. We welcome other suggestions. Please get in touch if you are aware of other resources that we are missing.,https://www.ucl.ac.uk/education-research-programme/knowledge-exchange-resource-hub,project_updates,www.ucl.ac.uk,ucl,Knowledge Producer,357,303,51


In [102]:
#Inspect descriptions with <10 words 

df['description_word_count'] = df['description'].apply(lambda x: len(str(x).split()))

short_descriptions = df[df['description_word_count'] < 10]

short_descriptions.head()


Unnamed: 0,id,newsletter_number,issue_date,theme,subtheme,title,description,link,new_theme,domain,organisation,org_category,title_length,description_length,title_word_count,description_word_count
9,56bbc37a-4fc1-4906-a7fa-2b55c664cff7,1,11 July 2023,Thematic roundup,Digital,A long read from Nuffield funded research project ' Advancing Leadership Development in Early Years Education via Digitally Mediated Professional Learning',"In brief, the report finds:",https://www.nuffieldfoundation.org/wp-content/uploads/2021/10/Project-Report-Advancing-Ear-Years-Leadership-Development.pdf,digital_ed,www.nuffieldfoundation.org,nuffield,Commissioner/Funder,155,27,21,5
11,b83f3825-767b-4f21-90aa-35e36ed27a11,2,16 July 2023,PI Updates and Papers,Digital,Cutting through the conjecture: How is EdTech really being used in our classrooms?,Full post - https://edtech.oii.ox.ac.uk/cutting-through-the-conjecture/,https://edtech.oii.ox.ac.uk/cutting-through-the-conjecture,digital_ed,edtech.oii.ox.ac.uk,oii_edtech_equity,Knowledge Producer,82,71,13,4
14,3187712e-c9ee-4afb-b71c-338f775c499b,2,16 July 2023,Events,,Date: Wednesday 19 th July 9.30 ‚Äì 11.30,https://epi.org.uk/events/education-priorities-for-the-next-general-election/,https://epi.org.uk/events/education-priorities-for-the-next-general-election,events_opportunities_research,epi.org.uk,epi,Knowledge Mobiliser,39,77,8,1
24,e1f9ff04-4b3a-48da-b7db-82a12c2eb364,2,16 July 2023,PI Updates and Papers,Leadership,A three-part series on leadership from Toby Greany and team on TES:,Part 1: Headteacher recruitment crisis: 5 tips for action,https://www.tes.com/magazine/leadership/staff-management/headteacher-recruitment-crisis-applications,project_updates,www.tes.com,tes,Media,67,57,12,9
38,ee262567-2f7a-417c-bae2-e24cfb5a121b,3,20 July 2023,Seminar topics,,FFT Data Labs - Should we redefine persistent absence?,https://ffteducationdatalab.org.uk/2023/07/should-we-redefine-persistent-absence/,https://ffteducationdatalab.org.uk/2023/07/should-we-redefine-persistent-absence,project_updates,ffteducationdatalab.org.uk,fft_ed_datalab,Knowledge Mobiliser,54,81,9,1


#### Create 'Text' Variable = 'Title' + 'Description'

In [103]:
#Create "Text" variable = "Title" + "Description" 
df['text'] = df['title'].fillna('') + ' ' + df['description'].fillna('')

In [104]:
# Basic info on the new column
print(df['text'].info())

# Add a column for text length (number of words or characters)
df['text_length_chars'] = df['text'].str.len()
df['text_length_words'] = df['text'].str.split().str.len()

# Summary statistics
print("\nCharacter length stats:")
print(df['text_length_chars'].describe())

<class 'pandas.core.series.Series'>
Index: 1056 entries, 0 to 1185
Series name: text
Non-Null Count  Dtype 
--------------  ----- 
1056 non-null   string
dtypes: string(1)
memory usage: 16.5 KB
None

Character length stats:
count        1056.0
mean        294.625
std      167.098891
min            52.0
25%           185.0
50%           256.0
75%           352.0
max          1544.0
Name: text_length_chars, dtype: Float64


In [105]:
# Check for missing or empty values
missing_mask = df['text'].isna() | (df['text'].str.strip() == '')

# Count how many
missing_count = missing_mask.sum()
print(f"Missing or empty 'text' entries: {missing_count}")

# Optionally view them
if missing_count > 0:
    print(df.loc[missing_mask, ['title', 'description']].head())


Missing or empty 'text' entries: 0


In [106]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1056 entries, 0 to 1185
Data columns (total 19 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   id                      1056 non-null   string
 1   newsletter_number       1056 non-null   int64 
 2   issue_date              1056 non-null   string
 3   theme                   1056 non-null   string
 4   subtheme                64 non-null     string
 5   title                   1056 non-null   string
 6   description             1056 non-null   string
 7   link                    1056 non-null   string
 8   new_theme               1056 non-null   object
 9   domain                  1056 non-null   object
 10  organisation            1056 non-null   object
 11  org_category            1056 non-null   object
 12  title_length            1056 non-null   Int64 
 13  description_length      1056 non-null   Int64 
 14  title_word_count        1056 non-null   int64 
 15  descripti

In [107]:
df.columns

Index(['id', 'newsletter_number', 'issue_date', 'theme', 'subtheme', 'title',
       'description', 'link', 'new_theme', 'domain', 'organisation',
       'org_category', 'title_length', 'description_length',
       'title_word_count', 'description_word_count', 'text',
       'text_length_chars', 'text_length_words'],
      dtype='object')

# Save Files 

In [111]:
output_path = "/workspaces/ERP_Newsletter/data/1_interim/1_cleaning"

# Create output directory ONCE
os.makedirs(output_path, exist_ok=True)

# 1Ô∏è‚É£ Save full cleaned dataset
data_cleaned_path = os.path.join(output_path, "data_cleaned.csv")
df.to_csv(data_cleaned_path, index=False)
print(f"‚úÖ Saved full cleaned dataset ‚Üí {data_cleaned_path}")

# 2Ô∏è‚É£ Save subset for preprocessing
cols_for_preprocessing = [
    "id",
    "newsletter_number",
    "issue_date",
    "new_theme",
    "organisation",
    "org_category",
    "text",
    "domain"
]

existing_cols = [c for c in cols_for_preprocessing if c in df.columns]
df_preproc = df[existing_cols].copy()

data_preproc_path = os.path.join(output_path, "data_for_preprocessing.csv")
df_preproc.to_csv(data_preproc_path, index=False)

print(f"‚úÖ Saved preprocessing dataset ‚Üí {data_preproc_path}")


‚úÖ Saved full cleaned dataset ‚Üí /workspaces/ERP_Newsletter/data/1_interim/1_cleaning/data_cleaned.csv
‚úÖ Saved preprocessing dataset ‚Üí /workspaces/ERP_Newsletter/data/1_interim/1_cleaning/data_for_preprocessing.csv
