In [1]:
import os 
import re 

import pandas as pd 
import numpy as np 

from ftfy import fix_text
import unicodedata as ud
from urllib.parse import urlparse

In [2]:
# Treat these text tokens as missing on read
NA_TOKENS = ["", " ", "NA", "N/A", "na", "NaN", "nan", "null", "NULL", "-"]

In [3]:
#load data 
input_path = "/workspaces/ERP_Newsletter/data/data01_newsletter_items/newsletter_items.csv"
data_cleaning_path = "/workspaces/ERP_Newsletter/data/data02_cleaning"
output_path = "/workspaces/ERP_Newsletter/data/data03_newsletter_items_clean"


df = pd.read_csv(input_path, keep_default_na=True, na_values=NA_TOKENS)

In [4]:
#inspect 
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1668 entries, 0 to 1667
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   id                 1668 non-null   object
 1   newsletter_number  1668 non-null   int64 
 2   issue_date         1668 non-null   object
 3   theme              1668 non-null   object
 4   subtheme           114 non-null    object
 5   title              1667 non-null   object
 6   description        1346 non-null   object
 7   link               1616 non-null   object
dtypes: int64(1), object(7)
memory usage: 104.4+ KB


In [5]:
print(f"Total rows: {len(df)}")
print(f"Unique newsletter: {df['newsletter_number'].nunique()}")

Total rows: 1668
Unique newsletter: 87


#¬†Clean Up Text

In [6]:
def clean_series(s: pd.Series) -> pd.Series:
    # Use pandas "string" dtype so NaNs stay as <NA>
    s = s.astype("string")
    mask = s.notna()
    # Fix mojibake and normalize only on non-missing cells
    s.loc[mask] = s.loc[mask].apply(fix_text)
    s.loc[mask] = s.loc[mask].apply(lambda x: ud.normalize("NFKC", x))
    # Basic whitespace cleanup
    s.loc[mask] = s.loc[mask].str.replace(r"\s+", " ", regex=True).str.strip()
    return s

# Clean all object/string columns (quick and safe)
obj_cols = [c for c in df.columns if df[c].dtype == object or pd.api.types.is_string_dtype(df[c])]
for c in obj_cols:
    df[c] = clean_series(df[c])

# Quick exact replacements for the most common artifacts (optional, simple)
REPL = {
    "√Ç ": " ", "√Ç": "",
    "‚Äö√Ñ√¨": "‚Äì", "‚Äö√Ñ√Æ": "‚Äî",
    "‚Äö√Ñ√¥": "‚Äô", "‚Äö√Ñ√≤": "‚Äò",
    "‚Äö√Ñ√∫": "‚Äú", "‚Äö√Ñ√π": "‚Äù",
    "√¢‚Ç¨‚Äú": "‚Äì", "√¢‚Ç¨‚Äù": "‚Äî",
    "√¢‚Ç¨Àú": "‚Äò", "√¢‚Ç¨‚Ñ¢": "‚Äô",
    "√¢‚Ç¨≈ì": "‚Äú", "√¢‚Ç¨\x9d": "‚Äù",
    "√¢‚Ç¨¬¢": "‚Ä¢", "√¢‚Ç¨¬¶": "‚Ä¶",
}
for c in obj_cols:
    s = df[c].astype("string")
    for bad, good in REPL.items():
        s = s.str.replace(bad, good, regex=False)
    s = s.str.replace(r"\s+", " ", regex=True).str.strip()
    df[c] = s

# Check for Missing Values 

In [7]:
def missing_table(d: pd.DataFrame) -> pd.DataFrame:
    mc = d.isna().sum()
    return pd.DataFrame({
        "Missing Values": mc,
        "Percentage (%)": (mc / len(d)) * 100
    }).sort_values("Missing Values", ascending=False)

print("\n=== Missing values (before drop) ===")
print(missing_table(df))


=== Missing values (before drop) ===
                   Missing Values  Percentage (%)
subtheme                     1554       93.165468
description                   322       19.304556
link                           52        3.117506
title                           1        0.059952
theme                           0        0.000000
issue_date                      0        0.000000
newsletter_number               0        0.000000
id                              0        0.000000


#¬†Remove items where description, link or title are missing

In [8]:
# Remove rows where 'description' or 'link' is missing
df_cleaned = df.dropna(subset=['description', 'link', 'title'])

# (Optional) Check how many rows remain
print(f"Rows before: {len(df)}")
print(f"Rows after : {len(df_cleaned)}")

df = df_cleaned

Rows before: 1668
Rows after : 1323


#¬†Check for Duplicates 

###¬†All rows identical 

In [9]:
#All rows identical 
total_duplicates = df.duplicated().sum()
print(f"Total duplicate rows (all columns identical): {total_duplicates}")

Total duplicate rows (all columns identical): 0


### Title and link identical 

In [10]:
# Check duplicates where both title and link are the same
title_link_dupes = df[df.duplicated(subset=["title", "link"], keep=False)]

print(f"Number of duplicate title+link pairs: {title_link_dupes.shape[0]}")
title_link_dupes.sort_values(by=["title"]).head(2)

Number of duplicate title+link pairs: 89


Unnamed: 0,id,newsletter_number,issue_date,theme,subtheme,title,description,link
1301,bf6c4fd6-a5bd-48ca-9249-b5b92849e038,70,4 April 2025,Updates from the programme,,A reminder that the ESRC Education Research Pr...,"AI in Education: From chalkboards to chatbots,...",https://uk.bettshow.com/speakers/dominik-lukes
1327,0ad9176d-2b5a-4306-a9f3-2b4ffdf96be6,71,11 April 2025,Updates from the programme,,A reminder that the ESRC Education Research Pr...,"AI in Education: From chalkboards to chatbots,...",https://uk.bettshow.com/speakers/dominik-lukes


In [11]:
title_link_dupes.theme.value_counts()

theme
Updates from the programme                                                                                                                                                                                                    35
You have indicated that you are happy to receive news and updates from the ESRC Education Research Programme. To unsubscribe, please email Elizabeth.hudson@ucl.ac.uk with the word UNSUBSCRIBE in the title of the email.    28
What Matters in Education?                                                                                                                                                                                                     8
Updates from the Programme                                                                                                                                                                                                     5
Update from the ESRC Education Research Programme                                             

In [12]:
title_link_dupes[title_link_dupes.theme == "Teacher recruitment, retention & development"]

Unnamed: 0,id,newsletter_number,issue_date,theme,subtheme,title,description,link
992,8991c36b-65fb-4011-9249-8f5917d32a0e,56,6 December 2024,"Teacher recruitment, retention & development",,DfE - Working lives of teachers and leaders: w...,A summary report of early findings from the th...,https://www.gov.uk/government/publications/wor...
1547,ac53ae25-d785-47b2-abd5-63d8e9583cbb,82,11 July 2025,"Teacher recruitment, retention & development",,DfE - Working lives of teachers and leaders: w...,Findings from the third wave of the working li...,https://www.gov.uk/government/publications/wor...


In [13]:
#drop duplicates keeping only first occurence 
df = df.drop_duplicates(subset=["title", "link"], keep="first").reset_index(drop=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1258 entries, 0 to 1257
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   id                 1258 non-null   string
 1   newsletter_number  1258 non-null   int64 
 2   issue_date         1258 non-null   string
 3   theme              1258 non-null   string
 4   subtheme           85 non-null     string
 5   title              1258 non-null   string
 6   description        1258 non-null   string
 7   link               1258 non-null   string
dtypes: int64(1), string(7)
memory usage: 78.8 KB


### Title only duplicates

In [14]:
# Count duplicates based on title only
title_dupes = df[df.duplicated(subset=["title"], keep=False)]

print(f"Number of rows with duplicate titles: {title_dupes.shape[0]}")
title_dupes.sort_values(by="title").head(1)

Number of rows with duplicate titles: 20


Unnamed: 0,id,newsletter_number,issue_date,theme,subtheme,title,description,link
1240,15f17205-bafc-43d8-ba3d-b4881956e48b,87,10 October 2025,Updates from the Programme,,Addressing key issues in teacher recruitment a...,Catch up with the video of the latest in the W...,https://mediacentral.ucl.ac.uk/Play/126585


In [15]:
title_table = title_dupes[["title", "theme"]].value_counts().reset_index(name="count")
title_table

Unnamed: 0,title,theme,count
0,Making Teaching Attractive and Worthwhile (Par...,Project news,3
1,Deadline: 28 April 2025,Political environment and key organisations,2
2,What matters in education? Education after the...,Updates from the programme,2
3,Panel:,Updates from the programme,2
4,Addressing key issues in teacher recruitment a...,Updates from the Programme,2
5,What matters in education? Education in a brok...,Updates from the programme,2
6,Labour,Political landscape & key organisations,1
7,Digital Poverty Alliance,EdTech,1
8,Digital Poverty Alliance,Thematic roundup,1
9,Panel:,"Teacher recruitment, retention & development",1


In [16]:
df = df.drop_duplicates(subset=["title"], keep="first").reset_index(drop=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1247 entries, 0 to 1246
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   id                 1247 non-null   string
 1   newsletter_number  1247 non-null   int64 
 2   issue_date         1247 non-null   string
 3   theme              1247 non-null   string
 4   subtheme           84 non-null     string
 5   title              1247 non-null   string
 6   description        1247 non-null   string
 7   link               1247 non-null   string
dtypes: int64(1), string(7)
memory usage: 78.1 KB


### link-only duplicates 

In [17]:
# Count duplicates based on link only
link_dupes = df[df.duplicated(subset=["link"], keep=False)]

print(f"Number of rows with duplicate links: {link_dupes.shape[0]}")
link_dupes.sort_values(by="link").head(1)

Number of rows with duplicate links: 114


Unnamed: 0,id,newsletter_number,issue_date,theme,subtheme,title,description,link
450,17268e59-d380-4e8f-a2e7-964543717f17,35,10 May 2024,What matters in education?,,Big Education conference - 'Next Generation Sc...,Hear from schools across the country who are w...,https://bigeducation.org/product/next-generati...


In [18]:
pd.set_option("display.max_colwidth", None)

link_table = link_dupes[["link"]].value_counts().reset_index(name="count")
link_table

Unnamed: 0,link,count
0,https://www.ucl.ac.uk/education-research-programme/events/2023/oct/practical-policies-or-bright-ideas-how-particular-topics-get-front-policy-queue,4
1,https://www.ucl.ac.uk/education-research-programme/events/2024/mar/investing-early-years-priorities-and-challenges,4
2,https://uk.bettshow.com/speakers/dominik-lukes,3
3,https://www.ucl.ac.uk/education-research-programme/events/2024/jan/pupil-absence-questions-policy-research-and-practice,3
4,https://childrens-participation.org/,3
5,https://www.ucl.ac.uk/education-research-programme/events/2025/may/how-build-resilient-schools-place-based-approaches-supporting-teachers-and-leaders,3
6,https://edtech.oii.ox.ac.uk/cutting-through-the-conjecture,2
7,https://engagementhub.ukri.org/esrc-1/weshorizonscanningsurvey,2
8,https://education.us18.list-manage.com/track/click?u=61f408a2f9c6d02a726ce6200&id=bea3b5fbac&e=4eb2cf985e,2
9,https://epi.org.uk/events/labour-party-conference-prioritising-equality-education-policy-as-a-lever-to-tackling-disadvantage-and-inequalities,2


In [19]:
df = df.drop_duplicates(subset=["link"], keep="first").reset_index(drop=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1186 entries, 0 to 1185
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   id                 1186 non-null   string
 1   newsletter_number  1186 non-null   int64 
 2   issue_date         1186 non-null   string
 3   theme              1186 non-null   string
 4   subtheme           80 non-null     string
 5   title              1186 non-null   string
 6   description        1186 non-null   string
 7   link               1186 non-null   string
dtypes: int64(1), string(7)
memory usage: 74.3 KB


# Identify themes and subthemes

In [20]:
#Unique counts of columns 
print("Unique titles:", df["title"].nunique())
print("Unique themes:", df["theme"].nunique())
print("Unique subthemes", df["subtheme"].nunique())
print("Unique links:", df["link"].nunique())

Unique titles: 1186
Unique themes: 62
Unique subthemes 35
Unique links: 1186


In [21]:
### Add placeholders for missing themes/subhtemes

# 1) Normalize empties/whitespace/"nan"/"none" to real NA
df_norm = df.copy()
for col in ["theme", "subtheme"]:
    df_norm[col] = (
        df_norm[col]
        .astype("string")
        .replace(r"^\s*$", pd.NA, regex=True)   # empty/whitespace ‚Üí NA
        .replace({"nan": pd.NA, "NaN": pd.NA, "None": pd.NA, "none": pd.NA})
    )

# 2) Create a version that fills NA with placeholders so ALL cases are counted
df_filled = df_norm.fillna({"theme": "No theme", "subtheme": "No subtheme"})

# 3) Group and count every (theme, subtheme) combo, including placeholder cases
theme_subtheme_counts = (
    df_filled
    .groupby(["theme", "subtheme"], dropna=False)
    .size()
    .reset_index(name="count")
    .sort_values(by=["theme", "subtheme"])
)

# 4) Export to Excel  
out_dir = data_cleaning_path
out_path = os.path.join(out_dir, "theme_subtheme_counts.xlsx")


theme_subtheme_counts.to_excel(out_path, index=False)  # <- this one
print(f"‚úÖ Exported {len(theme_subtheme_counts)} rows to {out_path}")

‚úÖ Exported 99 rows to /workspaces/ERP_Newsletter/data/data02_cleaning/theme_subtheme_counts.xlsx


# Check Themes and Articles 

In [22]:
# Filter articles under themes

check_themes = df[df["theme"] == "Research ‚Äì Practice ‚Äì Policy"].copy()

# View a few examples
display(check_themes.head(0))

Unnamed: 0,id,newsletter_number,issue_date,theme,subtheme,title,description,link


# Rename Themes

In [23]:
# ---------- 0) Drop rows where the entire theme is the unsubscribe text
UNSUB_THEME = (
    "You have indicated that you are happy to receive news and updates from the "
    "ESRC Education Research Programme. To unsubscribe, please email "
    "Elizabeth.hudson@ucl.ac.uk with the word UNSUBSCRIBE in the title of the email."
)
mask_unsub = df["theme"].astype(str).str.strip().eq(UNSUB_THEME)
dropped_rows = int(mask_unsub.sum())
df = df[~mask_unsub].copy()

# ---------- 1) Normalizers
def norm_theme(s: str) -> str:
    if not isinstance(s, str):
        return ""
    s = s.strip()
    s = re.sub(r"\s+", " ", s)
    s = s.replace("‚Äî", "-").replace("‚Äì", "-")  # normalize dashes
    s = s.replace("‚Äô", "'").replace("‚Äò", "'").replace("‚Äú", '"').replace("‚Äù", '"')
    return s.lower()

def norm_key(s: str) -> str:
    """Strong normalizer for matching keys like subthemes:
       - lowercase; & -> and; remove punctuation; normalize dashes; collapse spaces
    """
    if not isinstance(s, str):
        return ""
    s = s.strip().lower()
    s = s.replace("‚Äî", " ").replace("‚Äì", " ").replace("-", " ")
    s = s.replace("&", " and ")
    s = s.replace("‚Äô", "'").replace("‚Äò", "'").replace("‚Äú", '"').replace("‚Äù", '"')
    s = re.sub(r"[,\.\u00A0]", " ", s)         # remove commas, periods, NBSP
    s = re.sub(r"[^a-z0-9\s]", " ", s)         # drop other punctuation
    s = re.sub(r"\s+", " ", s).strip()
    return s

# ---------- 2) Theme mapping list: (new_theme, current_theme)
pairs = [
    # ============ erp_project ============
    ("erp_project", "Embedding children's participation rights in pedagogical practice in lower primary classrooms in Wales PI: Sarah Chicken"),
    ("erp_project", "Investigating the recruitment and retention of ethnic minority teachers PI: Stephen Gorard"),
    ("erp_project", "News from the Projects"),
    ("erp_project", "News from the projects"),
    ("erp_project", "PI Updates and Papers"),
    ("erp_project", "PI: David Lundie"),
    ("erp_project", "Programme news"),
    ("erp_project", "Programme Update"),
    ("erp_project", "Programme update"),
    ("erp_project", "Project news"),
    ("erp_project", "Rethinking teacher recruitment: New approaches to attracting prospective STEM teachers PI: Rob Klassen"),
    ("erp_project", "Sustainable school leadership: comparing approaches to the training, supply and retention of senior school leaders across the UK PI Toby Greany"),
    ("erp_project", "Toby Greany"),
    ("erp_project", "Towards equity focused approaches to EdTech: a socio-technical perspective PI: Professor Rebecca Eynon"),
    ("erp_project", "Towards equity focused approaches to EdTech: a socio-technical perspective PI: Rebecca Eynon"),
    ("erp_project", "Update from the ERP projects"),
    ("erp_project", "Update from the ESRC Education Research Programme"),
    ("erp_project", "Update from the projects"),
    ("erp_project", "Updates from David Lundie"),
    ("erp_project", "Updates from Steph Ainsworth"),
    ("erp_project", "Updates from the ERP projects"),
    ("erp_project", "Updates from the ESRC"),
    ("erp_project", "Updates from the Programme"),
    ("erp_project", "Updates from the programme"),
    ("erp_project", "Updates from the projects"),
    ("erp_project", "Decentring the 'resilient teacher': exploring interactions between individuals and their social ecologies PI: Steph Ainsworth"),
    ("erp_project", "Peer reviewed articles from the ERP projects"),
    ("erp_project", "Peer reviewed publications from the ERP projects"),

    # ============ what_matters_ed ============
    ("what_matters_ed", "What Matters in Education?"),
    ("what_matters_ed", "What matters in education?"),

    # ============ teacher_rrd ============
    ("teacher_rrd", "Teacher recruitment, retention & development"),

    # ============ edtech ============
    ("edtech", "EdTech"),

    # ============ four_nations ============
    ("four_nations", "4 Nations"),
    ("four_nations", "4 Nations & key organisations"),
    ("four_nations", "Four Nations"),
    ("four_nations", "Four Nations Landscape"),
    ("four_nations", "Four Nations landscape"),
    ("four_nations", "Political landscape across Four Nations & key organisations"),

    # ============ ppr (Policy‚ÄìPractice‚ÄìResearch) ============
    ("ppr", "Research ‚Äì Practice ‚Äì Policy"),
    ("ppr", "Education, Policy & Practice"),

    # ============ events_opportunities_research ============
    ("events_opportunities_research", "Conferences"),
    ("events_opportunities_research", "Opportunities"),
    ("events_opportunities_research", "Opportunities for funding"),
    ("events_opportunities_research", "Opportunities to blog"),
    ("events_opportunities_research", "Other Reports"),
    ("events_opportunities_research", "Other Research"),
    ("events_opportunities_research", "Relevant Events"),
    ("events_opportunities_research", "Relevant Research"),
    ("events_opportunities_research", "Reports"),
    ("events_opportunities_research", "Research"),
    ("events_opportunities_research", "Events"),
    ("events_opportunities_research", "Seminar series topics"),
    ("events_opportunities_research", "Seminar topics"),

    # ============ political_environment_key_organisations ============
    ("political_environment_key_organisations", "What are the politicians saying?"),
    ("political_environment_key_organisations", "Political environment and key organisations"),
    ("political_environment_key_organisations", "Political landscape - the election"),
    ("political_environment_key_organisations", "Political landscape & key organisations"),
    ("political_environment_key_organisations", "Calls for evidence"),
    ("political_environment_key_organisations", "DfE"),
    ("political_environment_key_organisations", "EEF"),
    ("political_environment_key_organisations", "ESRC"),
    ("political_environment_key_organisations", "Politics"),
    ("political_environment_key_organisations", "Launch of ESRC survey on social science research skills"),
    ("political_environment_key_organisations", "Updates from UKRI"),
    ("political_environment_key_organisations", "Update from UKRI"),
]

# ---------- 3) Build lookup (normalized)
lookup = {norm_theme(curr): new for new, curr in pairs}

# ---------- 4) Apply theme mapping (no fill yet)
theme_norm = df["theme"].map(norm_theme)
df["new_theme"] = theme_norm.map(lookup)

# ---------- 4b) Defensive keyword overrides (force correct bucket if text contains patterns)
kw_four_nations = theme_norm.str.contains(r"\b(4|four) nations\b", regex=True, na=False)
df.loc[kw_four_nations, "new_theme"] = "four_nations"

kw_ukri = theme_norm.str.contains(r"\bupdate(s)? from ukri\b", regex=True, na=False)
df.loc[kw_ukri, "new_theme"] = "political_environment_key_organisations"

# ---------- 5) Subtheme-based overrides
sub_norm = df["subtheme"].map(norm_key)

target_rrd = "teacher recruitment retention and development"
df.loc[sub_norm.eq(target_rrd), "new_theme"] = "teacher_rrd"  # any variant mapped earlier ‚Üí normalized equals this
df.loc[sub_norm.eq("digital"), "new_theme"] = "edtech"

# ---------- 6) Fill any remaining unmapped with the original theme text
df["new_theme"] = df["new_theme"].fillna(df["theme"])

# ---------- 7) Export a summary
summary = (
    df.assign(theme_norm=theme_norm, subtheme_norm=sub_norm)
      .groupby(["new_theme", "theme_norm"], dropna=False)
      .size()
      .reset_index(name="count")
      .sort_values(["new_theme", "count"], ascending=[True, False])
)

out_dir = data_cleaning_path
summary_path = os.path.join(out_dir, "theme_mapping_summary.xlsx")

with pd.ExcelWriter(summary_path) as xw:
    df.to_excel(xw, sheet_name="data_with_new_theme", index=False)
    summary.to_excel(xw, sheet_name="mapping_summary", index=False)

print(f"‚úÖ Dropped {dropped_rows} unsubscribe row(s).")
print("‚úÖ Mapping applied with canonical themes.")
print("üìÑ Excel written to:", summary_path)


  kw_four_nations = theme_norm.str.contains(r"\b(4|four) nations\b", regex=True, na=False)
  kw_ukri = theme_norm.str.contains(r"\bupdate(s)? from ukri\b", regex=True, na=False)


‚úÖ Dropped 0 unsubscribe row(s).
‚úÖ Mapping applied with canonical themes.
üìÑ Excel written to: /workspaces/ERP_Newsletter/data/data02_cleaning/theme_mapping_summary.xlsx


In [24]:
# ---------- 7) View unique new_theme values and their counts
theme_counts = (
    df["new_theme"]
    .value_counts(dropna=False)
    .reset_index()
    .rename(columns={"index": "new_theme", "new_theme": "count"})
)

print("üß≠ Unique new_theme values and their counts:")
print(theme_counts)

üß≠ Unique new_theme values and their counts:
                                     count  count
0  political_environment_key_organisations    221
1                          what_matters_ed    184
2                              teacher_rrd    178
3                                   edtech    166
4                                      ppr    147
5                              erp_project    144
6                             four_nations    112
7            events_opportunities_research     34


In [25]:
#save dataset with all themes 
all_clean_path = "/workspaces/ERP_Newsletter/data/data03_newsletter_items_clean/items_all_themes.csv"
df.to_csv(all_clean_path, index=False)

# keep six themes - 

In [26]:
df.new_theme.value_counts()

new_theme
political_environment_key_organisations    221
what_matters_ed                            184
teacher_rrd                                178
edtech                                     166
ppr                                        147
erp_project                                144
four_nations                               112
events_opportunities_research               34
Name: count, dtype: int64

In [27]:
# ---------- Keep only selected themes in new_theme
themes_to_keep = [
    "political_environment_key_organisations",
    "what_matters_ed",
    "teacher_rrd",
    "edtech",
    "ppr",
    "four_nations",
]

df = df[df["new_theme"].isin(themes_to_keep)].copy()

print(df["new_theme"].value_counts())

new_theme
political_environment_key_organisations    221
what_matters_ed                            184
teacher_rrd                                178
edtech                                     166
ppr                                        147
four_nations                               112
Name: count, dtype: int64


#¬†Number of unique domain names 

In [28]:
# Extract domain names from the 'link' column
df["domain"] = df["link"].apply(lambda x: urlparse(str(x)).netloc if pd.notna(x) else None)

# Count unique domains
unique_domains = df["domain"].nunique()

print(f"üåê There are {unique_domains} unique domains in this dataset.")

# Optional: see the top 10 most common domains
domain_counts = df["domain"].value_counts().reset_index()
domain_counts.columns = ["domain", "count"]
print(domain_counts.head(60))

üåê There are 300 unique domains in this dataset.
                                   domain  count
0                       schoolsweek.co.uk    138
1                              www.gov.uk     56
2                     www.theguardian.com     25
3                              epi.org.uk     23
4                          www.nfer.ac.uk     23
5                            www.gov.scot     20
6                          www.bera.ac.uk     20
7                     theconversation.com     19
8                    www.eventbrite.co.uk     18
9                 www.education-ni.gov.uk     16
10             www.belfasttelegraph.co.uk     16
11               committees.parliament.uk     14
12             ffteducationdatalab.org.uk     13
13                          www.gov.wales     12
14             www.nuffieldfoundation.org     11
15                        blog.bham.ac.uk     11
16                            www.tes.com     10
17  bera-journals.onlinelibrary.wiley.com     10
18                

In [29]:
#export a file to review domain names, remove irrelevant names and create a new 'organisation' name based on domain name 
review_path = os.path.join(data_cleaning_path, "domain_review.xlsx")
domain_counts.to_excel(review_path, index=False)

print("Wrote:", review_path)

Wrote: /workspaces/ERP_Newsletter/data/data02_cleaning/domain_review.xlsx


# Add'organisation' column and remove irreleant domain names

In [30]:
# Mapping from domain ‚Üí organisation
domain_to_org = {
    "schoolsweek.co.uk": "schools_week",
    "www.ucl.ac.uk": "ucl",
    "www.gov.uk": "uk_government",
    "theconversation.com": "conversation",
    "www.theguardian.com": "guardian",
    "www.bera.ac.uk": "bera",
    "epi.org.uk": "epi",
    "www.eventbrite.co.uk": "REMOVE",
    "www.nfer.ac.uk": "nfer",
    "www.gov.scot": "scottish_government",
    "bera-journals.onlinelibrary.wiley.com": "bera_journals",
    "www.belfasttelegraph.co.uk": "belfast_telegraph",
    "www.education-ni.gov.uk": "ni_government",
    "committees.parliament.uk": "uk_parliament",
    "ffteducationdatalab.org.uk": "fft_ed_datalab",
    "www.tes.com": "tes",
    "meetoecd1.zoom.us": "REMOVE",
    "www.nuffieldfoundation.org": "nuffield",
    "www.gov.wales": "welsh_government",
    "blogs.ucl.ac.uk": "ucl",
    "": "REMOVE",
    "blog.bham.ac.uk": "university_of_birmingham",
    "upen.ac.uk": "upen",
    "www.instituteforgovernment.org.uk": "ifg",
    "fed.education": "fed",
    "education.us18.list-manage.com": "REMOVE",
    "ifs.org.uk": "ifs",
    "mediacentral.ucl.ac.uk": "ucl",
    "educationwales.blog.gov.wales": "welsh_government",
    "www.childrenscommissioner.gov.uk": "childrens_commissioner",
    "www.bbc.co.uk": "bbc",
    "edtech.oii.ox.ac.uk": "oii_edtech_equity",
    "bera.us9.list-manage.com": "REMOVE",
    "teachertapp.co.uk": "teacher_tapp",
    "www.oecd.org": "oecd",
    "educationendowmentfoundation.org.uk": "eef",
    "www.ukri.org": "ukri",
    "childrens-participation.org": "childrends_participation_in_schools",
    "t.co": "twitter",
    "lordslibrary.parliament.uk": "house_of_lords_library",
    "my.chartered.college": "cct",
    "post.parliament.uk": "post_parliament",
    "lgiu.us3.list-manage.com": "REMOVE",
    "parliament.us16.list-manage.com": "REMOVE",
    "wonkhe.com": "wonkhe",
    "uk.bettshow.com": "bett_show",
    "newsletter.oecd.org": "oecd",
    "bit.ly": "twitter",
    "www.tandfonline.com": "taylor_and_francis",
    "chartered.college": "chartered_college_of_teaching",
    "teachertapp.com": "teacher_tapp",
    "www.thebritishacademy.ac.uk": "british_academy",
    "5rightsfoundation.com": "5rights_foundation",
    "www.independent.co.uk": "independent",
    "neu.org.uk": "national_education_union",
    "theippo.co.uk": "ippo",
    "www.linkedin.com": "linkedin",
    "www.eventbrite.com": "REMOVE",
    "forms.office.com": "REMOVE",
    "www.ippr.org": "ippr",
    "www.hepi.ac.uk": "hepi",
    "www.nesta.org.uk": "nesta",
    "assets.publishing.service.gov.uk": "uk_government",
    "journals.sagepub.com": "sage_journals",
    "d2tic4wvo1iusb.cloudfront.net": "REMOVE",
    "inews.co.uk": "inews",
    "discovery.ucl.ac.uk": "ucl",
    "education.gov.scot": "education_scotland",
    "email.thebritishacademy.ac.uk": "british_academy",
    "digitalpovertyalliance.org": "digital_poverty_alliance",
    "feweek.co.uk": "fe_week",
    "contacts.epi.org.uk": "REMOVE",
    "sustainableschoolleadership.uk": "sustainable_school_leadership",
    "www.oecd-ilibrary.org": "oecd",
    "www.transformingsociety.co.uk": "transforming_society",
    "www.upen.ac.uk": "upen",
    "manmetjobs.mmu.ac.uk": "manchester_metropolitan_university",
    "www.edge.co.uk": "edge_foundation",
    "www.jrf.org.uk": "joseph_rowntree_foundation",
    "www.unicef.org": "unicef",
    "www.adalovelaceinstitute.org": "ada_lovelace_institute",
    "literacytrust.org.uk": "national_literacy_trust",
    "events.teams.microsoft.com": "REMOVE",
    "engagementhub.ukri.org": "ukri",
    "cfey.org": "centre_for_education_and_youth",
    "www.nao.org.uk": "national_audit_office",
    "upen.us14.list-manage.com": "REMOVE",
    "www.edtechstrategylab.org": "edtech_strategy_lab",
    "el.wiley.com": "wiley",
    "senedd.wales": "welsh_parliament",
    "hansard.parliament.uk": "uk_parliament",
    "www.cape.ac.uk": "cape_collaboration_for_public_engagement",
    "options2040.co.uk": "options_2040_project",
    "transforming-evidence.org": "transforming_evidence",
    "www.orielsquare.co.uk": "oriel_square",
    "www.institute.global": "tony_blair_institute",
    "www.evaluation.impactedgroup.uk": "impacted_group",
    "open.spotify.com": "spotify_podcast",
    "educationhub.blog.gov.uk": "uk_government",
    "www.politicshome.com": "politics_home",
    "eprints.lse.ac.uk": "lse_repository",
    "business.senedd.wales": "welsh_parliament",
    "bigeducation.org": "big_education",
    "cpag.org.uk": "child_poverty_action_group",
    "www.tickettailor.com": "REMOVE",
    "click.communications.gse.harvard.edu": "REMOVE",
    "www.telegraph.co.uk": "daily_telegraph",
    "www.n8research.org.uk": "n8_research_partnership",
    "www.centreforyounglives.org.uk": "centre_for_young_lives",
    "cdn.prod.website-files.com": "REMOVE",
    "www.nasuwt.org.uk": "nasuwt_teachers_union",
    "www.de.ed.ac.uk": "university_of_edinburgh",
    "www.libdems.org.uk": "liberal_democrats",
    "wcpp.org.uk": "wales_centre_for_public_policy",
    "www.naht.org.uk": "national_association_head_teachers",
    "cstuk.org.uk": "charities_supporting_teachers_uk",
    "www.turing.ac.uk": "alan_turing_institute",
    "www.health-ni.gov.uk": "ni_department_of_health",
    "covidandsociety.us1.list-manage.com": "REMOVE",
    "lgiu.org": "local_government_information_unit",
    "www.mirror.co.uk": "daily_mirror",
    "drive.google.com": "REMOVE",
    "www.educationsupport.org.uk": "education_support_charity",
    "publicpolicydesign.blog.gov.uk": "uk_government",
    "epi.us15.list-manage.com": "REMOVE",
    "durhamuniversity.zoom.us": "REMOVE",
    "www.ascl.org.uk": "ascl",
    "www.nurseryworld.co.uk": "nursery_world_magazine",
    "teachingcommission.co.uk": "teaching_commission",
    "acss.org.uk": "academy_of_social_sciences",
    "defenddigitalme.org": "defend_digital_me",
    "www.researchgate.net": "researchgate",
    "crae.org.uk": "children_rights_alliance_england",
    "fairnessfoundation.com": "fairness_foundation",
    "www.durham.ac.uk": "durham_university",
    "www.gse.harvard.edu": "harvard_graduate_school_of_education",
    "politico.us8.list-manage.com": "REMOVE",
    "www.internetmatters.org": "internet_matters",
    "www.parliament.uk": "uk_parliament",
    "ripl.uk": "research_improvement_for_policy_and_learning",
    "www.youtube.com": "youtube",
    "www.coproductioncollective.co.uk": "coproduction_collective",
    "www.britishcouncil.org": "british_academy",
    "www.techuk.org": "tech_uk",
    "localed2025.org.uk": "local_ed_2025",
    "www.lgcplus.com": "local_government_chronicle",
    "lnkd.in": "linkedin",
    "www.economy-ni.gov.uk": "ni_department_for_economy",
    "techbullion.com": "techbullion",
    "ukla.us10.list-manage.com": "REMOVE",
    "educationscape.us4.list-manage.com": "REMOVE",
    "linkprotect.cudasvc.com": "REMOVE",
    "commonslibrary.parliament.uk": "house_of_commons_library",
    "play.wales": "play_wales",
    "www.edtechdigest.com": "edtech_digest",
    "cep.lse.ac.uk": "centre_for_economic_performance_lse",
    "www.elsevier.com": "elsevier",
    "www.express.co.uk": "daily_express",
    "onlinelibrary.wiley.com": "wiley",
    "institute.global": "tony_blair_institute",
    "www.insideedgetraining.co.uk": "inside_edge_training",
    "research.senedd.wales": "welsh_parliament",
    "www.frontiersin.org": "frontiers_journal",
    "openpolicy.blog.gov.uk": "uk_government",
    "www.mmu.ac.uk": "manchester_metropolitan_university",
    "blogs.uwe.ac.uk": "uwe_bristol_blog",
    "www.contractsfinder.service.gov.uk": "uk_government",
    "www.parliament.scot": "scottish_parliament",
    "njmok7zy3oa.typeform.com": "REMOVE",
    "public-api.wordpress.com": "wordpress",
    "www.sciencecampaign.org.uk": "campaign_for_science_and_engineering",
    "www.nottingham.ac.uk": "university_of_nottingham",
    "zoom.us": "REMOVE",
    "www.digit.fyi": "digit_fyi",
    "explore-education-statistics.service.gov.uk": "dfe_education_statistics",
    "www.atkinsrealis.com": "atkins_realis",
    "profbeckyallen.substack.com": "becky_allen_substack",
    "www.lse.ac.uk": "london_school_of_economics",
    "www.uwe.ac.uk": "uew_england",
    "wcpp.us12.list-manage.com": "REMOVE",
    "researchonresearch.org": "research_on_research_institute",
    "www.funding-futures.org": "funding_futures",
    "www.royalacademy.org.uk": "royal_academy",
    "unesdoc.unesco.org": "unesco",
    "srhe.ac.uk": "society_for_research_into_higher_education",
    "nfer.ac.uk": "nfer",
    "www.expressandstar.com": "express_and_star",
    "labour.org.uk": "labour_party",
    "econpapers.repec.org": "repec_econpapers",
    "uclpress.scienceopen.com": "ucl",
    "edarxiv.org": "education_arxiv",
    "mmail.dods.co.uk": "REMOVE",
    "www.unesco.org": "unesco",
    "daily.jstor.org": "jstor_daily",
    "www.jstor.org": "jstor",
    "www.thenhsa.co.uk": "northern_health_science_alliance",
    "www.ambition.org.uk": "ambition_institute",
    "www.fda.org.uk": "fda_union",
    "www.birmingham.ac.uk": "university_of_birmingham",
    "magicsmoke.substack.com": "magicsmoke_substack",
    "thebritishacademyecrn.com": "british_academy",
    "www.edtechinnovationhub.com": "edtech_innovation_hub",
    "link.news.inews.co.uk": "inews",
    "arcinstitute.org": "arc_institute",
    "www.thetimes.com": "the_times",
    "scholar.harvard.edu": "harvard_graduate_school_of_education",
    "www.the-tls.co.uk": "times_literary_supplement",
    "shadowpanel.uk": "shadow_panel_project",
    "www.oxfordschoolofthought.org": "oxford_school_of_thought",
    "www.schoolsappg.org.uk": "all_party_parliamentary_group_schools",
    "educationappg.org.uk": "education_appg",
    "benniekara.substack.com": "bennie_kara_substack",
    "www.twinkl.co.uk": "twinkl",
    "www.eyalliance.org.uk": "early_years_alliance",
    "niot.org.uk": "national_institute_of_teaching",
    "universitas21.com": "universitas_21",
    "ascl.org.uk": "ascl",
    "www.scottishai.com": "scottish_ai",
    "www.rijksoverheid.nl": "dutch_government",
    "samf.substack.com": "samf_substack",
    "londondesignbiennale.com": "london_design_biennale",
    "upp-foundation.org": "upp_foundation",
    "inclusioninpractice.org.uk": "inclusion_in_practice",
    "publications.parliament.uk": "uk_parliament",
    "observer.co.uk": "the_observer",
    "dundee.onlinesurveys.ac.uk": "university_of_dundee_surveys",
    "wonkhe.cmail20.com": "REMOVE",
    "digitalyouthindex.uk": "digital_youth_index",
    "neweconomics.org": "new_economics_foundation",
    "beyth.co.uk": "beyth_consultancy",
    "the-difference.com": "the_difference",
    "www.innovate-ed.uk": "innovate_ed",
    "www.oecd-events.org": "oecd",
    "youtu.be": "youtube",
    "ffteducationdatalab.us12.list-manage.com": "REMOVE",
    "www.centreforsocialjustice.org.uk": "centre_for_social_justice",
    "gamayo.co.uk": "gamayo",
    "profiles.ucl.ac.uk": "ucl",
    "rebeccaallen.co.uk": "rebecca_allen",
    "pod.co": "pod_co_podcast",
    "www.teachfirst.org.uk": "teach_first",
    "edsk.org": "edsk_think_tank",
    "digitalgood.net": "digital_good_network",
    "lnks.gd": "REMOVE",
    "issuu.com": "issuu",
    "adcs.org.uk": "association_of_directors_of_childrens_services",
    "www.labourtogether.uk": "labour_together",
    "downloads2.dodsmonitoring.com": "dods_monitoring",
    "www.besa.org.uk": "british_academy",
    "nepc.colorado.edu": "national_education_policy_center",
    "cipr.co.uk": "chartered_institute_of_public_relations",
    "img1.wsimg.com": "REMOVE",
    "www.leverhulme.ac.uk": "leverhulme_trust",
    "www.ons.gov.uk": "office_for_national_statistics",
    "labourlist.org": "labour_list",
    "www.centreformentalhealth.org.uk": "centre_for_mental_health",
    "lnu-se.zoom.us": "REMOVE",
    "lse.zoom.us": "REMOVE",
    "hechingerreport.org": "hechinger_report",
    "researcheracademy.elsevier.com": "elsevier_researcher_academy",
    "www.smf.co.uk": "social_market_foundation",
    "www.suttontrust.com": "sutton_trust",
    "www.research.net": "REMOVE",
    "y3r710.r.eu-west-1.awstrack.me": "REMOVE",
    "lucaf.org": "lucas_education_foundation",
    "learning.nspcc.org.uk": "nspcc_learning",
    "news.comms.nao.org.uk": "national_audit_office",
    "youthendowmentfund.org.uk": "youth_endowment_fund",
    "www.bigissue.com": "big_issue",
    "demos.co.uk": "demos",
    "links-2.govdelivery.com": "REMOVE",
    "news.chartered.college": "chartered_college_news",
    "www.workinglivesofteachers.com": "working_lives_of_teachers",
    "one.oecd.org": "oecd",
    "blog.policy.manchester.ac.uk": "policy_manchester_blog",
    "twitter.com": "twitter",
    "mcrmetropolis.uk": "manchester_metropolitan_university",
    "gtr.ukri.org": "ukri",
    "teaching-vacancies.service.gov.uk": "dfe_teaching_vacancies",
    "ari.org.uk": "ari_association_for_research_innovation",
    "kingsfundmail.org.uk": "kings_fund",
    "www.mdpi.com": "mdpi_journals",
    "blogs.gov.scot": "scottish_government",
    "parliamentlive.tv": "uk_parliament",
    "e-estonia.com": "e_estonia",
    "us9.campaign-archive.com": "REMOVE",
   "podfollow.com": "podfollow_podcast",
    "nation.cymru": "nation_cymru",
    "www.holyrood.com": "holyrood_magazine",
    "impactedgroup.us22.list-manage.com": "REMOVE",
    "teachersuccess.co.uk": "teacher_success",
    "tpea.ac.uk": "tpea_association",
    "www.fenews.co.uk": "fe_news",
    "www.qmul.ac.uk": "queen_mary_university_london",
    "www.echild.ac.uk": "echild_research_centre",
    "www.standard.co.uk": "evening_standard",
    "x.com": "twitter",
    "goodthingsfoundation.us7.list-manage.com": "REMOVE",
    "www.ocr.org.uk": "ocr_exam_board",
    "ideas.repec.org": "repec_ideas",
    "educationinspection.blog.gov.uk": "ofsted_blog",
    "medium.com": "medium",
    "niot.s3.amazonaws.com": "REMOVE",
    "lxhriqcab.cc.rs6.net": "REMOVE",
    "consult.education.gov.uk": "dfe_consultations",
    "onthinktanks.org": "on_think_tanks",
    "etat.uea.ac.uk": "university_of_east_anglia",
    "ucl.us20.list-manage.com": "REMOVE",
    "www.wsj.com": "wall_street_journal",
    "doi-org.libproxy.ucl.ac.uk": "doi_via_ucl_proxy",
    "www.kcl.ac.uk": "kings_college_london",
    "www.chandlerinstitute.org": "chandler_institute",
    "unige.zoom.us": "REMOVE",
    "insights.taylorandfrancis.com": "taylor_and_francis",
    "www.faircomment.co.uk": "fair_comment",
    "www.sciencedirect.com": "sciencedirect",
    "jacobsfoundation.us11.list-manage.com": "REMOVE",
    "www.barnardos.org.uk": "barnardos",
    "bbc.co.uk": "bbc",
    "fullfact.org": "full_fact",
    "news.sky.com": "sky_news",
    "www.magicbreakfast.com": "magic_breakfast",
    "ow.ly": "twitter",
    "durham.cloud.panopto.eu": "REMOVE",
    "adc.bmj.com": "british_medical_journal",
    "www.nationalcrimeagency.gov.uk": "national_crime_agency",
    "newvisionsforeducation.org.uk": "new_visions_for_education",
    "thestaffcollege.uk": "staff_college",
    "www.yorkshirepost.co.uk": "yorkshire_post",
    "doi.org": "REMOVE",
    "www.bristol.ac.uk": "university_of_bristol",
    "media.actionforchildren.org.uk": "action_for_children",
    "www.pearson.com": "pearson",
    "www.coe.int": "council_of_europe",
    "whatson.parliament.uk": "uk_parliament",
    "www.civilservicejobs.service.gov.uk": "uk_civil_service_jobs",
    "www.ft.com": "financial_times",
    "soundcloud.com": "soundcloud",
    "www.hmc.org.uk": "headmasters_and_headmistresses_conference",
    "app.getresponse.com": "REMOVE",
    "www.ntu.ac.uk": "nottingham_trent_university",
    "ippr-org.files.svdcdn.com": "ippr",
    "www.childreninwales.org.uk": "children_in_wales",
    "nuffieldfoundation.cmail20.com": "REMOVE",
    "www.wired-gov.net": "wired_gov",
    "nuffieldfoundation.cmail19.com": "REMOVE"
}

In [31]:
df["organisation"] = df["domain"].map(domain_to_org)

In [32]:
df = df[df["organisation"].notna()]
df = df[df["organisation"] != "REMOVE"]

In [33]:
df.head(1)


Unnamed: 0,id,newsletter_number,issue_date,theme,subtheme,title,description,link,new_theme,domain,organisation
0,c97ff62f-83ca-47ec-a4c7-b4e24157ae0a,1,11 July 2023,Calls for evidence,,Deadline 23 August 2023,"Education secretary Gillian Keegan has launched a call for evidence on using artificial intelligence (AI) like ChatGPT in schools ""to get the best"" out of the new technology.",https://schoolsweek.co.uk/chatgpt-keegan-launches-call-for-evidence-on-ai-in-schools,political_environment_key_organisations,schoolsweek.co.uk,schools_week


In [34]:
#number of unique organisations 
unique_orgs = df["organisation"].nunique()
print(f"Number of unique organisations: {unique_orgs}")

Number of unique organisations: 218


In [35]:
# Build full file path using your existing variable
output_path = os.path.join(data_cleaning_path, "unique_organisations.xlsx")

# Get the unique organisations as a sorted list
unique_org_list = sorted(df["organisation"].dropna().unique())

# Convert to DataFrame
unique_org_df = pd.DataFrame(unique_org_list, columns=["organisation"])

# Save to Excel
unique_org_df.to_excel(output_path, index=False)

print(f"Saved {len(unique_org_list)} unique organisations to {output_path}")

Saved 218 unique organisations to /workspaces/ERP_Newsletter/data/data02_cleaning/unique_organisations.xlsx


In [36]:
org_counts = df["organisation"].value_counts()
org_5plus = org_counts[org_counts >=3]
print(org_5plus)

organisation
schools_week                            138
uk_government                            65
guardian                                 25
nfer                                     24
epi                                      23
scottish_government                      21
welsh_government                         21
uk_parliament                            21
bera                                     20
conversation                             19
oecd                                     18
ucl                                      18
belfast_telegraph                        16
ni_government                            16
fft_ed_datalab                           13
university_of_birmingham                 12
teacher_tapp                             11
upen                                     11
nuffield                                 11
tes                                      10
bera_journals                            10
bbc                                      10
british_academy    

# Assign categories to the organiastions 

In [38]:
org_to_category = {
    # ========================================
    # GOVERNMENT_PUBLIC_SECTOR
    # ========================================
    
    # government_legislature
    "all_party_parliamentary_group_schools": ("government_public_sector", "government_legislature"),
    "council_of_europe": ("government_public_sector", "government_legislature"),
    "dfe_consultations": ("government_public_sector", "government_legislature"),
    "dfe_education_statistics": ("government_public_sector", "government_legislature"),
    "dfe_teaching_vacancies": ("government_public_sector", "government_legislature"),
    "dods_monitoring": ("government_public_sector", "government_legislature"),
    "dutch_government": ("government_public_sector", "government_legislature"),
    "education_scotland": ("government_public_sector", "government_legislature"),
    "house_of_commons_library": ("government_public_sector", "government_legislature"),
    "house_of_lords_library": ("government_public_sector", "government_legislature"),
    "labour_party": ("government_public_sector", "government_legislature"),
    "liberal_democrats": ("government_public_sector", "government_legislature"),
    "local_government_chronicle": ("government_public_sector", "government_legislature"),
    "local_government_information_unit": ("government_public_sector", "government_legislature"),
    "national_audit_office": ("government_public_sector", "government_legislature"),
    "national_crime_agency": ("government_public_sector", "government_legislature"),
    "ni_department_for_economy": ("government_public_sector", "government_legislature"),
    "ni_department_of_health": ("government_public_sector", "government_legislature"),
    "ni_government": ("government_public_sector", "government_legislature"),
    "office_for_national_statistics": ("government_public_sector", "government_legislature"),
    "post_parliament": ("government_public_sector", "government_legislature"),
    "scottish_government": ("government_public_sector", "government_legislature"),
    "scottish_parliament": ("government_public_sector", "government_legislature"),
    "uk_civil_service_jobs": ("government_public_sector", "government_legislature"),
    "uk_government": ("government_public_sector", "government_legislature"),
    "uk_parliament": ("government_public_sector", "government_legislature"),
    "welsh_government": ("government_public_sector", "government_legislature"),
    "welsh_parliament": ("government_public_sector", "government_legislature"),
    
    # executive_non_departmental_public_body_ndpb
    "ofsted_blog": ("government_public_sector", "executive_non_departmental_public_body_ndpb"),
    
    # international_organisation
    "oecd": ("government_public_sector", "international_organisation"),
    "unesco": ("government_public_sector", "international_organisation"),
    "unicef": ("government_public_sector", "international_organisation"),
    


    
    # ========================================
    # ACADEMIC_SECTOR
    # ========================================
    
    # universities
    "durham_university": ("academic_sector", "universities"),
    "kings_college_london": ("academic_sector", "universities"),
    "london_school_of_economics": ("academic_sector", "universities"),
    "lse_repository": ("academic_sector", "universities"),
    "manchester_metropolitan_university": ("academic_sector", "universities"),
    "nottingham_trent_university": ("academic_sector", "universities"),
    "queen_mary_university_london": ("academic_sector", "universities"),
    "ucl": ("academic_sector", "universities"),
    "universitas_21": ("academic_sector", "universities"),
    "university_of_birmingham": ("academic_sector", "universities"),
    "university_of_bristol": ("academic_sector", "universities"),
    "university_of_dundee_surveys": ("academic_sector", "universities"),
    "university_of_east_anglia": ("academic_sector", "universities"),
    "university_of_edinburgh": ("academic_sector", "universities"),
    "university_of_nottingham": ("academic_sector", "universities"),
    "harvard_graduate_school_of_education": ("academic_sector", "universities"),
    "uwe_bristol_blog": ("academic_sector", "universities"),
    
    # academic_publisher_platform
    "elsevier": ("academic_sector", "academic_publisher_platform"),
    "elsevier_researcher_academy": ("academic_sector", "academic_publisher_platform"),
    "frontiers_journal": ("academic_sector", "academic_publisher_platform"),
    "jstor": ("academic_sector", "academic_publisher_platform"),
    "jstor_daily": ("academic_sector", "academic_publisher_platform"),
    "mdpi_journals": ("academic_sector", "academic_publisher_platform"),
    "sage_journals": ("academic_sector", "academic_publisher_platform"),
    "sciencedirect": ("academic_sector", "academic_publisher_platform"),
    "taylor_and_francis": ("academic_sector", "academic_publisher_platform"),
    "wiley": ("academic_sector", "academic_publisher_platform"),
    "british_medical_journal": ("academic_sector", "academic_publisher_platform"),
    "repec_econpapers": ("academic_sector", "academic_publisher_platform"),
    "repec_ideas": ("academic_sector", "academic_publisher_platform"),
    "researchgate": ("academic_sector", "academic_publisher_platform"),
    "doi_via_ucl_proxy": ("academic_sector", "academic_publisher_platform"),
    "education_arxiv": ("academic_sector", "academic_publisher_platform"),
    "bera_journals": ("academic_sector", "academic_publisher_platform"),
    
    # academic_network
    "bera": ("academic_sector", "academic_network"),
    "academy_of_social_sciences": ("academic_sector", "academic_network"),
    "british_academy": ("academic_sector", "academic_network"),
    "royal_academy": ("academic_sector", "academic_network"),
    "ari_association_for_research_innovation": ("academic_sector", "academic_network"),
    "society_for_research_into_higher_education": ("academic_sector", "academic_network"),
    "n8_research_partnership": ("academic_sector", "academic_network"),
    
    # ========================================
    # RESEARCH_EVIDENCE_SECTOR
    # ========================================
    
    # research_organisation
    "nfer": ("research_evidence_sector", "research_organisation"),
    "centre_for_economic_performance_lse": ("research_evidence_sector", "research_organisation"),
    "echild_research_centre": ("research_evidence_sector", "research_organisation"),
    "national_education_policy_center": ("research_evidence_sector", "research_organisation"),
    "oii_edtech_equity": ("research_evidence_sector", "research_organisation"),
    "research_improvement_for_policy_and_learning": ("research_evidence_sector", "research_organisation"),
    "oxford_school_of_thought": ("research_evidence_sector", "research_organisation"),
    "northern_health_science_alliance": ("research_evidence_sector", "research_organisation"),
    
    # research_institution
    "alan_turing_institute": ("research_evidence_sector", "research_institution"),
    "ada_lovelace_institute": ("research_evidence_sector", "research_institution"),
    "arc_institute": ("research_evidence_sector", "research_institution"),
    "kings_fund": ("research_evidence_sector", "research_institution"),
    "research_on_research_institute": ("research_evidence_sector", "research_institution"),
    "scottish_ai": ("research_evidence_sector", "research_institution"),
    
    # research_project_initiative
    "working_lives_of_teachers": ("research_evidence_sector", "research_project_initiative"),
    "coproduction_collective": ("research_evidence_sector", "research_project_initiative"),
    
    # research_funder
    "leverhulme_trust": ("research_evidence_sector", "research_funder"),
    "nuffield": ("research_evidence_sector", "research_funder"),
    "ukri": ("research_evidence_sector", "research_funder"),
    
    # ========================================
    # CIVIL_SOCIETY_NONPROFIT_SECTOR
    # ========================================
    
    # union
    "national_education_union": ("civil_society_nonprofit_sector", "labour_union"),
    "nasuwt_teachers_union": ("civil_society_nonprofit_sector", "labour_union"),
    "fda_union": ("civil_society_nonprofit_sector", "labour_union"),
    
    # charity_ngo
    "5rights_foundation": ("civil_society_nonprofit_sector", "charity_ngo"),
    "action_for_children": ("civil_society_nonprofit_sector", "charity_ngo"),
    "barnardos": ("civil_society_nonprofit_sector", "charity_ngo"),
    "child_poverty_action_group": ("civil_society_nonprofit_sector", "charity_ngo"),
    "children_in_wales": ("civil_society_nonprofit_sector", "charity_ngo"),
    "children_rights_alliance_england": ("civil_society_nonprofit_sector", "charity_ngo"),
    "childrends_participation_in_schools": ("civil_society_nonprofit_sector", "charity_ngo"),
    "childrens_commissioner": ("civil_society_nonprofit_sector", "charity_ngo"),
    "magic_breakfast": ("civil_society_nonprofit_sector", "charity_ngo"),
    "nspcc_learning": ("civil_society_nonprofit_sector", "charity_ngo"),
    "centre_for_mental_health": ("civil_society_nonprofit_sector", "charity_ngo"),
    "centre_for_social_justice": ("civil_society_nonprofit_sector", "charity_ngo"),
    "centre_for_young_lives": ("civil_society_nonprofit_sector", "charity_ngo"),
    "education_support_charity": ("civil_society_nonprofit_sector", "charity_ngo"),
    "fairness_foundation": ("civil_society_nonprofit_sector", "charity_ngo"),
    "joseph_rowntree_foundation": ("civil_society_nonprofit_sector", "charity_ngo"),
    "national_literacy_trust": ("civil_society_nonprofit_sector", "charity_ngo"),
    "youth_endowment_fund": ("civil_society_nonprofit_sector", "charity_ngo"),
    "sutton_trust": ("civil_society_nonprofit_sector", "charity_ngo"),
    "internet_matters": ("civil_society_nonprofit_sector", "charity_ngo"),
    "digital_poverty_alliance": ("civil_society_nonprofit_sector", "charity_ngo"),
    "defend_digital_me": ("civil_society_nonprofit_sector", "charity_ngo"),
    
    # professional_network
    "chartered_college_of_teaching": ("civil_society_nonprofit_sector", "professional_network"),
    "chartered_college_news": ("civil_society_nonprofit_sector", "professional_network"),
    "cct": ("civil_society_nonprofit_sector", "professional_network"),
    "ascl": ("civil_society_nonprofit_sector", "professional_network"),
    "association_of_directors_of_childrens_services": ("civil_society_nonprofit_sector", "professional_network"),
    "chartered_institute_of_public_relations": ("civil_society_nonprofit_sector", "professional_network"),
    "charities_supporting_teachers_uk": ("civil_society_nonprofit_sector", "professional_network"),
    "headmasters_and_headmistresses_conference": ("civil_society_nonprofit_sector", "professional_network"),
    "national_association_head_teachers": ("civil_society_nonprofit_sector", "professional_network"),
    
    # practitioner_organisation
    "early_years_alliance": ("civil_society_nonprofit_sector", "practitioner_organisation"),
    "play_wales": ("civil_society_nonprofit_sector", "practitioner_organisation"),
    "teach_first": ("civil_society_nonprofit_sector", "practitioner_organisation"),
    "ambition_institute": ("civil_society_nonprofit_sector", "practitioner_organisation"),
    "national_institute_of_teaching": ("civil_society_nonprofit_sector", "practitioner_organisation"),
    
    # ========================================
    # KNOWLEDGE_MOBILISER_THINK_TANK_SECTOR
    # ========================================
    
    # think_tank
    "centre_for_education_and_youth": ("knowledge_mobiliser_think_tank_sector", "think_tank"),
    "demos": ("knowledge_mobiliser_think_tank_sector", "think_tank"),
    "edge_foundation": ("knowledge_mobiliser_think_tank_sector", "think_tank"),
    "edsk_think_tank": ("knowledge_mobiliser_think_tank_sector", "think_tank"),
    "epi": ("knowledge_mobiliser_think_tank_sector", "think_tank"),
    "ifg": ("knowledge_mobiliser_think_tank_sector", "think_tank"),
    "ifs": ("knowledge_mobiliser_think_tank_sector", "think_tank"),
    "ippr": ("knowledge_mobiliser_think_tank_sector", "think_tank"),
    "nesta": ("knowledge_mobiliser_think_tank_sector", "think_tank"),
    "new_economics_foundation": ("knowledge_mobiliser_think_tank_sector", "think_tank"),
    "social_market_foundation": ("knowledge_mobiliser_think_tank_sector", "think_tank"),
    "tony_blair_institute": ("knowledge_mobiliser_think_tank_sector", "think_tank"),
    "wales_centre_for_public_policy": ("knowledge_mobiliser_think_tank_sector", "think_tank"),
    "labour_together": ("knowledge_mobiliser_think_tank_sector", "think_tank"),
    "on_think_tanks": ("knowledge_mobiliser_think_tank_sector", "think_tank"),
    "hepi": ("knowledge_mobiliser_think_tank_sector", "think_tank"),
    "chandler_institute": ("knowledge_mobiliser_think_tank_sector", "think_tank"),
    
    # evidence_mobiliser
    "eef": ("knowledge_mobiliser_think_tank_sector", "evidence_mobiliser"),
    "fft_ed_datalab": ("knowledge_mobiliser_think_tank_sector", "evidence_mobiliser"),
    "full_fact": ("knowledge_mobiliser_think_tank_sector", "evidence_mobiliser"),
    "ippo": ("knowledge_mobiliser_think_tank_sector", "evidence_mobiliser"),
    "transforming_evidence": ("knowledge_mobiliser_think_tank_sector", "evidence_mobiliser"),
    "teacher_tapp": ("knowledge_mobiliser_think_tank_sector", "evidence_mobiliser"),
    
    # advocacy_organisation
    "campaign_for_science_and_engineering": ("knowledge_mobiliser_think_tank_sector", "advocacy_organisation"),
    "cape_collaboration_for_public_engagement": ("knowledge_mobiliser_think_tank_sector", "advocacy_organisation"),
    "education_appg": ("knowledge_mobiliser_think_tank_sector", "advocacy_organisation"),
    "fair_comment": ("knowledge_mobiliser_think_tank_sector", "advocacy_organisation"),
    "impacted_group": ("knowledge_mobiliser_think_tank_sector", "advocacy_organisation"),
    "teaching_commission": ("knowledge_mobiliser_think_tank_sector", "advocacy_organisation"),
    "shadow_panel_project": ("knowledge_mobiliser_think_tank_sector", "advocacy_organisation"),
    "options_2040_project": ("knowledge_mobiliser_think_tank_sector", "advocacy_organisation"),
    
    # ========================================
    # COMMERCIAL_PRIVATE_SECTOR
    # ========================================
    
    "atkins_realis": ("commercial_private_sector", "consultancy"),
    "beyth_consultancy": ("commercial_private_sector", "consultancy"),
    "big_education": ("commercial_private_sector", "consultancy"),
    "oriel_square": ("commercial_private_sector", "consultancy"),
    "staff_college": ("commercial_private_sector", "consultancy"),
    "pearson": ("commercial_private_sector", "edtech_education_business"),
    "twinkl": ("commercial_private_sector", "edtech_education_business"),
    "ocr_exam_board": ("commercial_private_sector", "edtech_education_business"),
    "tech_uk": ("commercial_private_sector", "industry_association"),
    "digital_good_network": ("commercial_private_sector", "industry_association"),
    "edtech_innovation_hub": ("commercial_private_sector", "industry_association"),
    "edtech_strategy_lab": ("commercial_private_sector", "industry_association"),
    "bett_show": ("commercial_private_sector", "industry_association"),
    
    # ========================================
    # MEDIA SECTOR
    # ========================================

    # news_media
    "bbc": ("media_sector", "news_media"),
    "belfast_telegraph": ("media_sector", "news_media"),
    "big_issue": ("media_sector", "news_media"),
    "daily_express": ("media_sector", "news_media"),
    "daily_mirror": ("media_sector", "news_media"),
    "daily_telegraph": ("media_sector", "news_media"),
    "evening_standard": ("media_sector", "news_media"),
    "express_and_star": ("media_sector", "news_media"),
    "financial_times": ("media_sector", "news_media"),
    "guardian": ("media_sector", "news_media"),
    "hechinger_report": ("media_sector", "news_media"),
    "holyrood_magazine": ("media_sector", "news_media"),
    "independent": ("media_sector", "news_media"),
    "inews": ("media_sector", "news_media"),
    "nation_cymru": ("media_sector", "news_media"),
    "politics_home": ("media_sector", "news_media"),
    "sky_news": ("media_sector", "news_media"),
    "the_observer": ("media_sector", "news_media"),
    "the_times": ("media_sector", "news_media"),
    "times_literary_supplement": ("media_sector", "news_media"),
    "wall_street_journal": ("media_sector", "news_media"),
    "yorkshire_post": ("media_sector", "news_media"),
    "labour_list": ("media_sector", "news_media"),

    # specialist_media
    "schools_week": ("media_sector", "specialist_media"),
    "tes": ("media_sector", "specialist_media"),
    "nursery_world_magazine": ("media_sector", "specialist_media"),
    "fe_news": ("media_sector", "specialist_media"),
    "fe_week": ("media_sector", "specialist_media"),
    "fed": ("media_sector", "specialist_media"),
    "wonkhe": ("media_sector", "specialist_media"),
    "wired_gov": ("media_sector", "specialist_media"),
    "edtech_digest": ("media_sector", "specialist_media"),
    "techbullion": ("media_sector", "specialist_media"),
    "digit_fyi": ("media_sector", "specialist_media"),

    # commentary_platform
    "conversation": ("media_sector", "commentary_platform"),
    "medium": ("media_sector", "commentary_platform"),
    "policy_manchester_blog": ("media_sector", "commentary_platform"),
    "samf_substack": ("media_sector", "commentary_platform"),
    "becky_allen_substack": ("media_sector", "commentary_platform"),
    "bennie_kara_substack": ("media_sector", "commentary_platform"),
    "magicsmoke_substack": ("media_sector", "commentary_platform"),
    "rebecca_allen": ("media_sector", "commentary_platform"),
    "wordpress": ("media_sector", "commentary_platform"),

    # ========================================
    # DIGITAL / SOCIAL MEDIA PLATFORMS
    # ========================================

    "linkedin": ("digital_social_media_platforms", "social_media"),
    "twitter": ("digital_social_media_platforms", "social_media"),
    "youtube": ("digital_social_media_platforms", "social_media"),

    "spotify_podcast": ("digital_social_media_platforms", "podcast_platform"),
    "pod_co_podcast": ("digital_social_media_platforms", "podcast_platform"),
    "podfollow_podcast": ("digital_social_media_platforms", "podcast_platform"),
    "soundcloud": ("digital_social_media_platforms", "podcast_platform"),

    # ========================================
    # OTHER / MISCELLANEOUS
    # ========================================

    "issuu": ("other_miscellaneous", "content_platform"),
    "london_design_biennale": ("other_miscellaneous", "cultural_organisation"),
    "gamayo": ("other_miscellaneous", "unclear"),
    "e_estonia": ("other_miscellaneous", "government_initiative"),
    "the_difference": ("other_miscellaneous", "unclear"),
    "teacher_success": ("other_miscellaneous", "unclear"),
    "inclusion_in_practice": ("other_miscellaneous", "unclear"),
    "innovate_ed": ("other_miscellaneous", "unclear"),
    "inside_edge_training": ("other_miscellaneous", "unclear"),
    "funding_futures": ("other_miscellaneous", "unclear"),
    "digital_youth_index": ("other_miscellaneous", "unclear"),
    "new_visions_for_education": ("other_miscellaneous", "unclear"),
    "tpea_association": ("other_miscellaneous", "unclear"),
    "transforming_society": ("other_miscellaneous", "unclear"),
    "uew_england": ("other_miscellaneous", "unclear"),
    "upen": ("other_miscellaneous", "unclear"),
    "upp_foundation": ("other_miscellaneous", "unclear"),
    "local_ed_2025": ("other_miscellaneous", "unclear"),
    "lucas_education_foundation": ("other_miscellaneous", "unclear"),
    "sustainable_school_leadership": ("other_miscellaneous", "unclear"),

}

In [39]:
#map dictionary to dataframe
mapped = df["organisation"].map(org_to_category)

In [40]:
# Turn the tuple series into two columns
df[["org_broad_category", "org_category"]] = mapped.apply(pd.Series)

In [41]:
df[["organisation", "org_broad_category", "org_category"]].head()

Unnamed: 0,organisation,org_broad_category,org_category
0,schools_week,media_sector,specialist_media
1,schools_week,media_sector,specialist_media
2,schools_week,media_sector,specialist_media
3,schools_week,media_sector,specialist_media
4,schools_week,media_sector,specialist_media


In [42]:
df["org_category"].value_counts(dropna=False)

org_category
government_legislature                         188
specialist_media                               170
news_media                                      80
think_tank                                      70
charity_ngo                                     45
universities                                    43
academic_network                                37
evidence_mobiliser                              37
research_organisation                           30
commentary_platform                             27
unclear                                         25
academic_publisher_platform                     23
international_organisation                      22
professional_network                            20
research_funder                                 18
advocacy_organisation                           12
social_media                                    11
research_institution                             9
labour_union                                     8
consultancy       

In [43]:
df["org_broad_category"].value_counts(dropna=False)

org_broad_category
media_sector                             277
government_public_sector                 211
knowledge_mobiliser_think_tank_sector    119
academic_sector                          103
civil_society_nonprofit_sector            78
research_evidence_sector                  60
other_miscellaneous                       27
digital_social_media_platforms            15
commercial_private_sector                 11
Name: count, dtype: int64

# Inspect "Title" and "Description" 


In [45]:
df[['title', 'description']].info()
df[['title', 'description']].isna().sum()
df[['title', 'description']].head(10)

<class 'pandas.core.frame.DataFrame'>
Index: 901 entries, 0 to 1183
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   title        901 non-null    string
 1   description  901 non-null    string
dtypes: string(2)
memory usage: 21.1 KB


Unnamed: 0,title,description
0,Deadline 23 August 2023,"Education secretary Gillian Keegan has launched a call for evidence on using artificial intelligence (AI) like ChatGPT in schools ""to get the best"" out of the new technology."
1,Revealed: the experts advising ministers on teacher training reforms review,"The Department for Education has appointed an ""external steering group"" to review both the initial teacher training and early career frameworks, first launched in 2019. The group is made up of seven experts who are ""closely familiar"" with both reforms, as well as the ""underpinning evidence"". They will help ""shape the work of the review, scrutinising, supporting and challenging our thinking"", the DfE said."
2,"Reject fewer teacher applicants, DfE tells trainers","Susan Acland-Hood, the DfE's permanent secretary, told providers a 7 per cent jump in applicants this year had not led to an equivalent rise in offers for courses."
3,Ofqual and DfE studying 'feasibility' of 'fully digital' exams,"Some exam boards are already piloting on-screen assessment, but research by AQA last year found teachers' biggest barrier to digital exams was a lack of infrastructure. https://schoolsweek.co.uk/ofqual-and-dfe-studying-feasibility-of-fully-digital-exams/"
4,Revealed: The full details of Labour's education 'mission',"Entitled 'Breaking down the barrier to opportunity', Labour will 'revise delivery' of the ECF and give more details on the plan to simplify the system of teacher incentives. Full mission document here - https://schoolsweek.co.uk/wp-content/uploads/2023/07/Labour-breaking-down-barriers-document.pdf"
5,Lib Dem,"Munira Wilson, Lib Dem spokesperson for education, is currently drawing up what she says is a ""strong education offer"" in the Lib Dem manifesto. Details seem thin on the ground so far. https://schoolsweek.co.uk/munira-wilson-lib-dem-education-spokesperson/"
6,Teacher retention commission: 8 proposals to stem exodus,"Teacher wellbeing chari ty Education Support has put forward a list of proposals to boost retention in the sector (published in partnership with Public First.) Report calls for review of teacher hours, retention targets and sabbaticals for headteachers every five years."
7,Who's supporting school leaders to stop them hitting crisis point?,"Recruitment and retention difficulties, Ofsted pressures, and the dismantling of other public services is leaving heads, as Executive head Sara-Jane Bake puts it, ""exhausted trying to keep all the plates spinning. We are so busy looking after everybody else ‚Äì but we need looking after too."""
8,Digital Poverty Alliance,A charity whose vision is for everyone to access the life changing benefits that digital brings. Homepage - https://digitalpovertyalliance.org/
9,A long read from Nuffield funded research project ' Advancing Leadership Development in Early Years Education via Digitally Mediated Professional Learning',"In brief, the report finds:"


In [46]:
df['title_length'] = df['title'].str.len()
df['description_length'] = df['description'].str.len()
df[['title_length', 'description_length']].describe()

Unnamed: 0,title_length,description_length
count,901.0,901.0
mean,82.716981,199.061043
std,37.627868,144.306478
min,7.0,5.0
25%,61.0,107.0
50%,77.0,172.0
75%,98.0,255.0
max,482.0,1324.0


In [47]:
#Inspect titles with >50 words 
df['title_word_count'] = df['title'].apply(lambda x: len(str(x).split()))

long_titles = df[df['title_word_count'] > 50]
long_titles.head()




Unnamed: 0,id,newsletter_number,issue_date,theme,subtheme,title,description,link,new_theme,domain,organisation,org_broad_category,org_category,title_length,description_length,title_word_count
412,a27ecb62-6146-4ebd-815f-157df254f616,35,10 May 2024,Political landscape & key organisations,,"The DfE is inviting applications to its newly formed Science Advisory Council (SAC) including from those with expertise in AI and education technology, sustainable and secure school buildings and adapting to climate change, and physical and mental health challenges. The group will support the Department's Chief Scientific Adviser, ensuring that DfE has access to cutting-edge scientific evidence, analytical approaches, and expertise for robust, evidence-informed decision making.",Applications close 27 May More,https://www.civilservicejobs.service.gov.uk/csr/jobs.cgi?jcode=1907685&csource=csalerts,political_environment_key_organisations,www.civilservicejobs.service.gov.uk,uk_civil_service_jobs,government_public_sector,government_legislature,482,30,66
474,0c3aa38a-3031-4986-a05d-2eb822939a85,40,21 June 2024,"Teacher recruitment, retention & development",,"EPI blog - Blog: The workforce challenges facing an incoming government - this blog post argues that ""retention problems persist, leaders are leaving the profession at a growing rate, and recruitment remains a challenge. On the other hand, an improved pay settlement, the sustained retention of early career teachers, and the increase in returners to the profession offer reasons for hope.""",By James Zuccollo,https://epi.org.uk/publications-and-research/blog-the-workforce-challenges-facing-an-incoming-government,teacher_rrd,epi.org.uk,epi,knowledge_mobiliser_think_tank_sector,think_tank,390,17,61
759,8446df04-fc2b-4bd7-96b4-bcede957a2c0,60,24 January 2025,EdTech,,"The Chartered College of Teaching will pilot setting up the EdTech Evidence Board to ""explore how we effectively build evidence of AI products that work well, helping education settings feel confident that they are choosing products that work well for them and for their classrooms"" You can read more about how they will approach this here.",Mor e,https://chartered.college/2025/01/22/supporting-effective-education-through-education-technology,edtech,chartered.college,chartered_college_of_teaching,civil_society_nonprofit_sector,professional_network,340,5,56


In [48]:
#Inspect descriptions with <10 words 

df['description_word_count'] = df['description'].apply(lambda x: len(str(x).split()))

short_descriptions = df[df['description_word_count'] < 10]

short_descriptions.head()


Unnamed: 0,id,newsletter_number,issue_date,theme,subtheme,title,description,link,new_theme,domain,organisation,org_broad_category,org_category,title_length,description_length,title_word_count,description_word_count
9,56bbc37a-4fc1-4906-a7fa-2b55c664cff7,1,11 July 2023,Thematic roundup,Digital,A long read from Nuffield funded research project ' Advancing Leadership Development in Early Years Education via Digitally Mediated Professional Learning',"In brief, the report finds:",https://www.nuffieldfoundation.org/wp-content/uploads/2021/10/Project-Report-Advancing-Ear-Years-Leadership-Development.pdf,edtech,www.nuffieldfoundation.org,nuffield,research_evidence_sector,research_funder,155,27,21,5
11,b83f3825-767b-4f21-90aa-35e36ed27a11,2,16 July 2023,PI Updates and Papers,Digital,Cutting through the conjecture: How is EdTech really being used in our classrooms?,Full post - https://edtech.oii.ox.ac.uk/cutting-through-the-conjecture/,https://edtech.oii.ox.ac.uk/cutting-through-the-conjecture,edtech,edtech.oii.ox.ac.uk,oii_edtech_equity,research_evidence_sector,research_organisation,82,71,13,4
57,51ee4890-cae6-4592-8b1f-c3978bd00ed6,5,4 September 2023,Political landscape & key organisations,,Labour ‚Äì Breaking down the barriers to opportunity,Labour sets out its plans for education https://labour.org.uk/wp-content/uploads/2023/07/Mission-breaking-down-barriers.pdf,https://labour.org.uk/wp-content/uploads/2023/07/Mission-breaking-down-barriers.pdf,political_environment_key_organisations,labour.org.uk,labour_party,government_public_sector,government_legislature,50,123,8,8
68,25d2c150-4e0d-4f5c-bb81-d5b7a7808ed7,6,8 September 2023,"Teacher recruitment, retention & development",,Teacher Tapp - Nearly three-fifths of teachers surveyed said cliques at school have affected staff wellbeing over the past year.,https://teachertapp.co.uk/articles/timetables-and-toxicity-uniformity-and-discovery/,https://teachertapp.co.uk/articles/timetables-and-toxicity-uniformity-and-discovery,teacher_rrd,teachertapp.co.uk,teacher_tapp,knowledge_mobiliser_think_tank_sector,evidence_mobiliser,128,84,20,1
71,044086b1-4eb7-4f74-af05-e0be287f2bb8,6,8 September 2023,EdTech,,"Artificial intelligence (AI) is still unsuitable for use in high-stakes exams, though it has the potential to reduce workload for teachers, the exam board AQA said.",TES - https://www.tes.com/magazine/news/general/artificial-intelligence-assessment-aqa-exam-board AQA - https://filestore.aqa.org.uk/content/consultation-responses/AQA-DFE-CONSULTATION-GENERATIVE-AI-AUG23.PDF,https://www.tes.com/magazine/news/general/artificial-intelligence-assessment-aqa-exam-board,edtech,www.tes.com,tes,media_sector,specialist_media,164,208,26,6


#### Create 'Text' Variable = 'Title' + 'Description'

In [49]:
#Create "Text" variable = "Title" + "Description" 
df['text'] = df['title'].fillna('') + ' ' + df['description'].fillna('')

In [50]:
# Basic info on the new column
print(df['text'].info())

# Add a column for text length (number of words or characters)
df['text_length_chars'] = df['text'].str.len()
df['text_length_words'] = df['text'].str.split().str.len()

# Summary statistics
print("\nCharacter length stats:")
print(df['text_length_chars'].describe())

<class 'pandas.core.series.Series'>
Index: 901 entries, 0 to 1183
Series name: text
Non-Null Count  Dtype 
--------------  ----- 
901 non-null    string
dtypes: string(1)
memory usage: 14.1 KB
None

Character length stats:
count         901.0
mean     282.778024
std      145.953728
min            59.0
25%           185.0
50%           254.0
75%           346.0
max          1445.0
Name: text_length_chars, dtype: Float64


In [51]:
# Check for missing or empty values
missing_mask = df['text'].isna() | (df['text'].str.strip() == '')

# Count how many
missing_count = missing_mask.sum()
print(f"Missing or empty 'text' entries: {missing_count}")

# Optionally view them
if missing_count > 0:
    print(df.loc[missing_mask, ['title', 'description']].head())


Missing or empty 'text' entries: 0


In [52]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 901 entries, 0 to 1183
Data columns (total 20 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   id                      901 non-null    string
 1   newsletter_number       901 non-null    int64 
 2   issue_date              901 non-null    string
 3   theme                   901 non-null    string
 4   subtheme                24 non-null     string
 5   title                   901 non-null    string
 6   description             901 non-null    string
 7   link                    901 non-null    string
 8   new_theme               901 non-null    object
 9   domain                  901 non-null    object
 10  organisation            901 non-null    object
 11  org_broad_category      901 non-null    object
 12  org_category            901 non-null    object
 13  title_length            901 non-null    Int64 
 14  description_length      901 non-null    Int64 
 15  title_word

In [54]:
df.columns

Index(['id', 'newsletter_number', 'issue_date', 'theme', 'subtheme', 'title',
       'description', 'link', 'new_theme', 'domain', 'organisation',
       'org_broad_category', 'org_category', 'title_length',
       'description_length', 'title_word_count', 'description_word_count',
       'text', 'text_length_chars', 'text_length_words'],
      dtype='object')

# Save Files 

In [55]:
df.to_csv("/workspaces/ERP_Newsletter/data/data03_newsletter_items_clean/items_final_themes.csv", index=False)

print(f"‚úÖ ‚úÖ ‚úÖ Saved")

‚úÖ ‚úÖ ‚úÖ Saved


In [56]:
df.head(0)

Unnamed: 0,id,newsletter_number,issue_date,theme,subtheme,title,description,link,new_theme,domain,organisation,org_broad_category,org_category,title_length,description_length,title_word_count,description_word_count,text,text_length_chars,text_length_words
