In [2]:
import os 
import re 

import pandas as pd 
import numpy as np 

from ftfy import fix_text
import unicodedata as ud
from urllib.parse import urlparse

In [3]:
# Treat these text tokens as missing on read
NA_TOKENS = ["", " ", "NA", "N/A", "na", "NaN", "nan", "null", "NULL", "-"]

In [5]:
#load data 
input_path = "/workspaces/ERP_Newsletter/data/raw/newsletter_short_text/newsletter_items.csv"
data_cleaning_path = "/workspaces/ERP_Newsletter/data/interim/ingestion"
output_path = "/workspaces/ERP_Newsletter/data/interim/cleaning"


df = pd.read_csv(input_path, keep_default_na=True, na_values=NA_TOKENS)

In [4]:
#inspect 
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1668 entries, 0 to 1667
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   id                 1668 non-null   object
 1   newsletter_number  1668 non-null   int64 
 2   issue_date         1668 non-null   object
 3   theme              1668 non-null   object
 4   subtheme           114 non-null    object
 5   title              1667 non-null   object
 6   description        1346 non-null   object
 7   link               1616 non-null   object
dtypes: int64(1), object(7)
memory usage: 104.4+ KB


In [5]:
print(f"Total rows: {len(df)}")
print(f"Unique newsletter: {df['newsletter_number'].nunique()}")

Total rows: 1668
Unique newsletter: 87


#¬†Clean Up Text

In [6]:
def clean_series(s: pd.Series) -> pd.Series:
    # Use pandas "string" dtype so NaNs stay as <NA>
    s = s.astype("string")
    mask = s.notna()
    # Fix mojibake and normalize only on non-missing cells
    s.loc[mask] = s.loc[mask].apply(fix_text)
    s.loc[mask] = s.loc[mask].apply(lambda x: ud.normalize("NFKC", x))
    # Basic whitespace cleanup
    s.loc[mask] = s.loc[mask].str.replace(r"\s+", " ", regex=True).str.strip()
    return s

# Clean all object/string columns (quick and safe)
obj_cols = [c for c in df.columns if df[c].dtype == object or pd.api.types.is_string_dtype(df[c])]
for c in obj_cols:
    df[c] = clean_series(df[c])

# Quick exact replacements for the most common artifacts (optional, simple)
REPL = {
    "√Ç ": " ", "√Ç": "",
    "‚Äö√Ñ√¨": "‚Äì", "‚Äö√Ñ√Æ": "‚Äî",
    "‚Äö√Ñ√¥": "‚Äô", "‚Äö√Ñ√≤": "‚Äò",
    "‚Äö√Ñ√∫": "‚Äú", "‚Äö√Ñ√π": "‚Äù",
    "√¢‚Ç¨‚Äú": "‚Äì", "√¢‚Ç¨‚Äù": "‚Äî",
    "√¢‚Ç¨Àú": "‚Äò", "√¢‚Ç¨‚Ñ¢": "‚Äô",
    "√¢‚Ç¨≈ì": "‚Äú", "√¢‚Ç¨\x9d": "‚Äù",
    "√¢‚Ç¨¬¢": "‚Ä¢", "√¢‚Ç¨¬¶": "‚Ä¶",
}
for c in obj_cols:
    s = df[c].astype("string")
    for bad, good in REPL.items():
        s = s.str.replace(bad, good, regex=False)
    s = s.str.replace(r"\s+", " ", regex=True).str.strip()
    df[c] = s

# Check for Missing Values 

In [7]:
def missing_table(d: pd.DataFrame) -> pd.DataFrame:
    mc = d.isna().sum()
    return pd.DataFrame({
        "Missing Values": mc,
        "Percentage (%)": (mc / len(d)) * 100
    }).sort_values("Missing Values", ascending=False)

print("\n=== Missing values (before drop) ===")
print(missing_table(df))


=== Missing values (before drop) ===
                   Missing Values  Percentage (%)
subtheme                     1554       93.165468
description                   322       19.304556
link                           52        3.117506
title                           1        0.059952
theme                           0        0.000000
issue_date                      0        0.000000
newsletter_number               0        0.000000
id                              0        0.000000


#¬†Remove items where description, link or title are missing

In [8]:
# Remove rows where 'description' or 'link' is missing
df_cleaned = df.dropna(subset=['description', 'link', 'title'])

# (Optional) Check how many rows remain
print(f"Rows before: {len(df)}")
print(f"Rows after : {len(df_cleaned)}")

df = df_cleaned

Rows before: 1668
Rows after : 1323


#¬†Check for Duplicates 

###¬†All rows identical 

In [9]:
#All rows identical 
total_duplicates = df.duplicated().sum()
print(f"Total duplicate rows (all columns identical): {total_duplicates}")

Total duplicate rows (all columns identical): 0


### Title and link identical 

In [10]:
# Check duplicates where both title and link are the same
title_link_dupes = df[df.duplicated(subset=["title", "link"], keep=False)]

print(f"Number of duplicate title+link pairs: {title_link_dupes.shape[0]}")
title_link_dupes.sort_values(by=["title"]).head(2)

Number of duplicate title+link pairs: 89


Unnamed: 0,id,newsletter_number,issue_date,theme,subtheme,title,description,link
1301,bf6c4fd6-a5bd-48ca-9249-b5b92849e038,70,4 April 2025,Updates from the programme,,A reminder that the ESRC Education Research Pr...,"AI in Education: From chalkboards to chatbots,...",https://uk.bettshow.com/speakers/dominik-lukes
1327,0ad9176d-2b5a-4306-a9f3-2b4ffdf96be6,71,11 April 2025,Updates from the programme,,A reminder that the ESRC Education Research Pr...,"AI in Education: From chalkboards to chatbots,...",https://uk.bettshow.com/speakers/dominik-lukes


In [11]:
title_link_dupes.theme.value_counts()

theme
Updates from the programme                                                                                                                                                                                                    35
You have indicated that you are happy to receive news and updates from the ESRC Education Research Programme. To unsubscribe, please email Elizabeth.hudson@ucl.ac.uk with the word UNSUBSCRIBE in the title of the email.    28
What Matters in Education?                                                                                                                                                                                                     8
Updates from the Programme                                                                                                                                                                                                     5
Update from the ESRC Education Research Programme                                             

In [12]:
title_link_dupes[title_link_dupes.theme == "Teacher recruitment, retention & development"]

Unnamed: 0,id,newsletter_number,issue_date,theme,subtheme,title,description,link
992,8991c36b-65fb-4011-9249-8f5917d32a0e,56,6 December 2024,"Teacher recruitment, retention & development",,DfE - Working lives of teachers and leaders: w...,A summary report of early findings from the th...,https://www.gov.uk/government/publications/wor...
1547,ac53ae25-d785-47b2-abd5-63d8e9583cbb,82,11 July 2025,"Teacher recruitment, retention & development",,DfE - Working lives of teachers and leaders: w...,Findings from the third wave of the working li...,https://www.gov.uk/government/publications/wor...


In [13]:
#drop duplicates keeping only first occurence 
df = df.drop_duplicates(subset=["title", "link"], keep="first").reset_index(drop=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1258 entries, 0 to 1257
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   id                 1258 non-null   string
 1   newsletter_number  1258 non-null   int64 
 2   issue_date         1258 non-null   string
 3   theme              1258 non-null   string
 4   subtheme           85 non-null     string
 5   title              1258 non-null   string
 6   description        1258 non-null   string
 7   link               1258 non-null   string
dtypes: int64(1), string(7)
memory usage: 78.8 KB


### Title only duplicates

In [14]:
# Count duplicates based on title only
title_dupes = df[df.duplicated(subset=["title"], keep=False)]

print(f"Number of rows with duplicate titles: {title_dupes.shape[0]}")
title_dupes.sort_values(by="title").head(1)

Number of rows with duplicate titles: 20


Unnamed: 0,id,newsletter_number,issue_date,theme,subtheme,title,description,link
1240,15f17205-bafc-43d8-ba3d-b4881956e48b,87,10 October 2025,Updates from the Programme,,Addressing key issues in teacher recruitment a...,Catch up with the video of the latest in the W...,https://mediacentral.ucl.ac.uk/Play/126585


In [15]:
title_table = title_dupes[["title", "theme"]].value_counts().reset_index(name="count")
title_table

Unnamed: 0,title,theme,count
0,Making Teaching Attractive and Worthwhile (Par...,Project news,3
1,Deadline: 28 April 2025,Political environment and key organisations,2
2,What matters in education? Education after the...,Updates from the programme,2
3,Panel:,Updates from the programme,2
4,Addressing key issues in teacher recruitment a...,Updates from the Programme,2
5,What matters in education? Education in a brok...,Updates from the programme,2
6,Labour,Political landscape & key organisations,1
7,Digital Poverty Alliance,EdTech,1
8,Digital Poverty Alliance,Thematic roundup,1
9,Panel:,"Teacher recruitment, retention & development",1


In [16]:
df = df.drop_duplicates(subset=["title"], keep="first").reset_index(drop=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1247 entries, 0 to 1246
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   id                 1247 non-null   string
 1   newsletter_number  1247 non-null   int64 
 2   issue_date         1247 non-null   string
 3   theme              1247 non-null   string
 4   subtheme           84 non-null     string
 5   title              1247 non-null   string
 6   description        1247 non-null   string
 7   link               1247 non-null   string
dtypes: int64(1), string(7)
memory usage: 78.1 KB


### link-only duplicates 

In [17]:
# Count duplicates based on link only
link_dupes = df[df.duplicated(subset=["link"], keep=False)]

print(f"Number of rows with duplicate links: {link_dupes.shape[0]}")
link_dupes.sort_values(by="link").head(1)

Number of rows with duplicate links: 114


Unnamed: 0,id,newsletter_number,issue_date,theme,subtheme,title,description,link
450,17268e59-d380-4e8f-a2e7-964543717f17,35,10 May 2024,What matters in education?,,Big Education conference - 'Next Generation Sc...,Hear from schools across the country who are w...,https://bigeducation.org/product/next-generati...


In [18]:
pd.set_option("display.max_colwidth", None)

link_table = link_dupes[["link"]].value_counts().reset_index(name="count")
link_table

Unnamed: 0,link,count
0,https://www.ucl.ac.uk/education-research-programme/events/2023/oct/practical-policies-or-bright-ideas-how-particular-topics-get-front-policy-queue,4
1,https://www.ucl.ac.uk/education-research-programme/events/2024/mar/investing-early-years-priorities-and-challenges,4
2,https://uk.bettshow.com/speakers/dominik-lukes,3
3,https://www.ucl.ac.uk/education-research-programme/events/2024/jan/pupil-absence-questions-policy-research-and-practice,3
4,https://childrens-participation.org/,3
5,https://www.ucl.ac.uk/education-research-programme/events/2025/may/how-build-resilient-schools-place-based-approaches-supporting-teachers-and-leaders,3
6,https://edtech.oii.ox.ac.uk/cutting-through-the-conjecture,2
7,https://engagementhub.ukri.org/esrc-1/weshorizonscanningsurvey,2
8,https://education.us18.list-manage.com/track/click?u=61f408a2f9c6d02a726ce6200&id=bea3b5fbac&e=4eb2cf985e,2
9,https://epi.org.uk/events/labour-party-conference-prioritising-equality-education-policy-as-a-lever-to-tackling-disadvantage-and-inequalities,2


In [19]:
df = df.drop_duplicates(subset=["link"], keep="first").reset_index(drop=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1186 entries, 0 to 1185
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   id                 1186 non-null   string
 1   newsletter_number  1186 non-null   int64 
 2   issue_date         1186 non-null   string
 3   theme              1186 non-null   string
 4   subtheme           80 non-null     string
 5   title              1186 non-null   string
 6   description        1186 non-null   string
 7   link               1186 non-null   string
dtypes: int64(1), string(7)
memory usage: 74.3 KB


# Identify themes and subthemes

In [20]:
#Unique counts of columns 
print("Unique titles:", df["title"].nunique())
print("Unique themes:", df["theme"].nunique())
print("Unique subthemes", df["subtheme"].nunique())
print("Unique links:", df["link"].nunique())

Unique titles: 1186
Unique themes: 62
Unique subthemes 35
Unique links: 1186


In [21]:
### Add placeholders for missing themes/subhtemes

# 1) Normalize empties/whitespace/"nan"/"none" to real NA
df_norm = df.copy()
for col in ["theme", "subtheme"]:
    df_norm[col] = (
        df_norm[col]
        .astype("string")
        .replace(r"^\s*$", pd.NA, regex=True)   # empty/whitespace ‚Üí NA
        .replace({"nan": pd.NA, "NaN": pd.NA, "None": pd.NA, "none": pd.NA})
    )

# 2) Create a version that fills NA with placeholders so ALL cases are counted
df_filled = df_norm.fillna({"theme": "No theme", "subtheme": "No subtheme"})

# 3) Group and count every (theme, subtheme) combo, including placeholder cases
theme_subtheme_counts = (
    df_filled
    .groupby(["theme", "subtheme"], dropna=False)
    .size()
    .reset_index(name="count")
    .sort_values(by=["theme", "subtheme"])
)

# 4) Export to Excel  
out_dir = data_cleaning_path
out_path = os.path.join(out_dir, "theme_subtheme_counts.xlsx")


theme_subtheme_counts.to_excel(out_path, index=False)  # <- this one
print(f"‚úÖ Exported {len(theme_subtheme_counts)} rows to {out_path}")

‚úÖ Exported 99 rows to /workspaces/ERP_Newsletter/data/1_interim/0_ingestion/theme_subtheme_counts.xlsx


# Check Themes and Articles 

In [22]:
# Filter articles under themes

check_themes = df[df["theme"] == "Research ‚Äì Practice ‚Äì Policy"].copy()

# View a few examples
display(check_themes.head(5))

Unnamed: 0,id,newsletter_number,issue_date,theme,subtheme,title,description,link
262,2fbf7b7b-334a-4d82-977c-d14a5bbf4778,25,16 February 2024,Research ‚Äì Practice ‚Äì Policy,,CAPE - Quid pro quo? Why academics meet with policy professionals,"Patrick McAlary, CAPE coordinator, explores what benefits academics report from giving up their time to chat with policy professionals about their policy priorities",https://t.co/DbEx7Z1PPJ
263,7f2bc1b1-fcd3-4602-89d3-cdd1d39845de,25,16 February 2024,Research ‚Äì Practice ‚Äì Policy,,The SHAPE of research impact,"British Academy report exploring research impact for the SHAPE disciplines, looking at the body of impact case studies submitted to the most recent research assessment exercise in the UK (REF21)",https://www.thebritishacademy.ac.uk/publications/the-shape-of-research-impact
264,1775fa9d-d0de-4be6-9e61-f466f7131eb2,25,16 February 2024,Research ‚Äì Practice ‚Äì Policy,,NFER Event ‚Äì Disadvantaged Policy webinar,Thursday 22 February 2024 ‚Äì 11am Online,https://www.nfer.ac.uk/events/disadvantaged-policy-webinar
276,e684b418-8313-40f5-a40d-1fb8e68784bc,26,23 February 2024,Research ‚Äì Practice ‚Äì Policy,,Post from the Co-Production Collective - How to best engage the public to participate in collaborative projects,"Through her lived experience of working with community groups, member of the Co-Production Collective Yesmin Begum shares her key principles for meaningful co-production and involvement.",https://www.coproductioncollective.co.uk/news/how-to-best-engage-the-public-to-participate-in-collaborative-projects?dm_i=2HJW%2C1ZV0V%2C7XUV43%2C74IFU%2C1
277,a8733168-3fb2-4eb4-b5d4-d15318b122f8,26,23 February 2024,Research ‚Äì Practice ‚Äì Policy,,"BERA event - Social theory, educational research and polycrisis",22 May 2024 2pm ‚Äì 4pm (Free for BERA members),https://www.bera.ac.uk/event/social-theory-educational-research-and-polycrisis-2024


# Rename Themes

In [23]:
# ---------- 0) Drop rows where the entire theme is the unsubscribe text
UNSUB_THEME = (
    "You have indicated that you are happy to receive news and updates from the "
    "ESRC Education Research Programme. To unsubscribe, please email "
    "Elizabeth.hudson@ucl.ac.uk with the word UNSUBSCRIBE in the title of the email."
)
mask_unsub = df["theme"].astype(str).str.strip().eq(UNSUB_THEME)
dropped_rows = int(mask_unsub.sum())
df = df[~mask_unsub].copy()

# ---------- 1) Normalizers
def norm_theme(s: str) -> str:
    if not isinstance(s, str):
        return ""
    s = s.strip()
    s = re.sub(r"\s+", " ", s)
    s = s.replace("‚Äî", "-").replace("‚Äì", "-")  # normalize dashes
    s = s.replace("‚Äô", "'").replace("‚Äò", "'").replace("‚Äú", '"').replace("‚Äù", '"')
    return s.lower()

def norm_key(s: str) -> str:
    """Strong normalizer for matching keys like subthemes:
       - lowercase; & -> and; remove punctuation; normalize dashes; collapse spaces
    """
    if not isinstance(s, str):
        return ""
    s = s.strip().lower()
    s = s.replace("‚Äî", " ").replace("‚Äì", " ").replace("-", " ")
    s = s.replace("&", " and ")
    s = s.replace("‚Äô", "'").replace("‚Äò", "'").replace("‚Äú", '"').replace("‚Äù", '"')
    s = re.sub(r"[,\.\u00A0]", " ", s)         # remove commas, periods, NBSP
    s = re.sub(r"[^a-z0-9\s]", " ", s)         # drop other punctuation
    s = re.sub(r"\s+", " ", s).strip()
    return s

# ---------- 2) Theme mapping list: (new_theme, current_theme)
# NOTE: Moved "Four Nations" + variants AND "Update from UKRI" to political_context_and_organisations
pairs = [
    # project_updates
    ("programme_updates", "Embedding children's participation rights in pedagogical practice in lower primary classrooms in Wales PI: Sarah Chicken"),
    ("programme_updates", "Investigating the recruitment and retention of ethnic minority teachers PI: Stephen Gorard"),
    ("programme_updates", "News from the Projects"),
    ("programme_updates", "News from the projects"),
    ("programme_updates", "PI Updates and Papers"),
    ("programme_updates", "PI: David Lundie"),
    ("programme_updates", "Programme news"),
    ("programme_updates", "Programme Update"),
    ("programme_updates", "Programme update"),
    ("programme_updates", "Project news"),
    ("programme_updates", "Rethinking teacher recruitment: New approaches to attracting prospective STEM teachers PI: Rob Klassen"),
    ("programme_updates", "Sustainable school leadership: comparing approaches to the training, supply and retention of senior school leaders across the UK PI Toby Greany"),
    ("programme_updates", "Toby Greany"),
    ("programme_updates", "Towards equity focused approaches to EdTech: a socio-technical perspective PI: Professor Rebecca Eynon"),
    ("programme_updates", "Towards equity focused approaches to EdTech: a socio-technical perspective PI: Rebecca Eynon"),
    ("programme_updates", "Update from the ERP projects"),
    ("programme_updates", "Update from the ESRC Education Research Programme"),
    ("programme_updates", "Update from the projects"),
    ("programme_updates", "Updates from David Lundie"),
    ("programme_updates", "Updates from Steph Ainsworth"),
    ("programme_updates", "Updates from the ERP projects"),
    ("programme_updates", "Updates from the ESRC"),
    ("programme_updates", "Updates from the Programme"),
    ("programme_updates", "Updates from the programme"),
    ("programme_updates", "Updates from the projects"),
    ("programme_updates", "Decentring the 'resilient teacher': exploring interactions between individuals and their social ecologies PI: Steph Ainsworth"),
    ("programme_updates", "Peer reviewed articles from the ERP projects"),
    ("programme_updates", "Peer reviewed publications from the ERP projects"),


    # digital_ed
    ("digital_ed", "EdTech"),

    # political_context_and_organisations
    ("political_context_and_organisations", "What are the politicians saying?"),
    ("political_context_and_organisations", "What Matters in Education?"),
    ("political_context_and_organisations", "What matters in education?"),
    ("political_context_and_organisations", "4 Nations"),
    ("political_context_and_organisations", "4 Nations & key organisations"),
    ("political_context_and_organisations", "Political environment and key organisations"),
    ("political_context_and_organisations", "Political landscape - the election"),
    ("political_context_and_organisations", "Political landscape & key organisations"),
    ("political_context_and_organisations", "Political landscape across Four Nations & key organisations"),
    ("political_context_and_organisations", "Research ‚Äì Practice ‚Äì Policy"),
    ("political_context_and_organisations", "Calls for evidence"),
    ("political_context_and_organisations", "DfE"),
    ("political_context_and_organisations", "Education, Policy & Practice"),
    ("political_context_and_organisations", "EEF"),
    ("political_context_and_organisations", "ESRC"),
    ("political_context_and_organisations", "Politics"),
    ("political_context_and_organisations", "Launch of ESRC survey on social science research skills"),
    ("political_context_and_organisations", "Updates from UKRI"),  # plural already here
    ("political_context_and_organisations", "Update from UKRI"),   # moved here (singular)
    ("political_context_and_organisations", "Four Nations"),       # moved here
    ("political_context_and_organisations", "Four Nations Landscape"),
    ("political_context_and_organisations", "Four Nations landscape"),

    # events_opportunities_research
    ("events_opportunities_research", "Conferences"),
    ("events_opportunities_research", "Opportunities"),
    ("events_opportunities_research", "Opportunities for funding"),
    ("events_opportunities_research", "Opportunities to blog"),
    ("events_opportunities_research", "Other Reports"),
    ("events_opportunities_research", "Other Research"),
    ("events_opportunities_research", "Relevant Events"),
    ("events_opportunities_research", "Relevant Research"),
    ("events_opportunities_research", "Reports"),
    ("events_opportunities_research", "Research"),
    ("events_opportunities_research", "Events"),
    ("events_opportunities_research", "Seminar series topics"),
    ("events_opportunities_research", "Seminar topics"),

    # teacher_rrd
    ("teacher_rrd", "Teacher recruitment, retention & development"),
]

# ---------- 3) Build lookup (normalized)
lookup = {norm_theme(curr): new for new, curr in pairs}

# ---------- 4) Apply theme mapping (no fill yet)
theme_norm = df["theme"].map(norm_theme)
df["new_theme"] = theme_norm.map(lookup)

# ---------- 4b) Defensive keyword overrides (force correct bucket if text contains patterns)
kw_mask = (
    theme_norm.str.contains(r"\bfour nations\b", regex=True, na=False) |
    theme_norm.str.contains(r"\bupdate(s)? from ukri\b", regex=True, na=False)
)
df.loc[kw_mask, "new_theme"] = "political_context_and_organisations"

# ---------- 5) Subtheme-based overrides
sub_norm = df["subtheme"].map(norm_key)

target_rrd = "teacher recruitment retention and development"
df.loc[sub_norm.eq(target_rrd), "new_theme"] = "teacher_rrd"  # any variant mapped earlier ‚Üí normalized equals this
df.loc[sub_norm.eq("digital"), "new_theme"] = "digital_ed"

# ---------- 6) Fill any remaining unmapped with the original theme text (your previous behavior)
df["new_theme"] = df["new_theme"].fillna(df["theme"])

# ---------- 7) Export a summary
summary = (
    df.assign(theme_norm=theme_norm, subtheme_norm=sub_norm)
      .groupby(["new_theme", "theme_norm"], dropna=False)
      .size()
      .reset_index(name="count")
      .sort_values(["new_theme", "count"], ascending=[True, False])
)

out_dir = data_cleaning_path
summary_path = os.path.join(out_dir, "theme_mapping_summary.xlsx")


with pd.ExcelWriter(summary_path) as xw:
    df.to_excel(xw, sheet_name="data_with_new_theme", index=False)
    summary.to_excel(xw, sheet_name="mapping_summary", index=False)

print(f"‚úÖ Dropped {dropped_rows} unsubscribe row(s).")
print("‚úÖ Mapping applied.")
print("üìÑ Excel written to:", summary_path)

  theme_norm.str.contains(r"\bupdate(s)? from ukri\b", regex=True, na=False)


‚úÖ Dropped 0 unsubscribe row(s).
‚úÖ Mapping applied.
üìÑ Excel written to: /workspaces/ERP_Newsletter/data/1_interim/0_ingestion/theme_mapping_summary.xlsx


# Explore Programme Updates 

In [24]:
#create new df with programme updates 
programme_updates_df = df[df["new_theme"] == "programme_updates"].copy()

In [25]:
programme_updates_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 144 entries, 12 to 1185
Data columns (total 9 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   id                 144 non-null    string
 1   newsletter_number  144 non-null    int64 
 2   issue_date         144 non-null    string
 3   theme              144 non-null    string
 4   subtheme           49 non-null     string
 5   title              144 non-null    string
 6   description        144 non-null    string
 7   link               144 non-null    string
 8   new_theme          144 non-null    object
dtypes: int64(1), object(1), string(7)
memory usage: 11.2+ KB


In [26]:
programme_updates_df.to_excel("/workspaces/ERP_Newsletter/data/0_raw/1_newsletter_short_text/programme_updates.xlsx", index=False)
print("File written to folder")

File written to folder


# Explore events_opportunities_research Updates 

In [30]:
#create new df with programme updates 
events_opportunities_research_df = df[df["new_theme"] == "events_opportunities_research"].copy()

In [31]:
programme_updates_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 144 entries, 12 to 1185
Data columns (total 9 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   id                 144 non-null    string
 1   newsletter_number  144 non-null    int64 
 2   issue_date         144 non-null    string
 3   theme              144 non-null    string
 4   subtheme           49 non-null     string
 5   title              144 non-null    string
 6   description        144 non-null    string
 7   link               144 non-null    string
 8   new_theme          144 non-null    object
dtypes: int64(1), object(1), string(7)
memory usage: 11.2+ KB


In [33]:
events_opportunities_research_df.to_excel("/workspaces/ERP_Newsletter/data/0_raw/1_newsletter_short_text/events_opportunities_research.xlsx", index=False)
print("File written to folder")

File written to folder
