# Imports and Data

In [21]:
#imports 
import pandas as pd
import numpy as np 

In [4]:
#data
df = pd.read_csv("/workspaces/ERP_Newsletter/data_processed/data_for_preprocessing.csv")
df.head(1)

Unnamed: 0,id,newsletter_number,issue_date,new_theme,text,domain,organisation
0,c97ff62f-83ca-47ec-a4c7-b4e24157ae0a,1,11 July 2023,political_context_and_organisations,Deadline 23 August 2023 Education secretary Gi...,schoolsweek.co.uk,schools_week


# Keep only three main themes 

In [10]:
df["new_theme"].value_counts()

new_theme
political_context_and_organisations    614
project_updates                        182
teacher_rrd                            168
digital_ed                             158
events_opportunities_research           30
Name: count, dtype: int64

In [12]:
themes_keep = [
    "political_context_and_organisations",
    "teacher_rrd",
    "digital_ed"
]
df=df[df["new_theme"].isin(themes_keep)].copy()

df["new_theme"].value_counts()    

new_theme
political_context_and_organisations    614
teacher_rrd                            168
digital_ed                             158
Name: count, dtype: int64

In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 940 entries, 0 to 1149
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   id                 940 non-null    object
 1   newsletter_number  940 non-null    int64 
 2   issue_date         940 non-null    object
 3   new_theme          940 non-null    object
 4   text               940 non-null    object
 5   domain             940 non-null    object
 6   organisation       940 non-null    object
dtypes: int64(1), object(6)
memory usage: 58.8+ KB


# Identify the top 20 organisations 

In [15]:
top20_orgs = df["organisation"].value_counts().head(20).index.tolist()
df["org_group"] = df["organisation"].where(df["organisation"].isin(top20_orgs), "Other")
print("\nTop 20 organisations:")
print(df["org_group"].value_counts().head(20))


Top 20 organisations:
org_group
Other                       397
schools_week                145
uk_government                67
guardian                     27
ucl                          25
epi                          25
nfer                         24
bera                         23
uk_parliament                22
welsh_government             21
scottish_government          21
conversation                 19
oecd                         19
belfast_telegraph            16
ni_government                16
upen                         13
teacher_tapp                 13
fft_ed_datalab               13
nuffield                     12
university_of_birmingham     12
Name: count, dtype: int64


# Make Time Bins 

In [18]:
df["issue_date"] = pd.to_datetime(df["issue_date"], errors="coerce")
df["year_quarter"] = df["issue_date"].dt.to_period("Q").astype(str).fillna("unknown")

In [None]:
#COLLAPSE 

# Balanced sampling plan

In [24]:
TOTAL = 300
per_theme = TOTAL // 3        # 100 per theme
share_top = 0.70              # ~70% from Top-20, 30% Other
rng = np.random.default_rng(42)

samples = []

for theme, g in df.groupby("new_theme", group_keys=False):
    target_theme = per_theme

    # Split theme pool into Top-20 vs Other
    g_top = g[g["org_group"] != "Other"].copy()
    g_other = g[g["org_group"] == "Other"].copy()

    need_top = int(round(target_theme * share_top))
    need_other = target_theme - need_top

    # Allocate need_top across the top orgs (proportional, with cap)
    CAP_PER_ORG = max(3, int(np.ceil(need_top / max(1, g_top["org_group"].nunique()))) + 2)

    parts = []

    if len(g_top) > 0 and need_top > 0:
        counts = g_top["org_group"].value_counts()

        # Proportional base allocation
        prop = (counts / counts.sum()) * need_top
        base = prop.astype(int)

        # Distribute remainder by largest fractional parts
        remainder = need_top - int(base.sum())
        if remainder > 0:
            frac = (prop - base).sort_values(ascending=False)
            for k in frac.index.tolist():
                if remainder <= 0:
                    break
                base[k] = base.get(k, 0) + 1
                remainder -= 1

        # Apply a per-org cap to avoid dominance
        base = base.clip(upper=CAP_PER_ORG)

        # If cap made us short, top up using any headroom left
        short = need_top - int(base.sum())
        if short > 0:
            headroom = pd.Series(
                {k: max(0, CAP_PER_ORG - base.get(k, 0)) for k in counts.index}
            ).sort_values(ascending=False)
            for k, room in headroom.items():
                if short <= 0:
                    break
                add = min(short, room)
                if add > 0:
                    base[k] = base.get(k, 0) + add
                    short -= add

        # Sample from each top org
        for k, n_take in base.items():
            if n_take <= 0:
                continue
            pool = g_top[g_top["org_group"] == k]
            n_take = min(n_take, len(pool))
            if n_take > 0:
                parts.append(pool.sample(n=n_take, random_state=42))

    samp_top = pd.concat(parts, ignore_index=False) if parts else g_top.iloc[0:0]

    # Sample from Other to reach target_theme
    need_other = target_theme - len(samp_top)
    if need_other > 0 and len(g_other) > 0:
        n_take_other = min(need_other, len(g_other))
        samp_other = g_other.sample(n=n_take_other, random_state=42)
    else:
        samp_other = g.iloc[0:0]

    # Combine theme samples
    samp_theme = pd.concat([samp_top, samp_other], ignore_index=False)

    # If still short (e.g., not enough Top/Other), fill from remaining rows in this theme
    shortfall = target_theme - len(samp_theme)
    if shortfall > 0:
        remaining = g.drop(samp_theme.index, errors="ignore")
        if len(remaining) > 0:
            extra = remaining.sample(n=min(shortfall, len(remaining)), random_state=42)
            samp_theme = pd.concat([samp_theme, extra], ignore_index=False)

    # Trim in the unlikely event we’re a tad over
    if len(samp_theme) > target_theme:
        samp_theme = samp_theme.sample(n=target_theme, random_state=42)

    samples.append(samp_theme)

# Final sample
sample = pd.concat(samples, ignore_index=False)

# Checks

In [25]:
print("\nPer-theme counts:")
print(sample["new_theme"].value_counts())

print("\nOrg_group (Top-20 + Other):")
print(sample["org_group"].value_counts().head(25))

print("\nTime coverage (year-quarter, top 10):")
print(sample["year_quarter"].value_counts().head(10))


Per-theme counts:
new_theme
digital_ed                             100
political_context_and_organisations    100
teacher_rrd                            100
Name: count, dtype: int64

Org_group (Top-20 + Other):
org_group
Other                       123
schools_week                 26
uk_government                20
guardian                     12
oecd                         12
nfer                         12
bera                         12
teacher_tapp                 12
uk_parliament                11
conversation                  9
ucl                           9
tes                           8
epi                           7
ni_government                 4
nuffield                      4
welsh_government              4
scottish_government           4
university_of_birmingham      3
belfast_telegraph             3
fft_ed_datalab                3
upen                          2
Name: count, dtype: int64

Time coverage (year-quarter, top 10):
year_quarter
2025Q2    53
2024Q2    39
2

# Save 

In [29]:
labeling_df = sample[[
    "id",
    "text",
    "new_theme",
    "org_group",
    "organisation",
    "issue_date"
]].copy()

labeling_df["manual_label"] = ""

out_path = "/workspaces/ERP_Newsletter/data_processed/sample_to_label.xlsx"
labeling_df.to_excel(out_path, index=False)

print(f"\n📄 Saved labeling file: {out_path}  (rows={len(labeling_df)})")


📄 Saved labeling file: /workspaces/ERP_Newsletter/data_processed/sample_to_label.xlsx  (rows=300)
