## 🧪 Intercoder Reliability (ICR) + FINAL FOR CODING Sample Drawing

This script generates and verifies raw samples for ICR testing rounds 1, 2, 3 and 4 using a consistent, reproducible method.

---



In [26]:
import os
import pandas as pd
import random

# === Reproducibility Config ===
RANDOM_STATE = 42
random.seed(RANDOM_STATE)

# === Paths ===
BASE_PATH = os.path.expanduser(
    "~/webdav/ASCOR-FMG-5580-RESPOND-news-data (Projectfolder)/annotations/coding_frames/ICR"
)
INPUT_PATH = os.path.expanduser(
    "~/webdav/ASCOR-FMG-5580-RESPOND-news-data (Projectfolder)/output/news_sample_translated_10000_with_llm_annotations.csv"
)

# === Load data ===
df_all = pd.read_csv(INPUT_PATH)
df_yes = df_all[df_all.get("llm_label", "") == "Yes"].copy()
n_total = 15

# === Helper: Only save if file exists and is identical ===
def save_raw_only_if_same(df_new, path):
    if not os.path.exists(path):
        print(f"⛔ File doesn't exist, NOT saving: {os.path.basename(path)}")
        return
    try:
        df_existing = pd.read_csv(path)
        if df_new.reset_index(drop=True).equals(df_existing.reset_index(drop=True)):
            df_new.to_csv(path, index=False)
            print(f"✅ File exists and matches, re-saving: {os.path.basename(path)}")
        else:
            print(f"❌ File differs — NOT saving: {os.path.basename(path)}")
    except Exception as e:
        print(f"⚠️ Could not read {path}, NOT saving. Error: {e}")

# === ICR 1: Simple Random Sample ===
print("\n🎯 Drawing ICR 1 sample...")
df_icr1 = df_yes.sample(n=15, random_state=12).reset_index(drop=True)
icr1_path = os.path.join(BASE_PATH, "ICR_test1", "icr1_sample_raw.csv")
os.makedirs(os.path.dirname(icr1_path), exist_ok=True)
save_raw_only_if_same(df_icr1, icr1_path)

# === ICR 2: Balanced by country, excluding ICR 1 ===
print("\n🎯 Drawing ICR 2 sample (balanced across countries, excluding ICR 1)...")
df_remaining_icr2 = df_yes[~df_yes['translated_text'].isin(df_icr1['translated_text'])]
country_counts = df_remaining_icr2['country'].value_counts()
countries_to_sample = country_counts.index.tolist()
samples_per_country = n_total // len(countries_to_sample)
extra = n_total % len(countries_to_sample)

sampled_rows = []
for i, country in enumerate(countries_to_sample):
    n = samples_per_country + (1 if i < extra else 0)
    subset = df_remaining_icr2[df_remaining_icr2['country'] == country]
    if len(subset) >= n:
        sampled_rows.append(subset.sample(n=n, random_state=42))
    else:
        print(f"⚠️ Not enough articles for {country} (needed {n}, found {len(subset)})")

df_icr2 = pd.concat(sampled_rows).reset_index(drop=True)
icr2_path = os.path.join(BASE_PATH, "ICR_test2", "icr2_sample_raw.csv")
os.makedirs(os.path.dirname(icr2_path), exist_ok=True)
save_raw_only_if_same(df_icr2, icr2_path)

# === ICR 3: Balanced by country, excluding ICR 1 + 2 ===
print("\n🎯 Drawing ICR 3 sample (balanced across countries, excluding ICR 1 + 2)...")
already_sampled_texts = pd.concat([df_icr1, df_icr2])['translated_text']
df_remaining_icr3 = df_yes[~df_yes['translated_text'].isin(already_sampled_texts)]

country_counts = df_remaining_icr3['country'].value_counts()
countries_to_sample = country_counts.index.tolist()
samples_per_country = n_total // len(countries_to_sample)
extra = n_total % len(countries_to_sample)

sampled_rows = []
for i, country in enumerate(countries_to_sample):
    n = samples_per_country + (1 if i < extra else 0)
    subset = df_remaining_icr3[df_remaining_icr3['country'] == country]
    if len(subset) >= n:
        sampled_rows.append(subset.sample(n=n, random_state=33))
    else:
        print(f"⚠️ Not enough articles for {country} (needed {n}, found {len(subset)})")

df_icr3 = pd.concat(sampled_rows).reset_index(drop=True)
icr3_path = os.path.join(BASE_PATH, "ICR_test3", "icr3_sample_raw.csv")
os.makedirs(os.path.dirname(icr3_path), exist_ok=True)
df_icr3.to_csv(icr3_path, index=False)
print(f"✅ ICR 3 redrawn and saved to: {icr3_path}")

# === ICR 4: Balanced by country, excluding ICR 1 + 2 + 3 ===
print("\n🎯 Drawing ICR 4 sample (balanced across countries, excluding ICR 1 + 2 + 3)...")
already_sampled_texts = pd.concat([df_icr1, df_icr2, df_icr3])['translated_text']
df_remaining_icr4 = df_yes[~df_yes['translated_text'].isin(already_sampled_texts)]

country_counts = df_remaining_icr4['country'].value_counts()
countries_to_sample = country_counts.index.tolist()
samples_per_country = n_total // len(countries_to_sample)
extra = n_total % len(countries_to_sample)

sampled_rows = []
for i, country in enumerate(countries_to_sample):
    n = samples_per_country + (1 if i < extra else 0)
    subset = df_remaining_icr4[df_remaining_icr4['country'] == country]
    if len(subset) >= n:
        sampled_rows.append(subset.sample(n=n, random_state=44))
    else:
        print(f"⚠️ Not enough articles for {country} (needed {n}, found {len(subset)})")

df_icr4 = pd.concat(sampled_rows).reset_index(drop=True)
icr4_path = os.path.join(BASE_PATH, "ICR_test4", "icr4_sample_raw.csv")
os.makedirs(os.path.dirname(icr4_path), exist_ok=True)
df_icr4.to_csv(icr4_path, index=False)
print(f"✅ ICR 4 redrawn and saved to: {icr4_path}")



🎯 Drawing ICR 1 sample...
✅ File exists and matches, re-saving: icr1_sample_raw.csv

🎯 Drawing ICR 2 sample (balanced across countries, excluding ICR 1)...
✅ File exists and matches, re-saving: icr2_sample_raw.csv

🎯 Drawing ICR 3 sample (balanced across countries, excluding ICR 1 + 2)...
✅ ICR 3 redrawn and saved to: /home/akroon/webdav/ASCOR-FMG-5580-RESPOND-news-data (Projectfolder)/annotations/coding_frames/ICR/ICR_test3/icr3_sample_raw.csv

🎯 Drawing ICR 4 sample (balanced across countries, excluding ICR 1 + 2 + 3)...
✅ ICR 4 redrawn and saved to: /home/akroon/webdav/ASCOR-FMG-5580-RESPOND-news-data (Projectfolder)/annotations/coding_frames/ICR/ICR_test4/icr4_sample_raw.csv


In [27]:
print("\n📊 Remaining 'Yes'-labeled articles by country (excluding ICR 1, 2, 3 and 4):")

# Combine all sampled texts
sampled_texts = pd.concat([df_icr1, df_icr2, df_icr3, df_icr4])['translated_text']
df_remaining = df_yes[~df_yes['translated_text'].isin(sampled_texts)]

# Count by country
country_summary = df_remaining['country'].value_counts().sort_index()
print(country_summary)

# Optional: total count
print(f"\n🧮 Total remaining articles: {len(df_remaining)}")


📊 Remaining 'Yes'-labeled articles by country (excluding ICR 1, 2, 3 and 4):
country
Bulgaria          944
Italy             451
Netherlands       478
United_Kingdom    329
Name: count, dtype: int64

🧮 Total remaining articles: 2202


In [30]:
BASE_PATH = os.path.expanduser("~/webdav/ASCOR-FMG-5580-RESPOND-news-data (Projectfolder)/output/data-deductive-analysis/sample-manual-content-analysis")

# Annotator assignments
annotator_map = {
    'Bulgaria': 'Alexander',
    'Italy': 'Luigia',
    'Netherlands': 'Assia',
    'United_Kingdom': 'Elisa'
}

# Draw 250 per country from remaining set
print("\n🎯 Drawing 250 articles per country from remaining set...")

countries = ['Bulgaria', 'Italy', 'Netherlands', 'United_Kingdom']
samples_per_country = 250
final_sample_rows = []

for i, country in enumerate(countries):
    subset = df_remaining_icr4[df_remaining_icr4['country'] == country]
    if len(subset) >= samples_per_country:
        sampled = subset.sample(n=samples_per_country, random_state=200 + i)
        final_sample_rows.append(sampled)

        # Save individual subset with annotator name
        annotator = annotator_map[country]
        filename_base = f"{country}_{annotator}_sample_250"
        csv_path = os.path.join(BASE_PATH, f"{filename_base}.csv")
        xlsx_path = os.path.join(BASE_PATH, f"{filename_base}.xlsx")
        
        sampled.to_csv(csv_path, index=False)
        sampled.to_excel(xlsx_path, index=False)

        print(f"📁 Saved: {csv_path}")
        print(f"📁 Saved: {xlsx_path}")
    else:
        print(f"⚠️ Not enough articles for {country} (needed {samples_per_country}, found {len(subset)})")

# === Save final merged sample ===
df_final_sample = pd.concat(final_sample_rows).reset_index(drop=True)
final_sample_path = os.path.join(BASE_PATH, "dataset_for_manual_content_analysis_250_per_country.csv")
df_final_sample.to_csv(final_sample_path, index=False)
print(f"✅ Final sample (250 per country) saved to: {final_sample_path}")

final_sample_excel = final_sample_path.replace(".csv", ".xlsx")
df_final_sample.to_excel(final_sample_excel, index=False)
print(f"📄 Also saved as Excel: {final_sample_excel}")



🎯 Drawing 250 articles per country from remaining set...
📁 Saved: /home/akroon/webdav/ASCOR-FMG-5580-RESPOND-news-data (Projectfolder)/output/data-deductive-analysis/sample-manual-content-analysis/Bulgaria_Alexander_sample_250.csv
📁 Saved: /home/akroon/webdav/ASCOR-FMG-5580-RESPOND-news-data (Projectfolder)/output/data-deductive-analysis/sample-manual-content-analysis/Bulgaria_Alexander_sample_250.xlsx
📁 Saved: /home/akroon/webdav/ASCOR-FMG-5580-RESPOND-news-data (Projectfolder)/output/data-deductive-analysis/sample-manual-content-analysis/Italy_Luigia_sample_250.csv
📁 Saved: /home/akroon/webdav/ASCOR-FMG-5580-RESPOND-news-data (Projectfolder)/output/data-deductive-analysis/sample-manual-content-analysis/Italy_Luigia_sample_250.xlsx
📁 Saved: /home/akroon/webdav/ASCOR-FMG-5580-RESPOND-news-data (Projectfolder)/output/data-deductive-analysis/sample-manual-content-analysis/Netherlands_Assia_sample_250.csv
📁 Saved: /home/akroon/webdav/ASCOR-FMG-5580-RESPOND-news-data (Projectfolder)/outpu