In [1]:
# ============================================
# Build distribution datasets for Assignment 2 (Final + Country Filter)
# Dataset: education_conflict_merged.csv
# ============================================
from google.colab import files
import pandas as pd
import numpy as np
from pathlib import Path

# ----------------------------
# 1.Upload the dataset
# ----------------------------
print("Please upload your 'education_conflict_merged.csv' file")
uploaded = files.upload()
file_name = list(uploaded.keys())[0]

df = pd.read_csv(file_name)
print(f"File uploaded: {file_name} | Shape: {df.shape}")

# Optional filter for selected countries
countries = [
    "Afghanistan", "Syria", "Yemen", "Sudan", "Ukraine", "Palestine",
    "Germany", "Canada", "Japan", "Brazil", "Kenya"
]
df = df[df["country"].isin(countries)]
df["year"] = pd.to_numeric(df["year"], errors="coerce")

# ----------------------------
# 2.Create output folder
# ----------------------------
Path("data_distributions").mkdir(exist_ok=True)

# ----------------------------
# 3.HISTOGRAM with TIME & COUNTRY
# ----------------------------
bins = np.arange(0, 101, 5)
hist_records = []

for year in sorted(df["year"].unique()):
    subset = df[df["year"] == year]
    for country in subset["country"].unique():
        row = subset[subset["country"] == country]
        status = row["conflict_status"].iloc[0]
        counts, edges = np.histogram(row["out_of_school_pct"], bins=bins)
        mids = (edges[:-1] + edges[1:]) / 2
        tmp = pd.DataFrame({
            "year": year,
            "country": country,
            "conflict_status": status,
            "bin_mid": mids,
            "bin_start": edges[:-1],
            "bin_end": edges[1:],
            "count": counts
        })
        hist_records.append(tmp)

hist_country_time = pd.concat(hist_records, ignore_index=True)
hist_country_time.to_csv("data_distributions/histogram_country_time.csv", index=False)
print("histogram_country_time.csv created (includes country & year)")

# ----------------------------
# 4. BOX PLOT STATS with TIME & COUNTRY
# ----------------------------
def box_stats(x):
    """Compute summary statistics for boxplot."""
    q1 = np.percentile(x, 25)
    q3 = np.percentile(x, 75)
    iqr = q3 - q1
    lower = max(x.min(), q1 - 1.5 * iqr)
    upper = min(x.max(), q3 + 1.5 * iqr)
    return pd.Series({
        "count": len(x),
        "median": np.median(x),
        "q1": q1, "q3": q3,
        "whisker_low": lower, "whisker_high": upper
    })

# Apply stats per (year, country, conflict_status)
box_summary_country_time = (
    df.groupby(["year", "country", "conflict_status"], group_keys=False)
    .apply(lambda g: box_stats(g["out_of_school_pct"]))
    .reset_index()
)

# Clean any unintended index columns
for col in ["level_0", "level_1", "level_2"]:
    if col in box_summary_country_time.columns:
        box_summary_country_time.drop(columns=[col], inplace=True)

box_summary_country_time.to_csv("data_distributions/box_summary_country_time.csv", index=False)
print("box_summary_country_time.csv created (clean, with country & year)")

# ----------------------------
# 5.VIOLIN DATA â€” raw points (per country-year)
# ----------------------------
violin = df[["country", "year", "conflict_status", "out_of_school_pct"]].copy()
violin.to_csv("data_distributions/violin_box_raw.csv", index=False)
print("violin_box_raw.csv created (raw values)")

Please upload your 'education_conflict_merged.csv' file


Saving education_conflict_merged.csv to education_conflict_merged.csv
File uploaded: education_conflict_merged.csv | Shape: (2120, 6)
histogram_country_time.csv created (includes country & year)
box_summary_country_time.csv created (clean, with country & year)
violin_box_raw.csv created (raw values)


  .apply(lambda g: box_stats(g["out_of_school_pct"]))
