In [1]:
# ============================================
# Data Cleaning and Merging for
# "Impact of War on Child Education (2010–2024)"
# ============================================

import pandas as pd
from pathlib import Path
from google.colab import files

# ----------------------------
#  1.Upload both CSV files
# ----------------------------
print("Upload 'share-primary-school-age-out-of-school.csv'")
edu_file = files.upload()

print("Upload 'number-of-armed-conflicts.csv'")
conf_file = files.upload()

# Get filenames
edu_name = list(edu_file.keys())[0]
conf_name = list(conf_file.keys())[0]

# ----------------------------
#  2.Load datasets
# ----------------------------
edu = pd.read_csv(edu_name)
conf = pd.read_csv(conf_name)

print("Education data:", edu.shape)
print("Conflict data:", conf.shape)
print("\nEducation columns:", edu.columns.tolist())
print("Conflict columns:", conf.columns.tolist())

# ----------------------------
# 3.Standardize column names
# ----------------------------
# Education dataset (simple)
edu.columns = ["country", "code", "year", "out_of_school_pct"]

# Conflict dataset (complex – multiple conflict types)
conf.rename(columns={
    "Entity": "country",
    "Code": "code",
    "Year": "year",
    "Number of ongoing conflicts - Conflict type: one-sided violence": "conf_onesided",
    "Number of ongoing conflicts - Conflict type: extrasystemic": "conf_extrasystemic",
    "Number of ongoing conflicts - Conflict type: non-state conflict": "conf_nonstate",
    "Number of ongoing conflicts - Conflict type: intrastate": "conf_intrastate",
    "Number of ongoing conflicts - Conflict type: interstate": "conf_interstate"
}, inplace=True)

# Sum all conflict types to get total number of conflicts
conf["num_conflicts"] = (
    pd.to_numeric(conf["conf_onesided"], errors="coerce").fillna(0) +
    pd.to_numeric(conf["conf_extrasystemic"], errors="coerce").fillna(0) +
    pd.to_numeric(conf["conf_nonstate"], errors="coerce").fillna(0) +
    pd.to_numeric(conf["conf_intrastate"], errors="coerce").fillna(0) +
    pd.to_numeric(conf["conf_interstate"], errors="coerce").fillna(0)
)

# Keep only needed columns
conf = conf[["country", "code", "year", "num_conflicts"]]

# ----------------------------
# 4. Convert year to numeric and filter 2010–2024
# ----------------------------
edu["year"] = pd.to_numeric(edu["year"], errors="coerce")
conf["year"] = pd.to_numeric(conf["year"], errors="coerce")

edu = edu[(edu["year"] >= 2010) & (edu["year"] <= 2024)]
conf = conf[(conf["year"] >= 2010) & (conf["year"] <= 2024)]

# ----------------------------
# 5. Merge datasets on country + year
# ----------------------------
df = pd.merge(edu, conf, on=["country", "code", "year"], how="left")
print("Merged dataset:", df.shape)

# ----------------------------
# 6. Clean and prepare columns
# ----------------------------
df["out_of_school_pct"] = pd.to_numeric(df["out_of_school_pct"], errors="coerce")

# Conflict status: mark countries with >0 conflicts as "Conflict"
df["conflict_status"] = df["num_conflicts"].apply(
    lambda x: "Conflict" if pd.notna(x) and x > 0 else "Stable"
)

# Drop missing education values
df = df.dropna(subset=["out_of_school_pct"])

# ----------------------------
# 7.Save cleaned dataset
# ----------------------------
Path("data_clean").mkdir(exist_ok=True)
df.to_csv("data_clean/education_conflict_merged.csv", index=False)

print("Cleaned dataset saved as: data_clean/education_conflict_merged.csv")
print("\n Sample rows:")
df.head(10)


Upload 'share-primary-school-age-out-of-school.csv'


Saving share-primary-school-age-out-of-school.csv to share-primary-school-age-out-of-school.csv
Upload 'number-of-armed-conflicts.csv'


Saving number-of-armed-conflicts.csv to number-of-armed-conflicts.csv
Education data: (5400, 4)
Conflict data: (7890, 8)

Education columns: ['Entity', 'Code', 'Year', 'Out-of-school rate for children of primary school age']
Conflict columns: ['Entity', 'Code', 'Year', 'Number of ongoing conflicts - Conflict type: one-sided violence', 'Number of ongoing conflicts - Conflict type: extrasystemic', 'Number of ongoing conflicts - Conflict type: non-state conflict', 'Number of ongoing conflicts - Conflict type: intrastate', 'Number of ongoing conflicts - Conflict type: interstate']
Merged dataset: (2120, 5)
Cleaned dataset saved as: data_clean/education_conflict_merged.csv

 Sample rows:


Unnamed: 0,country,code,year,out_of_school_pct,num_conflicts,conflict_status
0,Albania,ALB,2010,6.39161,0.0,Stable
1,Albania,ALB,2011,4.81195,0.0,Stable
2,Albania,ALB,2012,3.54341,0.0,Stable
3,Albania,ALB,2013,3.15807,0.0,Stable
4,Albania,ALB,2014,0.38362,0.0,Stable
5,Albania,ALB,2015,0.20883,0.0,Stable
6,Albania,ALB,2016,0.15571,0.0,Stable
7,Albania,ALB,2017,0.76753,0.0,Stable
8,Albania,ALB,2018,1.36746,0.0,Stable
9,Albania,ALB,2019,2.19485,0.0,Stable
