# Set up

In [42]:
import pandas as pd
import requests

# Loading datasets

In [35]:
enroll = pd.read_csv("../raw/enrollment.csv")
world_education = pd.read_csv("../raw/world-education-data.csv")
various = pd.read_csv("../raw/various_indicators.csv")

### Cleaning Enrollment

In [36]:
enroll = enroll.rename(columns={"Entity": "country",
                                "Code": "country_code",
                                "Year":"year",
                                "Combined total net enrolment rate, primary, both sexes": "net_enrollment"})
enroll.head()


Unnamed: 0,country,country_code,year,net_enrollment
0,Afghanistan,AFG,1820,0.0
1,Afghanistan,AFG,1825,0.0
2,Afghanistan,AFG,1830,0.0
3,Afghanistan,AFG,1835,0.0
4,Afghanistan,AFG,1840,0.0


### Cleaned world education data

In [37]:
world_education.head()


Unnamed: 0,country,country_code,year,gov_exp_pct_gdp,lit_rate_adult_pct,pri_comp_rate_pct,pupil_teacher_primary,pupil_teacher_secondary,school_enrol_primary_pct,school_enrol_secondary_pct,school_enrol_tertiary_pct
0,Afghanistan,AFG,1999,,,,33.18571,,27.298849,,
1,Afghanistan,AFG,2000,,,,,,22.162991,,
2,Afghanistan,AFG,2001,,,,,,22.90859,14.47151,
3,Afghanistan,AFG,2002,,,,,,75.959747,,
4,Afghanistan,AFG,2003,,,,,,96.55368,14.07805,1.38107


### Cleaning various indicators

In [38]:
various = various.rename(columns={"Country Name":"country",
                                  "Country Code": "country_code",
                                  "Series Name": "variable"})
various.drop("Series Code", axis=1, inplace=True)
various = various.melt(id_vars = ["country", "country_code", "variable"], var_name="year", value_name="value")
various["year"] = various["year"].str.extract(r"(\d{4})").astype(int)
various["value"] = pd.to_numeric(various["value"], errors="coerce")
final_df = various.pivot_table(
    index=["country", "country_code", "year"],
    columns="variable",
    values="value",
    aggfunc="mean"  # default; works fine now
).reset_index()
various = various.dropna(subset=["variable"])
various = various.pivot(index = ["country", "country_code", "year"], columns = "variable", values="value").reset_index()
various

variable,country,country_code,year,Access to clean fuels and technologies for cooking (% of population),Access to electricity (% of population),Adequacy of social insurance programs (% of total welfare of beneficiary households),"Average working hours of children, study and work, ages 7-14 (hours per week)","Children in employment, female (% of female children ages 7-14)","Children in employment, unpaid family workers (% of children in employment, ages 7-14)","Current education expenditure, primary (% of total expenditure in primary public institutions)","Current education expenditure, secondary (% of total expenditure in secondary public institutions)","Current education expenditure, tertiary (% of total expenditure in tertiary public institutions)",Demand for family planning satisfied by modern methods (% of married women with demand for family planning),"Educational attainment, at least completed lower secondary, population 25+, total (%) (cumulative)","Educational attainment, at least completed primary, population 25+ years, total (%) (cumulative)","Government expenditure on education, total (% of government expenditure)",People using at least basic drinking water services (% of population),Teenage mothers (% of women ages 15-19 who have had children or are currently pregnant),Trained teachers in lower secondary education (% of total teachers),"Women participating in the three decisions (own health care, major household purchases, and visiting family) (% of women age 15-49)"
0,Afghanistan,AFG,1960,,,,,,,,,,,,,,,,,
1,Afghanistan,AFG,1961,,,,,,,,,,,,,,,,,
2,Afghanistan,AFG,1962,,,,,,,,,,,,,,,,,
3,Afghanistan,AFG,1963,,,,,,,,,,,,,,,,,
4,Afghanistan,AFG,1964,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17220,Zimbabwe,ZWE,2020,30.5,52.7,,,,,,,,,,,15.666611,62.666456,,,
17221,Zimbabwe,ZWE,2021,30.5,49.0,,,,,,,,,,,,62.252798,,,
17222,Zimbabwe,ZWE,2022,30.8,50.1,,,,,,,,,,,,62.294255,,,
17223,Zimbabwe,ZWE,2023,,62.0,,,,,,,,,,,10.688962,,,,


In [39]:
enroll_world_education = pd.merge(enroll, world_education, on=["country_code", "year"], how="left")
enroll_world_education.drop("country_y", axis=1, inplace=True)
enroll_world_education.rename(columns={"country_x":"country"}, inplace=True)

In [40]:
final = pd.merge(enroll_world_education, various, on=["country_code", "year"], how="left")
final = final[final["year"] >= 1950]
final.rename(columns={"country_x": "country"}, inplace=True)
final.drop("country_y", axis=1, inplace=True)

In [41]:
final.to_csv("cleaned_data.csv")

### Cleaning avg learning outcomes

In [43]:
avg_outcome = pd.read_csv("https://ourworldindata.org/grapher/learning-outcomes-vs-gdp-per-capita.csv?v=1&csvType=full&useColumnShortNames=true", storage_options = {'User-Agent': 'Our World In Data data fetch/1.0'})

In [45]:
new_names = {'Entity': "country",
             'Code': "country_code",
             'Year':"year",
             'harmonized_test_scores__sex_all_students':"test_scores",
       'ny_gdp_pcap_pp_kd': "gdp_pc",
       'population_historical':"population",
       'wb_region':'wb_region'}
avg_outcome.rename(columns=new_names, inplace=True)

In [48]:
avg_outcome.to_csv("avg_outcome.csv")