## Import Libraries

In [11]:
import pandas as pd
from IPython.display import display

## Read csv file to dataframe.

In [12]:
# Read data file.
# Skip last 5 rows in data file as these are not part of data.
df = pd.read_csv("Education_Data_Raw.csv", 
                 skipfooter=5, engine='python')

## Process Data

In [13]:
# Rename column headers based on Year.
df.columns = list(df.columns[:4]) + ["Value" + x.split()[0] for x in df.columns[4:]]
df.columns

# Convert appropriate columns to numeric data.
yr_cols = df.columns[4:]
for col in yr_cols:
    df[col] = pd.to_numeric(df[col], errors='coerce')

# Fill missing values with row average of year values.
df = df.apply(lambda row: row.fillna(row[yr_cols].mean()), axis=1)
df

# Convert dataframe from wide to long format.
df = pd.wide_to_long(df, ["Value"], i=["Country Name", df.columns[0]], j="Year")
df.reset_index(inplace=True)

# Reshape data from long to wide format.
df = df.pivot(index=["Country Name", "Year"], columns="Series Name", values="Value")
df.reset_index(inplace=True)

# Add 'Education_Key' column.
df['Education_Key'] = range(0, len(df))

# Rearrange columns in df.
cols = df.columns
cols = cols[[0, 1, 14, 5, 6, 8, 9, 7, 10, 12, 13, 11, 4, 2, 3]]
df = df[cols]

# Rename columns.
cols_names = [
    "Country_Ref",
    "Year_Ref",
    "Education_Key",
    "Public_Education_Spending",
    "School_Enrollment_Primary_%Gross",
    "School_Enrollment_Primary_Female_%Gross",
    "School_Enrollment_Primary_Male_%Gross",
    "School_Enrollment_Primary_%Net",
    "School_Enrollment_Secondary_%Gross",
    "School_Enrollment_Secondary_Female_%Gross",
    "School_Enrollment_Secondary_Male_%Gross",
    "School_Enrollment_Secondary_%Net",
    "Primary_Completion_Rate",
    "Primary_Completion_Rate_Female",
    "Primary_Completion_Rate_Male",
]
df.columns = cols_names
df['Country_Ref'].replace({"United States": "USA"}, inplace = True)
df.tail()

Unnamed: 0,Country_Ref,Year_Ref,Education_Key,Public_Education_Spending,School_Enrollment_Primary_%Gross,School_Enrollment_Primary_Female_%Gross,School_Enrollment_Primary_Male_%Gross,School_Enrollment_Primary_%Net,School_Enrollment_Secondary_%Gross,School_Enrollment_Secondary_Female_%Gross,School_Enrollment_Secondary_Male_%Gross,School_Enrollment_Secondary_%Net,Primary_Completion_Rate,Primary_Completion_Rate_Female,Primary_Completion_Rate_Male
139,USA,2016,139,,101.362862,101.276413,101.445778,95.07325,98.769928,98.474709,99.052834,92.18317,99.804649,98.662,100.897433
140,USA,2017,140,,101.821442,101.421997,102.20446,94.62633,98.952339,98.695557,99.198517,92.45022,98.832199,97.166031,100.429466
141,USA,2018,141,,101.256561,101.196373,101.314239,93.669332,99.275581,98.796059,99.735657,90.224416,100.092697,95.476341,104.510193
142,USA,2019,142,,100.9813,101.343742,100.634079,93.669332,100.063431,99.189453,100.902123,90.224416,100.489052,103.343628,97.75264
143,USA,2020,143,,100.840614,100.671481,101.002396,93.669332,97.859668,97.887492,97.834641,90.224416,99.804649,98.662,100.897433


## Write data to csv file.

In [14]:
df.to_csv('Education_Processed_Table.csv', index=False)

File for DB

In [15]:
final_table = df.drop(columns=["Country_Ref", "Year_Ref"])
display(final_table.head())
final_table.to_csv("Education_Processed_DB.csv", index = False, header = True)

Unnamed: 0,Education_Key,Public_Education_Spending,School_Enrollment_Primary_%Gross,School_Enrollment_Primary_Female_%Gross,School_Enrollment_Primary_Male_%Gross,School_Enrollment_Primary_%Net,School_Enrollment_Secondary_%Gross,School_Enrollment_Secondary_Female_%Gross,School_Enrollment_Secondary_Male_%Gross,School_Enrollment_Secondary_%Net,Primary_Completion_Rate,Primary_Completion_Rate_Female,Primary_Completion_Rate_Male
0,0,,114.813164,114.884911,114.74427,94.96596,87.017761,84.940758,89.01384,74.68435,93.355029,93.321095,93.387371
1,1,,113.279572,113.342087,113.219544,95.07125,86.167748,84.596413,87.67733,74.05262,93.355029,93.321095,93.387371
2,2,,112.388763,112.373993,112.402939,94.89489,85.635277,84.543098,86.684174,73.1655,93.355029,93.321095,93.387371
3,3,,111.008499,110.739891,111.266388,94.08623,85.012779,84.330597,85.667778,72.09962,93.355029,93.321095,93.387371
4,4,,107.1726,106.611,107.711777,93.52178,83.461884,83.292412,83.62458,72.78282,93.355029,93.321095,93.387371
