In [92]:
import pandas as pd
from datetime import datetime


In [55]:
# Get data Function
# Written as functions for testing in the future and if the source changes the function calls are still valid
def get_enrollment_data() -> pd.DataFrame:
    data = pd.read_excel("enrollments.xlsx")
    return pd.DataFrame(data)

def get_gdp_data() -> pd.DataFrame:
    data = pd.read_csv("Countries_GDP_1960-2020.csv")
    return pd.DataFrame(data)

In [28]:
#  Validate Schema
def validate_enrollment_schema(data: pd.DataFrame) -> bool:
    # We want to check that the column names are what we are expecting
    # if the file changes we need to adjust this validation
    valid_columns = [
        "country",
        "countrycode",
        "region",
        "incomegroup",
        "iau_id",
        "iau_id1",
        "eng_name",
        "orig_name",
        "foundedyr",
        "yrclosed",
        "private01",
        "coordinates",
        "latitude",
        "longitude",
        "phd_granting",
        "m_granting",
        "b_granting",
        "divisions",
        "total_fields",
        "unique_fields",
        "specialized",
        "merger",
        "noiau",
        "year",
        "students5_interpolated",
        "students5_extrapolated",
        "students5_estimated"
    ]
    columns = data.columns
    if set(columns) == set(valid_columns):
        return True
    return False

def validate_gdp_schema(data: pd.DataFrame) -> bool:
    # The dates for valid schema will change as more data is added
    # Country Name and Country Code are enough for this file
    # we really don't need to update this function every year
    valid_columns = [
        "Country Name",
        "Country Code"
    ]
    columns = data.columns
    valid_columns_exist = 0
    for valid_column in valid_columns:
        for column in columns:
            if column == valid_column:
                valid_columns_exist += 1
    print(valid_columns_exist)
    if valid_columns_exist == len(valid_columns):
        return True
    return False


In [119]:
# Enrollment Transformations
def enrollment_convert_types(df: pd.DataFrame) -> pd.DataFrame:
    # Convert items to int
    # Fill NA values with 0
    #   Making an assumption that 0 is true and 1 is false to move forward with the exercise
    #   I would raise assumption to stakeholders making sure my assumption is correct and the best use for this data
    #   In this exercise I am just going ahead because it is an exercise
    numeric_columns = {
        'private01': 'int32',
        'm_granting': 'int32',
        'b_granting': 'int32',
        'divisions': 'int32',
        'total_fields': 'int32',
        'unique_fields': 'int32',
        'merger': 'int32',
        'noiau': 'int32',
        'students5_interpolated': 'int32',
        'students5_extrapolated': 'int32',
        'students5_estimated': 'int32',
        "foundedyr": 'int32',
        'year': 'int32',
    }
    # Removed Lat and Long form typing I need to be finishing this exercise up to stay close to the limits
    # Hurts me deply to not finish that but got to go fast right now

    fill_na_column_names = list(numeric_columns)
    df[fill_na_column_names] = df[fill_na_column_names].fillna(0)

    df = df.astype(numeric_columns)
    string_columns = {
        'country': 'string',
        'countrycode': 'string',
        'region': 'string',
        'incomegroup': 'string',
        'iau_id': 'string',
        'iau_id1': 'string',
        'eng_name': 'string',
        'orig_name': 'string',
        'coordinates': 'string'
    }
    df = df.astype(string_columns)
    return df


In [131]:
# GDP Transformation
def transform_GDP(df: pd.DataFrame) -> pd.DataFrame:
    # un-pivoting the data to be country|country code| year
    # this will allow us to join to the enrollment data easier
    df = pd.melt(
        df,
        id_vars=['Country Name', 'Country Code'],
        var_name="Year",
        value_name="GDP"
    )
    convert_columns = {
        'Country Name': 'string',
        'Country Code': 'string',
        'Year': 'int32'
    }
    df = df.astype(convert_columns)
    return df


In [138]:
def merge_data(gdp: pd.DataFrame, enrollment: pd.DataFrame) -> pd.DataFrame:
    df = enrollment.merge(
        gdp,
        how='left',
        left_on=['year', 'countrycode'],
        right_on=['Year', 'Country Code']
    )
    return df

In [128]:
df_gdp = get_gdp_data()
df_enrollment = get_enrollment_data()

In [129]:
df_gdp_clean = transform_GDP(df_gdp)
df_enrollment_clean = enrollment_convert_types(df_enrollment)

In [139]:
df_final = merge_data(df_gdp_clean, df_enrollment_clean)
df_final.sample(10)

Unnamed: 0,country,countrycode,region,incomegroup,iau_id,iau_id1,eng_name,orig_name,foundedyr,yrclosed,...,merger,noiau,year,students5_interpolated,students5_extrapolated,students5_estimated,Country Name,Country Code,Year,GDP
89358,mexico,MEX,Latin America and Caribbean,Upper middle income,IAU-023672-023874,IAU-023672-023874-1,Calmecac University\nCoatepec Branch,Plantel Coatepec,2001,,...,0,0,2020,0,0,3630,Mexico,MEX,2020.0,1070000000000.0
29002,china,CHN,East Asia and Pacific,Upper middle income,IAU-026420,IAU-026420-1,Anyang Normal University,Anyang Shifan Xueyuan,1908,,...,0,0,1990,0,2788,2788,China,CHN,1990.0,361000000000.0
116987,russian federation,RUS,Europe and Central Asia,Upper middle income,IAU-014079,IAU-014079-4,"Moscow State Agro-Engineering University, V. P...",,1930,,...,1,0,1955,0,0,0,,,,
144158,united states,USA,North America,High income,IAU-006794,IAU-006794-1,Grinnell College,Grinnell College,1846,,...,0,0,1955,0,1126,1126,,,,
159720,venezuela,VEN,Latin America and Caribbean,Upper middle income,IAU-018106,IAU-018106-4,Antonio Jos�� The Sucre National Experimental ...,Universidad Nacional Experimental Polit��cnica...,1962,,...,1,0,2020,0,5797,0,,,,
36221,dominican republic,DOM,Latin America and Caribbean,Upper middle income,IAU-013665,IAU-013665-1,Pontifical Catholic University Mother And Master,Pontificia Universidad Cat��lica Madre y Maest...,1962,,...,0,0,1990,0,10790,10790,Dominican Republic,DOM,1990.0,7073676000.0
23645,china,CHN,East Asia and Pacific,Upper middle income,IAU-009930,IAU-009930-1,Jilin University Of Finance And Economics,Jilin Caijing Daxue,1946,,...,0,0,1965,0,0,918,China,CHN,1965.0,70436270000.0
68763,japan,JPN,East Asia and Pacific,High income,IAU-006945,IAU-006945-1,Search Oh University,Hakuoh Daigaku,1915,,...,0,0,1970,0,1137,1137,Japan,JPN,1970.0,213000000000.0
52611,india,IND,South Asia,Lower middle income,IAU-007784,IAU-007784-1,"Indian Institute Of Technology, Madras","Indian Institute of Technology, Chennai (IIT M...",1959,,...,0,0,1990,2386,2386,2386,India,IND,1990.0,321000000000.0
9633,brazil,BRA,Latin America and Caribbean,Upper middle income,IAU-005278,IAU-005278-1,Faculty Of Medicine Of Santa Casa De S��O Paulo,Faculdade de Ci��ncias M��dicas da Santa Casa ...,1963,,...,0,0,1985,0,0,1889,,,,


In [133]:
pd.set_option("display.max_info_columns", 30)
df_final.info(verbose=True)
pd.reset_option("display.max_info_columns")
# Not 100% happy with Year being a float64, but I need to wrap up to stay close to time

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 161560 entries, 0 to 161559
Data columns (total 31 columns):
 #   Column                  Dtype  
---  ------                  -----  
 0   country                 string 
 1   countrycode             string 
 2   region                  string 
 3   incomegroup             string 
 4   iau_id                  string 
 5   iau_id1                 string 
 6   eng_name                string 
 7   orig_name               string 
 8   foundedyr               int32  
 9   yrclosed                float64
 10  private01               int32  
 11  coordinates             string 
 12  latitude                object 
 13  longitude               object 
 14  phd_granting            float64
 15  m_granting              int32  
 16  b_granting              int32  
 17  divisions               int32  
 18  total_fields            int32  
 19  unique_fields           int32  
 20  specialized             float64
 21  merger                  int32  
 