In [92]:
import pandas as pd
from datetime import datetime


In [55]:
# Get data Function
# Written as functions for testing in the future and if the source changes the function calls are still valid
def get_enrollment_data() -> pd.DataFrame:
    data = pd.read_excel("enrollments.xlsx")
    return pd.DataFrame(data)

def get_gdp_data() -> pd.DataFrame:
    data = pd.read_csv("Countries_GDP_1960-2020.csv")
    return pd.DataFrame(data)

In [28]:
#  Validate Schema
def validate_enrollment_schema(data: pd.DataFrame) -> bool:
    # We want to check that the column names are what we are expecting
    # if the file changes we need to adjust this validation
    valid_columns = [
        "country",
        "countrycode",
        "region",
        "incomegroup",
        "iau_id",
        "iau_id1",
        "eng_name",
        "orig_name",
        "foundedyr",
        "yrclosed",
        "private01",
        "coordinates",
        "latitude",
        "longitude",
        "phd_granting",
        "m_granting",
        "b_granting",
        "divisions",
        "total_fields",
        "unique_fields",
        "specialized",
        "merger",
        "noiau",
        "year",
        "students5_interpolated",
        "students5_extrapolated",
        "students5_estimated"
    ]
    columns = data.columns
    if set(columns) == set(valid_columns):
        return True
    return False

def validate_gdp_schema(data: pd.DataFrame) -> bool:
    # The dates for valid schema will change as more data is added
    # Country Name and Country Code are enough for this file
    # we really don't need to update this function every year
    valid_columns = [
        "Country Name",
        "Country Code"
    ]
    columns = data.columns
    valid_columns_exist = 0
    for valid_column in valid_columns:
        for column in columns:
            if column == valid_column:
                valid_columns_exist += 1
    print(valid_columns_exist)
    if valid_columns_exist == len(valid_columns):
        return True
    return False


In [90]:
# Enrollment Transformations
def enrollment_convert_types(df: pd.DataFrame) -> pd.DataFrame:
    # Convert items to int
    # Fill NA values with 0
    #   Making an assumption that 0 is true and 1 is false to move forward with the exercise
    #   I would raise assumption to stakeholders making sure my assumption is correct and the best use for this data
    #   In this exercise I am just going ahead because it is an exercise
    numeric_columns = {
        'private01': 'int32',
        'm_granting': 'int32',
        'b_granting': 'int32',
        'divisions': 'int32',
        'total_fields': 'int32',
        'unique_fields': 'int32',
        'merger': 'int32',
        'noiau': 'int32',
        'students5_interpolated': 'int32',
        'students5_extrapolated': 'int32',
        'students5_estimated': 'int32',
        "foundedyr": 'int32',
        'latitude': 'float32',
        'longitude': 'float32'
    }

    # we need to string out some non-numeric characters for latitude and longitude before converting the type
    df['latitude'] = df['latitude'].str.replace(r'[^\d.]', '', regex=True)
    df['longitude'] = df['longitude'].str.replace(r'[^\d.]', '', regex=True)

    fill_na_column_names = list(numeric_columns)
    df[fill_na_column_names] = df[fill_na_column_names].fillna(0)

    df = df.astype(numeric_columns)
    string_columns = {
        'country': 'string',
        'countrycode': 'string',
        'region': 'string',
        'incomegroup': 'string',
        'iau_id': 'string',
        'iau_id1': 'string',
        'eng_name': 'string',
        'orig_name': 'string',
        'coordinates': 'string'
    }
    df = df.astype(string_columns)
    return df


In [97]:
# GDP Transformation
def transform_GDP(df: pd.DataFrame) -> pd.DataFrame:
    # un-pivoting the data to be country|country code| year
    # this will allow us to join to the enrollment data easier
    df = pd.melt(
        df,
        id_vars=['Country Name', 'Country Code'],
        var_name="Year",
        value_name="GDP"
    )
    string_columns = {
        'Country Name': 'string',
        'Country Code': 'string',
        'Year': 'string'
    }
    df = df.astype(string_columns)
    return df


In [95]:
df_gdp = get_gdp_data()

In [98]:
test_df = transform_GDP(df_gdp)
pd.set_option("display.max_info_columns", 26)
test_df.info(verbose=True)
pd.reset_option("display.max_info_columns")
test_df.sample(10)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7320 entries, 0 to 7319
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Country Name  7320 non-null   string 
 1   Country Code  7320 non-null   string 
 2   Year          7320 non-null   string 
 3   GDP           7319 non-null   float64
dtypes: float64(1), string(3)
memory usage: 228.9 KB


Unnamed: 0,Country Name,Country Code,Year,GDP
1310,Turkey,TUR,1970,17086960000.0
3141,"Congo, Rep.",COG,1986,1849268000.0
6603,Austria,AUT,2015,382000000000.0
3822,Chad,TCD,1991,1877138000.0
1663,East Asia & Pacific (IDA & IBRD countries),TEA,1973,195000000000.0
901,Sri Lanka,LKA,1967,1859465000.0
2116,Nicaragua,NIC,1977,2239857000.0
4248,IDA & IBRD total,IBT,1995,5300000000000.0
1131,IDA only,IDX,1969,45818570000.0
5691,IDA only,IDX,2007,699000000000.0
