In [50]:
import pandas as pd
import unicodedata


In [55]:
# Get data Function
# Written as functions for testing in the future and if the source changes the function calls are still valid
def get_enrollment_data() -> pd.DataFrame:
    data = pd.read_excel("enrollments.xlsx")
    return pd.DataFrame(data)

def get_gdp_data() -> pd.DataFrame:
    data = pd.read_csv("Countries_GDP_1960-2020.csv")
    return pd.DataFrame(data)

In [28]:
#  Validate Schema
def validate_enrollment_schema(data: pd.DataFrame) -> bool:
    # We want to check that the column names are what we are expecting
    # if the file changes we need to adjust this validation
    valid_columns = [
        "country",
        "countrycode",
        "region",
        "incomegroup",
        "iau_id",
        "iau_id1",
        "eng_name",
        "orig_name",
        "foundedyr",
        "yrclosed",
        "private01",
        "coordinates",
        "latitude",
        "longitude",
        "phd_granting",
        "m_granting",
        "b_granting",
        "divisions",
        "total_fields",
        "unique_fields",
        "specialized",
        "merger",
        "noiau",
        "year",
        "students5_interpolated",
        "students5_extrapolated",
        "students5_estimated"
    ]
    columns = data.columns
    if set(columns) == set(valid_columns):
        return True
    return False

def validate_gdp_schema(data: pd.DataFrame) -> bool:
    # The dates for valid schema will change as more data is added
    # Country Name and Country Code are enough for this file
    # we really don't need to update this function every year
    valid_columns = [
        "Country Name",
        "Country Code"
    ]
    columns = data.columns
    valid_columns_exist = 0
    for valid_column in valid_columns:
        for column in columns:
            if column == valid_column:
                valid_columns_exist += 1
    print(valid_columns_exist)
    if valid_columns_exist == len(valid_columns):
        return True
    return False


In [90]:
# Enrollment Transformations

def enrollment_convert_types(df: pd.DataFrame) -> pd.DataFrame:
    # Convert items to int
    # Fill NA values with 0
    #   Making an assumption that 0 is true and 1 is false to move forward with the exercise
    #   I would raise assumption to stakeholders making sure my assumption is correct and the best use for this data
    #   In this exercise I am just going ahead because it is an exercise
    numeric_columns = {
        'private01': 'int32',
        'm_granting': 'int32',
        'b_granting': 'int32',
        'divisions': 'int32',
        'total_fields': 'int32',
        'unique_fields': 'int32',
        'merger': 'int32',
        'noiau': 'int32',
        'students5_interpolated': 'int32',
        'students5_extrapolated': 'int32',
        'students5_estimated': 'int32',
        "foundedyr": 'int32',
        'latitude': 'float32',
        'longitude': 'float32'
    }

    # we need to string out some non-numeric characters for latitude and longitude before converting the type
    df['latitude'] = df['latitude'].str.replace(r'[^\d.]', '', regex=True)
    df['longitude'] = df['longitude'].str.replace(r'[^\d.]', '', regex=True)

    fill_na_column_names = list(numeric_columns)
    df[fill_na_column_names] = df[fill_na_column_names].fillna(0)

    df = df.astype(numeric_columns)
    string_columns = {
        'country': 'string',
        'countrycode': 'string',
        'region': 'string',
        'incomegroup': 'string',
        'iau_id': 'string',
        'iau_id1': 'string',
        'eng_name': 'string',
        'orig_name': 'string',
        'coordinates': 'string'
    }
    df = df.astype(string_columns)
    return df


In [56]:
df_enrollment = get_enrollment_data()

In [91]:
test_df = enrollment_convert_types(df_enrollment)
pd.set_option("display.max_info_columns", 26)
test_df.info(verbose=True)
pd.reset_option("display.max_info_columns")
test_df['longitude'].sample(10)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 161560 entries, 0 to 161559
Data columns (total 27 columns):
 #   Column                  Dtype  
---  ------                  -----  
 0   country                 string 
 1   countrycode             string 
 2   region                  string 
 3   incomegroup             string 
 4   iau_id                  string 
 5   iau_id1                 string 
 6   eng_name                string 
 7   orig_name               string 
 8   foundedyr               int32  
 9   yrclosed                float64
 10  private01               int32  
 11  coordinates             string 
 12  latitude                float32
 13  longitude               float32
 14  phd_granting            float64
 15  m_granting              int32  
 16  b_granting              int32  
 17  divisions               int32  
 18  total_fields            int32  
 19  unique_fields           int32  
 20  specialized             float64
 21  merger                  int32  
 

62431     1.160861e+09
124383    3.251453e+08
149451    0.000000e+00
96454     7.952479e+08
29267     1.174323e+08
103814    1.225840e+08
48461     1.658758e+06
738       0.000000e+00
85923     1.034518e+09
28588     0.000000e+00
Name: longitude, dtype: float32