# Project Census WFH   
The goal of this workbook is to compare the efficacy of machine learning models based on different levels of spacial consolidation, based on the [Australian Statistical Geography Standard](https://www.abs.gov.au/websitedbs/D3310114.nsf/home/Australian+Statistical+Geography+Standard+(ASGS)) framework. the key questions are:   

- Can we find a model for predicting with an R2 value > 0.7? (Based on [prior work](https://github.com/blkemp/ABS-Region-Data/tree/BKSubmission), I suspect the answer is yes).
- Using this model, what is the impact on accuracy for feeding in data that is consolidated at a different level (e.g. neighborhood vs city vs county)?
- How do models trained at differing levels of granularity compare in both baseline accuracy and generalisability? I.e. Are models trained with the most fine grained data more accurate, or are they prone to overfitting?

As a starting point I will be utilising the [Australian Bureau of Statistics 2016 Census Datapacks](https://datapacks.censusdata.abs.gov.au/datapacks/) and attempting to predict "working from home" behaviours by region. Why this particular response vector? a) It just seems interesting and b) I suspect that demographic information available within the census itself (gender, age, profession and industry) will all be strongly related to both individuals' propensity to undertake working from home and their ability to do so with support from employers.

In [1]:
# Import statements
# Declare Imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import make_scorer, r2_score, mean_squared_error
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
import os
from textwrap import wrap
import operator

# Set a variable for current notebook's path for various loading/saving mechanisms
nb_path = os.getcwd()

In [2]:
def feature_plot_h(model, X_train, n_features):
    '''
    Takes a trained model and outputs a horizontal bar chart showing the "importance" of the
    most impactful n features.
    
    INPUTS
    model = Trained model in sklearn with  variable ".feature_importances_". Trained supervised learning model.
    X_train = Pandas Dataframe object. Feature set the training was completed using.
    n_features = Int. Top n features you would like to plot.
    '''
    importances = model.feature_importances_
    # Identify the n most important features
    indices = np.argsort(importances)[::-1]
    columns = X_train.columns.values[indices[:n_features]]
    values = importances[indices][:n_features]
    
    columns = [ '\n'.join(wrap(c, 20)) for c in columns ]
    
    # Create the plot
    fig = plt.figure(figsize = (9,n_features))
    plt.title("Normalized Weights for {} Most Predictive Features".format(n_features), fontsize = 16)
    plt.barh(np.arange(n_features), values, height = 0.6, align="center", color = '#00A000', 
          label = "Feature Weight")
    plt.barh(np.arange(n_features) - 0.3, np.cumsum(values), height = 0.2, align = "center", color = '#00A0A0', 
          label = "Cumulative Feature Weight")
    plt.yticks(np.arange(n_features), columns)
    plt.xlabel("Weight", fontsize = 12)
    
    plt.legend(loc = 'upper right')
    
    plt.gca().invert_yaxis()
    plt.tight_layout()
    plt.show()  

In [3]:
def feature_impact_plot(model, X_train, n_features, y_label):
    '''
    Takes a trained model and training dataset and synthesises the impacts of the top n features
    to show their relationship to the response vector (i.e. how a change in the feature changes
    the prediction). Returns n plots showing the variance for min, max, median, 1Q and 3Q.
    
    INPUTS
    model = Trained model in sklearn with  variable ".feature_importances_". Trained supervised learning model.
    X_train = Pandas Dataframe object. Feature set the training was completed using.
    n_features = Int. Top n features you would like to plot.
    y_label = String. Description of response variable for axis labelling.
    '''
    # Display the n most important features
    indices = np.argsort(model.feature_importances_)[::-1]
    columns = X_train.columns.values[indices[:n_features]]
    
    sim_var = [[]]
    
    for col in columns:
        base_pred = model.predict(X_train)
        #add percentiles of base predictions to a df for use in reporting
        base_percentiles = [np.percentile(base_pred, pc) for pc in range(0,101,25)]

        # Create new predictions based on tweaking the parameter
        # copy X, resetting values to align to the base information through different iterations
        df_copy = X_train.copy()

        for val in np.arange(-X_train[col].std(), X_train[col].std(), X_train[col].std()/50):
            df_copy[col] = X_train[col] + val
            # Add new predictions based on changed database
            predictions = model.predict(df_copy)
            
            # Add percentiles of these predictions to a df for use in reporting
            percentiles = [np.percentile(predictions, pc) for pc in range(0,101,25)]
            
            # Add variances between percentiles of these predictions and the base prediction to a df for use in reporting
            percentiles = list(map(operator.sub, percentiles, base_percentiles))
            percentiles = list(map(operator.truediv, percentiles, base_percentiles))
            sim_var.append([val, col] + percentiles)

    # Create a dataframe based off the arrays created above
    df_predictions = pd.DataFrame(sim_var,columns = ['Value','Feature']+[0,25,50,75,100])
    
    # Create a subplot object based on the number of features
    num_cols = 2
    subplot_rows = int(n_features/num_cols) + int(n_features%num_cols)
    fig, axs = plt.subplots(nrows = subplot_rows, ncols = num_cols, sharey = True, figsize=(15,5*subplot_rows))

    nlines = 1

    # Plot the feature variance impacts
    for i in range(axs.shape[0]*axs.shape[1]):
        if i < len(columns):
            # Cycle through each plot object in the axs array and plot the appropriate lines
            ax_row = int(i/num_cols)
            ax_column = int(i%num_cols)
            
            axs[ax_row, ax_column].plot(df_predictions[df_predictions['Feature'] == columns[i]]['Value'],
                     df_predictions[df_predictions['Feature'] == columns[i]][50])
            
            axs[ax_row, ax_column].set_title("\n".join(wrap(columns[i], int(100/num_cols))))
            
            # Create spacing between charts if chart titles happen to be really long.
            nlines = max(nlines, axs[ax_row, ax_column].get_title().count('\n'))

            axs[ax_row, ax_column].set_xlabel('Simulated +/- change to feature'.format(y_label))
            
            # Format the y-axis as %
            if ax_column == 0:
                vals = axs[ax_row, ax_column].get_yticks()
                axs[ax_row, ax_column].set_yticklabels(['{:,.2%}'.format(x) for x in vals])
                axs[ax_row, ax_column].set_ylabel('% change to {}'.format(y_label))
        
        # If there is a "spare" plot, hide the axis so it simply shows ans an empty space
        else:
            axs[int(i/num_cols),int(i%num_cols)].axis('off')
    
    # Apply spacing between subplots in case of very big headers
    fig.subplots_adjust(hspace=0.5*nlines)
    
    # Return the plot
    plt.tight_layout()    
    plt.show()

In [4]:
def sort_series_abs(S):
    'Takes a pandas Series object and returns the series sorted by absolute value'
    temp_df = pd.DataFrame(S)
    temp_df['abs'] = temp_df.iloc[:,0].abs()
    temp_df.sort_values('abs', ascending = False, inplace = True)
    return temp_df.iloc[:,0]

## Begin importing and exploring data

In [5]:
# Import metadata sheets
df_meta_tables = pd.read_excel('{}\Data\Metadata\Metadata_2016_GCP_DataPack.xlsx'.format(nb_path),
                               sheet_name = 'Table number, name, population',
                               skiprows=9)
df_meta_measures = pd.read_excel('{}\Data\Metadata\Metadata_2016_GCP_DataPack.xlsx'.format(nb_path),
                               sheet_name = 'Cell descriptors information',
                               skiprows=10)

In [6]:
df_meta_tables.tail()

Unnamed: 0,Table number,Table name,Table population
54,G55,Total Family Income (Weekly) by Labour Force S...,Couple families with children
55,G56,Total Family Income (Weekly) by Labour Force S...,One parent families
56,G57,Occupation by Age by Sex,Employed persons aged 15 years and over
57,G58,Occupation by Hours Worked by Sex,Employed persons aged 15 years and over
58,G59,Method of Travel to Work by Sex,Employed persons aged 15 years and over


In [7]:
df_meta_measures['Table number'] = df_meta_measures['DataPack file'].str[:3]

In [8]:
df_meta_measures = pd.merge(df_meta_measures, df_meta_tables, on='Table number')

In [9]:
df_meta_measures.shape

(15535, 9)

In [10]:
# Come up with some heuristics on what fields to drop and what to keep
# E.g. do you really want Age splits? If so, then you need to filter out any "Total" Lines for them
# if not then you need to filter out any lines with a metadata Long cell descriptor including the word "[A/a]ge"
# Similar for splits by sex, whether you choose to delete the categories of "Persons" or only keep this field

# Come up with a function to show the levels of data available within a table, i.e. age, sex, occupation, etc.
# Build off this to create a function to filter for these levels as desired

In [11]:
import re

In [12]:
def lower_all_except_first_char(str_item):
    str_replace = '{}{}'.format(str_item[:1],str_item[1:].lower())
    df_meta_measures['Long'] = df_meta_measures['Long'].str.replace(str_item, str_replace)

In [13]:
df_meta_measures['Long'] = df_meta_measures['Long'].str.replace('YEARS', 'years')

In [14]:
# I think I should go back through this and remove the "Total" lines
# Also sort the list by len in order to ensure substrings of other strings requiring change are done last
replace_capitalisation_list = ['Not_stated',
                               'None',
                               'No_children',
                               'Aboriginal_and_or_Torres_Strait_Islander',
                               'Both_Aboriginal_and_Torres_Strait_Islander',
                               'Torres_Strait_Islander',
                               'Non_Indigenous',
                               'New_Zealand',
                               'South_Africa',
                               'Sri_Lanka',
                               'Bosnia_and_Herzegovina',
                               'China_excludes_SARs_and_Taiwan',
                               'Hong_Kong_SAR_of_China',
                               'Korea_Republic_of_South',
                               'Northern_Ireland',
                               'Papua_New_Guinea',
                               'South_Eastern_Europe',
                               'Total_Responses',
                               'Census_Night',
                               'Elsewhere_in_Australia',
                               'Birthplace_Australia',
                               'Birthplace_Elsewhere',
                               'Other_Language',
                               'Age_of_Persons',
                               'Count_of_Persons',
                               'Average_number_of_Persons_per_bedroom',
                               'Visitor_from_Different_SA2',
                               'Visitor_from_Same_Statistical_Area_Level_2_SA2',
                               'Western_Australia',
                               'Northern_Territory',
                               'Australian_Capital_Territory',
                               'New_South_Wales',
                               'Other_Territories',
                               'China_excl_SARs_and_Taiwan',
                               'United_Kingdom_Channel_Islands_and_Isle_of_Man',
                               'The_Former_Yugoslav_Republic_of_Macedonia',
                               'United_States_of_America',
                               'Year_of_arrival_Before',
                               'Speaks_English',
                               'speaks_English',
                               'Proficiency_in_English',
                               'Proficiency_in_english',
                               '_Before_2000',
                               'Total_Year_of_arrival_not_stated',
                               'Australian_Indigenous_Languages',
                               'Chinese_Languages_Cantonese',
                               'Chinese_Languages_Mandarin',
                               'Chinese_Languages_Other',
                               'Chinese_languages_Other',
                               'Chinese_Languages_Total',
                               'Indo_Aryan_Languages_Bengali',
                               'Indo_Aryan_Languages_Hindi',
                               'Indo_Aryan_Languages_Punjabi',
                               'Indo_Aryan_Languages_Sinhalese',
                               'Indo_Aryan_Languages_Urdu',
                               'Indo_Aryan_Languages_Other',
                               'Indo_Aryan_Languages_Total',
                               'Persian_excluding_Dari',
                               'Southeast_Asian_Austronesian_Languages_Filipino',
                               'Southeast_Asian_Austronesian_Languages_Indonesian',
                               'Southeast_Asian_Austronesian_Languages_Tagalog',
                               'Southeast_Asian_Austronesian_Languages_Other',
                               'Southeast_Asian_Austronesian_Languages_Total',
                               'Christianity_Anglican',
                               'Christianity_Assyrian_Apostolic',
                               'Christianity_Baptist',
                               'Christianity_Brethren',
                               'Christianity_Catholic',
                               'Christianity_Churches_of_Christ',
                               'Christianity_Eastern_Orthodox',
                               'Christianity_Jehovahs_Witnesses',
                               'Christianity_Latter_day_Saints',
                               'Christianity_Lutheran',
                               'Christianity_Oriental_Orthodox',
                               'Christianity_Other_Protestant',
                               'Christianity_Pentecostal',
                               'Christianity_Presbyterian_and_Reformed',
                               'Christianity_Salvation_Army',
                               'Christianity_Seventh_day_Adventist',
                               'Christianity_Uniting_Church',
                               'Christianity_Christianity_nfd',
                               'Christianity_Other_Christian',
                               'Christianity_Total',
                               'Other_Religions_Australian_Aboriginal_Traditional_Religions',
                               'Other_Religions_Sikhism',
                               'Other_Religions_Other',
                               'Other_Religions_Total',
                               'Secular_Beliefs_and_Other_Spiritual_Beliefs_and_No_Religious_Affiliation_No_Religion_So_Described',
                               'Secular_Beliefs_and_Other_Spiritual_Beliefs_and_No_Religious_Affiliation_Secular_Beliefs',
                               'Secular_Beliefs_and_Other_Spiritual_Beliefs_and_No_Religious_Affiliation_Other_Spiritual_Beliefs',
                               'Secular_Beliefs_and_Other_Spiritual_Beliefs_and_No_Religious_Affiliation_Total',
                               'Infants_Primary',
                               'Other_Non_Government',
                               'Technical_or_Further_Educational_institution',
                               'Full_Part_time',
                               'University_or_other_Tertiary_Institution',
                               'Males_Negative_Nil_income', #maybe look at this whole section
                               'Females_Negative_Nil_income', #maybe look at this whole section
                               'Persons_Negative_Nil_income', #maybe look at this whole section
                               'Males_Personal_income_not_stated',#maybe look at this whole section
                               'Feales_Personal_income_not_stated',#maybe look at this whole section
                               'Persons_Personal_income_not_stated',#maybe look at this whole section
                               'Cared_for_Own',
                               'Cared_for_Other',
                               'Visitor_from_within_Australia',
                               'born_in_Australia',
                               'Real_Estate_Agent',
                               'with_Children',
                               'with__No_children',
                               'Flat_or_Apartment',
                               'Graduate_Diploma_and_Graduate_Certificate_Level_Graduate_Diploma_Level', 
                               'Advanced_Diploma_and_Diploma_Level_Advanced_Diploma_and_Associate_Degree_Level',
                               'Advanced_Diploma_and_Diploma_Advanced_Diploma_and_Associate_Degree_Level',
                               'Certificate_Level_Certificate_III_and_IV_Level',
                               'Certificate_Level_Certificate_I_and_II_Level',
                               'Certificate_Level_Certificate_Level_nfd',
                               'Graduate_Diploma_and_Graduate_Certificate_Level',
                               'Advanced_Diploma_and_Diploma_Level_Diploma_Level',
                               'Occupation_Inadequately_described',
                               'Postgraduate_Degree_Level',
                               'Master_Degree_Level',
                               'Doctoral_Degree_Level',
                               'Certificate_I_and_II_Level',
                               'Certificate_III_and_IV_Level',
                               'Advanced_Diploma_and_Diploma_Level',
                               'Graduate_Certificate_Level',
                               'Bachelor_Degree_Level',
                               'Certificate_Level',
                               'Different_SA2',
                               'Same_Statistical_Area_Level_2',
                               'Lone_Parent',
                               'Worked_Full_Time',
                               'Worked_Part_Time',
                               'Away_From_Work',
                               'Age_Of_Dependent_Children',
                               '4_Years',
                               '9_Years',
                               '2_Years',
                               '7_Years',
                               '0_Years',
                               'Hours_Worked',
                               'Labour_Force_Status_Not_Stated',
                               'Not_In_The_Labour_Force',
                               'Labour_Force',
                               'Looking_For_Full_Time_Work',
                               'Looking_For_Part_Time_Work',
                               'Natural_and_Physical_Sciences',
                               'Information_Technology',
                               'Engineering_and_Related_Technologies',
                               'Architecture_and_Building',
                               'Agriculture_Environmental_and_Related_Studies',
                               'Management_and_Commerce',
                               'Society_and_Culture',
                               'Creative_Arts',
                               'Food_Hospitality_and_Personal_Services',
                               'Mixed_Field_Programmes',
                               'Male_Parent',
                               'Looking_For',
                               'Hours_Worked_Not_Stated',
                               'Inadequately_described_Not_stated',
                               'Number_of_hours_worked_None',
                               'Number_of_hours_worked_Not_stated',
                               'Dependent_children_In_Couple_Families',
                               'Negative_Nil',
                               'Never_Married'
                              ]


# maybe have a rule for splitting directly after Males_/Females_/Persons_?

In [15]:
replace_capitalisation_list.sort(key = len, reverse=True)
for correction in replace_capitalisation_list:
    lower_all_except_first_char(correction)

In [16]:
df_meta_measures.tail(20)

Unnamed: 0,Sequential,Short,Long,DataPack file,Profile table,Column heading description in profile,Table number,Table name,Table population
15515,G15516,Three_met_Bs_2_ot_met_ex_tr_F,Three_methods_Bus_and_two_other_methods_exclud...,G59,G59,Females,G59,Method of Travel to Work by Sex,Employed persons aged 15 years and over
15516,G15517,Three_met_Bs_2_ot_met_ex_tr_P,Three_methods_Bus_and_two_other_methods_exclud...,G59,G59,Persons,G59,Method of Travel to Work by Sex,Employed persons aged 15 years and over
15517,G15518,Three_meth_Othr_three_meth_M,Three_methods_Other_three_methods_Males,G59,G59,Males,G59,Method of Travel to Work by Sex,Employed persons aged 15 years and over
15518,G15519,Three_meth_Othr_three_meth_F,Three_methods_Other_three_methods_Females,G59,G59,Females,G59,Method of Travel to Work by Sex,Employed persons aged 15 years and over
15519,G15520,Three_meth_Othr_three_meth_P,Three_methods_Other_three_methods_Persons,G59,G59,Persons,G59,Method of Travel to Work by Sex,Employed persons aged 15 years and over
15520,G15521,Three_meth_Tot_three_meth_M,Three_methods_Total_three_methods_Males,G59,G59,Males,G59,Method of Travel to Work by Sex,Employed persons aged 15 years and over
15521,G15522,Three_meth_Tot_three_meth_F,Three_methods_Total_three_methods_Females,G59,G59,Females,G59,Method of Travel to Work by Sex,Employed persons aged 15 years and over
15522,G15523,Three_meth_Tot_three_meth_P,Three_methods_Total_three_methods_Persons,G59,G59,Persons,G59,Method of Travel to Work by Sex,Employed persons aged 15 years and over
15523,G15524,Worked_home_M,Worked_at_home_Males,G59,G59,Males,G59,Method of Travel to Work by Sex,Employed persons aged 15 years and over
15524,G15525,Worked_home_F,Worked_at_home_Females,G59,G59,Females,G59,Method of Travel to Work by Sex,Employed persons aged 15 years and over


In [17]:
# replace any words following the word "and" with a lowercase version
replace_ands = set()
for cats in df_meta_measures.Long.unique():
    try:
        word = re.search('(?<=_and_)\w+', cats).group(0).split('_')[0]
        replace_ands.add('_and_{}'.format(word))
    except:
        pass

for repl in replace_ands:
    df_meta_measures['Long'] = df_meta_measures['Long'].str.replace(repl,repl.lower())

In [18]:
# replace any words following the word "Occupation" with a lowercase version
replace_occs = set()
for cats in df_meta_measures.Long.unique():
    try:
        word = re.search('(?<=ccupation_)\w+', cats).group(0).split('_')[0]
        replace_occs.add('ccupation_{}'.format(word))
    except:
        pass

for repl in replace_occs:
    df_meta_measures['Long'] = df_meta_measures['Long'].str.replace(repl,repl.lower())

In [19]:
df_meta_measures[df_meta_measures['Short'] == 'China_exc_SARs_Taiw_Bef_1946']

Unnamed: 0,Sequential,Short,Long,DataPack file,Profile table,Column heading description in profile,Table number,Table name,Table population
2846,G2847,China_exc_SARs_Taiw_Bef_1946,China_excl_sars_and_taiwan_Year_of_arrival_bef...,G10A,G10a,Before 1946,G10,Country of Birth of Person by Year of Arrival ...,Persons born overseas


In [20]:
measure_cats = []
for category in df_meta_measures.Long.tolist():
    measure_cats.append(re.findall('[A-Z][^A-Z]*', category))

In [21]:
#export list to csv for framework to build reference table
import csv

with open("out.csv", "w", newline="") as f:
    writer = csv.writer(f)
    writer.writerows(measure_cats)

In [14]:
# Import SA2 level data 
    # (start modelling at this level as a nice mid-point [plus it's the level I worked with in the previous project])

In [22]:
# Generalise this importation method to allow easy imports based on folder name (for SA level) 
# and a list of datapack files you want to amalgamate into a single dataframe
def load_census_csv(table_list, statistical_area_code):
    for index, table in enumerate(table_list):
        #
        # Probably want to put in a function here to check if the table is in the "DataPack File" group
        # or just a raw table name. If not in the list, replace with the datapack names and loop through these
        # if not in any list, print an error and continue.
        #
        if index==0:
            df = pd.read_csv('{}\Data\{}\AUST\\2016Census_{}_AUS_{}.csv'.format(nb_path,
                                                                                statistical_area_code,
                                                                                table,
                                                                                statistical_area_code
                                                                               ),
                                       engine='python')
        else:
            temp_df = pd.read_csv('{}\Data\{}\AUST\\2016Census_{}_AUS_{}.csv'.format(nb_path,
                                                                                statistical_area_code,
                                                                                table,
                                                                                statistical_area_code
                                                                               ),
                                       engine='python')
            merge_col = df.columns[0]
            df = pd.merge(df, temp_df, on=merge_col)
    
    return df

In [23]:
df_base_data = load_census_csv(['G59','G53A'],'SA2')

In [5]:
# Create new "Work From Home Participation Rate" vector to ensure consistency across regions
# Base this off population who worked from home divided by total working population in the region
'Worked_home_P'

In [None]:
# Investigate correlations to check out items which stand out as potential drivers
response_vector = 'INSERTVECTORHERE'
sort_series_abs(df.dropna(subset=[solar]).corr().loc[:,response_vector])[1:50]

### Train model

In [6]:
# Create X & y

In [None]:
# Split the 'features' and 'response' vectors into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [None]:
rf = RandomForestRegressor(random_state=42)

parameters = {'n_estimators':[10,20,40,80],
              #'max_depth':[4,8,16,32,64],
              'min_samples_leaf':[1,2,4]
             }

# TODO: Make a scoring object using make_scorer()
scorer = make_scorer(r2_score)

# TODO: Perform grid search on the regressor using 'scorer' as the scoring method using GridSearchCV()
grid_obj = GridSearchCV(rf, param_grid=parameters, scoring=scorer, verbose = 2)

# TODO: Fit the grid search object to the training data and find the optimal parameters using fit()
grid_fit = grid_obj.fit(X_train, y_train)

# Get the estimator
best_rf = grid_fit.best_estimator_

# Make predictions using the unoptimized and model
y_pred = best_rf.predict(X_test)

best_rf

In [None]:
plt.scatter(y_pred, y_test)
plt.xlabel('Predictions')
plt.ylabel('Actuals')

In [None]:
feature_plot_h(best_rf.feature_importances_, X_train, 5)

In [None]:
r2_score(y_pred, y_test)

## Space for initial thoughts