# Census Data Cleaning
Keanna Knebel

---

In [380]:
# import packages
import pandas as pd
import re

In [381]:
def clean_census(file, year):
    
    column_names= ['Variable', 'Arbutus-Ridge', 'Downtown', 'Dunbar-Southlands',
               'Fairview', 'Grandview-Woodland', 'Hastings-Sunrise',
               'Kensington-Cedar Cottage', 'Kerrisdale', 'Killarney', 'Kitsilano',
               'Marpole', 'Mount Pleasant', 'Oakridge', 'Renfrew-Collingwood',
               'Riley Park', 'Shaughnessy', 'South Cambie', 'Strathcona',
               'Sunset', 'Victoria-Fraserview', 'West End', 'West Point Grey',
               'Vancouver CSD', 'Vancouver CMA']
   
    # read in csv file as dataframe
    df = pd.read_csv(file, encoding='latin-1', skiprows=4)
    
    # remove 'ID' column if present
    df.drop(columns='ID', inplace=True, errors='ignore')
    
    # rename columns 
    df.set_axis(column_names, axis=1, inplace=True)
    
    # remove empty rows
    df.dropna(0, 'all', inplace=True)
    
    # remove leading whitespace from variables
    df.Variable = df.Variable.apply(lambda x: (x.lstrip()).rstrip())
    df.drop(df[df.Variable.str.contains('20%.*data', flags=re.IGNORECASE) ].index , inplace=True)
    
    # convert all data to strings except NaN values
    df.iloc[:,1:25] = df.iloc[:,1:25].applymap(lambda x: str(x) if x == x else x)
    
    # convert data to float
    df.iloc[:,1:25] = df.iloc[:,1:25].applymap(lambda x: re.sub("[-]", "0", x) if x == x else x)
    df.iloc[:,1:25] = df.iloc[:,1:25].applymap(lambda x: float(re.sub("[,$]", "", x)) if x == x else x)
    
    # divide the census datasets into subgroups
    sub_dict = create_subgroup_dict(df, year)
    
    # clean the datatables by topics
    census_dict = clean_age(sub_dict, year)
    census_dict = clean_marital_status(census_dict, year)
    census_dict = clean_private_households(census_dict, year)
    census_dict = clean_couple_fam_structure(census_dict, year)
    census_dict = clean_language_detailed(census_dict, year)
    census_dict = clean_official_language(census_dict, year)
    census_dict = clean_structural_dwelling_type(census_dict, year)
    census_dict = clean_household_size(census_dict, year)
    census_dict = clean_lone_parent(census_dict, year)
    census_dict = clean_immigration_age(census_dict, year)
    census_dict = clean_immigration_period(census_dict, year)
    
    #### INCOMPLETE HELPER FUNCTIONS ####
    #census_dict = clean_household_type(census_dict, year)
    #census_dict = clean_census_family_children(census_dict, year)
    #census_dict = clean_aboriginal(census_dict, year)
    #census_dict = clean_citizenship(census_dict, year)
    #census_dict = clean_worker_class(census_dict, year)
    #census_dict = clean_education(census_dict, year)
    #census_dict = clean_ethnic_origin(census_dict, year)
    #census_dict = clean_time_worked(census_dict, year)
    #census_dict = clean_generation_status(census_dict, year)
    #census_dict = clean_household_char(census_dict, year)
    #census_dict = clean_birth_place(census_dict, year)
    #census_dict = clean_household_income(census_dict, year)
    #census_dict = clean_individual_income(census_dict, year)
    #census_dict = clean_industry(census_dict, year)
    #census_dict = clean_labour_force_status(census_dict, year)
    #census_dict = clean_commute_time(census_dict, year)
    #census_dict = clean_mobility(census_dict, year)
    #census_dict = clean_transport_mode(census_dict, year)
    #census_dict = clean_occupation(census_dict, year)
    #census_dict = clean_workplace_status(census_dict, year)
    #census_dict = clean_religion(census_dict, year)
    #census_dict = clean_shelter_cost(census_dict, year)
    #census_dict = clean_visible_minority(census_dict, year)
    #census_dict = clean_work_activity(census_dict, year)
        
    return census_dict

In [382]:
def create_subgroup_dict(df, year):
    
    # separate dataframe by 'Variables' containing regex expressions:
    if year == 2001:
        re1 = ['total.*by', 
               'population.*by', 
               'common-law couples',
               '^Male', 
               '^Female', 
               'total - male', 
               'total - female']
        
    elif year == 2006:
        re1 = ['total.*by', 
               'population.*by',
               'common-law couples',
               '^Male[s\s,]', 
               '^Female[s\s,]',
              'total - mobility',
              'Average number of children']
        
    elif year == 2011:
        df.drop(index=201 , inplace=True)
        re1 = ['total.*by', 
               'population.*by', 
               'common-law couples',
               'males',
               'Total population excluding institutional residents',  
               'Total.*in private households'] 
    elif year == 2016:
        re1 = ['^total', 'population.*by', 'males']
    
    subgroup = list(df[df.Variable.str.contains('|'.join(re1), flags=re.IGNORECASE)].index)
    subgroup.append(len(df.Variable)+1)
    subgroup = subgroup[1:]
    
    # create census dictionary of sub datasets
    # initialize variables for the lookup dictionary
    start = 0
    census_dict = {}
    
    for s in subgroup:
        sub_df = df.loc[start:s-1]

        # transpose dataframe and rename column
        sub_df = sub_df.set_index('Variable').T.reset_index().rename(columns={'index': 'LocalArea'})

        # clean up names and store dataframes into the dictionary
        census_dict[df.Variable[start].rstrip().lstrip()] = sub_df
        start = s
        
    return census_dict


In [383]:
def clean_age(census_dict, year):
    
    if year == 2001:
        column_names = ['LocalArea', 'Type', 'Total', '0 to 4 years', 
                   '5 to 9 years', '10 to 14 years', '15 to 19 years',
                   '20 to 24 years', '25 to 29 years', '30 to 34 years', 
                   '35 to 39 years', '40 to 44 years', '45 to 49 years', 
                   '50 to 54 years', '55 to 59 years', '60 to 64 years', 
                   '65 to 69 years', '70 to 74 years', '75 to 79 years', 
                   '80 to 84 years', '85 to 89 years', '90 to 94 years',
                   '95 to 99 years', '100 years and over']

        male = census_dict['Male']
        female = census_dict['Female']
    
        female.insert(1, 'Type', 'female')
        female.set_axis(column_names, axis=1, inplace=True)
        male.insert(1, 'Type', 'male')
        male.set_axis(column_names, axis=1, inplace=True)
    
        merged = pd.concat([female, male])
        merged.sort_values(by=['LocalArea', 'Type'], inplace=True)
        total = merged.groupby('LocalArea').sum()
        total['Type'] = 'total'
        total.reset_index(inplace=True)
        merged = pd.concat([merged, total])
    
    else:
        if year == 2006:
            
            column_names = ['LocalArea', 'Type', 'Total', '0 to 4 years', 
                       '5 to 9 years', '10 to 14 years', '15 to 19 years',
                       '20 to 24 years', '25 to 29 years', '30 to 34 years', 
                       '35 to 39 years', '40 to 44 years', '45 to 49 years', 
                       '50 to 54 years', '55 to 59 years', '60 to 64 years', 
                        '65 to 69 years', '70 to 74 years', '75 to 79 years', 
                       '80 to 84 years', '85 to 89 years', '90 to 94 years',
                       '95 to 99 years', '100 years and over', 'Median Age']
            
            total = census_dict['Male & Female, Total']
            male = census_dict['Male, Total']
            female = census_dict['Female, Total']
        
        elif year == 2011:
            
            column_names = ['LocalArea', 'Type', 'Total', '0 to 4 years', 
                       '5 to 9 years', '10 to 14 years', '15 to 19 years',
                       '15 years', '16 years', '17 years', '18 years', '19 years',
                       '20 to 24 years', '25 to 29 years', '30 to 34 years', 
                       '35 to 39 years', '40 to 44 years', '45 to 49 years', 
                       '50 to 54 years', '55 to 59 years', '60 to 64 years', 
                       '65 to 69 years', '70 to 74 years', '75 to 79 years', 
                       '80 to 84 years', '85 years and over', 'Median age',
                       '% of the population aged 15 and over']
        
            total = census_dict['Total population by age groups']
            male = census_dict['Males, total']
            female = census_dict['Females, total']

        elif year == 2016:
            
            column_names = ['LocalArea', 'Type', 'Total', '0 to 14 years', 
                       '0 to 4 years', '5 to 9 years', '10 to 14 years',
                       '15 to 64 years', '15 to 19 years', '20 to 24 years', 
                       '25 to 29 years', '30 to 34 years', '35 to 39 years', 
                       '40 to 44 years', '45 to 49 years', '50 to 54 years', 
                       '55 to 59 years', '60 to 64 years', '65 years and over', 
                       '65 to 69 years', '70 to 74 years', '75 to 79 years', 
                       '80 to 84 years', '85 years and over','85 to 89 years', 
                       '90 to 94 years', '95 to 99 years', '100 years and over']
        
            total = census_dict['Total - Age groups and average age of the population - 100% data']
            male = census_dict['Total - Age groups and average age of males - 100% data']
            female = census_dict['Total - Age groups and average age of females - 100% data']
         
        female.insert(1, 'Type', 'female')
        female.set_axis(column_names, axis=1, inplace=True)
        male.insert(1, 'Type', 'male')
        male.set_axis(column_names, axis=1, inplace=True)
        total.insert(1, 'Type', 'total')
        total.set_axis(column_names, axis=1, inplace=True)
    
        merged = pd.concat([female, male, total])
    
    merged.sort_values(by=['LocalArea', 'Type'], inplace=True)
    census_dict['population by age and sex'] = merged
    merged.to_csv('../../data/processed/census_' + str(year) + '/population_age_sex.csv')
    
    return census_dict

In [384]:
def clean_marital_status(census_dict, year):
    
    if year in [2001, 2006]:
        column_names = ['LocalArea', 'Total population 15 years and over',
                        'Single (never legally married)',
                       'Married', 'Separated', 'Divorced',
                       'Widowed', 'total x', 'Not living common law',
                       'Living common law']
        
        columns_ordered = ['LocalArea', 'Total population 15 years and over',
                       'Married or living with a or common-law partner',
                       'Married', 'Living common law',
                       'Not living with a married spouse or common-law partner',
                       'Single (never legally married)', 'Separated', 'Divorced',
                       'Widowed']
        
        df1 = census_dict['Total population 15 years and over by legal marital status']
        df2 = census_dict['Total population 15 years and over by common-law status']
        
        merged = pd.merge(df1, df2, on=['LocalArea'])
        merged.set_axis(column_names, axis=1, inplace=True)

        merged['Married or living with a or common-law partner'] = merged['Married'] + merged['Living common law']
        merged['Not living with a married spouse or common-law partner'] = merged['Total population 15 years and over'] - merged['Married or living with a or common-law partner']
        merged = merged[columns_ordered]
        
    
    else:
        if year == 2011:
            total = census_dict['Total population 15 years and over by marital status']
            male = census_dict['Males 15 years and over by marital status']
            female = census_dict['Females 15 years and over by marital status']
        elif year == 2016:
            total = census_dict['Total - Marital status for the population aged 15 years and over - 100% data']
            male = census_dict['Total - Marital status for males aged 15 years and over - 100% data']
            female = census_dict['Total - Marital status for females aged 15 years and over - 100% data']
            
        column_names = ['LocalArea', 'Type', 'Total population 15 years and over',
                       'Married or living with a or common-law partner',
                       'Married', 'Living common law',
                       'Not living with a married spouse or common-law partner',
                       'Single (never legally married)', 'Separated', 'Divorced',
                       'Widowed']
        
        female.insert(1, 'Type', 'female')
        female.set_axis(column_names, axis=1, inplace=True)
        male.insert(1, 'Type', 'male')
        male.set_axis(column_names, axis=1, inplace=True)
        total.insert(1, 'Type', 'total')
        total.set_axis(column_names, axis=1, inplace=True)
    
        merged = pd.concat([female, male, total])
        merged.sort_values(by=['LocalArea', 'Type'], inplace=True)
        
    census_dict['marital status'] = merged   
    merged.to_csv('../../data/processed/census_' + str(year) + '/marital_status.csv')
    return census_dict

In [385]:
def clean_private_households(census_dict, year):
    if year == 2011:
        column_names = ['LocalArea', 'Type', 'Persons in private households',
                   'Persons not in census families', 'Living with relatives',
                   'Living with non-relatives only', 'Living alone',
                   'Number of census family persons']
        
        total = census_dict['Total number of persons in private households']
        male = census_dict['Number of males in private households']
        female = census_dict['Number of females in private households']
        total_65 = census_dict['Total number of persons aged 65 years and over in private households']
        male_65 = census_dict['Number of males aged 65 years and over in private households']
        female_65 = census_dict['Number of females aged 65 years and over in private households']
        
        female.insert(1, 'Type', 'female')
        female.set_axis(column_names, axis=1, inplace=True)
        male.insert(1, 'Type', 'male')
        male.set_axis(column_names, axis=1, inplace=True)
        total.insert(1, 'Type', 'total')
        total.set_axis(column_names, axis=1, inplace=True)
        
        female_65.insert(1, 'Type', '65+ female')
        female_65.set_axis(column_names, axis=1, inplace=True)
        male_65.insert(1, 'Type', '65+ male')
        male_65.set_axis(column_names, axis=1, inplace=True)
        total_65.insert(1, 'Type', '65+ total')
        total_65.set_axis(column_names, axis=1, inplace=True)
    
        merged = pd.concat([female, male, total, female_65, male_65, total_65])
        merged.sort_values(by=['LocalArea', 'Type'], inplace=True)
        census_dict['private households - individuals'] = merged
        merged.to_csv('../../data/processed/census_' + str(year) + '/private_households.csv')

    return census_dict

In [386]:
def clean_couple_fam_structure(census_dict, year):
    
    column_names = ['LocalArea', 'Type', 'Total', 
                    'Without children at home',
                    'With children at home', '1 child', '2 children',
                    '3 or more children']

    if year == 2016:
        total = census_dict['Total - Couple census families in private households - 100% data']
        total.insert(1, 'Type', 'total couples')
        total.set_axis(column_names, axis=1, inplace=True)
    
        census_dict['couples - family structure'] = total
        total.to_csv('../../data/processed/census_' + str(year) + '/couples_family_structure.csv')
        
    else:
        if year in [2011, 2006]:
            married = census_dict['Total couple families by family structure and number of children'] 
            married = married[['LocalArea', 'Married couples',
                               'Without children at home',
                               'With children at home', '1 child', '2 children',
                               '3 or more children']]
            common_law = census_dict['Common-law couples']
            
        elif year == 2001:
            married = census_dict['Total couple families by family structure']
            married = married[['LocalArea','Married couples', 
                               'Without children at home', 'With children at home',
                               '1 child', '2 children', '3 or more children']]
            common_law = census_dict['Common-law couples']
            
        married.insert(1, 'Type', 'married couples')
        married.set_axis(column_names, axis=1, inplace=True)
        common_law.insert(1, 'Type', 'common-law couples')
        common_law.set_axis(column_names, axis=1, inplace=True)
        
        merged = pd.concat([married, common_law])
        total = merged.groupby('LocalArea').sum()
        total['Type'] = 'total couples'
        total.reset_index(inplace=True)
        merged = pd.concat([merged, total])
        
        merged.sort_values(by=['LocalArea', 'Type'], inplace=True)
        census_dict['couples - family structure'] = merged
        merged.to_csv('../../data/processed/census_' + str(year) + '/couples_family_structure.csv')
    
    return census_dict
        

In [387]:
def clean_language_detailed(census_dict, year):
    
    if year == 2006:
        mt_total = census_dict['Total population by mother tongue']
        home_total = census_dict['Total population by language spoken most often at home']
        home_total = home_total.iloc[:,0:104].copy()
        work_total = census_dict['Total population 15 years and over who worked since January 1, 2005 by language used most often at work']
        
        mt_total.rename(columns={mt_total.columns[1]:'Total'}, inplace=True)
        mt_total.insert(1, 'Type', 'mother tongue - total')
        home_total.rename(columns={home_total.columns[1]:'Total'}, inplace=True)
        home_total.insert(1, 'Type', 'language most often spoken at home - total')
        work_total.rename(columns={work_total.columns[1]:'Total'}, inplace=True)
        work_total.insert(1, 'Type', 'language most often spoken at work - total')
        
        merged = pd.concat([mt_total, home_total, work_total])
    
    elif year == 2001:
        mt_total = census_dict['Total population by mother tongue']
        home_total = census_dict['Total population by home language']
        home_total = home_total.groupby(home_total.columns, axis=1).sum()
        
        mt_total.rename(columns={mt_total.columns[1]:'Total'}, inplace=True)
        mt_total.insert(1, 'Type', 'mother tongue - total')
        home_total.rename(columns={'Total population by home language':'Total'}, inplace=True)
        home_total.insert(1, 'Type', 'language most often spoken at home - total')
        
        merged = pd.concat([mt_total, home_total])
    
    else:
        if year == 2011:
            mt_total = census_dict['Detailed mother tongue - Total population excluding institutional residents']
            mt_male = census_dict['Detailed mother tongue - Males excluding institutional residents']
            mt_female = census_dict['Detailed mother tongue - Females excluding institutional residents']
        
            home_total = census_dict['Detailed language spoken most often at home - Total population excluding institutional residents']
            home_male = census_dict['Detailed language spoken most often at home - Males excluding institutional residents']
            home_female = census_dict['Detailed language spoken most often at home - Females excluding institutional residents']
        
            home2_total = census_dict['Detailed other language spoken regularly at home - Total population excluding institutional residents']
            home2_male = census_dict['Detailed other language spoken regularly at home - Males excluding institutional residents']
            home2_female = census_dict['Detailed other language spoken regularly at home - Females excluding institutional residents']
        
        elif year == 2016:
            mt_total = census_dict['Total - Mother tongue for the total population excluding institutional residents - 100% data']
            mt_male = census_dict['Total - Mother tongue for males excluding institutional residents - 100% data']
            mt_female = census_dict['Total - Mother tongue for females excluding institutional residents - 100% data']
        
            home_total = census_dict['Total - Language spoken most often at home for the total population excluding institutional residents - 100% data']
            home_male = census_dict['Total - Language spoken most often at home for males excluding institutional residents - 100% data']
            home_female = census_dict['Total - Language spoken most often at home for females excluding institutional residents - 100% data']
        
            home2_total = census_dict['Total - Other language(s) spoken regularly at home for the total population excluding institutional residents - 100% data']
            home2_male = census_dict['Total - Other language(s) spoken regularly at home for males excluding institutional residents - 100% data']
            home2_female = census_dict['Total - Other language(s) spoken regularly at home for females excluding institutional residents - 100% data']
        
        mt_female.rename(columns={mt_female.columns[1]:'Total'}, inplace=True)
        mt_female.insert(1, 'Type', 'mother tongue - female')
        mt_male.rename(columns={mt_male.columns[1]:'Total'}, inplace=True)
        mt_male.insert(1, 'Type', 'mother tongue - male')
        mt_total.rename(columns={mt_total.columns[1]:'Total'}, inplace=True)
        mt_total.insert(1, 'Type', 'mother tongue - total')
        
        home_female.rename(columns={home_female.columns[1]:'Total'}, inplace=True)
        home_female.insert(1, 'Type', 'language most often spoken at home - female')
        home_male.rename(columns={home_male.columns[1]:'Total'}, inplace=True)
        home_male.insert(1, 'Type', 'language most often spoken at home - male')
        home_total.rename(columns={home_total.columns[1]:'Total'}, inplace=True)
        home_total.insert(1, 'Type', 'language most often spoken at home - total')
        
        home2_female.rename(columns={home2_female.columns[1]:'Total'}, inplace=True)
        home2_female.insert(1, 'Type', 'other language spoken at home - female')
        home2_male.rename(columns={home2_male.columns[1]:'Total'}, inplace=True)
        home2_male.insert(1, 'Type', 'other language spoken at home - male')
        home2_total.rename(columns={home2_total.columns[1]:'Total'}, inplace=True)
        home2_total.insert(1, 'Type', 'other language spoken at home - total')
        
        merged = pd.concat([mt_female, mt_male, mt_total, home_female, home_male, home_total, home2_female, home2_male, home2_total])
    
    merged.sort_values(by=['LocalArea', 'Type'], inplace=True)
    census_dict['detailed language'] = merged
    merged.to_csv('../../data/processed/census_' + str(year) + '/detailed_language.csv')
        
    return census_dict
    

In [388]:
def clean_official_language(census_dict, year):
    column_names = ['LocalArea', 'Type', 'Total',
                     'English', 'French', 'English and French',
                     'Neither English nor French']
    if year == 2016:
        known = census_dict['Total - Knowledge of official languages for the total population excluding institutional residents - 100% data']
        first = census_dict['Total - First official language spoken for the total population excluding institutional residents - 100% data']
    
    elif year == 2011:
        known = census_dict['Knowledge of official languages - Total population excluding institutional residents']
        first = census_dict['First official language spoken - Total population excluding institutional residents']
    
    elif year in [2001, 2006]:
        known = census_dict['Total population by knowledge of official languages']
        first = census_dict['Total population by first official language spoken']
            
        
    known.insert(1, 'Type', 'knowledge of official languages')
    known.set_axis(column_names, axis=1, inplace=True)
    first.insert(1, 'Type', 'first official language spoken')
    first.set_axis(column_names, axis=1, inplace=True)  
    
    merged = pd.concat([known, first])
    merged.sort_values(by=['LocalArea', 'Type'], inplace=True)
    census_dict['official language'] = merged
    merged.to_csv('../../data/processed/census_' + str(year) + '/official_language.csv')
    
    return census_dict

In [389]:
def clean_structural_dwelling_type(census_dict, year):
          
    if year == 2006:
        column_names = ['LocalArea', 'Total',
       'Single-detached house', 'Semi-detached house', 'Row house',
       'Apartment, duplex',
       'Apartment, building that has five or more storeys']
        
        df = census_dict['Total number of occupied private dwellings by structural type of dwelling']
    
    elif year in [2001, 2011, 2016]:
        column_names = ['LocalArea', 'Total',
       'Single-detached house', 'Semi-detached house', 'Row house',
       'Apartment, detached duplex',
       'Apartment, building that has five or more storeys',
       'Apartment, building that has fewer than five storeys',
       'Other single-attached house', 'Movable dwelling']
        
        if year == 2001:
            df = census_dict['Total number of occupied private dwellings by structural type of dwelling']
            df = df.iloc[:,0:10].copy()
        
        elif year == 2011:
            df = census_dict['Total number of occupied private dwellings by structural type of dwelling']
            df = df[['LocalArea',
                    'Total number of occupied private dwellings by structural type of dwelling',
                    'Single-detached house', 'Semi-detached house', 'Row house',
                    'Apartment, duplex', 
                    'Apartment, building that has five or more storeys', 
                    'Apartment, building that has fewer than five storeys',
                    'Other single-attached house', 'Movable dwelling']].copy()
    
        elif year == 2016:
            df = census_dict['Total - Occupied private dwellings by structural type of dwelling - 100% data']
            df = df[['LocalArea',
            'Total - Occupied private dwellings by structural type of dwelling - 100% data',
           'Single-detached house', 'Semi-detached house', 'Row house',
           'Apartment or flat in a duplex',
           'Apartment in a building that has five or more storeys',
           'Apartment in a building that has fewer than five storeys',
           'Other single-attached house', 'Movable dwelling']].copy()
        
    df.set_axis(column_names, axis=1, inplace=True)
    df.sort_values(by=['LocalArea'], inplace=True)
    census_dict['structual dwelling type'] = df
    df.to_csv('../../data/processed/census_' + str(year) + '/structual_dwelling_type.csv')
    
    return census_dict

In [390]:
def clean_household_size(census_dict, year):
    if year == 2001:
        column_names = ['LocalArea', 'Total households',
       '1 person', '2 persons', '3 persons', '4 to 5 persons',
       '6 or more persons',
       'Average household size']
    
        df = census_dict['Total number of private households by household size']
        
    elif year in [2006, 2011]:
        
        column_names = ['LocalArea', 'Total households',
       '1 person', '2 persons', '3 persons', '4 to 5 persons',
       '6 or more persons', 'Number of persons in private households',
       'Average household size']
    
        df = census_dict['Total number of private households by household size']
        
    elif year == 2016:
        
        column_names = ['LocalArea', 'Total households',
       '1 person', '2 persons', '3 persons', '4 persons', '5 or more persons',
       'Number of persons in private households', 'Average household size']
        
        df = census_dict['Total - Private households by household size - 100% data']
   
    df.set_axis(column_names, axis=1, inplace=True)
    df.sort_values(by=['LocalArea'], inplace=True)
    census_dict['household size'] = df
    df.to_csv('../../data/processed/census_' + str(year) + '/household_size.csv')
    
    return census_dict

In [391]:
def clean_lone_parent(census_dict, year):
    column_names = ['LocalArea', 'Total lone-parent families',
                 'Female parent', 'Male parent',
                 '1 child', '2 children', '3 or more children']
    if year == 2016:
        df1 = census_dict["Total lone-parent families by sex of parent"]
        df2 = census_dict["Total - Lone-parent census families in private households - 100% data"]
        
        df = pd.concat([df1, df2], axis=1)
        df = df.groupby(df.columns, axis=1).first()
        df = df[['LocalArea', 'Total lone-parent families by sex of parent',
                 'Female parent', 'Male parent',
                 '1 child', '2 children', '3 or more children']].copy()
    
    elif year == 2011:
        df = census_dict['Total lone-parent families by sex of parent and number of children']
        df = df.groupby(df.columns, axis=1).sum()
        df = df[['LocalArea', 'Total lone-parent families by sex of parent and number of children',
                  'Female parent', 'Male parent',
                 '1 child', '2 children', '3 or more children']].copy()
    
    elif year == 2006:
        df1 = census_dict['Total lone-parent families by sex of parent and number of children']
        df2 = census_dict['Female parent']
        df2 = df2.iloc[:,1:5].copy()
        df3 = census_dict['Male parent']
        df3 = df3.iloc[:,1:5].copy()
        
        df = pd.concat([df1, df2, df3], axis=1)
        df = df.groupby(df.columns, axis=1).sum()
        df = df[['LocalArea', 'Total lone-parent families by sex of parent and number of children',
                  'Female parent', 'Male parent',
                 '1 child', '2 children', '3 or more children']].copy()
    
    elif year == 2001:
        df1 = census_dict['Total lone-parent families by sex of parent']
        df2 = census_dict['Female parent']
        df2 = df2.iloc[:,1:5].copy()
        df3 = census_dict['Male parent']
        df3 = df3.iloc[:,1:5].copy()
        
        df = pd.concat([df1, df2, df3], axis=1)
        df = df.groupby(df.columns, axis=1).sum()
        df = df[['LocalArea', 'Total lone-parent families by sex of parent',
                  'Female parent', 'Male parent',
                 '1 child', '2 children', '3 or more children']].copy()

    df.set_axis(column_names, axis=1, inplace=True)
    df.sort_values(by=['LocalArea'], inplace=True)
    census_dict['lone_parent'] = df
    df.to_csv('../../data/processed/census_' + str(year) + '/lone_parent.csv')
        
    return census_dict

In [392]:
def clean_immigration_age(census_dict, year):
    
    if year in [2006, 2016]:
        column_names = ['LocalArea', 
                 'Total immigrant population',
                 'Under 5 years', '5 to 14 years', '15 to 24 years', 
                 '25 to 44 years', '45 years and over']
        
        if year == 2006:
            df = census_dict['Total immigrant population by age at immigration']
            
        elif year == 2016:
            df = census_dict['Total - Age at immigration for the immigrant population in private households - 25% sample data']
    
    elif year == 2011:
        column_names = ['LocalArea', 'Type', 
                 'Total immigrant population',
                 'Under 5 years', '5 to 14 years', '15 to 24 years', 
                 '25 to 44 years', '45 years and over']
        
        df = pd.read_csv('../../data/processed/nhs/Age at immigration.csv', index_col=0)
        df = df[['LocalArea', 'Type', 
                 'Total immigrant population in private households by age at immigration',
                 'Under 5 years', '5 to 14 years', '15 to 24 years', 
                 '25 to 44 years', '45 years and over']].copy()
    
    elif year == 2001:
        column_names = ['LocalArea', 
                 'Total immigrant population',
                 'Under 5 years', '5 to 19 years', '20 years and over']
        df = census_dict['Total immigrant population by age at immigration']
       
    df.set_axis(column_names, axis=1, inplace=True)
    df.sort_values(by=['LocalArea'], inplace=True)
    census_dict['immigration_age'] = df
    df.to_csv('../../data/processed/census_' + str(year) + '/immigration_age.csv')
            
    return census_dict

In [393]:
def clean_immigration_period(census_dict, year):
    
    if year == 2001:
        column_names = ['LocalArea', 
                 'Total immigrant population','Before 1961', 
                        '1961 to 1970', '1971 to 1980', '1981 to 1990', 
                        '1991 to 1995', '1996 to 2001']
        df = census_dict['Total immigrant population by period of immigration']
    
    elif year == 2006:
        column_names = ['LocalArea', 
                 'Total immigrant population',
                 'Before 1961', '1961 to 1970', '1971 to 1980', '1981 to 1990',
                 '1991 to 2000', '1991 to 1995', '1996 to 2000', '2001 to 2006']
        df = census_dict['Total immigrant population by period of immigration']
            
    elif year == 2016:
        column_names = ['LocalArea', 'Total population',
           'Non-immigrants', 'Non-permanent residents', 'Immigrants', 
           'Before 1981', '1981 to 1990', '1991 to 2000', '2001 to 2010', 
           '2001 to 2005', '2006 to 2010','2011 to 2016']
                        
        df = census_dict['Total - Immigrant status and period of immigration for the population in private households - 25% sample data']
        df = df[['LocalArea',
           'Total - Immigrant status and period of immigration for the population in private households - 25% sample data',
           'Non-immigrants', 'Non-permanent residents', 'Immigrants', 'Before 1981', '1981 to 1990',
           '1991 to 2000', '2001 to 2010', '2001 to 2005', '2006 to 2010',
           '2011 to 2016']].copy()
    
    elif year == 2011:
        column_names = ['LocalArea', 'Type', 'Total population',
                  'Non-immigrants', 'Non-permanent residents', 'Immigrants',
                 'Before 1971', '1971 to 1980', '1981 to 1990', '1991 to 2000', '2001 to 2005']
        
        df = pd.read_csv('../../data/processed/nhs/Immigrant status and period of immigration.csv', index_col=0)
        df = df[['LocalArea', 'Type', 
                 'Total population in private households by immigrant status and period of immigration',
                  'Non-immigrants', 'Non-permanent residents', 'Immigrants',
                 'Before 1971', '1971 to 1980', '1981 to 1990', '1991 to 2000', '2001 to 2005']].copy()
    
       
    df.set_axis(column_names, axis=1, inplace=True)
    df.sort_values(by=['LocalArea'], inplace=True)
    census_dict['immigration_period'] = df
    df.to_csv('../../data/processed/census_' + str(year) + 'immigration_period.csv')
    
    return census_dict

In [409]:
def clean_visible_minority(census_dict, year):
    
    column_names = ['LocalArea', 'Total population',
                    'Not a visible minority', 'Total visible minority population',
                    'Arab', 'Black', 'Chinese', 'Filipino', 'Japanese', 'Korean',
                    'Latin American', 'West Asian', 'South Asian', 'Southeast Asian', 
                    'Multiple visible minorities', 'Other visible minority']
    
    if year == 2001:
        
        df = census_dict['Total population by visible minority groups']
        df = df[['LocalArea', 'Total population by visible minority groups',
                 'All others', 'Total visible minority population', 
                 'Arab', 'Black', 'Chinese',  'Filipino', 'Japanese',
                 'Korean', 'Latin American', 'West Asian', 'South Asian',
                 'Southeast Asian', 'Multiple visible minorities',
                 'Visible minority, n.i.e.']].copy()
        
    elif year == 2006:
        
        df = census_dict['Total population by visible minority groups']
        df = df[['LocalArea', 'Total population by visible minority groups',
                 'Not a visible minority', 'Total visible minority population',
                 'Arab', 'Black', 'Chinese', 'Filipino', 'Japanese', 'Korean',
                 'Latin American', 'West Asian', 'South Asian', 'Southeast Asian', 
                 'Multiple visible minority', 'Visible minority, n.i.e.']]
        
    elif year == 2011:
        column_names = ['LocalArea', 'Type', 'Total population',
                       'Not a visible minority', 'Total visible minority population',
                       'Arab', 'Black', 'Chinese', 'Filipino', 'Japanese', 'Korean',
                       'Latin American', 'West Asian', 'South Asian', 'Southeast Asian', 
                       'Multiple visible minorities', 'Other visible minority']
        
        df = pd.read_csv('../../data/processed/nhs/Visible minority population.csv', index_col=0)
        df = df[['LocalArea', 'Type', 
                 'Total population in private households by visible minority',
                 'Not a visible minority', 'Total visible minority population',
                 'Arab', 'Black', 'Chinese', 'Filipino', 'Japanese', 'Korean',
                 'Latin American', 'West Asian', 'South Asian', 'Southeast Asian', 
                 'Multiple visible minorities', 'Visible minority, n.i.e.']].copy()
        
    elif year == 2016:
        df = census_dict['Total visible minority population']
        df = df[['LocalArea', 'Total visible minority population',
                 'Not a visible minority', 'Total visible minority population',
                 'Arab', 'Black', 'Chinese', 'Filipino', 'Japanese', 'Korean',
                 'Latin American', 'West Asian', 'South Asian', 'Southeast Asian', 
                 'Multiple visible minorities', 'Visible minority, n.i.e.']]
       
    
    df.set_axis(column_names, axis=1, inplace=True)
    df.sort_values(by=['LocalArea'], inplace=True)
    census_dict['visible_minority'] = df
    df.to_csv('../../data/processed/census_' + str(year) + 'visible_minority.csv')
    
    return census_dict

# Incomplete Helper Functions

In [326]:
def clean_household_type(census_dict, year):
    
    if year == 2001:
        
    elif year == 2006:
        
    elif year == 2011:
        
    elif year == 2016:
    
    df.set_axis(column_names, axis=1, inplace=True)
    df.sort_values(by=['LocalArea'], inplace=True)
    census_dict['household_type'] = df
    df.to_csv('../../data/processed/census_' + str(year) + 'household_type.csv')
    
    return census_dict

In [328]:
def clean_census_family_children(census_dict, year):
    
    if year == 2001:
        
    elif year == 2006:
        
    elif year == 2011:
        
    elif year == 2016:
    
    df.set_axis(column_names, axis=1, inplace=True)
    df.sort_values(by=['LocalArea'], inplace=True)
    census_dict['census_family_children'] = df
    df.to_csv('../../data/processed/census_' + str(year) + 'census_family_children.csv')
    
    return census_dict

In [330]:
def clean_aboriginal(census_dict, year):
    
    if year == 2001:
        
    elif year == 2006:
        
    elif year == 2011:
        
    elif year == 2016:
        
    df.set_axis(column_names, axis=1, inplace=True)
    df.sort_values(by=['LocalArea'], inplace=True)
    census_dict['aboriginal'] = df
    df.to_csv('../../data/processed/census_' + str(year) + 'aboriginal.csv')
    
    return census_dict

In [332]:
def clean_citizenship(census_dict, year):
    
    if year == 2001:
        
    elif year == 2006:
        
    elif year == 2011:
        
    elif year == 2016:
    
    df.set_axis(column_names, axis=1, inplace=True)
    df.sort_values(by=['LocalArea'], inplace=True)
    census_dict['citizenship'] = df
    df.to_csv('../../data/processed/census_' + str(year) + 'citizenship.csv')
    
    return census_dict

In [333]:
def clean_worker_class(census_dict, year):
    
    if year == 2001:
        
    elif year == 2006:
        
    elif year == 2011:
        
    elif year == 2016:
    
    df.set_axis(column_names, axis=1, inplace=True)
    df.sort_values(by=['LocalArea'], inplace=True)
    census_dict['worker_class'] = df
    df.to_csv('../../data/processed/census_' + str(year) + 'worker_class.csv')
    
    return census_dict

In [334]:
def clean_education(census_dict, year):
    
    if year == 2001:
        
    elif year == 2006:
        
    elif year == 2011:
        
    elif year == 2016:
        
    df.set_axis(column_names, axis=1, inplace=True)
    df.sort_values(by=['LocalArea'], inplace=True)
    census_dict['education'] = df
    df.to_csv('../../data/processed/census_' + str(year) + 'education.csv')
    
    return census_dict

In [335]:
def clean_ethnic_origin(census_dict, year):
    
    if year == 2001:
        
    elif year == 2006:
        
    elif year == 2011:
        
    elif year == 2016:
    
    df.set_axis(column_names, axis=1, inplace=True)
    df.sort_values(by=['LocalArea'], inplace=True)
    census_dict['ethnic_origin'] = df
    df.to_csv('../../data/processed/census_' + str(year) + 'ethnic_origin.csv')
    
    return census_dict

In [288]:
def clean_time_worked(census_dict, year):
    
    if year == 2001:
        
    elif year == 2006:
        
    elif year == 2011:
        
    elif year == 2016:
    
    df.set_axis(column_names, axis=1, inplace=True)
    df.sort_values(by=['LocalArea'], inplace=True)
    census_dict['time_worked'] = df
    df.to_csv('../../data/processed/census_' + str(year) + 'time_worked.csv')
    
    return census_dict

In [289]:
def clean_generation_status(census_dict, year):
    
    if year == 2001:
        
    elif year == 2006:
        
    elif year == 2011:
        
    elif year == 2016:
    
    df.set_axis(column_names, axis=1, inplace=True)
    df.sort_values(by=['LocalArea'], inplace=True)
    census_dict['generation_status'] = df
    df.to_csv('../../data/processed/census_' + str(year) + 'generation_status.csv')
    
    return census_dict

In [290]:
def clean_household_char(census_dict, year):
    
    if year == 2001:
        
    elif year == 2006:
        
    elif year == 2011:
        
    elif year == 2016:
    
    df.set_axis(column_names, axis=1, inplace=True)
    df.sort_values(by=['LocalArea'], inplace=True)
    census_dict['household_characteristics'] = df
    df.to_csv('../../data/processed/census_' + str(year) + 'household_characteristics.csv')
    
    return census_dict

In [359]:
def clean_birth_place(census_dict, year):
    if year == 2001:
        column_names = ['LocalArea', 
                 'Total immigrant population','Before 1961', 
                        '1961 to 1970', '1971 to 1980', '1981 to 1990', 
                        '1991 to 1995', '1996 to 2001']
        df = census_dict['Total immigrant population by period of immigration']
    
    elif year == 2006:
        column_names = ['LocalArea', 
                 'Total immigrant population',
                 'Before 1961', '1961 to 1970', '1971 to 1980', '1981 to 1990',
                 '1991 to 2000', '1991 to 1995', '1996 to 2000', '2001 to 2006']
        df = census_dict['Total immigrant population by period of immigration']
            
    elif year == 2016:
        column_names = ['LocalArea', 'Total population',
           'Non-immigrants', 'Non-permanent residents', 'Immigrants', 
           'Before 1981', '1981 to 1990', '1991 to 2000', '2001 to 2010', 
           '2001 to 2005', '2006 to 2010','2011 to 2016']
                        
        df = census_dict['Total - Immigrant status and period of immigration for the population in private households - 25% sample data']
        df = df[['LocalArea',
           'Total - Immigrant status and period of immigration for the population in private households - 25% sample data',
           'Non-immigrants', 'Non-permanent residents', 'Immigrants', 'Before 1981', '1981 to 1990',
           '1991 to 2000', '2001 to 2010', '2001 to 2005', '2006 to 2010',
           '2011 to 2016']].copy()
    
    elif year == 2011:
        column_names = ['LocalArea', 'Type', 'Total population',
                 'Non-immigrants',
                 'Born in province of residence',
                 'Born outside province of residence',
                 'Non-permanent residents', 'Immigrants',
                 'Afghanistan', 'Africa', 'Algeria', 'Americas', 'Asia',
                 'Bangladesh', 'Bosnia and Herzegovina', 'Chile',
                 'China', 'Colombia', 'Croatia', 'Egypt', 'El Salvador', 'Ethiopia',
                 'Europe', 'Fiji', 'France', 'Germany', 'Greece', 'Guyana', 'Haiti',
                 'Hong Kong', 'Hungary', 'India', 'Iran', 'Iraq', 'Ireland', 'Italy', 'Jamaica',
                 'Japan', 'Kenya', 'South Korea', 'Lebanon', 'Mexico', 'Morocco',
                 'Netherlands', 'Nigeria', 'Pakistan', 'Peru', 'Philippines', 'Poland',
                 'Portugal', 'Romania', 'Russia', 'Serbia', 'South Africa',
                 'Sri Lanka', 'Taiwan', 'Trinidad and Tobago', 'Turkey', 'Ukraine', 'United Kingdom',
                'United States', 'Viet Nam', 'Oceania and other',
                'Other places of birth in Africa', 'Other places of birth in Americas',
                'Other places of birth in Asia', 'Other places of birth in Europe', 
                'Other places of birth']
        
        df = pd.read_csv('../../data/processed/nhs/Immigrant status and selected places of birth.csv', index_col=0)
        df = df[['LocalArea', 'Type',  
                 'Total population in private households by immigrant status and selected places of birth',
                 'Non-immigrants',
                 'Born in province of residence',
                 'Born outside province of residence',
                 'Non-permanent residents', 'Immigrants',
                 'Afghanistan', 'Africa', 'Algeria', 'Americas', 'Asia',
                 'Bangladesh', 'Bosnia and Herzegovina', 'Chile',
                 'China', 'Colombia', 'Croatia', 'Egypt', 'El Salvador', 'Ethiopia',
                 'Europe', 'Fiji', 'France', 'Germany', 'Greece', 'Guyana', 'Haiti',
                 'Hong Kong Special Administrative Region', 'Hungary',
                 'India', 'Iran', 'Iraq', 'Ireland, Republic of', 'Italy', 'Jamaica',
                 'Japan', 'Kenya', 'Korea, South', 'Lebanon', 'Mexico', 'Morocco',
                 'Netherlands', 'Nigeria', 'Pakistan', 'Peru', 'Philippines', 'Poland',
                 'Portugal', 'Romania', 'Russian Federation', 'Serbia', 'South Africa, Republic of',
                 'Sri Lanka', 'Taiwan', 'Trinidad and Tobago', 'Turkey', 'Ukraine', 'United Kingdom',
                'United States', 'Viet Nam', 'Oceania and other',
                'Other places of birth in Africa', 'Other places of birth in Americas',
                'Other places of birth in Asia', 'Other places of birth in Europe', 
                 'Other places of birth']].copy()
    
       
    df.set_axis(column_names, axis=1, inplace=True)
    df.sort_values(by=['LocalArea'], inplace=True)
    census_dict['immigration_birth_place'] = df
    df.to_csv('../../data/processed/census_' + str(year) + 'immigration_birth_place.csv')
    
    return census_dict

In [339]:
def clean_household_income(census_dict, year):
    if year == 2011:
        column_names = ['LocalArea', 'Type', 'Total population']
        
        df = pd.read_csv('../../data/processed/nhs/Income of households in 2010.csv', index_col=0)
        df = df[['LocalArea', 'Type', 
                 'After-tax income of households in 2010 of private households',
                 '$5,000 to $9,999', '$10,000 to $14,999', '$15,000 to $19,999',  
                 '$20,000 to $29,999', '$30,000 to $39,999', '$40,000 to $49,999', 
                 '$50,000 to $59,999', '$60,000 to $79,999', '$80,000 to $99,999',
                 '$100,000 and over', '$100,000 to $124,999', '$125,000 and over',
                 '$125,000 to $149,999', '$150,000 and over',
       'Average after-tax household income ($)',
       'Average household total income ($)',
       'Average household total income ($)',
       'Household income in 2010 of private households',
       'Household total income in 2010 of private households',
       'Median after-tax household income ($)',
       'Median household total income ($)',
       'Median household total income ($)', 'One-person private households',
       'Two-or-more-persons private households', 'Under $5,000', 'LocalArea']]
    
    df.set_axis(column_names, axis=1, inplace=True)
    df.sort_values(by=['LocalArea'], inplace=True)
    census_dict['household_income'] = df
    df.to_csv('../../data/processed/census_' + str(year) + 'household_income.csv')
    
    return census_dict

In [340]:
def clean_individual_income(census_dict, year):
    
    if year == 2001:
        
    elif year == 2006:
        
    elif year == 2011:
        column_names = ['LocalArea', 'Type', 'Total population']
        
        df = pd.read_csv('../../data/processed/nhs/Income of individuals in 2010.csv', index_col=0)
        
    elif year == 2016:
    
    df.set_axis(column_names, axis=1, inplace=True)
    df.sort_values(by=['LocalArea'], inplace=True)
    census_dict['individual_income'] = df
    df.to_csv('../../data/processed/census_' + str(year) + 'individual_income.csv')
    
    return census_dict

In [341]:
def clean_industry(census_dict, year):
    
    if year == 2001:
        
    elif year == 2006:
        
    elif year == 2011:
        column_names = ['LocalArea', 'Type', 'Total population']
        
        df = pd.read_csv('../../data/processed/nhs/Industry.csv', index_col=0)
        
    elif year == 2016:
    
    df.set_axis(column_names, axis=1, inplace=True)
    df.sort_values(by=['LocalArea'], inplace=True)
    census_dict['industry'] = df
    df.to_csv('../../data/processed/census_' + str(year) + 'industry.csv')
    
    return census_dict

In [342]:
def clean_labour_force_status(census_dict, year):
    
    if year == 2001:
        
    elif year == 2006:
        
    elif year == 2011:
        column_names = ['LocalArea', 'Type', 'Total population']
        
        df = pd.read_csv('../../data/processed/nhs/Labour force status.csv', index_col=0)
        
    elif year == 2016:
    
    df.set_axis(column_names, axis=1, inplace=True)
    df.sort_values(by=['LocalArea'], inplace=True)
    census_dict['labour_force_status'] = df
    df.to_csv('../../data/processed/census_' + str(year) + 'labour_force_status.csv')
    
    return census_dict

In [343]:
def clean_commute_time(census_dict, year):
    
    if year == 2001:
        
    elif year == 2006:
        
    elif year == 2011:
        column_names = ['LocalArea', 'Type', 'Total population']
        
        df = pd.read_csv('../../data/processed/nhs/Median commuting duration.csv', index_col=0)
        
    elif year == 2016:
        
    df.set_axis(column_names, axis=1, inplace=True)
    df.sort_values(by=['LocalArea'], inplace=True)
    census_dict['commute_time'] = df
    df.to_csv('../../data/processed/census_' + str(year) + 'commute_time.csv')
    
    return census_dict

In [344]:
def clean_mobility(census_dict, year):
    
    if year == 2001:
        
    elif year == 2006:
        
    elif year == 2011:
        column_names = ['LocalArea', 'Type', 'Total population']
        
        df = pd.read_csv('../../data/processed/nhs/Mobility.csv', index_col=0)
        
    elif year == 2016:
    
    df.set_axis(column_names, axis=1, inplace=True)
    df.sort_values(by=['LocalArea'], inplace=True)
    census_dict['mobility'] = df
    df.to_csv('../../data/processed/census_' + str(year) + 'mobility.csv')
    
    return census_dict

In [345]:
def clean_transport_mode(census_dict, year):
    
    if year == 2001:
        
    elif year == 2006:
        
    elif year == 2011:
        column_names = ['LocalArea', 'Type', 'Total population']
        
        df = pd.read_csv('../../data/processed/nhs/Mode of transportation.csv', index_col=0)
        
    elif year == 2016:
    
    df.set_axis(column_names, axis=1, inplace=True)
    df.sort_values(by=['LocalArea'], inplace=True)
    census_dict['transport_mode'] = df
    df.to_csv('../../data/processed/census_' + str(year) + 'transport_mode.csv')
    
    return census_dict

In [346]:
def clean_occupation(census_dict, year):
    
    if year == 2001:
        
    elif year == 2006:
        
    elif year == 2011:
        column_names = ['LocalArea', 'Type', 'Total population']
        
        df = pd.read_csv('../../data/processed/nhs/Occupation.csv', index_col=0)
        
    elif year == 2016:
    
    df.set_axis(column_names, axis=1, inplace=True)
    df.sort_values(by=['LocalArea'], inplace=True)
    census_dict['occupation'] = df
    df.to_csv('../../data/processed/census_' + str(year) + 'occupation.csv')
    
    return census_dict

In [347]:
def clean_workplace_status(census_dict, year):
    
    if year == 2001:
        
    elif year == 2006:
        
    elif year == 2011:
        column_names = ['LocalArea', 'Type', 'Total population']
        
        df = pd.read_csv('../../data/processed/nhs/Place of work status.csv', index_col=0)
        
    elif year == 2016:
    
    df.set_axis(column_names, axis=1, inplace=True)
    df.sort_values(by=['LocalArea'], inplace=True)
    census_dict['workplace_status'] = df
    df.to_csv('../../data/processed/census_' + str(year) + 'workplace_status.csv')
    
    return census_dict

In [348]:
# no data for 2006
def clean_religion(census_dict, year):
    
    if year == 2001:
        
    elif year == 2006:
        
    elif year == 2011:
        column_names = ['LocalArea', 'Type', 'Total population']
        
        df = pd.read_csv('../../data/processed/nhs/Religion.csv', index_col=0)
        
    elif year == 2016:
    
    df.set_axis(column_names, axis=1, inplace=True)
    df.sort_values(by=['LocalArea'], inplace=True)
    census_dict['clean_religion'] = df
    df.to_csv('../../data/processed/census_' + str(year) + 'clean_religion.csv')
    
    return census_dict

In [349]:
def clean_shelter_cost(census_dict, year):
    
    column_names = ['LocalArea', 'Total number of dwellings', 'Owned',
                       'Rented', 'Band housing']

    if year == 2001:
        df = census_dict['Total number of occupied private dwellings by tenure']
        
    elif year == 2006:
        df = census_dict['Total number of occupied private dwellings by housing tenure']
        
    elif year == 2011:
        column_names = ['LocalArea', 'Type', 'Total number of dwellings', 
                        'Owned','Rented']
        
        df = pd.read_csv('../../data/processed/nhs/Shelter costs.csv', index_col=0)
        df = df[['LocalArea','Type',
                 'Total number of owner and tenant households with household total income greater than zero, in non-farm, non-reserve private dwellings by shelter-cost-to-income ratio',
                 'Number of owner households in non-farm, non-reserve private dwellings',
                 'Number of tenant households in non-farm, non-reserve private dwellings']].copy()
        
    elif year == 2016:
    
    df.set_axis(column_names, axis=1, inplace=True)
    df.sort_values(by=['LocalArea'], inplace=True)
    census_dict['shelter_cost'] = df
    df.to_csv('../../data/processed/census_' + str(year) + 'shelter_cost.csv')
    
    return census_dict

In [352]:
df_2001 = clean_census('../../data/raw/census_2001.csv', 2001)
df_2006 = clean_census('../../data/raw/census_2006.csv', 2006)
df_2011 = clean_census('../../data/raw/census_2011.csv', 2011)
df_2016 = clean_census('../../data/raw/census_2016.csv', 2016)

In [423]:
for key in df_2016.keys():
    print(key)

Total - Age groups and average age of the population - 100% data
Total - Age groups and average age of males - 100% data
Total - Age groups and average age of females - 100% data
Total - Distribution (%) of the population by broad age groups - 100% data
Total - Distribution (%) of males by broad age groups - 100% data
Total - Distribution (%) of females by broad age groups - 100% data
Average age of males
Median age of males
Average age of females
Median age of females
Total - Marital status for the population aged 15 years and over - 100% data
Total - Marital status for males aged 15 years and over - 100% data
Total - Marital status for females aged 15 years and over - 100% data
Total - Census families in private households by family size - 100% data
Total number of census families in private households - 100% data
Total couple families
Total lone-parent families by sex of parent
Total - Couple census families in private households - 100% data
Total - Lone-parent census families in pr

In [424]:
df_2016['Total -  Owner and tenant households with household total income greater than zero, in non-farm, non-reserve private dwellings by shelter-cost-to-income ratio - 25% sample data']

Variable,LocalArea,"Total - Owner and tenant households with household total income greater than zero, in non-farm, non-reserve private dwellings by shelter-cost-to-income ratio - 25% sample data",Spending less than 30% of income on shelter costs,Spending 30% or more of income on shelter costs,30% to less than 100%
0,Arbutus-Ridge,6080,3370,2710,1760
1,Downtown,34985,19470,15515,11080
2,Dunbar-Southlands,6930,4675,2255,1365
3,Fairview,19460,12960,6495,5080
4,Grandview-Woodland,15035,9755,5275,4425
5,Hastings-Sunrise,12715,8915,3800,3250
6,Kensington-Cedar Cottage,17855,12325,5530,4615
7,Kerrisdale,5475,3320,2155,1455
8,Killarney,10650,7775,2875,2425
9,Kitsilano,22870,14895,7980,6015


In [420]:
df_2006['Total number of occupied private dwellings by housing tenure'].columns

Index(['LocalArea',
       'Total number of occupied private dwellings by housing tenure', 'Owned',
       'Rented', 'Band housing'],
      dtype='object', name='Variable')

In [422]:
df_2001['Total number of occupied private dwellings by tenure'].columns

Index(['LocalArea', 'Total number of occupied private dwellings by tenure',
       'Owned', 'Rented', 'Band housing'],
      dtype='object', name='Variable')

In [414]:
df = pd.read_csv('../../data/processed/nhs/Shelter costs.csv', index_col=0)
df

Unnamed: 0,Type,% of owner households spending 30% or more of household total income on shelter costs,% of owner households with a mortgage,% of tenant households in subsidized housing,% of tenant households spending 30% or more of household total income on shelter costs,Average monthly shelter costs for owned dwellings ($),Average monthly shelter costs for rented dwellings ($),Average value of dwellings ($),Median monthly shelter costs for owned dwellings ($),Median monthly shelter costs for rented dwellings ($),Median value of dwellings ($),"Number of owner households in non-farm, non-reserve private dwellings","Number of tenant households in non-farm, non-reserve private dwellings",Spending 30% or more of household total income on shelter costs,Spending 30% to less than 100% of household total income on shelter costs,Spending less than 30% of household total income on shelter costs,"Total number of owner and tenant households with household total income greater than zero, in non-farm, non-reserve private dwellings by shelter-cost-to-income ratio",LocalArea
0,Total,134.8,223.2,75.0,189.8,5877.0,4600.0,4079188.0,3759.0,4313.0,3998640.0,4265.0,2335.0,2055.0,1635.0,4540.0,6590.0,Victoria-Fraserview
1,Male,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Victoria-Fraserview
2,Female,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Victoria-Fraserview
0,Total,88.9,153.7,2.5,150.6,7446.0,6665.0,7250055.0,4022.0,5702.0,6952411.0,6085.0,1425.0,1935.0,1360.0,5560.0,7485.0,Dunbar-Southlands
1,Male,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Dunbar-Southlands
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1,Male,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Killarney
2,Female,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Killarney
0,Total,74.6,70.7,9.4,117.0,3102.0,2980.0,3247722.0,1949.0,2806.0,2901767.0,2280.0,1320.0,1615.0,1060.0,1965.0,3580.0,Oakridge
1,Male,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Oakridge


In [410]:
df_2006['structual dwelling type'].head()

Unnamed: 0,LocalArea,Total,Single-detached house,Semi-detached house,Row house,"Apartment, duplex","Apartment, building that has five or more storeys"
0,Arbutus-Ridge,6100,2410,60,255,630,1385
1,Downtown,25025,15,10,575,80,22355
2,Dunbar-Southlands,7405,5285,10,30,1730,0
3,Fairview,17390,50,120,655,45,4695
4,Grandview-Woodland,14320,1405,380,260,1960,715


In [411]:
df_2001['Total population by immigrant status and place of birth'].head()

Variable,LocalArea,Total population by immigrant status and place of birth,Non-immigrant population,Born in province of residence,Born outside province of residence
0,Arbutus-Ridge,14040,6640,4390,2245
1,Downtown,27725,16105,7715,8390
2,Dunbar-Southlands,21215,13680,9670,4010
3,Fairview,27550,18275,9235,9040
4,Grandview-Woodland,28975,18250,10490,7765


In [194]:
df_2006['Total recent immigrants by selected places of birth']

Variable,LocalArea,Total recent immigrants by selected places of birth,"China, People's Republic of",India,Philippines,Pakistan,United States of America,"Korea, South",Romania,Iran,...,Living with relatives,Living with non-relatives only,Living alone,Number of census family persons,Total number of persons aged 65 years and over,Number of persons not in census families aged 65 years and over,Living with relatives.1,Living with non-relatives only.1,Living alone.1,Number of census family persons aged 65 years and over
0,Arbutus-Ridge,2375,1305,20,10,0,50,300,0,40,...,385,495,1805,12890.0,2620,1170,165,55,950,1450
1,Downtown,3375,625,35,100,0,170,340,15,405,...,780,3835,13550,23785.0,3450,1510,65,105,1340,1940
2,Dunbar-Southlands,1010,320,10,30,0,80,170,0,0,...,455,850,1245,18745.0,2850,760,160,45,550,2085
3,Fairview,1280,105,30,85,0,115,25,15,10,...,375,2200,9405,16445.0,3240,1725,30,45,1650,1510
4,Grandview-Woodland,1335,460,10,205,10,75,10,10,10,...,725,3045,6495,17540.0,2855,1405,155,95,1155,1455
5,Hastings-Sunrise,1765,830,25,255,10,25,10,0,40,...,1410,1395,2715,27460.0,5140,1515,540,90,885,3625
6,Kensington-Cedar Cottage,3200,1200,110,900,60,80,15,10,0,...,1940,2460,3490,36395.0,5590,1695,680,175,840,3895
7,Kerrisdale,1665,885,15,35,0,0,135,0,10,...,310,565,1525,12210.0,2110,745,100,35,605,1370
8,Killarney,2150,1170,25,150,0,30,105,10,40,...,875,800,2085,23085.0,3595,1265,435,75,760,2330
9,Kitsilano,1585,150,10,10,0,240,60,20,30,...,695,4530,10315,24665.0,3485,1610,85,110,1415,1875


In [191]:
df_im_2011 = pd.read_csv('../../data/processed/nhs_Immigrant status and selected places of birth.csv', index_col=0)
df_im_2011.columns

Index(['Type', 'Afghanistan', 'Africa', 'Algeria', 'Americas', 'Asia',
       'Bangladesh', 'Born in province of residence',
       'Born outside province of residence', 'Bosnia and Herzegovina', 'Chile',
       'China', 'Colombia', 'Croatia', 'Egypt', 'El Salvador', 'Ethiopia',
       'Europe', 'Fiji', 'France', 'Germany', 'Greece', 'Guyana', 'Haiti',
       'Hong Kong Special Administrative Region', 'Hungary', 'Immigrants',
       'India', 'Iran', 'Iraq', 'Ireland, Republic of', 'Italy', 'Jamaica',
       'Japan', 'Kenya', 'Korea, South', 'Lebanon', 'Mexico', 'Morocco',
       'Netherlands', 'Nigeria', 'Non-immigrants', 'Non-permanent residents',
       'Oceania and other', 'Other places of birth',
       'Other places of birth in Africa', 'Other places of birth in Americas',
       'Other places of birth in Asia', 'Other places of birth in Europe',
       'Pakistan', 'Peru', 'Philippines', 'Poland', 'Portugal', 'Romania',
       'Russian Federation', 'Serbia', 'South Africa, Republi

In [128]:
df_2006['Total population by Aboriginal and non-Aboriginal identity population'].head(2)
#df_2006['Total population by Registered Indian status']

Variable,LocalArea,Total population by Aboriginal and non-Aboriginal identity population,Total Aboriginal identity population,North American Indian single response,Métis single response,Inuit single response,Multiple Aboriginal identity responses,Aboriginal responses not included elsewhere,Non-Aboriginal identity population
0,Arbutus-Ridge,15835,65,15,25,0,0,25,15765
1,Downtown,42580,1380,945,380,10,10,40,41195


In [123]:
df_2001['Total population by aboriginal and non-aboriginal population'].head()

Variable,LocalArea,Total population by aboriginal and non-aboriginal population,Total aboriginal origins population,North American Indian single origin,North American Indian and non-aboriginal origins,Métis single origin,Métis and non-aboriginal origins,Inuit single origin,Inuit and non-aboriginal origins,Other aboriginal multiple origins,Total non-aboriginal population
0,Arbutus-Ridge,14040,110,10,95,0,10,0,0,0,13925
1,Downtown,27725,1330,585,535,35,115,15,15,30,26390
2,Dunbar-Southlands,21215,650,485,120,0,30,0,0,0,20565
3,Fairview,27545,565,55,335,10,145,0,10,20,26980
4,Grandview-Woodland,28975,2955,1480,1055,60,290,10,0,65,26015


In [116]:
df_ab_2011 = pd.read_csv('../../data/processed/nhs_Aboriginal population.csv', index_col=0)
df_ab_2011.head(2)

Unnamed: 0,Type,Aboriginal ancestry,Aboriginal identities not included elsewhere,Aboriginal identity,First Nations (North American Indian) Aboriginal ancestry,First Nations (North American Indian) single identity,Inuit ancestry,Inuk (Inuit) single identity,Multiple Aboriginal identities,Métis ancestry,Métis single identity,Non-Aboriginal ancestry only,Non-Aboriginal identity,Not a Registered or Treaty Indian,Registered or Treaty Indian,Total population in private households by Aboriginal ancestry,Total population in private households by Aboriginal identity,Total population in private households by Registered or Treaty Indian status,LocalArea
0,Total,355.0,0.0,255.0,210.0,205.0,0.0,0.0,0.0,45.0,25.0,20255.0,20345.0,20520.0,55.0,20610.0,20610.0,20620.0,Victoria-Fraserview
1,Male,120.0,0.0,25.0,35.0,0.0,0.0,0.0,0.0,15.0,10.0,9790.0,9850.0,9915.0,0.0,9940.0,9945.0,9940.0,Victoria-Fraserview


In [119]:
df_2016['Total - Aboriginal identity for the population in private households - 25% sample data'].head()

Variable,LocalArea,Total - Aboriginal identity for the population in private households - 25% sample data,Aboriginal identity,Single Aboriginal responses,First Nations (North American Indian),Métis,Inuk (Inuit),Multiple Aboriginal responses,Aboriginal responses not included elsewhere,Non-Aboriginal identity
0,Arbutus-Ridge,15070,140,130,60,70,0,10,10,14930
1,Downtown,58855,1430,1375,795,560,20,30,25,57425
2,Dunbar-Southlands,21285,880,875,835,40,0,0,0,20405
3,Fairview,32725,590,575,305,265,10,0,10,32135
4,Grandview-Woodland,29000,2265,2215,1760,445,10,25,25,26740


In [120]:
df_2016['Total - Aboriginal ancestry for the population in private households - 25% sample data']

Variable,LocalArea,Total - Aboriginal ancestry for the population in private households - 25% sample data,Aboriginal ancestry (only),Single Aboriginal ancestry (only),First Nations (North American Indian) single ancestry,Métis single ancestry,Inuit single ancestry,Multiple Aboriginal ancestries (only),First Nations (North American Indian) and Métis ancestries,First Nations (North American Indian) and Inuit ancestries,...,Single Aboriginal and non-Aboriginal ancestries,First Nations (North American Indian) and non-Aboriginal ancestries,Métis and non-Aboriginal ancestries,Inuit and non-Aboriginal ancestries,Multiple Aboriginal and non-Aboriginal ancestries,"First Nations (North American Indian), Métis and non-Aboriginal ancestries","First Nations (North American Indian), Inuit and non-Aboriginal ancestries","Métis, Inuit and non-Aboriginal ancestries","First Nations (North American Indian), Métis, Inuit and non-Aboriginal ancestries",Non-Aboriginal ancestry (only)
0,Arbutus-Ridge,15075.0,15,15,15,0,0,0,0,0,...,245,190,55,0,0,0,0,0,0,14810.0
1,Downtown,58855.0,415,410,340,55,10,10,0,0,...,1255,860,395,0,70,75,0,0,0,57115.0
2,Dunbar-Southlands,21285.0,610,605,595,15,0,10,10,0,...,400,315,90,0,0,0,0,0,0,20275.0
3,Fairview,32720.0,60,60,50,10,0,0,0,0,...,825,530,285,10,45,45,0,0,0,31800.0
4,Grandview-Woodland,29000.0,1005,990,950,40,0,15,10,0,...,1350,1105,240,10,85,80,10,0,0,26560.0
5,Hastings-Sunrise,34115.0,225,205,180,25,10,15,15,0,...,1050,790,255,0,50,40,0,0,0,32800.0
6,Kensington-Cedar Cottage,48865.0,235,230,205,20,0,10,10,0,...,910,635,250,20,40,45,0,0,0,47685.0
7,Kerrisdale,13900.0,10,15,10,0,0,0,0,0,...,100,65,30,0,10,10,0,0,0,13780.0
8,Killarney,28930.0,95,100,85,10,0,0,0,0,...,490,395,90,0,20,20,0,0,0,28330.0
9,Kitsilano,42755.0,70,70,50,25,0,0,0,0,...,910,650,255,0,50,45,10,0,0,41720.0


# MISSING MATCHING DATA ACROSS ALL CENSUS YEARS

In [351]:
### NO MATCHING DATA FOR CENSUS YEARS ###

def clean_work_activity(census_dict, year):
    
    if year == 2001:
        column_names = ['LocalArea',
       'Total population with employment income',
       'Total - average employment income',
       'Total standard error of average employment income',
       'Worked full year, full time', 'Full time - average employment income',
       'Full time - standard error of average employment income',
       'Worked part year or part time', 'Part time - average employment income',
       'Part time - standard error of average employment income']
        
        df = census_dict['Total population 15 years and over with employment income, by sex and work activity']
        
    elif year == 2006:
        
    elif year == 2011:
        column_names = ['LocalArea', 'Type', 
                 'Total labour force',
                 'Did not work in past year', 'Worked in past year',
                 '1 to 13 weeks', '14 to 26 weeks', '27 to 39 weeks',
                 '40 to 48 weeks', '49 to 52 weeks', 
                 'Average weeks worked in past year']
        
        df = pd.read_csv('../../data/processed/nhs/Work activity.csv', index_col=0)
        df = df[['LocalArea', 'Type', 
                 'Total labour force aged 15 years and over by work activity in 2010',
                 'Did not work in 2010', 'Worked in 2010',
                 '1 to 13 weeks', '14 to 26 weeks', '27 to 39 weeks',
                 '40 to 48 weeks', '49 to 52 weeks', 
                 'Average weeks worked in 2010']].copy()
        
    elif year == 2016:
    
    df.set_axis(column_names, axis=1, inplace=True)
    df.sort_values(by=['LocalArea'], inplace=True)
    census_dict['work_activity'] = df
    df.to_csv('../../data/processed/census_' + str(year) + 'work_activity.csv')
    
    return census_dict