# Census Data Cleaning
Keanna Knebel

---

In [1]:
# import packages
import pandas as pd
import re
import os

In [2]:
# Options
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)

In [3]:
census_2001 = "../../data/raw/census_2001.csv"
census_2006 = "../../data/raw/census_2006.csv"
census_2011 = "../../data/raw/census_2011.csv"
census_2016 = "../../data/raw/census_2016.csv"

In [132]:
def clean_census(file, year):
    
    column_names= ['Variable', 'Arbutus-Ridge', 'Downtown', 'Dunbar-Southlands',
               'Fairview', 'Grandview-Woodland', 'Hastings-Sunrise',
               'Kensington-Cedar Cottage', 'Kerrisdale', 'Killarney', 'Kitsilano',
               'Marpole', 'Mount Pleasant', 'Oakridge', 'Renfrew-Collingwood',
               'Riley Park', 'Shaughnessy', 'South Cambie', 'Strathcona',
               'Sunset', 'Victoria-Fraserview', 'West End', 'West Point Grey',
               'Vancouver CSD', 'Vancouver CMA']
   
    # read in csv file as dataframe
    df = pd.read_csv(file, encoding='latin-1', skiprows=4)
    
    # remove 'ID' column if present
    df.drop(columns='ID', inplace=True, errors='ignore')
    
    # rename columns 
    df.set_axis(column_names, axis=1, inplace=True)
    
    # remove empty rows
    df.dropna(0, 'all', inplace=True)
    
    # remove leading whitespace from variables
    df.Variable = df.Variable.apply(lambda x: (x.lstrip()).rstrip())
    df.drop(df[df.Variable.str.contains('20%.*data', flags=re.IGNORECASE) ].index , inplace=True)
    
    # convert all data to strings except NaN values
    df.iloc[:,1:25] = df.iloc[:,1:25].applymap(lambda x: str(x) if x == x else x)
    
    # convert data to float
    df.iloc[:,1:25] = df.iloc[:,1:25].applymap(lambda x: re.sub("[-]", "0", x) if x == x else x)
    df.iloc[:,1:25] = df.iloc[:,1:25].applymap(lambda x: float(re.sub("[,$]", "", x)) if x == x else x)
    
    # Create the census subdirectory for given year if it doesn't exist
    os.makedirs('../../data/processed/census_' + str(year), exist_ok=True) 
    
    # divide the census datasets into subgroups
    sub_dict = create_subgroup_dict(df, year)
    
    # clean the datatables by topics
    census_dict = clean_age(sub_dict, year)
    #census_dict = clean_marital_status(census_dict, year)
    #census_dict = clean_private_households(census_dict, year)
    #census_dict = clean_couple_fam_structure(census_dict, year)
    #census_dict = clean_language_detailed(census_dict, year)
    #census_dict = clean_official_language(census_dict, year)
    #census_dict = clean_structural_dwelling_type(census_dict, year)
    #census_dict = clean_household_size(census_dict, year)
    #census_dict = clean_lone_parent(census_dict, year)
    #census_dict = clean_immigration_age(census_dict, year)
    #census_dict = clean_immigration_period(census_dict, year)
    #census_dict = clean_birth_place(census_dict, year)
    #census_dict = clean_shelter_tenure(census_dict, year)
    #census_dict = clean_visible_minority(census_dict, year)
    
    #### INCOMPLETE HELPER FUNCTIONS ####
    #census_dict = clean_household_type(census_dict, year)
    #census_dict = clean_census_family_children(census_dict, year)
    #census_dict = clean_aboriginal(census_dict, year)
    #census_dict = clean_citizenship(census_dict, year)
    #census_dict = clean_worker_class(census_dict, year)
    #census_dict = clean_education(census_dict, year)
    #census_dict = clean_ethnic_origin(census_dict, year)
    #census_dict = clean_time_worked(census_dict, year)
    #census_dict = clean_generation_status(census_dict, year)
    #census_dict = clean_household_char(census_dict, year)
    #census_dict = clean_household_income(census_dict, year)
    #census_dict = clean_individual_income(census_dict, year)
    #census_dict = clean_industry(census_dict, year)
    #census_dict = clean_labour_force_status(census_dict, year)
    #census_dict = clean_commute_time(census_dict, year)
    census_dict = clean_mobility(census_dict, year)
    census_dict = clean_transport_mode(census_dict, year)
    census_dict = clean_occupation(census_dict, year)
    census_dict = clean_workplace_status(census_dict, year)
    
        
    return census_dict

In [133]:
dict_2001 = clean_census(census_2001, 2001)
dict_2006 = clean_census(census_2006, 2006)
dict_2011 = clean_census(census_2011, 2011)
dict_2016 = clean_census(census_2016, 2016)

In [8]:
def create_subgroup_dict(df, year):
    
    # separate dataframe by 'Variables' containing regex expressions:
    if year == 2001:
        re1 = ['total.*by', 
               'population.*by', 
               'common-law couples',
               '^Male', 
               '^Female', 
               'total - male', 
               'total - female']
        
    elif year == 2006:
        re1 = ['total.*by', 
               'population.*by',
               'common-law couples',
               '^Male[s\s,]', 
               '^Female[s\s,]',
              'total - mobility',
              'Average number of children']
        
    elif year == 2011:
        df.drop(index=201 , inplace=True)
        re1 = ['total.*by', 
               'population.*by', 
               'common-law couples',
               'males',
               'Total population excluding institutional residents',  
               'Total.*in private households'] 
    elif year == 2016:
        re1 = ['^total', 'population.*by', 'males']
    
    subgroup = list(df[df.Variable.str.contains('|'.join(re1), flags=re.IGNORECASE)].index)
    subgroup.append(len(df.Variable)+1)
    subgroup = subgroup[1:]
    
    # create census dictionary of sub datasets
    # initialize variables for the lookup dictionary
    start = 0
    census_dict = {}
    
    for s in subgroup:
        sub_df = df.loc[start:s-1]

        # transpose dataframe and rename column
        sub_df = sub_df.set_index('Variable').T.reset_index().rename(columns={'index': 'LocalArea'})

        # check for duplicates and store dataframes into the dictionary
        if df.Variable[start] in census_dict:
            start = s
        else:
            census_dict[df.Variable[start]] = sub_df
            start = s
        
    return census_dict


In [9]:
def clean_age(census_dict, year):
    
    if year == 2001:
        column_names = ['LocalArea', 'Type', 'Total', '0 to 4 years', 
                   '5 to 9 years', '10 to 14 years', '15 to 19 years',
                   '20 to 24 years', '25 to 29 years', '30 to 34 years', 
                   '35 to 39 years', '40 to 44 years', '45 to 49 years', 
                   '50 to 54 years', '55 to 59 years', '60 to 64 years', 
                   '65 to 69 years', '70 to 74 years', '75 to 79 years', 
                   '80 to 84 years', '85 to 89 years', '90 to 94 years',
                   '95 to 99 years', '100 years and over']

        male = census_dict['Male']
        female = census_dict['Female']
    
        female.insert(1, 'Type', 'female')
        female.set_axis(column_names, axis=1, inplace=True)
        male.insert(1, 'Type', 'male')
        male.set_axis(column_names, axis=1, inplace=True)
    
        merged = pd.concat([female, male])
        merged.sort_values(by=['LocalArea', 'Type'], inplace=True)
        total = merged.groupby('LocalArea').sum()
        total['Type'] = 'total'
        total.reset_index(inplace=True)
        merged = pd.concat([merged, total])
    
    else:
        if year == 2006:
            
            column_names = ['LocalArea', 'Type', 'Total', '0 to 4 years', 
                       '5 to 9 years', '10 to 14 years', '15 to 19 years',
                       '20 to 24 years', '25 to 29 years', '30 to 34 years', 
                       '35 to 39 years', '40 to 44 years', '45 to 49 years', 
                       '50 to 54 years', '55 to 59 years', '60 to 64 years', 
                        '65 to 69 years', '70 to 74 years', '75 to 79 years', 
                       '80 to 84 years', '85 to 89 years', '90 to 94 years',
                       '95 to 99 years', '100 years and over', 'Median Age']
            
            total = census_dict['Male & Female, Total']
            male = census_dict['Male, Total']
            female = census_dict['Female, Total']
        
        elif year == 2011:
            
            column_names = ['LocalArea', 'Type', 'Total', '0 to 4 years', 
                       '5 to 9 years', '10 to 14 years', '15 to 19 years',
                       '15 years', '16 years', '17 years', '18 years', '19 years',
                       '20 to 24 years', '25 to 29 years', '30 to 34 years', 
                       '35 to 39 years', '40 to 44 years', '45 to 49 years', 
                       '50 to 54 years', '55 to 59 years', '60 to 64 years', 
                       '65 to 69 years', '70 to 74 years', '75 to 79 years', 
                       '80 to 84 years', '85 years and over', 'Median age',
                       '% of the population aged 15 and over']
        
            total = census_dict['Total population by age groups']
            male = census_dict['Males, total']
            female = census_dict['Females, total']

        elif year == 2016:
            
            column_names = ['LocalArea', 'Type', 'Total', '0 to 14 years', 
                       '0 to 4 years', '5 to 9 years', '10 to 14 years',
                       '15 to 64 years', '15 to 19 years', '20 to 24 years', 
                       '25 to 29 years', '30 to 34 years', '35 to 39 years', 
                       '40 to 44 years', '45 to 49 years', '50 to 54 years', 
                       '55 to 59 years', '60 to 64 years', '65 years and over', 
                       '65 to 69 years', '70 to 74 years', '75 to 79 years', 
                       '80 to 84 years', '85 years and over','85 to 89 years', 
                       '90 to 94 years', '95 to 99 years', '100 years and over']
        
            total = census_dict['Total - Age groups and average age of the population - 100% data']
            male = census_dict['Total - Age groups and average age of males - 100% data']
            female = census_dict['Total - Age groups and average age of females - 100% data']
         
        female.insert(1, 'Type', 'female')
        female.set_axis(column_names, axis=1, inplace=True)
        male.insert(1, 'Type', 'male')
        male.set_axis(column_names, axis=1, inplace=True)
        total.insert(1, 'Type', 'total')
        total.set_axis(column_names, axis=1, inplace=True)
    
        merged = pd.concat([female, male, total])
    
    merged.sort_values(by=['LocalArea', 'Type'], inplace=True)
    census_dict['population by age and sex'] = merged
    merged.to_csv('../../data/processed/census_' + str(year) + '/population_age_sex.csv')
    
    return census_dict

# Incomplete Helper Functions

In [14]:
def clean_household_type(census_dict, year):
    
    if year == 2001:
        
    elif year == 2006:
        
    elif year == 2011:
        
    elif year == 2016:
    
    df.set_axis(column_names, axis=1, inplace=True)
    df.sort_values(by=['LocalArea'], inplace=True)
    census_dict['household_type'] = df
    df.to_csv('../../data/processed/census_' + str(year) + '/household_type.csv')
    
    return census_dict

IndentationError: expected an indented block (<ipython-input-14-c1d5e90acc76>, line 5)

In [328]:
def clean_census_family_children(census_dict, year):
    
    if year == 2001:
        
    elif year == 2006:
        
    elif year == 2011:
        
    elif year == 2016:
    
    df.set_axis(column_names, axis=1, inplace=True)
    df.sort_values(by=['LocalArea'], inplace=True)
    census_dict['census_family_children'] = df
    df.to_csv('../../data/processed/census_' + str(year) + '/census_family_children.csv')
    
    return census_dict

In [330]:
def clean_aboriginal(census_dict, year):
    
    if year == 2001:
        
    elif year == 2006:
        
    elif year == 2011:
        column_names = ['LocalArea', 'Type', 'Total population']
        
        df = pd.read_csv('../../data/processed/nhs/Aboriginal population.csv', index_col=0)
        
    elif year == 2016:
        
    df.set_axis(column_names, axis=1, inplace=True)
    df.sort_values(by=['LocalArea'], inplace=True)
    census_dict['aboriginal'] = df
    df.to_csv('../../data/processed/census_' + str(year) + '/aboriginal.csv')
    
    return census_dict

In [332]:
def clean_citizenship(census_dict, year):
    
    if year == 2001:
        
    elif year == 2006:
        
    elif year == 2011:
         column_names = ['LocalArea', 'Type', 'Total population']
        
        df = pd.read_csv('../../data/processed/nhs/Citizenship.csv', index_col=0)
        
    elif year == 2016:
    
    df.set_axis(column_names, axis=1, inplace=True)
    df.sort_values(by=['LocalArea'], inplace=True)
    census_dict['citizenship'] = df
    df.to_csv('../../data/processed/census_' + str(year) + '/citizenship.csv')
    
    return census_dict

In [333]:
def clean_worker_class(census_dict, year):
    
    if year == 2001:
        
    elif year == 2006:
        
    elif year == 2011:
         column_names = ['LocalArea', 'Type', 'Total population']
        
        df = pd.read_csv('../../data/processed/nhs/Class of worker.csv', index_col=0)
        
    elif year == 2016:
    
    df.set_axis(column_names, axis=1, inplace=True)
    df.sort_values(by=['LocalArea'], inplace=True)
    census_dict['worker_class'] = df
    df.to_csv('../../data/processed/census_' + str(year) + '/worker_class.csv')
    
    return census_dict

In [334]:
def clean_education(census_dict, year):
    
    if year == 2001:
        
    elif year == 2006:
        
    elif year == 2011:
         column_names = ['LocalArea', 'Type', 'Total population']
        
        df = pd.read_csv('../../data/processed/nhs/Education.csv', index_col=0)
        df = df[['LocalArea', 'Type', 
                 'Total population aged 15 years and over by highest certificate, diploma or degree',
                 'Total population aged 25 to 64 years by highest certificate, diploma or degree',
                 'Apprenticeship or trades certificate or diploma',
                 'College, CEGEP or other non-university certificate or diploma',
                 'University certificate or diploma below bachelor level',
                 'University certificate, diploma or degree above bachelor level',
                 'University certificate, diploma or degree at bachelor level or above',
                 "Bachelor's degree",
                 'High school diploma or equivalent',
                 'No certificate, diploma or degree',
                 'No postsecondary certificate, diploma or degree',
                 'Postsecondary certificate, diploma or degree',
                 'With postsecondary certificate, diploma or degree',
                 'Total population aged 15 years and over by location of study compared with province or territory of residence',
                  'Location of study inside Canada', 'Location of study outside Canada',
                  'Same as province or territory of residence',
                 'Another province or territory',
                 'Total population aged 15 years and over by major field of study - Classification of Instructional Programs (CIP) 2011',
                  'Agriculture, natural resources and conservation',
                  'Architecture, engineering, and related technologies',
                 'Business, management and public administration',
                 'Education', 'Health and related fields', 'Humanities',
                 'Mathematics, computer and information sciences',
                 'Personal, protective and transportation services',
                 'Physical and life sciences and technologies',
                 'Social and behavioural sciences and law',
                 'Visual and performing arts, and communications technologies',
                 'Other fields of study']]
        
    elif year == 2016:
        
    df.set_axis(column_names, axis=1, inplace=True)
    df.sort_values(by=['LocalArea'], inplace=True)
    census_dict['education'] = df
    df.to_csv('../../data/processed/census_' + str(year) + '/education.csv')
    
    return census_dict

In [335]:
def clean_ethnic_origin(census_dict, year):
    
    if year == 2001:
        
    elif year == 2006:
        
    elif year == 2011:
         column_names = ['LocalArea', 'Type', 'Total population']
        
        df = pd.read_csv('../../data/processed/nhs/Ethnic origin population.csv', index_col=0)
        
    elif year == 2016:
    
    df.set_axis(column_names, axis=1, inplace=True)
    df.sort_values(by=['LocalArea'], inplace=True)
    census_dict['ethnic_origin'] = df
    df.to_csv('../../data/processed/census_' + str(year) + '/ethnic_origin.csv')
    
    return census_dict

In [288]:
def clean_time_worked(census_dict, year):
    
    if year == 2001:
        
    elif year == 2006:
        
    elif year == 2011:
        
    elif year == 2016:
    
    df.set_axis(column_names, axis=1, inplace=True)
    df.sort_values(by=['LocalArea'], inplace=True)
    census_dict['time_worked'] = df
    df.to_csv('../../data/processed/census_' + str(year) + '/time_worked.csv')
    
    return census_dict

In [289]:
def clean_generation_status(census_dict, year):
    
    if year == 2001:
        
    elif year == 2006:
        
    elif year == 2011:
         column_names = ['LocalArea', 'Type', 'Total population']
        
        df = pd.read_csv('../../data/processed/nhs/Generation status.csv', index_col=0)
        
    elif year == 2016:
    
    df.set_axis(column_names, axis=1, inplace=True)
    df.sort_values(by=['LocalArea'], inplace=True)
    census_dict['generation_status'] = df
    df.to_csv('../../data/processed/census_' + str(year) + '/generation_status.csv')
    
    return census_dict

In [290]:
def clean_household_char(census_dict, year):
    
    if year == 2001:
        
    elif year == 2006:
        
    elif year == 2011:
         column_names = ['LocalArea', 'Type', 'Total population']
        
        df = pd.read_csv('../../data/processed/nhs/Household characteristics.csv', index_col=0)
        
    elif year == 2016:
    
    df.set_axis(column_names, axis=1, inplace=True)
    df.sort_values(by=['LocalArea'], inplace=True)
    census_dict['household_characteristics'] = df
    df.to_csv('../../data/processed/census_' + str(year) + '/household_characteristics.csv')
    
    return census_dict

In [339]:
def clean_household_income(census_dict, year):
    if year == 2011:
        column_names = ['LocalArea', 'Type', 'Total population']
        
        df = pd.read_csv('../../data/processed/nhs/Income of households in 2010.csv', index_col=0)
        df = df[['LocalArea', 'Type', 
                 'After-tax income of households in 2010 of private households',
                 '$5,000 to $9,999', '$10,000 to $14,999', '$15,000 to $19,999',  
                 '$20,000 to $29,999', '$30,000 to $39,999', '$40,000 to $49,999', 
                 '$50,000 to $59,999', '$60,000 to $79,999', '$80,000 to $99,999',
                 '$100,000 and over', '$100,000 to $124,999', '$125,000 and over',
                 '$125,000 to $149,999', '$150,000 and over',
       'Average after-tax household income ($)',
       'Average household total income ($)',
       'Average household total income ($)',
       'Household income in 2010 of private households',
       'Household total income in 2010 of private households',
       'Median after-tax household income ($)',
       'Median household total income ($)',
       'Median household total income ($)', 'One-person private households',
       'Two-or-more-persons private households', 'Under $5,000', 'LocalArea']]
    
    df.set_axis(column_names, axis=1, inplace=True)
    df.sort_values(by=['LocalArea'], inplace=True)
    census_dict['household_income'] = df
    df.to_csv('../../data/processed/census_' + str(year) + '/household_income.csv')
    
    return census_dict

In [340]:
def clean_individual_income(census_dict, year):
    
    if year == 2001:
        
    elif year == 2006:
        
    elif year == 2011:
        column_names = ['LocalArea', 'Type', 'Total population']
        
        df = pd.read_csv('../../data/processed/nhs/Income of individuals in 2010.csv', index_col=0)
        
    elif year == 2016:
    
    df.set_axis(column_names, axis=1, inplace=True)
    df.sort_values(by=['LocalArea'], inplace=True)
    census_dict['individual_income'] = df
    df.to_csv('../../data/processed/census_' + str(year) + '/individual_income.csv')
    
    return census_dict

In [255]:
def clean_industry(census_dict, year):
    
    if year == 2011:
        df = pd.read_csv('../../data/processed/nhs/Industry.csv', index_col=0).query('Type == "Total"')
        label = list(df.columns[[-1, -3]])
        
    else:
        if year == 2001:
            df = census_dict['Total labour force 15 years and over by industry - 1997 North American Industry Classification System']
        elif year == 2006:
            df = census_dict['Total labour force 15 years and over by industry - North American Industry Classification System 2002']
        else:
            df = census_dict['Total Labour Force population aged 15 years and over by Industry - North American Industry Classification System (NAICS) 2012 - 25% sample data']
        
        label = list(df.columns[[0, 2]])
        
    industries = list(df.columns)
    industries_original = [i for i in industries if re.match(r'^[0-9]', i)]
    industries = [re.findall(r'^[0-9 -]*(.*)', i)[0] for i in industries_original]
    
    column_names = ['LocalArea', 'Industry not applicable'] + industries
        
    cols = label + industries_original
    df = df.loc[: ,cols]
        
    
    df.set_axis(column_names, axis=1, inplace=True)
    df.sort_values(by=['LocalArea'], inplace=True)
    #census_dict['industry'] = df
    #df.to_csv('../../data/processed/census_' + str(year) + '/industry.csv')
    
    return df

In [342]:
def clean_labour_force_status(census_dict, year):
    
    if year == 2001:
        
    elif year == 2006:
        
    elif year == 2011:
        column_names = ['LocalArea', 'Type', 'Total population']
        
        df = pd.read_csv('../../data/processed/nhs/Labour force status.csv', index_col=0)
        
    elif year == 2016:
    
    df.set_axis(column_names, axis=1, inplace=True)
    df.sort_values(by=['LocalArea'], inplace=True)
    census_dict['labour_force_status'] = df
    df.to_csv('../../data/processed/census_' + str(year) + '/labour_force_status.csv')
    
    return census_dict

In [343]:
def clean_commute_time(census_dict, year):
    
    if year == 2001:
        
    elif year == 2006:
        
    elif year == 2011:
        column_names = ['LocalArea', 'Type', 'Total population']
        
        df = pd.read_csv('../../data/processed/nhs/Median commuting duration.csv', index_col=0)
        
    elif year == 2016:
        
    df.set_axis(column_names, axis=1, inplace=True)
    df.sort_values(by=['LocalArea'], inplace=True)
    census_dict['commute_time'] = df
    df.to_csv('../../data/processed/census_' + str(year) + '/commute_time.csv')
    
    return census_dict

In [131]:
def clean_mobility(census_dict, year):

    column_names = ['LocalArea', 'Migrants', 'Non-migrants', 'Non-movers']
    
    if year == 2011:

        df = pd.read_csv('../../data/processed/nhs/Mobility.csv',
                         index_col=0).query('Type == "Total"').iloc[:, [-1, 5, 7, 8]]

    else:
        if year == 2001:
            yr1 = census_dict['Total population 1 year and over by mobility status 1 year ago']
            yr5 = census_dict['Total population 5 years and over by mobility status 5 years ago']
        elif year == 2006:
            yr1 = census_dict['Total - Mobility status 1 year ago']
            yr5 = census_dict['Total - Mobility status 5 years ago']
        else:
            yr1 = census_dict['Total - Mobility status 1 year ago - 25% sample data']
            yr5 = census_dict['Total - Mobility status 5 years ago - 25% sample data']
        
        yr1 = yr1.loc[:, column_names]
        yr5 = yr5.loc[:, column_names]
        
        df = pd.concat([yr1, yr5]).groupby(['LocalArea']).sum().reset_index()

    df.set_axis(column_names, axis=1, inplace=True)
    df.sort_values(by=['LocalArea'], inplace=True)
    census_dict['mobility'] = df
    df.to_csv('../../data/processed/census_' + str(year) + '/mobility.csv')

    return census_dict


In [13]:
def clean_transport_mode(census_dict, year):

    column_names = ['LocalArea', 'Type', 'Total population aged 15', 'Bicycle',
                    'Car as driver', 'Car as passenger',
                    'Other methods', 'Public transit', 'Walked']

    if year == 2011:
        df = pd.read_csv('../../data/processed/nhs/Mode of transportation.csv',
                         index_col=0).iloc[:, [-1, 0, -3, 1, 2, 3, 4, 5, -2]]

    else:
        if year == 2016:
            order = [0, 1, -2, 2, 3, -1, 4, 5]
            male = census_dict['Total - Main mode of commuting for the male employed labour force aged 15 years and over in private households with a usual place of work or no fixed workplace address - 25% sample data']
            female = census_dict['Total - Main mode of commuting for the female employed labour force aged 15 years and over in private households with a usual place of work or no fixed workplace address - 25% sample data']

            male = male.iloc[:, order]
            female = female.iloc[:, order]

            male.insert(1, 'Type', 'Male')
            female.insert(1, 'Type', 'Female')
        
        else:
            order = [0, -1, 1, 6, 2, 3, -2, 4, 5]
            if year == 2001:
                male = census_dict['Males with a usual place of work or no fixed workplace address']
            else:
                male = census_dict['Males with usual place of work or no fixed workplace address']
                
            female = census_dict['Females with usual place of work or no fixed workplace address']
                
            male['Type'] = ['Male']*len(male)
            male['Other method'] = male['Other method'] + \
                male['Taxicab'] + male['Motorcycle']
            male = male.iloc[:, order]

            female['Type'] = ['Female']*len(female)
            female['Other method'] = female['Other method'] + \
                female['Taxicab'] + female['Motorcycle']
            female = female.iloc[:, order]
        
        male.columns = column_names
        female.columns = column_names
        
        df = pd.concat([male, female])

        total = df.groupby(['LocalArea']).sum().reset_index()
        total['Type'] = ['Total'] * len(total)

        df = pd.concat([df, total])
        
    df.set_axis(column_names, axis=1, inplace=True)
    df.sort_values(by=['LocalArea'], inplace=True)
    census_dict['transport_mode'] = df
    df.to_csv('../../data/processed/census_' +
              str(year) + '/transport_mode.csv')

    return census_dict


In [11]:
def clean_occupation(census_dict, year):

    column_names = ['LocalArea', 'Type', 'Total', 'All occupations', 'Occupations n/a',
                    'Management', 'Business and finance', 'Natural and applied sciences',
                    'Health', 'Social Sience and education', 'Art', 'Sales and service',
                    'Trades and transport', 'Natural resources and agriculture', 'Manufacturing and utilities']

    if year == 2001 or year == 2006:
        if year == 2001:
            female = census_dict['Female labour force 15 years and over - Occupation']
            male = census_dict['Male labour force 15 years and over - Occupation']
        else:
            female = census_dict['Female labour force 15 years and over by occupation - National Occupational Classification for Statistics 2006']
            male = census_dict['Male labour force 15 years and over by occupation - National Occupational Classification for Statistics 2006']

        occupations = [i for i in female.columns if re.match(r'^[A-Z] ', i)]
        female['Type'] = ['Female'] * len(female)
        female = pd.concat(
            [female.iloc[:, [0, -1, 1, 3, 2]], female.loc[:, occupations]], axis=1)
        female.columns = column_names

        male['Type'] = ['Male'] * len(male)
        male = pd.concat(
            [male.iloc[:, [0, -1, 1, 3, 2]], male.loc[:, occupations]], axis=1)
        male.columns = column_names

        df = pd.concat([female, male])

        total = df.groupby(['LocalArea']).sum().reset_index()
        total['Type'] = ['Total'] * len(total)

        df = pd.concat([df, total])

    elif year == 2011:
        df = pd.read_csv(
            '../../data/processed/nhs/Occupation.csv', index_col=0)

        df = pd.concat(
            (df.iloc[:, [14, 0, 13, 11, 12]], df.iloc[:, 1:11]), axis=1)

    elif year == 2016:
        total = census_dict['Total labour force population aged 15 years and over by occupation - National Occupational Classification (NOC) 2016 - 25% sample data']
        female = census_dict['Total female labour force population aged 15 years and over by occupation - National Occupational Classification (NOC) 2016 - 25% sample data']
        male = census_dict['Total male labour force population aged 15 years and over by occupation - National Occupational Classification (NOC) 2016 - 25% sample data']
        
        male['Type'] = ['Male'] * len(male)
        total['Type'] = ['Total'] * len(total)
        female['Type'] = ['Female'] * len(female)
        
        total = pd.concat(
            (total.iloc[:, [0, -1, 1, 3, 2]], total.iloc[:, 4:14]), axis=1)
        female = pd.concat(
            (female.iloc[:, [0, -1, 1, 3, 2]], female.iloc[:, 4:14]), axis=1)
        male = pd.concat(
            (male.iloc[:, [0, -1, 1, 3, 2]], male.iloc[:, 4:14]), axis=1)
        
        female.columns, male.columns, total.columns = column_names, column_names, column_names
        
        df = pd.concat([total, female, male])

    df.set_axis(column_names, axis=1, inplace=True)
    df.sort_values(by=['LocalArea'], inplace=True)
    census_dict['occupation'] = df
    df.to_csv('../../data/processed/census_' + str(year) + '/occupation.csv')

    return census_dict


In [55]:
def clean_workplace_status(census_dict, year):

    column_names = ['LocalArea', 'Type', 'Total employed population aged 15',
                    'Worked at home', 'Worked at usual place', 'Worked outside Canada',
                    'No fixed workplace']

    if year == 2011:

        df = pd.read_csv('../../data/processed/nhs/Place of work status.csv',
                         index_col=0).iloc[:, [-1, 0, 2, 3, 4, 5, 1]]

    else:
        if year == 2001 or year == 2006:
            order = [0, 1, -3, 2, -2, -1]
            male = census_dict['Males'].iloc[:, order]
            female = census_dict['Females'].iloc[:, order]
        else:
            order = [0, 1, 2, -1, 3, 4]
            male = census_dict['Total - Place of work status for the male employed labour force aged 15 years and over in private households - 25% sample data'].iloc[:, order]
            female = census_dict['Total - Place of work status for the female employed labour force aged 15 years and over in private households - 25% sample data'].iloc[:, order]

        male.insert(1, 'Type', 'Male')
        female.insert(1, 'Type', 'Female')

        male.columns = column_names
        female.columns = column_names

        df = pd.concat([female, male])

        total = df.groupby(['LocalArea']).sum().reset_index()
        total['Type'] = ['Total'] * len(total)

        df = pd.concat([df, total])

    df.set_axis(column_names, axis=1, inplace=True)
    df.sort_values(by=['LocalArea'], inplace=True)
    census_dict['workplace_status'] = df
    df.to_csv('../../data/processed/census_' + str(year) + '/workplace_status.csv')

    return census_dict
