# Census Data Cleaning
Keanna Knebel

---

In [3]:
# import packages
import pandas as pd
import re
import os

In [4]:
census_2001 = "../../data/raw/census_2001.csv"
census_2006 = "../../data/raw/census_2006.csv"
census_2011 = "../../data/raw/census_2011.csv"
census_2016 = "../../data/raw/census_2016.csv"

In [5]:
def clean_census(file, year):
    
    column_names= ['Variable', 'Arbutus-Ridge', 'Downtown', 'Dunbar-Southlands',
               'Fairview', 'Grandview-Woodland', 'Hastings-Sunrise',
               'Kensington-Cedar Cottage', 'Kerrisdale', 'Killarney', 'Kitsilano',
               'Marpole', 'Mount Pleasant', 'Oakridge', 'Renfrew-Collingwood',
               'Riley Park', 'Shaughnessy', 'South Cambie', 'Strathcona',
               'Sunset', 'Victoria-Fraserview', 'West End', 'West Point Grey',
               'Vancouver CSD', 'Vancouver CMA']
   
    # read in csv file as dataframe
    df = pd.read_csv(file, encoding='latin-1', skiprows=4)
    
    # remove 'ID' column if present
    df.drop(columns='ID', inplace=True, errors='ignore')
    
    # rename columns 
    df.set_axis(column_names, axis=1, inplace=True)
    
    # remove empty rows
    df.dropna(0, 'all', inplace=True)
    
    # remove leading whitespace from variables
    df.Variable = df.Variable.apply(lambda x: (x.lstrip()).rstrip())
    df.drop(df[df.Variable.str.contains('20%.*data', flags=re.IGNORECASE) ].index , inplace=True)
    
    # convert all data to strings except NaN values
    df.iloc[:,1:25] = df.iloc[:,1:25].applymap(lambda x: str(x) if x == x else x)
    
    # convert data to float
    df.iloc[:,1:25] = df.iloc[:,1:25].applymap(lambda x: re.sub("[-]", "0", x) if x == x else x)
    df.iloc[:,1:25] = df.iloc[:,1:25].applymap(lambda x: float(re.sub("[,$]", "", x)) if x == x else x)
    
    # Create the census subdirectory for given year if it doesn't exist
    os.makedirs('../../data/processed/census_' + str(year), exist_ok=True) 
    
    # divide the census datasets into subgroups
    sub_dict = create_subgroup_dict(df, year)
    
    # clean the datatables by topics
    census_dict = clean_age(sub_dict, year)
    census_dict = clean_marital_status(census_dict, year)
    census_dict = clean_private_households(census_dict, year)
    census_dict = clean_couple_fam_structure(census_dict, year)
    census_dict = clean_language_detailed(census_dict, year)
    census_dict = clean_official_language(census_dict, year)
    census_dict = clean_structural_dwelling_type(census_dict, year)
    census_dict = clean_household_size(census_dict, year)
    census_dict = clean_lone_parent(census_dict, year)
    census_dict = clean_immigration_age(census_dict, year)
    census_dict = clean_immigration_period(census_dict, year)
    census_dict = clean_birth_place(census_dict, year)
    census_dict = clean_shelter_tenure(census_dict, year)
    census_dict = clean_visible_minority(census_dict, year)
    
    #### INCOMPLETE HELPER FUNCTIONS ####
    census_dict = clean_household_type(census_dict, year)
    #census_dict = clean_census_family_children(census_dict, year)
    #census_dict = clean_aboriginal(census_dict, year)
    census_dict = clean_citizenship(census_dict, year)
    census_dict = clean_worker_class(census_dict, year)
    #census_dict = clean_education(census_dict, year)
    #census_dict = clean_ethnic_origin(census_dict, year)
    census_dict = clean_time_worked(census_dict, year)
    census_dict = clean_generation_status(census_dict, year)
    #census_dict = clean_household_char(census_dict, year)
    #census_dict = clean_household_income(census_dict, year)
    #census_dict = clean_individual_income(census_dict, year)
    census_dict = clean_industry(census_dict, year)
    census_dict = clean_labour_force_status(census_dict, year)
    #census_dict = clean_commute_time(census_dict, year)
    census_dict = clean_mobility(census_dict, year)
    census_dict = clean_transport_mode(census_dict, year)
    census_dict = clean_occupation(census_dict, year)
    census_dict = clean_workplace_status(census_dict, year)
    
        
    return census_dict

In [20]:
dict_2001=clean_census(census_2001, 2001)
dict_2006=clean_census(census_2006, 2006)
dict_2011=clean_census(census_2011, 2011)
dict_2016=clean_census(census_2016, 2016)

NameError: name 'clean_marital_status' is not defined

In [7]:
def create_subgroup_dict(df, year):
    
    # separate dataframe by 'Variables' containing regex expressions:
    if year == 2001:
        re1 = ['total.*by', 
               'population.*by', 
               'common-law couples',
               '^Male', 
               '^Female', 
               'total - male', 
               'total - female']
        
    elif year == 2006:
        re1 = ['total.*by', 
               'population.*by',
               'common-law couples',
               '^Male[s\s,]', 
               '^Female[s\s,]',
              'total - mobility',
              'Average number of children']
        
    elif year == 2011:
        df.drop(index=201 , inplace=True)
        re1 = ['total.*by', 
               'population.*by', 
               'common-law couples',
               'males',
               'Total population excluding institutional residents',  
               'Total.*in private households'] 
    elif year == 2016:
        re1 = ['^total', 'population.*by', 'males']
    
    subgroup = list(df[df.Variable.str.contains('|'.join(re1), flags=re.IGNORECASE)].index)
    subgroup.append(len(df.Variable)+1)
    subgroup = subgroup[1:]
    
    # create census dictionary of sub datasets
    # initialize variables for the lookup dictionary
    start = 0
    census_dict = {}
    
    for s in subgroup:
        sub_df = df.loc[start:s-1]

        # transpose dataframe and rename column
        sub_df = sub_df.set_index('Variable').T.reset_index().rename(columns={'index': 'LocalArea'})

        # check for duplicates and store dataframes into the dictionary
        if df.Variable[start] in census_dict:
            start = s
        else:
            census_dict[df.Variable[start]] = sub_df
            start = s
        
    return census_dict


In [8]:
def clean_age(census_dict, year):
    
    if year == 2001:
        column_names = ['LocalArea', 'Type', 'Total', '0 to 4 years', 
                   '5 to 9 years', '10 to 14 years', '15 to 19 years',
                   '20 to 24 years', '25 to 29 years', '30 to 34 years', 
                   '35 to 39 years', '40 to 44 years', '45 to 49 years', 
                   '50 to 54 years', '55 to 59 years', '60 to 64 years', 
                   '65 to 69 years', '70 to 74 years', '75 to 79 years', 
                   '80 to 84 years', '85 to 89 years', '90 to 94 years',
                   '95 to 99 years', '100 years and over']

        male = census_dict['Male']
        female = census_dict['Female']
    
        female.insert(1, 'Type', 'female')
        female.set_axis(column_names, axis=1, inplace=True)
        male.insert(1, 'Type', 'male')
        male.set_axis(column_names, axis=1, inplace=True)
    
        merged = pd.concat([female, male])
        merged.sort_values(by=['LocalArea', 'Type'], inplace=True)
        total = merged.groupby('LocalArea').sum()
        total['Type'] = 'total'
        total.reset_index(inplace=True)
        merged = pd.concat([merged, total])
    
    else:
        if year == 2006:
            
            column_names = ['LocalArea', 'Type', 'Total', '0 to 4 years', 
                       '5 to 9 years', '10 to 14 years', '15 to 19 years',
                       '20 to 24 years', '25 to 29 years', '30 to 34 years', 
                       '35 to 39 years', '40 to 44 years', '45 to 49 years', 
                       '50 to 54 years', '55 to 59 years', '60 to 64 years', 
                        '65 to 69 years', '70 to 74 years', '75 to 79 years', 
                       '80 to 84 years', '85 to 89 years', '90 to 94 years',
                       '95 to 99 years', '100 years and over', 'Median Age']
            
            total = census_dict['Male & Female, Total']
            male = census_dict['Male, Total']
            female = census_dict['Female, Total']
        
        elif year == 2011:
            
            column_names = ['LocalArea', 'Type', 'Total', '0 to 4 years', 
                       '5 to 9 years', '10 to 14 years', '15 to 19 years',
                       '15 years', '16 years', '17 years', '18 years', '19 years',
                       '20 to 24 years', '25 to 29 years', '30 to 34 years', 
                       '35 to 39 years', '40 to 44 years', '45 to 49 years', 
                       '50 to 54 years', '55 to 59 years', '60 to 64 years', 
                       '65 to 69 years', '70 to 74 years', '75 to 79 years', 
                       '80 to 84 years', '85 years and over', 'Median age',
                       '% of the population aged 15 and over']
        
            total = census_dict['Total population by age groups']
            male = census_dict['Males, total']
            female = census_dict['Females, total']

        elif year == 2016:
            
            column_names = ['LocalArea', 'Type', 'Total', '0 to 14 years', 
                       '0 to 4 years', '5 to 9 years', '10 to 14 years',
                       '15 to 64 years', '15 to 19 years', '20 to 24 years', 
                       '25 to 29 years', '30 to 34 years', '35 to 39 years', 
                       '40 to 44 years', '45 to 49 years', '50 to 54 years', 
                       '55 to 59 years', '60 to 64 years', '65 years and over', 
                       '65 to 69 years', '70 to 74 years', '75 to 79 years', 
                       '80 to 84 years', '85 years and over','85 to 89 years', 
                       '90 to 94 years', '95 to 99 years', '100 years and over']
        
            total = census_dict['Total - Age groups and average age of the population - 100% data']
            male = census_dict['Total - Age groups and average age of males - 100% data']
            female = census_dict['Total - Age groups and average age of females - 100% data']
         
        female.insert(1, 'Type', 'female')
        female.set_axis(column_names, axis=1, inplace=True)
        male.insert(1, 'Type', 'male')
        male.set_axis(column_names, axis=1, inplace=True)
        total.insert(1, 'Type', 'total')
        total.set_axis(column_names, axis=1, inplace=True)
    
        merged = pd.concat([female, male, total])
    
    merged.sort_values(by=['LocalArea', 'Type'], inplace=True)
    census_dict['population by age and sex'] = merged
    merged.to_csv('../../data/processed/census_' + str(year) + '/population_age_sex.csv')
    
    return census_dict

# Incomplete Helper Functions

In [9]:
def clean_household_type(census_dict, year):
    
    column_names = ['LocalArea', 'Total number of private households by household type', 
                    'One-family households', 'Multiple-family households', 'Non-family households']

    if year == 2001:
        df = census_dict['Total number of private households by household type']
        df = df[column_names]
        
    elif year == 2006:
        df = census_dict['Total number of private households by household type']
        df = df[column_names]
        
    elif year == 2011:
        column_names = ['LocalArea', 'Total number of private households by household type', 
                    'One-family only households', 'Couple family households', 
                'Other family households']
        
        #no useful information in nhs 
        df = census_dict['Total number of private households by household type']
        df = df[column_names]
        
    elif year == 2016:
        column_names = ['LocalArea', 'Total - Private households by household type - 100% data', 
                    'One-census-family households', 'Multiple-census-family households', 
                'Non-census-family households']
        df = census_dict['Total - Private households by household type - 100% data']
        df = df[column_names]
    
    df.set_axis(column_names, axis=1, inplace=True)
    df.sort_values(by=['LocalArea'], inplace=True)
    census_dict['household_type'] = df
    df.to_csv('../../data/processed/census_' + str(year) + '/household_type.csv')
    
    return census_dict

In [10]:
def clean_worker_class(census_dict,year):
    column_names = ['LocalArea','Type','Total labour force aged 15 years and over by class of worker',
                    'Class of worker - not applicable','All classes of worker']
    if year == 2001:
        df1 = census_dict['Total labour force 15 years and over  by class of worker']
        df2 = census_dict['Males labour force 15 years and over  by class of worker']
        df3 = census_dict['Females labour force 15 years and over  by class of worker']
        df1 = df1[['LocalArea', 'Total labour force 15 years and over  by class of worker',
               'Class of worker - Not applicable', 'All classes of worker']].copy()
        df1.insert(1, 'Type', 'total')
        df1.set_axis(column_names, axis=1, inplace=True)
        df2 = df2[['LocalArea', 'Males labour force 15 years and over  by class of worker',
               'Class of worker - Not applicable', 'All classes of worker']].copy()
        df2.insert(1, 'Type', 'male')
        df2.set_axis(column_names, axis=1, inplace=True)
        df3 = df3[['LocalArea', 'Females labour force 15 years and over  by class of worker',
               'Class of worker - Not applicable', 'All classes of worker']].copy()
        df3.insert(1, 'Type', 'female')
        df3.set_axis(column_names, axis=1, inplace=True)
        merged = pd.concat([df1,df2,df3])
    elif year == 2006:
        df1 = census_dict['Total labour force 15 years and over by class of worker']
        df2 = census_dict['Male labour force 15 years and over - class of worker']
        df3 = census_dict['Female labour force 15 years and over - class of worker']
        df1 = df1[['LocalArea', 'Total labour force 15 years and over by class of worker',
               'Class of worker - Not applicable', 'All classes of worker']].copy()
        df1.insert(1, 'Type', 'total')
        df1.set_axis(column_names, axis=1, inplace=True)
        df2 = df2[['LocalArea', 'Male labour force 15 years and over - class of worker',
                'Class of worker - Not applicable', 'All classes of worker']].copy()
        df2.insert(1, 'Type', 'male')
        df2.set_axis(column_names, axis=1, inplace=True)
        df3 = df3[['LocalArea', 'Female labour force 15 years and over - class of worker',
                'Class of worker - Not applicable', 'All classes of worker']].copy()
        df3.insert(1, 'Type', 'female')
        df3.set_axis(column_names, axis=1, inplace=True)
        merged = pd.concat([df1,df2,df3])
    elif year == 2011:
        df = pd.read_csv('../../data/processed/nhs/Class of worker.csv', index_col=0)
        df = df[['Type','All classes of worker','Class of worker - not applicable','Total labour force aged 15 years and over by class of worker','LocalArea']].copy()
        merged = df[column_names].copy()
        merged.set_axis(column_names, axis=1, inplace=True)
    elif year == 2016:
        df1 = census_dict['Total labour force aged 15 years and over by class of worker - 25% sample data']
        df1 = df1.iloc[:,0:4].copy()
        df1.insert(1, 'Type', 'total')
        df1.set_axis(column_names, axis=1, inplace=True)
        df2 = census_dict['Total male labour force aged 15 years and over by class of worker - 25% sample data']
        df2 = df2.iloc[:,0:4].copy()
        df2.insert(1, 'Type', 'male')
        df2.set_axis(column_names, axis=1, inplace=True)
        df3 = census_dict['Total female labour force aged 15 years and over by class of worker - 25% sample data']
        df3 = df3.iloc[:,0:4].copy()
        df3.insert(1, 'Type', 'female')
        df3.set_axis(column_names, axis=1, inplace=True)
        merged = pd.concat([df1,df2,df3])
    merged.sort_values(by=['LocalArea','Type'], inplace=True)
    census_dict['worker_class'] = merged
    merged.to_csv('../../data/processed/census_' + str(year) + '/worker_class.csv')
    return census_dict     

In [11]:
def clean_generation_status(census_dict, year):
    column_names = ['LocalArea', 'Total Population 15 years and older by generation status','1st generation','2nd generation', '3rd generation and over']
    if year == 2001:
        df = census_dict['Total population 15 years and over by generation status']
        df = df.iloc[:,0:5].copy()
        df.set_axis(column_names, axis=1, inplace=True)
    elif year == 2006:
        df = census_dict['Total population 15 years and older by generation status']
        df = df.iloc[:,0:5].copy()
        df.set_axis(column_names, axis=1, inplace=True)
    elif year == 2011:
        df = pd.read_csv('../../data/processed/nhs/Generation status.csv', index_col=0)
        df = df.loc[df['Type'] == 'Total'].copy().reset_index()
        df.drop(['Type','index'],axis=1, inplace=True)
        cols = ['1st generation','2nd generation', '3rd generation and over','Total Population 15 years and older by generation status','LocalArea']
        df.columns = cols
        cols_1 = ['LocalArea','Total Population 15 years and older by generation status','1st generation','2nd generation', '3rd generation and over']
        df = df[cols_1].copy()
        df.set_axis(column_names, axis=1, inplace=True)
    elif year == 2016:
        df = census_dict['Total - Generation status for the population in private households - 25% sample data']
        df = df.iloc[:,0:5].copy()
        df.set_axis(column_names, axis=1, inplace=True)
    df.sort_values(by=['LocalArea'], inplace=True)
    census_dict['generation_status'] = df
    df.to_csv('../../data/processed/census_' + str(year) + '/generation_status.csv')
    return census_dict

In [12]:
def clean_time_worked(census_dict, year):
    column_names = ['LocalArea','Type','Population 15 years and over by work activity','full time','part time']
    if year == 2001:
        df1 = census_dict['Total population 15 years and over with employment income, by sex and work activity']
        df1 = df1[['LocalArea','Total population 15 years and over with employment income, by sex and work activity','Worked full year, full time','Worked part year or part time']].copy()
        df1.insert(1, 'Type', 'total')
        df1.set_axis(column_names, axis=1, inplace=True)
        df1['Worked partialy full time and partially part time'] = df1['Population 15 years and over by work activity']-df1['full time']-df1['part time']
        df2 = census_dict['Males 15 years and over with employment income by work activity']
        df2 = df2[['LocalArea','Males 15 years and over with employment income by work activity','Worked full year, full time','Worked part year or part time']].copy()
        df2.insert(1, 'Type', 'male')
        df2.set_axis(column_names, axis=1, inplace=True)
        df2['Worked partialy full time and partially part time'] = df2['Population 15 years and over by work activity']-df2['full time']-df2['part time']
        df3 = census_dict['Females 15 years and over with employment income by work activity']
        df3 = df3[['LocalArea','Females 15 years and over with employment income by work activity','Worked full year, full time','Worked part year or part time']].copy()
        df3.insert(1, 'Type', 'female')
        df3.set_axis(column_names, axis=1, inplace=True)
        df3['Worked partialy full time and partially part time'] = df3['Population 15 years and over by work activity']-df3['full time']-df3['part time']
        merged = pd.concat([df1,df2,df3])
    elif year == 2006:
        column_names_1 = ['LocalArea','Population 15 years and over by work activity','full time','part time']
        # Females
        df1 = census_dict['Females 15 years and over with employment income']
        df1 = df1[['LocalArea','Females 15 years and over with employment income','Worked full year, full time','Worked part year or part time']].copy()
        df1.set_axis(column_names_1, axis=1, inplace=True)
        # Males
        df2 = census_dict['Males 15 years and over with employment income']
        df2 = df2[['LocalArea','Males 15 years and over with employment income','Worked full year, full time','Worked part year or part time']].copy()
        df2.set_axis(column_names_1, axis=1, inplace=True)
        # Calculate total
        df3 = pd.merge(df1,df2,on='LocalArea')
        df3 = df3.groupby(df3.columns, axis=1).sum()
        df3['Population 15 years and over by work activity'] = df3['Population 15 years and over by work activity_x']+df3['Population 15 years and over by work activity_x']
        df3['full time']=df3['full time_x']+df3['full time_y']
        df3['part time']=df3['part time_x']+df3['part time_y']
        df3 = df3[['LocalArea','Population 15 years and over by work activity','full time', 'part time']].copy()
        df1.insert(1, 'Type', 'female')
        df2.insert(1, 'Type', 'male')
        df3.insert(1, 'Type', 'total')
        merged = pd.concat([df3,df2,df1])
        merged['Worked partialy full time and partially part time'] = merged['Population 15 years and over by work activity']-merged['full time']-merged['part time']
    elif year == 2011:
        df1 = pd.read_csv('../../data/processed/nhs/Full-time or part-time weeks worked.csv', index_col=0)
        df1 = df1.iloc[:,0:7].copy()
        column_names = ['Type','x','Population 15 years and over by work activity','full time','y','part time','LocalArea']
        df1.set_axis(column_names, axis=1, inplace=True)
        df1['Worked partialy full time and partially part time'] = df1['Population 15 years and over by work activity']-df1['full time']-df1['part time']-df1['x']
        df1.loc[df1['Worked partialy full time and partially part time'] < 0, 'Worked partialy full time and partially part time'] = 0
        df1.drop(['x','y'], inplace=True, axis=1)
        cols = ['LocalArea','Type','Population 15 years and over by work activity','full time','part time','Worked partialy full time and partially part time']
        merged = df1[cols]
    elif year == 2016:
        column_names_1 = ['LocalArea','Population 15 years and over by work activity','full time','part time', 'Worked partialy full time and partially part time']
        df1 = census_dict['Total population aged 15 years and over by work activity during the reference year - 25% sample data']
        df2 = census_dict['Males aged 15 years and over by work activity during the reference year - 25% sample data']
        df3 = census_dict['Females aged 15 years and over by work activity during the reference year - 25% sample data']
        # Total
        df1['Worked partialy full time and partially part time'] = df1['Total population aged 15 years and over by work activity during the reference year - 25% sample data']-df1['Did not work']-df1['Worked']
        df1.drop(['Did not work','Worked'], axis=1,inplace=True)
        df1 = df1[['LocalArea','Total population aged 15 years and over by work activity during the reference year - 25% sample data','Worked full year, full time','Worked part year and/or part time','Worked partialy full time and partially part time']].copy()
        df1.set_axis(column_names_1, axis=1, inplace=True)
        df1.insert(1, 'Type', 'total')
        # Male
        df2['Worked partialy full time and partially part time'] = df2['Males aged 15 years and over by work activity during the reference year - 25% sample data']-df2['Did not work']-df2['Worked']
        df2.drop(['Did not work','Worked'], axis=1,inplace=True)
        df2 = df2[['LocalArea','Males aged 15 years and over by work activity during the reference year - 25% sample data','Worked full year, full time','Worked part year and/or part time','Worked partialy full time and partially part time']].copy()
        df2.set_axis(column_names_1, axis=1, inplace=True)
        df2.insert(1, 'Type', 'male')
        # Female
        df3['Worked partialy full time and partially part time'] = df3['Females aged 15 years and over by work activity during the reference year - 25% sample data']-df3['Did not work']-df3['Worked']
        df3.drop(['Did not work','Worked'], axis=1,inplace=True)
        df3 = df3[['LocalArea','Females aged 15 years and over by work activity during the reference year - 25% sample data','Worked full year, full time','Worked part year and/or part time','Worked partialy full time and partially part time']].copy()
        df3.set_axis(column_names_1, axis=1, inplace=True)
        df3.insert(1, 'Type', 'female')
        merged = pd.concat([df3,df2,df1])
        merged.sort_values(by=['LocalArea', 'Type'], inplace=True)
        census_dict['time_worked'] = merged
        merged.to_csv('../../data/processed/census_' + str(year) + '/time_worked.csv')
        return census_dict  


In [13]:
def clean_citizenship(census_dict, year):
    
    column_names = ['LocalArea', 'Canadian citizens', 'Not Canadian citizens']
    
    if year == 2001:
        column_names = ['LocalArea', 'Canadian Citizenship', 'Citizenship other than Canadian']
        df = census_dict['Total population by citizenship']
        df = df[column_names]
    elif year == 2006:
        df = census_dict['Total population by citizenship']
        df = df[column_names]
    elif year == 2011:
        df = pd.read_csv('../../data/processed/nhs/Citizenship.csv', index_col=0)
        df = df[column_names]
        
    elif year == 2016:
        df = census_dict['Total - Citizenship for the population in private households - 25% sample data']
    
    df.set_axis(column_names, axis=1, inplace=True)
    df.sort_values(by=['LocalArea'], inplace=True)
    census_dict['citizenship'] = df
    df.to_csv('../../data/processed/census_' + str(year) + '/citizenship.csv')
    
    return census_dict

In [14]:
def clean_industry(census_dict, year):
    
    if year == 2011:
        df = pd.read_csv('../../data/processed/nhs/Industry.csv', index_col=0).query('Type == "Total"')
        label = list(df.columns[[-1, -3]])
        
    else:
        if year == 2001:
            df = census_dict['Total labour force 15 years and over by industry - 1997 North American Industry Classification System']
        elif year == 2006:
            df = census_dict['Total labour force 15 years and over by industry - North American Industry Classification System 2002']
        else:
            df = census_dict['Total Labour Force population aged 15 years and over by Industry - North American Industry Classification System (NAICS) 2012 - 25% sample data']
        
        label = list(df.columns[[0, 2]])
        
    industries = list(df.columns)
    industries_original = [i for i in industries if re.match(r'^[0-9]', i)]
    industries = [re.findall(r'^[0-9 -]*(.*)', i)[0] for i in industries_original]
    
    column_names = ['LocalArea', 'Industry not applicable'] + industries
        
    cols = label + industries_original
    df = df.loc[: ,cols]
        
    
    df.set_axis(column_names, axis=1, inplace=True)
    df.sort_values(by=['LocalArea'], inplace=True)
    census_dict['industry'] = df
    df.to_csv('../../data/processed/census_' + str(year) + '/industry.csv')
    
    return census_dict

In [15]:
def clean_labour_force_status(census_dict, year):

    column_names = ['LocalArea', 'Type', 'Employed', 'Employment rate', 'In the labour force',
                    'Not in the labour force', 'Participation rate', 'Unemployed', 'Unemployment rate']

    if year == 2011:
        df = pd.read_csv(
            '../../data/processed/nhs/Labour force status.csv', index_col=0)
        df = df.drop(
            columns=['Total population aged 15 years and over by labour force status'])
        df = df[column_names]

    else:
        order = [0, 3, 7, 2, 5, 6, 4, 8]
        if year == 2001:
            total = census_dict['Population - 15 years and over by labour force activity'].iloc[:, order]
            male = census_dict['Total - Males 15 years and over'].iloc[:, order]
            female = census_dict['Total - Females 15 years and over'].iloc[:, order]

        elif year == 2006:
            total = census_dict['Total population 15 years and over by labour force activity'].iloc[:, order]
            male = census_dict['Males 15 years and over - Labour force activity'].iloc[:, order]
            female = census_dict['Females 15 years and over - Labour force activity'].iloc[:, order]

        else:
            total = census_dict['Total - Population aged 15 years and over by Labour force status - 25% sample data'].iloc[:, order]
            male = census_dict['Total - Males aged 15 years and over by Labour force status - 25% sample data'].iloc[:, order]
            female = census_dict['Total - Females aged 15 years and over by Labour force status - 25% sample data'].iloc[:, order]

        total.insert(1, 'Type', 'Total')
        total.columns = column_names
        male.insert(1, 'Type', 'Male')
        male.columns = column_names
        female.insert(1, 'Type', 'Female')
        female.columns = column_names

        df = pd.concat([total, male, female])

    df.set_axis(column_names, axis=1, inplace=True)
    df.sort_values(by=['LocalArea'], inplace=True)
    census_dict['labour_force_status'] = df
    df.to_csv('../../data/processed/census_' +
              str(year) + '/labour_force_status.csv')

    return census_dict


In [16]:
def clean_mobility(census_dict, year):

    column_names = ['LocalArea', 'Migrants', 'Non-migrants', 'Non-movers',
                    'Total - Mobility status 1 year ago',
                    'Total - Mobility status 5 yeas ago']
    sel_col = ['LocalArea', 'Migrants', 'Non-migrants', 'Non-movers']
    
    if year == 2011:

        df = pd.read_csv('../../data/processed/nhs/Mobility.csv',
                         index_col=0).query('Type == "Total"').iloc[:, [-1, 5, 7, 8, -3, -2]]

    else:
        if year == 2001:
            yr1 = census_dict['Total population 1 year and over by mobility status 1 year ago']
            yr5 = census_dict['Total population 5 years and over by mobility status 5 years ago']
        elif year == 2006:
            yr1 = census_dict['Total - Mobility status 1 year ago']
            yr5 = census_dict['Total - Mobility status 5 years ago']
        else:
            yr1 = census_dict['Total - Mobility status 1 year ago - 25% sample data']
            yr5 = census_dict['Total - Mobility status 5 years ago - 25% sample data']
        
        yr1.sort_values(by=['LocalArea'], inplace=True)
        yr5.sort_values(by=['LocalArea'], inplace=True)
        
        total1 = yr1.iloc[:,1]
        total5 = yr5.iloc[:,1]
        
        yr1 = yr1.loc[:, sel_col]
        yr5 = yr5.loc[:, sel_col]
        
        df = pd.concat([yr1, yr5]).groupby(['LocalArea']).sum().reset_index()
        df['Total - Mobility status 1 year ago'] = total1
        df['Total - Mobility status 5 years ago'] = total5

    df.set_axis(column_names, axis=1, inplace=True)
    df.sort_values(by=['LocalArea'], inplace=True)
    census_dict['mobility'] = df
    df.to_csv('../../data/processed/census_' + str(year) + '/mobility.csv')

    return census_dict


In [17]:
def clean_transport_mode(census_dict, year):

    column_names = ['LocalArea', 'Type', 'Bicycle',
                    'Car as driver', 'Car as passenger',
                    'Other methods', 'Public transit', 'Walked']

    if year == 2011:
        df = pd.read_csv('../../data/processed/nhs/Mode of transportation.csv',
                         index_col=0).iloc[:, [-1, 0, 1, 2, 3, 4, 5, -2]]

    else:
        if year == 2016:
            order = [0, -2, 2, 3, -1, 4, 5]
            male = census_dict['Total - Main mode of commuting for the male employed labour force aged 15 years and over in private households with a usual place of work or no fixed workplace address - 25% sample data']
            female = census_dict['Total - Main mode of commuting for the female employed labour force aged 15 years and over in private households with a usual place of work or no fixed workplace address - 25% sample data']

            male = male.iloc[:, order]
            female = female.iloc[:, order]

            male.insert(1, 'Type', 'Male')
            female.insert(1, 'Type', 'Female')
        
        else:
            order = [0, -1, 6, 2, 3, -2, 4, 5]
            if year == 2001:
                male = census_dict['Males with a usual place of work or no fixed workplace address']
            else:
                male = census_dict['Males with usual place of work or no fixed workplace address']
                
            female = census_dict['Females with usual place of work or no fixed workplace address']
                
            male['Type'] = ['Male']*len(male)
            male['Other method'] = male['Other method'] + \
                male['Taxicab'] + male['Motorcycle']
            male = male.iloc[:, order]

            female['Type'] = ['Female']*len(female)
            female['Other method'] = female['Other method'] + \
                female['Taxicab'] + female['Motorcycle']
            female = female.iloc[:, order]
        
        male.columns = column_names
        female.columns = column_names
        
        df = pd.concat([male, female])

        total = df.groupby(['LocalArea']).sum().reset_index()
        total['Type'] = ['Total'] * len(total)

        df = pd.concat([df, total])
        
    df.set_axis(column_names, axis=1, inplace=True)
    df.sort_values(by=['LocalArea'], inplace=True)
    census_dict['transport_mode'] = df
    df.to_csv('../../data/processed/census_' +
              str(year) + '/transport_mode.csv')

    return census_dict


In [18]:
def clean_occupation(census_dict, year):

    column_names = ['LocalArea', 'Type', 'All occupations', 'Occupations n/a',
                    'Management', 'Business and finance', 'Natural and applied sciences',
                    'Health', 'Social Sience and education', 'Art', 'Sales and service',
                    'Trades and transport', 'Natural resources and agriculture', 'Manufacturing and utilities']

    if year == 2001 or year == 2006:
        if year == 2001:
            female = census_dict['Female labour force 15 years and over - Occupation']
            male = census_dict['Male labour force 15 years and over - Occupation']
        else:
            female = census_dict['Female labour force 15 years and over by occupation - National Occupational Classification for Statistics 2006']
            male = census_dict['Male labour force 15 years and over by occupation - National Occupational Classification for Statistics 2006']

        occupations = [i for i in female.columns if re.match(r'^[A-Z] ', i)]
        female['Type'] = ['Female'] * len(female)
        female = pd.concat(
            [female.iloc[:, [0, -1, 3, 2]], female.loc[:, occupations]], axis=1)
        female.columns = column_names

        male['Type'] = ['Male'] * len(male)
        male = pd.concat(
            [male.iloc[:, [0, -1, 3, 2]], male.loc[:, occupations]], axis=1)
        male.columns = column_names

        df = pd.concat([female, male])

        total = df.groupby(['LocalArea']).sum().reset_index()
        total['Type'] = ['Total'] * len(total)

        df = pd.concat([df, total])

    elif year == 2011:
        df = pd.read_csv(
            '../../data/processed/nhs/Occupation.csv', index_col=0)

        df = pd.concat(
            (df.iloc[:, [14, 0, 11, 12]], df.iloc[:, 1:11]), axis=1)

    elif year == 2016:
        total = census_dict['Total labour force population aged 15 years and over by occupation - National Occupational Classification (NOC) 2016 - 25% sample data']
        female = census_dict['Total female labour force population aged 15 years and over by occupation - National Occupational Classification (NOC) 2016 - 25% sample data']
        male = census_dict['Total male labour force population aged 15 years and over by occupation - National Occupational Classification (NOC) 2016 - 25% sample data']
        
        male['Type'] = ['Male'] * len(male)
        total['Type'] = ['Total'] * len(total)
        female['Type'] = ['Female'] * len(female)
        
        total = pd.concat(
            (total.iloc[:, [0, -1, 3, 2]], total.iloc[:, 4:14]), axis=1)
        female = pd.concat(
            (female.iloc[:, [0, -1, 3, 2]], female.iloc[:, 4:14]), axis=1)
        male = pd.concat(
            (male.iloc[:, [0, -1, 3, 2]], male.iloc[:, 4:14]), axis=1)
        
        female.columns, male.columns, total.columns = column_names, column_names, column_names
        
        df = pd.concat([total, female, male])

    df.set_axis(column_names, axis=1, inplace=True)
    df.sort_values(by=['LocalArea'], inplace=True)
    census_dict['occupation'] = df
    df.to_csv('../../data/processed/census_' + str(year) + '/occupation.csv')

    return census_dict


In [19]:
def clean_workplace_status(census_dict, year):

    column_names = ['LocalArea', 'Type',
                    'Worked at home', 'Worked at usual place', 'Worked outside Canada',
                    'No fixed workplace']

    if year == 2011:

        df = pd.read_csv('../../data/processed/nhs/Place of work status.csv',
                         index_col=0).iloc[:, [-1, 0, 3, 4, 5, 1]]

    else:
        if year == 2001 or year == 2006:
            order = [0, -3, 2, -2, -1]
            male = census_dict['Males'].iloc[:, order]
            female = census_dict['Females'].iloc[:, order]
        else:
            order = [0, 2, -1, 3, 4]
            male = census_dict['Total - Place of work status for the male employed labour force aged 15 years and over in private households - 25% sample data'].iloc[:, order]
            female = census_dict['Total - Place of work status for the female employed labour force aged 15 years and over in private households - 25% sample data'].iloc[:, order]

        male.insert(1, 'Type', 'Male')
        female.insert(1, 'Type', 'Female')

        male.columns = column_names
        female.columns = column_names

        df = pd.concat([female, male])

        total = df.groupby(['LocalArea']).sum().reset_index()
        total['Type'] = ['Total'] * len(total)

        df = pd.concat([df, total])

    df.set_axis(column_names, axis=1, inplace=True)
    df.sort_values(by=['LocalArea'], inplace=True)
    census_dict['workplace_status'] = df
    df.to_csv('../../data/processed/census_' + str(year) + '/workplace_status.csv')

    return census_dict
