In [1]:
import zipfile
import pandas as pd
import geopandas as gpd
import os

In [7]:
# extract and read boundary data from zip file
boundaries_zip_path = "../../data/raw/census_boundaries_2011.zip"
z = zipfile.ZipFile(boundaries_zip_path)
z.extractall(path="../../data/raw/") # extract to folder
nhs_boundaries = gpd.read_file("../../data/raw/gct_000b11a_e.shp")

# read 2011 NHS data for BC from zip file
nhs_zip_path = "../../data/raw/nhs_census_2011.zip"
with zipfile.ZipFile(nhs_zip_path,"r") as z:
    with z.open("99-004-XWE2011001-401-BC.csv") as f:
        nhs_2011 = pd.read_csv(f, encoding='latin-1', usecols=[0,2,3,4,5,6,8,10,12])
        
# read in local areas boundaries
areas = gpd.read_file("../../data/raw/local_area_boundary.geojson")

In [None]:
# select only census tracts in Vancouver
van_bound = nhs_boundaries[nhs_boundaries['CMANAME'] == 'Vancouver']

# select and rename needed columns
van_bound = van_bound[['CTUID', 'geometry']]
van_bound.rename(columns={'CTUID':'Geo_Code'}, inplace=True)
van_bound.Geo_Code = van_bound.Geo_Code.apply(lambda x: float(x))
van_bound['LocalArea'] = 'None'
van_bound.reset_index(drop=True, inplace=True)
van_bound

In [None]:
# find local area of the census tract based on geom
local_areas= list(van_bound['LocalArea'])

for row in range(len(local_areas)):
    for area in range(len(areas.name)):
        if areas.geometry[area].contains(van_bound.geometry[row].centroid):
            local_areas[row] = str(areas.name[area])
            break

In [3]:
# name local areas based on geom
van_bound['LocalArea'] = local_areas
van_bound = van_bound[van_bound['LocalArea'] != 'None']
van_bound

NameError: name 'local_areas' is not defined

In [16]:
# merge local area to NHS data
merged_df = nhs_2011.merge(van_bound, on = 'Geo_Code')
merged_df = merged_df[['LocalArea', 'Topic', 'Characteristic', 'Total', 'Male', 'Female']]
merged_df.Characteristic = merged_df.Characteristic.apply(lambda x: (x.lstrip()).rstrip())
merged_df

Unnamed: 0,LocalArea,Topic,Characteristic,Total,Male,Female
0,Victoria-Fraserview,Citizenship,Total population in private households by citi...,3015.0,1395.0,1620.0
1,Victoria-Fraserview,Citizenship,Canadian citizens,2670.0,1230.0,1440.0
2,Victoria-Fraserview,Citizenship,Canadian citizens aged under 18,410.0,175.0,235.0
3,Victoria-Fraserview,Citizenship,Canadian citizens aged 18 and over,2255.0,1050.0,1210.0
4,Victoria-Fraserview,Citizenship,Not Canadian citizens,345.0,160.0,185.0
...,...,...,...,...,...,...
111355,Kitsilano,Income of individuals in 2010,Prevalence of low income in 2010 based on afte...,20.3,24.2,16.5
111356,Kitsilano,Income of individuals in 2010,Less than 18 years (%),24.5,32.3,11.1
111357,Kitsilano,Income of individuals in 2010,Less than 6 years (%),35.7,0.0,25.0
111358,Kitsilano,Income of individuals in 2010,18 to 64 years (%),20.2,23.8,17.0


In [17]:
# split data frame by topics
topics = list(merged_df['Topic'].unique())
local_areas = list(merged_df['LocalArea'].unique())
sub_dataframes = dict()
sub_group = dict()
for topic in topics:
    sub_dataframes[topic] = merged_df[merged_df['Topic'] == topic]

In [18]:
# Create the nhs directory if it doesn't exist
os.makedirs('../../data/processed/nhs/', exist_ok=True) 

for topic in topics:
    topic_df = sub_dataframes[topic].copy()
    topic_df.drop(columns=['Topic'], inplace=True)
    topic_grouped = topic_df.groupby(by=['LocalArea','Characteristic']).sum()
    topic_grouped.reset_index(inplace=True)
    sub_group[str(topic)] = pd.DataFrame()
    
    for area in local_areas:
        df = topic_grouped[topic_grouped['LocalArea']==area].copy()
        df.drop(columns=['LocalArea'], inplace=True)
        df = df.set_index('Characteristic').T.reset_index().rename(columns={'index': 'Type'})
        df['LocalArea'] = str(area)
        sub_group[str(topic)] = pd.concat([sub_group[str(topic)], df])
    sub_group[str(topic)].to_csv('../../data/processed/nhs/' + str(topic) + '.csv')

In [19]:
sub_group.keys()

dict_keys(['Citizenship', 'Immigrant status and period of immigration', 'Age at immigration', 'Immigrant status and selected places of birth', 'Recent immigrants by selected place of birth', 'Generation status', 'Visible minority population', 'Ethnic origin population', 'Religion', 'Aboriginal population', 'Non-official languages spoken', 'Mobility', 'Education', 'Language used most often at work', 'Labour force status', 'Class of worker', 'Occupation', 'Industry', 'Work activity', 'Full-time or part-time weeks worked', 'Place of work status', 'Mode of transportation', 'Median commuting duration', 'Time leaving for work', 'Occupied private dwelling characteristics', 'Household characteristics', 'Shelter costs', 'Income of individuals in 2010', 'Income of households in 2010'])