In [1]:
import zipfile
import pandas as pd
import geopandas as gpd

In [2]:
# extract and read boundary data from zip file
boundaries_zip_path = "../data/raw/census_boundaries_2011.zip"
z = zipfile.ZipFile(boundaries_zip_path)
z.extractall(path="../data/raw/") # extract to folder
nhs_boundaries = gpd.read_file("../data/raw/gct_000b11a_e.shp")

# read 2011 NHS data for BC from zip file
nhs_zip_path = "../data/raw/nhs_census_2011.zip"
with zipfile.ZipFile(nhs_zip_path,"r") as z:
    with z.open("99-004-XWE2011001-401-BC.csv") as f:
        nhs_2011 = pd.read_csv(f, encoding='latin-1', usecols=[0,2,3,4,5,6,8,10,12])
        
# read in local areas boundaries
areas = gpd.read_file("../data/raw/local_area_boundary.geojson")

In [3]:
# select only census tracts in Vancouver
van_bound = nhs_boundaries[nhs_boundaries['CMANAME'] == 'Vancouver']

# select and rename needed columns
van_bound = van_bound[['CTUID', 'geometry']]
van_bound.rename(columns={'CTUID':'Geo_Code'}, inplace=True)
van_bound.Geo_Code = van_bound.Geo_Code.apply(lambda x: float(x))
van_bound['LocalArea'] = 'None'
van_bound.reset_index(drop=True, inplace=True)
van_bound

Unnamed: 0,Geo_Code,geometry,LocalArea
0,9330147.01,"POLYGON ((-123.11399 49.17010, -123.11398 49.1...",
1,9330187.14,"POLYGON ((-122.76776 49.15303, -122.76776 49.1...",
2,9330203.00,"POLYGON ((-122.93060 49.20761, -122.93129 49.2...",
3,9330134.00,"POLYGON ((-123.13981 49.33816, -123.14067 49.3...",
4,9330403.05,"POLYGON ((-122.63942 49.22053, -122.64141 49.2...",
...,...,...,...
452,9330188.08,"POLYGON ((-122.76747 49.19152, -122.76827 49.1...",
453,9330170.03,"POLYGON ((-122.80112 49.03118, -122.80112 49.0...",
454,9330226.03,"POLYGON ((-122.99420 49.22992, -122.99420 49.2...",
455,9330147.08,"POLYGON ((-123.12469 49.17019, -123.12469 49.1...",


In [4]:
# find local area of the census tract based on geom
local_areas= list(van_bound['LocalArea'])

for row in range(len(local_areas)):
    for area in range(len(areas.name)):
        if areas.geometry[area].contains(van_bound.geometry[row].centroid):
            local_areas[row] = str(areas.name[area])
            break

In [5]:
# name local areas based on geom
van_bound['LocalArea'] = local_areas
van_bound = van_bound[van_bound['LocalArea'] != 'None']
van_bound

Unnamed: 0,Geo_Code,geometry,LocalArea
7,9330028.00,"POLYGON ((-123.12721 49.25712, -123.12723 49.2...",Shaughnessy
14,9330027.01,"POLYGON ((-123.15077 49.25725, -123.15079 49.2...",Arbutus-Ridge
26,9330060.02,"POLYGON ((-123.12918 49.27908, -123.12970 49.2...",West End
28,9330053.02,"POLYGON ((-123.02339 49.28114, -123.02339 49.2...",Hastings-Sunrise
29,9330036.02,"POLYGON ((-123.02353 49.26154, -123.02354 49.2...",Renfrew-Collingwood
...,...,...,...
433,9330015.01,"POLYGON ((-123.02351 49.22923, -123.02351 49.2...",Killarney
434,9330058.00,"POLYGON ((-123.07735 49.28741, -123.07735 49.2...",Strathcona
436,9330004.02,"POLYGON ((-123.09094 49.21827, -123.09097 49.2...",Sunset
437,9330008.01,"POLYGON ((-123.19708 49.23505, -123.19708 49.2...",Dunbar-Southlands


In [6]:
# merge local area to NHS data
merged_df = nhs_2011.merge(van_bound, on = 'Geo_Code')
merged_df = merged_df[['LocalArea', 'Topic', 'Characteristic', 'Total', 'Male', 'Female']]
merged_df

Unnamed: 0,LocalArea,Topic,Characteristic,Total,Male,Female
0,Victoria-Fraserview,Citizenship,Total population in private households by citi...,3015.0,1395.0,1620.0
1,Victoria-Fraserview,Citizenship,Canadian citizens,2670.0,1230.0,1440.0
2,Victoria-Fraserview,Citizenship,Canadian citizens aged under 18,410.0,175.0,235.0
3,Victoria-Fraserview,Citizenship,Canadian citizens aged 18 and over,2255.0,1050.0,1210.0
4,Victoria-Fraserview,Citizenship,Not Canadian citizens,345.0,160.0,185.0
...,...,...,...,...,...,...
111355,Kitsilano,Income of individuals in 2010,Prevalence of low income in 2010 based on afte...,20.3,24.2,16.5
111356,Kitsilano,Income of individuals in 2010,Less than 18 years (%),24.5,32.3,11.1
111357,Kitsilano,Income of individuals in 2010,Less than 6 years (%),35.7,0.0,25.0
111358,Kitsilano,Income of individuals in 2010,18 to 64 years (%),20.2,23.8,17.0


In [7]:
# split data frame by topics
topics = list(merged_df['Topic'].unique())
local_areas = list(merged_df['LocalArea'].unique())
sub_dataframes = dict()
sub_group = dict()
for topic in topics:
    sub_dataframes[topic] = merged_df[merged_df['Topic'] == topic]

In [8]:
for topic in topics:
    topic_df = sub_dataframes[topic].copy()
    topic_df.drop(columns=['Topic'], inplace=True)
    topic_grouped = topic_df.groupby(by=['LocalArea','Characteristic']).sum()
    topic_grouped.reset_index(inplace=True)
    sub_group[str(topic)] = pd.DataFrame()
    
    for area in local_areas:
        df = topic_grouped[topic_grouped['LocalArea']==area].copy()
        df.drop(columns=['LocalArea'], inplace=True)
        df = df.set_index('Characteristic').T.reset_index().rename(columns={'index': 'Type'})
        df['LocalArea'] = str(area)
        sub_group[str(topic)] = pd.concat([sub_group[str(topic)], df])

In [9]:
sub_group.keys()

dict_keys(['Citizenship', 'Immigrant status and period of immigration', 'Age at immigration', 'Immigrant status and selected places of birth', 'Recent immigrants by selected place of birth', 'Generation status', 'Visible minority population', 'Ethnic origin population', 'Religion', 'Aboriginal population', 'Non-official languages spoken', 'Mobility', 'Education', 'Language used most often at work', 'Labour force status', 'Class of worker', 'Occupation', 'Industry', 'Work activity', 'Full-time or part-time weeks worked', 'Place of work status', 'Mode of transportation', 'Median commuting duration', 'Time leaving for work', 'Occupied private dwelling characteristics', 'Household characteristics', 'Shelter costs', 'Income of individuals in 2010', 'Income of households in 2010'])

In [10]:
df = sub_group['Citizenship']
df.reset_index(inplace=True, drop=True)
df

Characteristic,Type,Canadian citizens aged 18 and over,Canadian citizens aged under 18,Canadian citizens,Not Canadian citizens,Total population in private households by citizenship,LocalArea
0,Total,14725.0,3335.0,18055.0,2550.0,20615.0,Victoria-Fraserview
1,Male,7070.0,1700.0,8780.0,1150.0,9945.0,Victoria-Fraserview
2,Female,7655.0,1630.0,9285.0,1385.0,10660.0,Victoria-Fraserview
3,Total,14930.0,4440.0,19355.0,2365.0,21735.0,Dunbar-Southlands
4,Male,7115.0,2360.0,9480.0,1155.0,10635.0,Dunbar-Southlands
...,...,...,...,...,...,...,...
61,Male,11205.0,3005.0,14210.0,1475.0,15685.0,Killarney
62,Female,12480.0,2700.0,15175.0,2090.0,17270.0,Killarney
63,Total,6435.0,1320.0,7755.0,1820.0,9575.0,Oakridge
64,Male,2830.0,685.0,3520.0,895.0,4415.0,Oakridge
