In [None]:
# This line tells iPython to not display warnings.
import warnings
warnings.filterwarnings('ignore')

# RUN THIS CELL FIRST or the notebook won't work
import numpy as np
import pandas as pd
import geopandas as gpd
import datetime as dt
import requests
from io import BytesIO
from urllib.request import urlopen
from zipfile import ZipFile
import sys
if sys.version_info[0] < 3: 
    from StringIO import StringIO
else:
    from io import StringIO
from IPython.display import display
import matplotlib.pyplot as plt

# These help the maps display nicely in the notebook
%matplotlib inline
plt.rcParams['figure.figsize'] = [30, 20]

In [None]:
# downloaded from: https://open.toronto.ca/dataset/neighbourhoods/
neighbourhoods_gpd = gpd.read_file("neighbourhoods/Neighbourhoods - 4326.shp")
neighbourhoods_gpd.head()

In [None]:
toronto_gpd = gpd.read_file('~/git/cp101.github.io/labs/lab10/shapefiles/toronto/toronto.shp')
toronto_gpd.head()

In [None]:
toronto_gpd.crs

In [None]:
# reprojects coordinates to new coordinate reference system
toronto_gpd = toronto_gpd.to_crs(4326)

In [None]:
# sjoin toronto neighborhoods and census tracts
# some GeoUIDs are in multiple neighborhoods; census tract data will be grouped by neighbourhood and averaged
toronto_neighbourhoods_ct = pd.DataFrame(neighbourhoods_gpd.sjoin(toronto_gpd[['geometry', 'GeoUID']]))[['AREA_NA7', 'GeoUID']].drop_duplicates()
toronto_neighbourhoods_ct.head()

In [None]:
with ZipFile("../lab10/ttc/routes_schedules.zip") as myzip:
    stops_df = pd.read_csv(myzip.open("stops.txt"), dtype={
        'stop_id': 'str', 
        'stop_code': 'str',
        'stop_name': 'str',
        'stop_lat': 'float',
        'stop_lon': 'float'
    })
    stops_gdf = gpd.GeoDataFrame(stops_df, 
        geometry=gpd.points_from_xy(stops_df.stop_lon, stops_df.stop_lat)).set_crs(epsg=4326)

In [None]:
stops_gdf.head()

In [None]:
neighbourhood_stops = pd.DataFrame(neighbourhoods_gpd.sjoin(stops_gdf))[['AREA_NA7', 'stop_id']]
neighbourhood_stops.head()

In [None]:
census_neighbourhood_stops = toronto_neighbourhoods_ct.merge(neighbourhood_stops, how = "inner", on = "AREA_NA7")
census_neighbourhood_stops

In [None]:
def clean_can_census_data(can_df):
    can_df = can_df.fillna(0)
    can_df = can_df.replace({'NA': 0})
    can_df = can_df.replace({'': 0})
    can_df.iloc[:,4:] = can_df.iloc[:,4:].apply(pd.to_numeric)
    # pad to get correct geouid length
    can_df["GeoUID"] = can_df["GeoUID"].astype(str).str.ljust(10, "0")
    return can_df

In [None]:
lab01_data = pd.read_csv('~/git/cp101.github.io/labs/lab01/lab01_data.csv')
lab01_data = clean_can_census_data(lab01_data)
lab01_data.head()

In [None]:
lab03_data = pd.read_csv('~/git/cp101.github.io/labs/lab03/census21_data.csv')
lab03_data = clean_can_census_data(lab03_data)
lab03_data = lab03_data.rename(columns = {'GeoUID' : 'GeoUIDX'})
lab03_data = lab03_data.drop(columns = lab03_data.columns[lab03_data.columns.isin(lab01_data.columns)])
lab03_data = lab03_data.rename(columns = {'GeoUIDX' : 'GeoUID'})
lab03_data.head()

In [None]:
# fix missingness and data types
lab10_data = pd.read_csv('~/git/cp101.github.io/labs/lab10/census21_data.csv')
lab10_data = clean_can_census_data(lab10_data)
lab10_data = lab10_data.rename(columns = {'GeoUID' : 'GeoUIDX'})
lab10_data = lab10_data.drop(columns = lab10_data.columns[lab10_data.columns.isin(lab01_data.columns) | lab10_data.columns.isin(lab03_data.columns)])
lab10_data = lab10_data.rename(columns = {'GeoUIDX' : 'GeoUID'})
lab10_data.head()

In [None]:
# union of all columns so far in the course
all_can_census = lab01_data.merge(lab03_data, on = 'GeoUID').merge(lab10_data, on = 'GeoUID')
all_can_census.head()

In [None]:
def div_0(n,d):
    try:
        return n/d
    except:
        return 0

In [None]:
# use this to help identify column names by string
all_can_census.columns.values

In [None]:
all_can_census.columns[all_can_census.columns.str.lower().str.contains("average")]

In [None]:
# create variables
#all_can_census['total_pop'] = all_can_census['Population']
all_can_census['pop_density'] = all_can_census['v_CA21_6: Population density per square kilometre']
all_can_census['avg_monthly_shelter_rented'] = all_can_census['v_CA21_4318: Average monthly shelter costs for rented dwellings ($) (59)']
all_can_census['avg_monthly_shelter_owned'] = all_can_census['v_CA21_4310: Average monthly shelter costs for owned dwellings ($) (59)']
all_can_census['avg_value_owned'] = all_can_census['v_CA21_4312: Average value of dwellings ($) (60)']

#all_can_census['br0'] = all_can_census['v_CA21_4245: No bedrooms']
#all_can_census['br1'] = all_can_census['v_CA21_4246: 1 bedroom']
#all_can_census['br2'] = all_can_census['v_CA21_4247: 2 bedrooms']
#all_can_census['br3'] = all_can_census['v_CA21_4248: 3 bedrooms']
#all_can_census['br4plus'] = all_can_census['v_CA21_4249: 4 or more bedrooms']

#all_can_census['pct_pop_change_16_21'] = all_can_census['v_CA21_3: Population percentage change, 2016 to 2021'] / 100
all_can_census['gini_after_tax'] = all_can_census['v_CA21_1142: Gini index on adjusted household after-tax income']
#all_can_census['gini_total'] = all_can_census['v_CA21_1140: Gini index on adjusted household total income']
#all_can_census['gini_market'] = all_can_census['v_CA21_1141: Gini index on adjusted household market income']

all_can_census['pct_rent'] = div_0(all_can_census['v_CA21_4239: Renter'] , all_can_census['v_CA21_4237: Total - Private households by tenure'])

all_can_census['pct_visible_minority'] = div_0(all_can_census['v_CA21_4875: Total visible minority population'] , all_can_census['v_CA21_4872: Total - Visible minority for the population in private households'])

all_can_census['pct_seniors'] = div_0(all_can_census['v_CA21_251: 65 years and over'] , all_can_census['v_CA21_8: Total - Age'])
all_can_census['pct_multifam'] = div_0(all_can_census['v_CA21_438: Apartment or flat in a duplex'] + all_can_census['v_CA21_439: Apartment in a building that has fewer than five storeys'] + all_can_census['v_CA21_440: Apartment in a building that has five or more storeys'], all_can_census['v_CA21_434: Occupied private dwellings by structural type of dwelling data'])
#all_can_census['avg_rooms'] = all_can_census['v_CA21_4256: Average number of rooms per dwelling']

all_can_census['pct_vacant'] = 1 - (div_0(all_can_census['v_CA21_5: Private dwellings occupied by usual residents'], all_can_census['v_CA21_4: Total private dwellings']))

all_can_census['pct_before1960'] = div_0(all_can_census['v_CA21_4264: 1960 or before'] , all_can_census['v_CA21_4263: Total - Occupied private dwellings by period of construction'])

all_can_census['pct_limat'] = all_can_census['v_CA21_1040: Prevalence of low income based on the Low-income measure, after tax (LIM-AT) (%)'] / 100
all_can_census['pct_licoat'] = all_can_census['v_CA21_1085: Prevalence of low income based on the Low-income cut-offs, after tax (LICO-AT) (%)'] / 100

all_can_census['pct_mortgage'] = all_can_census['v_CA21_4306: % of owner households with a mortgage (58)'] / 100
all_can_census['pct_owner_rent_burden'] = all_can_census['v_CA21_4307: % of owner households spending 30% or more of its income on shelter costs (55)'] / 100
all_can_census['pct_owner_ichn'] = all_can_census['v_CA21_4308: % in core housing need (57)'] / 100

all_can_census['pct_renter_subsidized'] = all_can_census['v_CA21_4314: % of tenant households in subsidized housing (61)'] / 100
all_can_census['pct_renter_rent_burden'] = all_can_census['v_CA21_4315: % of tenant households spending 30% or more of its income on shelter costs (55)'] / 100
all_can_census['pct_renter_ichn'] = all_can_census['v_CA21_4316: % in core housing need (57)'] / 100

all_can_census['pct_noncitizen'] = div_0(all_can_census['v_CA21_4401: Not Canadian citizens'], all_can_census['v_CA21_4389: Total - Citizenship for the population in private households'])

all_can_census['pct_public_transit'] = div_0(all_can_census['v_CA21_7644: Public transit'] , all_can_census['v_CA21_7632: Total - Main mode of commuting for the employed labour force aged 15 years and over with a usual place of work or no fixed workplace address'])

subset_cols = all_can_census.iloc[:, 0:2].merge(all_can_census.iloc[:, -21:], left_index = True, right_index = True, how = 'left')
# omit tracts with NA fields
subset_cols = subset_cols[subset_cols.isna().sum(axis = 1) == 0]
subset_cols

In [None]:
# omit tracts with at least one severe outlier
from scipy import stats
subset_cols = subset_cols[(np.abs(stats.zscore(subset_cols.iloc[:, 2:])) < 3).all(axis = 1)]
subset_cols.head()

In [None]:
subset_cols.iloc[:,2:].describe()

In [None]:
toronto_stops = neighbourhood_stops.merge(subset_cols.drop(columns = "Unnamed: 0").merge(toronto_neighbourhoods_ct, on = "GeoUID").groupby("AREA_NA7").mean().reset_index(), on = "AREA_NA7", how = "inner")
toronto_stops.head()

In [None]:
toronto_stops.iloc[3:].describe()

In [None]:
# any lingering data transformations?
for col in toronto_stops.columns[3:]:
    toronto_stops[col].hist()
    plt.title(col)
    plt.show()

In [None]:
toronto_stops.to_csv("toronto_stops.csv")