# Building the dataset for PerfectCity.io

PerfectCity.io requires many features for cities. Often we would need to look at multiple datasets for each city and extract relevant information and each dataset requires careful treatment to get the right data. This notebook will document the process that went from basically having no data to a complete PerfectCity.io dataset

In [216]:
import pandas as pd
import json

In [217]:
# define cities and the parameters we are going to use
cities = ['VANCOUVER', 'MONTREAL', 'TORONTO', 'OTTAWA', 'HAMILTON', 'WINNIPEG', 'EDMONTON', 'CALGARY']
parameters = ['PARKS', 'TRANSIT_SCORE', 'OUTDOOR', 'POPULATION', 'BUSINESSES' ,'CRIME', 'SCHOOLS', 'UNIVERSITIES', 'UNEMPLOYMENT']

print 'cities x parameters '
print len(cities),'x',len(parameters),'=',len(cities)*len(parameters)

cities x parameters 
8 x 9 = 72


In [218]:
# build an empty object to hold the information
data = { city: { parameter : 0 for parameter in parameters } for city in cities }

#### Dealing with Accents
Many Canadian cities have accents in their names, we use the REGEX in the following way to match the first 4 charaters of the city name which seemed to produce good enough results for our purposes    
```
search_city = city[:4] # select the first four letters of the city name
search_term = r'\b('+search_city+')\w?' # incorporate it into the regex
```

---
# Transit Score Ranking
Obtained from [Walkscore 2014 Ranking](http://blog.walkscore.com/2014/03/best-canadian-cities-for-public-transit/#.VYWKBxNJaV4)

In [219]:
# RANKING FROM http://blog.walkscore.com/2014/03/best-canadian-cities-for-public-transit/#.VYWKBxNJaV4
TRANSIT_SCORE = {'TORONTO': 78, 'MONTREAL': 77, 'VANCOUVER': 74, 'WINNIPEG': 51, 'OTTAWA': 49, 'EDMONTON': 44, 'CALGARY':43, 'HAMILTON': 42 }

for city, score in TRANSIT_SCORE.items():
    data[city]['TRANSIT_SCORE'] = score/100.0

----
# Population Information
[Stats Canada census from 2011](http://www12.statcan.gc.ca/census-recensement/2011/dp-pd/hlt-fst/pd-pl/Table-Tableau.cfm?LANG=Eng&T=205&S=3&RPP=50)

In [220]:
population_df = pd.read_csv('./datasets/canadian_population_census_2011.CSV')
print(population_df.columns[[1,4]]) # we need to keep columns 1 and 4 and delete everything else
population_df = population_df[[1,4]]
population_df['Geographic name'] = population_df['Geographic name'].str.upper()
population_df.head() #all good

Index([u'Geographic name', u'Population, 2011'], dtype='object')


Unnamed: 0,Geographic name,"Population, 2011"
0,TORONTO (ONT.),5583064
1,MONTR�AL (QUE.),3824221
2,VANCOUVER (B.C.),2313328
3,OTTAWA - GATINEAU (ONT./QUE.),1236324
4,CALGARY (ALTA.),1214839


In [221]:
for city in cities:
    search_term = city[:4]
    data[city]['POPULATION'] = population_df[ population_df['Geographic name'].str.contains(r'\b('+search_term+')\w?', na=False) ]['Population, 2011'].tolist()[0]

-----
# Age Groups
Curated from *[Statistics Canada. Table  051-0056 -  Estimates of population by census metropolitan area, sex and age group for July 1, based on the Standard Geographical Classification (SGC) 2011, annual (persons),  CANSIM (database). (accessed: 2015-06-20)](http://www5.statcan.gc.ca/cansim/a47)*

In [222]:
age_df = pd.read_csv('./datasets/age_groups.csv')
age_df['Geography'] = age_df['Geography'].str.upper()

age_df['Age'] = age_df['Age'].str[:-6] # get rid of the years prefix.
age_df['Age'] = age_df['Age'].str.replace(' to ', '-') # remove the space around 'to' and add the '-'
age_df['Age'] = age_df['Age'].str.replace(' years an', '+') # remove the 'years and over' part of 90

age_df.head()

Unnamed: 0,Geography,Age,2014
0,"MONTR�AL, QUEBEC [24462]",0-4,226900
1,"MONTR�AL, QUEBEC [24462]",5-9,217483
2,"MONTR�AL, QUEBEC [24462]",10-14,199491
3,"MONTR�AL, QUEBEC [24462]",15-19,227351
4,"MONTR�AL, QUEBEC [24462]",20-24,287307


In [223]:
# prepare our breakdowns

age_breakdowns = age_df[ age_df['Geography'].str.contains(r'\b('+search_term+')\w?', na=False) ]['Age'].tolist()

for key, age in enumerate(age_breakdowns):
    age_breakdowns[key] = 'AGE_CAT_'+age
    
print age_breakdowns

['AGE_CAT_0-4', 'AGE_CAT_5-9', 'AGE_CAT_10-14', 'AGE_CAT_15-19', 'AGE_CAT_20-24', 'AGE_CAT_25-29', 'AGE_CAT_30-34', 'AGE_CAT_35-39', 'AGE_CAT_40-44', 'AGE_CAT_45-49', 'AGE_CAT_50-54', 'AGE_CAT_55-59', 'AGE_CAT_60-64', 'AGE_CAT_65-69', 'AGE_CAT_70-74', 'AGE_CAT_75-79', 'AGE_CAT_80-84', 'AGE_CAT_85-89', 'AGE_CAT_90+']


In [224]:
for city in cities:
    search_term = city[:4]
    ages = age_df[ age_df['Geography'].str.contains(r'\b('+search_term+')\w?', na=False) ]['2014'].tolist()    
    total = sum(ages) # get totals so we can normalize later
    for key, age in enumerate(ages):
        ages[key] = age / float(total)

    associations =  dict( zip( age_breakdowns, ages ) )
    for category, value in associations.items():
        data[city][category] = value
        
#     data[city]['AGE_'] = population_df[ population_df['Geography'].str.contains(r'\b('+search_term+')\w?', na=False) ]['Population, 2011'].tolist()[0]

----
# Parks and Green Spaces
Curate from *[Statistics Canada. Table  153-0148 -  Households and the environment survey, parks and green spaces, Canada, provinces and census metropolitan areas (CMA), every 2 years (percent),  CANSIM (database). (accessed: 2015-06-20)](http://www5.statcan.gc.ca/cansim/a26)*

- "Close to home" is defined as being a 10 minute journey from home

In [225]:
parks_df = pd.read_csv('./datasets/canadian_parks2013.csv')
parks_df['CITY'] = parks_df['CITY'].str.upper()
parks_df.head() #all good

Unnamed: 0,CITY,PARKS
0,"MONTREAL, QUEBEC [24462]",91
1,"OTTAWA-GATINEAU, ONTARIO/QUEBEC [24505 35505]",91
2,"TORONTO, ONTARIO [35535]",85
3,"HAMILTON, ONTARIO [35537]",86
4,"WINNIPEG, MANITOBA [46602]",88


In [226]:
for city in cities:
    search_term = city[:4]
    data[city]['PARKS'] = parks_df[ parks_df['CITY'].str.contains(r'\b('+search_term+')\w?', na=False) ]['PARKS'].tolist()[0]/100.0

# Outdoor Activities
Curated from [Statistics Canada. Table  153-0153 -  Households and the environment survey, participation in outdoor activities, Canada, provinces and census metropolitan areas (CMA), every 2 years (percent),  CANSIM (database). (accessed: 2015-06-20)](http://www5.statcan.gc.ca/cansim/a47)

In [227]:
outdoor_df = pd.read_csv('./datasets/outdoor_activities2013.csv')
outdoor_df['CITY'] = outdoor_df['CITY'].str.upper()
outdoor_df.head() #all good

Unnamed: 0,CITY,OUTDOOR
0,"MONTR�AL, QUEBEC [24462]",66
1,"OTTAWA-GATINEAU, ONTARIO/QUEBEC [24505 35505]",79
2,"TORONTO, ONTARIO [35535]",69
3,"HAMILTON, ONTARIO [35537]",73
4,"WINNIPEG, MANITOBA [46602]",73


In [228]:
for city in cities:
    search_term = city[:4]
    data[city]['OUTDOOR'] = outdoor_df[ outdoor_df['CITY'].str.contains(r'\b('+search_term+')\w?', na=False) ]['OUTDOOR'].tolist()[0]/100.0

-----
# Crime severity Index

We obtain the crime severity statistic from [Statistics Canada. Table  252-0052 -  Crime severity index and weighted clearance rates, annual (index unless otherwise noted),  CANSIM (database). (accessed: 2015-06-20)](http://www5.statcan.gc.ca/cansim/a26)

In [229]:
crime_df = pd.read_csv('./datasets/crime_severity2013.csv')
crime_df['CITY'] = crime_df['CITY'].str.upper()
crime_df.head()

Unnamed: 0,CITY,CRIME_SEVERITY_INDEX
0,"MONTR�AL, QUEBEC (28,34)",65.93
1,"OTTAWA-GATINEAU, ONTARIO/QUEBEC (6)",53.28
2,"TORONTO, ONTARIO (25)",47.14
3,"HAMILTON, ONTARIO (25)",55.11
4,"WINNIPEG, MANITOBA (9,10,33)",83.17


In [230]:
for city in cities:
    search_term = city[:4]
    data[city]['CRIME'] = crime_df[ crime_df['CITY'].str.contains(r'\b('+search_term+')\w?', na=False) ]['CRIME_SEVERITY_INDEX'].tolist()[0]/100.0

# Unemployment
Curated from [Statistics Canada. Table  109-5334 -  Unemployment rate, Canada, provinces, health regions (2014 boundaries) and peer groups, annual (percent),  CANSIM (database).](http://www5.statcan.gc.ca/cansim/a26?lang=eng&retrLang=eng&id=1095334&pattern=unemployment&tabMode=dataTable&srchLan=-1&p1=1&p2=-1)

# Final Curated Data set

In [231]:
s = json.dumps(data)
df = pd.read_json(s, orient='index')
df

Unnamed: 0,AGE_CAT_0-4,AGE_CAT_10-14,AGE_CAT_15-19,AGE_CAT_20-24,AGE_CAT_25-29,AGE_CAT_30-34,AGE_CAT_35-39,AGE_CAT_40-44,AGE_CAT_45-49,AGE_CAT_5-9,...,AGE_CAT_90+,BUSINESSES,CRIME,OUTDOOR,PARKS,POPULATION,SCHOOLS,TRANSIT_SCORE,UNEMPLOYMENT,UNIVERSITIES
CALGARY,0.06384,0.053175,0.056785,0.068301,0.085462,0.093434,0.082916,0.077784,0.071268,0.059873,...,0.004525,0,0.604,0.69,0.88,1214839,0,0.43,0,0
EDMONTON,0.061928,0.051831,0.057203,0.075901,0.090961,0.089605,0.075942,0.068441,0.066505,0.057247,...,0.005763,0,0.8449,0.8,0.89,1159869,0,0.44,0,0
HAMILTON,0.050418,0.054018,0.06229,0.072376,0.066994,0.063852,0.062413,0.065816,0.07138,0.053104,...,0.009022,0,0.5511,0.73,0.86,721053,0,0.42,0,0
MONTREAL,0.056343,0.049537,0.056455,0.071343,0.069167,0.075015,0.074128,0.069141,0.071504,0.054005,...,0.007277,0,0.6593,0.66,0.91,3824221,0,0.77,0,0
OTTAWA,0.054458,0.05297,0.061301,0.076179,0.073756,0.070534,0.068428,0.06966,0.072807,0.055031,...,0.00684,0,0.5328,0.79,0.91,1236324,0,0.49,0,0
TORONTO,0.053736,0.054635,0.063571,0.072731,0.073705,0.074372,0.071559,0.073017,0.07585,0.055133,...,0.006661,0,0.4714,0.69,0.85,5583064,0,0.78,0,0
VANCOUVER,0.046733,0.048941,0.0619,0.074837,0.074465,0.076066,0.07018,0.073289,0.075123,0.048129,...,0.007272,0,0.9026,0.75,0.87,2313328,0,0.74,0,0
WINNIPEG,0.054495,0.056325,0.063847,0.074039,0.076011,0.073354,0.068369,0.066727,0.068052,0.054693,...,0.008554,0,0.8317,0.73,0.88,730018,0,0.51,0,0


In [232]:
s

'{"TORONTO": {"AGE_CAT_15-19": 0.063570928926087122, "UNEMPLOYMENT": 0, "AGE_CAT_65-69": 0.043824157111519615, "AGE_CAT_60-64": 0.053129567992200435, "CRIME": 0.47139999999999999, "POPULATION": 5583064.0, "OUTDOOR": 0.68999999999999995, "AGE_CAT_90+": 0.0066611358113414683, "AGE_CAT_55-59": 0.065628156104868718, "AGE_CAT_80-84": 0.018273124732897338, "AGE_CAT_30-34": 0.074372279846307399, "AGE_CAT_35-39": 0.071559073696225259, "AGE_CAT_0-4": 0.053736101579266164, "AGE_CAT_50-54": 0.077314289752967599, "AGE_CAT_45-49": 0.07584955985444515, "AGE_CAT_20-24": 0.072730857615043221, "AGE_CAT_10-14": 0.054634590347908851, "AGE_CAT_85-89": 0.0115441192498205, "PARKS": 0.84999999999999998, "BUSINESSES": 0, "AGE_CAT_70-74": 0.03084437137491735, "UNIVERSITIES": 0, "TRANSIT_SCORE": 0.78, "AGE_CAT_75-79": 0.024473209148897803, "AGE_CAT_25-29": 0.073704647041377705, "SCHOOLS": 0, "AGE_CAT_40-44": 0.073016868007855049, "AGE_CAT_5-9": 0.055132961806053246}, "EDMONTON": {"AGE_CAT_15-19": 0.057202869855