In [1]:
import pandas as pd
import folium

In [2]:
consent_asked = pd.read_csv('data/consent_asked')
zipcode_frequency = pd.read_csv('data/zipcode_frequency')


In [3]:
consent_asked.head()

Unnamed: 0,STOP_LOCATION_ZIP_CODE,TOTAL_STOPPED,ASK_FOR_CONSENT_FLG,PERCENT_ASKED
0,10000,6,0,0.0
1,10001,88,14,15.909091
2,10002,106,10,9.433962
3,10003,80,10,12.5
4,10004,7,1,14.285714


In [4]:
zipcode_frequency.head()

Unnamed: 0,STOP_LOCATION_ZIP_CODE,FREQUENCY
0,10000,6
1,10001,92
2,10002,109
3,10003,82
4,10004,7


The "frequency" numbers are slightly off from the "total asked" numbers since we dropped 3% from "frequency" where the notation showed neither yes or no for if consent was asked for. 

For these purposes, I'm going to use the "frequency" column instead of the "total asked" since that will include the data where the value of "(" was given for if someone consented. 

In [5]:
comparing_data = consent_asked.merge(zipcode_frequency, how = 'inner', on = "STOP_LOCATION_ZIP_CODE")

In [6]:
comparing_data.head()

Unnamed: 0,STOP_LOCATION_ZIP_CODE,TOTAL_STOPPED,ASK_FOR_CONSENT_FLG,PERCENT_ASKED,FREQUENCY
0,10000,6,0,0.0,6
1,10001,88,14,15.909091,92
2,10002,106,10,9.433962,109
3,10003,80,10,12.5,82
4,10004,7,1,14.285714,7


In [7]:
comparing_data['PERCENT_FREQUENCY'] = (comparing_data['FREQUENCY']/9544) * 100

In [8]:
comparing_data.head()

Unnamed: 0,STOP_LOCATION_ZIP_CODE,TOTAL_STOPPED,ASK_FOR_CONSENT_FLG,PERCENT_ASKED,FREQUENCY,PERCENT_FREQUENCY
0,10000,6,0,0.0,6,0.062867
1,10001,88,14,15.909091,92,0.963956
2,10002,106,10,9.433962,109,1.142079
3,10003,80,10,12.5,82,0.859179
4,10004,7,1,14.285714,7,0.073345


In [9]:
comparing_percents = comparing_data.loc[comparing_data['PERCENT_ASKED'] < 18]
# show me all the zipcodes where consent was asked for less than 18% of the time 

comparing_percents = comparing_percents.loc[comparing_data['PERCENT_FREQUENCY'] > 1]
# out of those zipcodes, only keep the zipcodes that made up more than 1% of the instances

In [10]:
comparing_data['PERCENT_FREQUENCY'].describe()

count    178.000000
mean       0.523301
std        0.439899
min        0.010478
25%        0.159786
50%        0.413873
75%        0.806790
max        2.378458
Name: PERCENT_FREQUENCY, dtype: float64

In [11]:
comparing_data['PERCENT_FREQUENCY'].sum()

93.14752724224644

In [12]:
comparing_percents

Unnamed: 0,STOP_LOCATION_ZIP_CODE,TOTAL_STOPPED,ASK_FOR_CONSENT_FLG,PERCENT_ASKED,FREQUENCY,PERCENT_FREQUENCY
2,10002,106,10,9.433962,109,1.142079
23,10025,114,17,14.912281,118,1.236379
25,10027,115,14,12.173913,120,1.257334
27,10029,220,25,11.363636,227,2.378458
33,10035,135,20,14.814815,140,1.46689
61,10453,158,25,15.822785,159,1.665968
63,10455,102,15,14.705882,102,1.068734
65,10457,100,15,15.0,106,1.110645
68,10460,102,17,16.666667,104,1.08969
74,10467,105,11,10.47619,108,1.131601


In [13]:
comparing_percents['STOP_LOCATION_ZIP_CODE'] = comparing_percents['STOP_LOCATION_ZIP_CODE'].astype(str)

In [14]:
map = folium.Map(location=[40.693943, -73.985880], default_zoom_start=15)

# making the base map of nyc

In [15]:
map.choropleth(geo_data="data/nyc-zip-code-tabulation-areas-polygons.geojson",
               data=comparing_percents,
               columns=['STOP_LOCATION_ZIP_CODE', 'WEIGHTED'],
               key_on='feature.properties.postalCode', 
               fill_color='YlOrBr', fill_opacity=0.7, line_opacity=0.2,
               legend_name='Percent Asked for Consent, Weighted by Frequency')



IndexError: list index out of range

In [None]:
map

So what exactly does this map represent? The shaded zip codes are zip codes where consent was asked for less than 18% of the time (which represents our 75% quartile of data.) Within the zip codes in that 75% quartile, only show me the zipcodes that made up 1 or more percent of the total data. Now I've scaled  