In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn import cluster
from sklearn.metrics import silhouette_score
from sklearn import preprocessing
from sklearn.decomposition import PCA
from sklearn.cross_validation import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import mean_squared_error

import geopandas as gpd
from fiona.crs import from_epsg
import shapely
import urllib.request
import requests
import json
import os



In [2]:
df = pd.read_csv("top20com_clean.csv", index_col=0)
df.head()

Unnamed: 0,zipcode,type,201001,201002,201003,201004,201005,201006,201007,201008,...,201603,201604,201605,201606,201607,201608,201609,201610,201611,201612
0,11436,Street Condition,24.0,16.0,41.0,14.0,5.0,13.0,3.0,3.0,...,19.0,21.0,12.0,11.0,9.0,9.0,6.0,8.0,10.0,15.0
1,11213,Street Condition,32.0,68.0,124.0,88.0,55.0,19.0,31.0,46.0,...,58.0,51.0,24.0,32.0,26.0,55.0,33.0,28.0,22.0,25.0
2,11212,Street Condition,56.0,72.0,151.0,68.0,52.0,33.0,69.0,34.0,...,54.0,40.0,49.0,45.0,45.0,48.0,53.0,31.0,26.0,34.0
3,11225,Street Condition,13.0,32.0,62.0,62.0,39.0,10.0,25.0,13.0,...,44.0,24.0,34.0,23.0,23.0,29.0,30.0,26.0,12.0,20.0
4,11218,Street Condition,127.0,129.0,265.0,146.0,117.0,57.0,84.0,90.0,...,107.0,44.0,57.0,75.0,58.0,84.0,44.0,35.0,25.0,28.0


In [3]:
df.shape

(3497, 86)

In [4]:
for i in range(2,30):
    year = (i-2) // 4 + 2010
    quarter = (i-1) % 4
    if quarter == 0:
        quarter = 4
    print(year, quarter)
    df[str(year) + str(quarter)] = df.iloc[:,(3*i-2):3*i+1].sum(axis=1)

2010 1
2010 2
2010 3
2010 4
2011 1
2011 2
2011 3
2011 4
2012 1
2012 2
2012 3
2012 4
2013 1
2013 2
2013 3
2013 4
2014 1
2014 2
2014 3
2014 4
2015 1
2015 2
2015 3
2015 4
2016 1
2016 2
2016 3
2016 4


In [5]:
df.shape

(3497, 114)

In [6]:
df_quarter = df.drop(df.columns[2:86],axis=1)

In [7]:
df_quarter.head()

Unnamed: 0,zipcode,type,20101,20102,20103,20104,20111,20112,20113,20114,...,20143,20144,20151,20152,20153,20154,20161,20162,20163,20164
0,11436,Street Condition,60.0,19.0,21.0,44.0,69.0,31.0,32.0,28.0,...,22.0,46.0,105.0,46.0,39.0,34.0,52.0,29.0,24.0,94.0
1,11213,Street Condition,267.0,96.0,67.0,145.0,181.0,120.0,120.0,94.0,...,67.0,104.0,304.0,155.0,96.0,84.0,133.0,113.0,83.0,388.0
2,11212,Street Condition,271.0,136.0,68.0,158.0,146.0,180.0,122.0,111.0,...,96.0,223.0,310.0,199.0,130.0,199.0,143.0,138.0,110.0,441.0
3,11225,Street Condition,163.0,48.0,80.0,70.0,148.0,100.0,56.0,60.0,...,34.0,54.0,204.0,177.0,57.0,48.0,102.0,75.0,68.0,231.0
4,11218,Street Condition,528.0,231.0,202.0,292.0,268.0,166.0,117.0,82.0,...,98.0,143.0,382.0,222.0,127.0,118.0,208.0,217.0,104.0,787.0


In [8]:
df_quarter.to_csv("complaintQuarterly.csv")

In [9]:
df_quarter = pd.read_csv("complaintQuarterly.csv", index_col=0)
df_quarter.head(2)

Unnamed: 0,zipcode,type,20101,20102,20103,20104,20111,20112,20113,20114,...,20143,20144,20151,20152,20153,20154,20161,20162,20163,20164
0,11436,Street Condition,60.0,19.0,21.0,44.0,69.0,31.0,32.0,28.0,...,22.0,46.0,105.0,46.0,39.0,34.0,52.0,29.0,24.0,94.0
1,11213,Street Condition,267.0,96.0,67.0,145.0,181.0,120.0,120.0,94.0,...,67.0,104.0,304.0,155.0,96.0,84.0,133.0,113.0,83.0,388.0


In [10]:
df_quarter.fillna(0, inplace=True)

In [11]:
df_quarter['type'].unique()

array(['Street Condition', 'PLUMBING', 'Street Light Condition',
       'Noise - Residential', 'Traffic Signal Condition',
       'General Construction/Plumbing', 'Blocked Driveway', 'ELECTRIC',
       'Damaged Tree', 'Building/Use', 'Graffiti', 'Taxi Complaint',
       'Noise - Commercial', 'Rodent', 'SCRIE', 'Consumer Complaint',
       'Sidewalk Condition', 'Derelict Vehicle', 'Broken Muni Meter',
       'APPLIANCE'], dtype=object)

In [12]:
df_quarter.drop(df_quarter[df_quarter['type']=='Consumer Complaint'].index, inplace=True)

In [13]:
df_quarter.shape

(3312, 30)

In [14]:
change = {'Street Condition':'Streets_Sidewalks', 
          'PLUMBING':'Home',
           'Street Light Condition':'Streets_Sidewalks',
         'Noise - Residential':'Noise',
         'Traffic Signal Condition':'Transportation',
         'General Construction/Plumbing':'Home',
         'Blocked Driveway':'Transportation',
         'ELECTRIC':'Home',
         'Damaged Tree':'Public_Health_Safety',
         'Building/Use':'Home',
         'Graffiti':'Home',
         'Taxi Complaint':'Vehicles_Parking',
         'Noise - Commercial':'Noise',
          'Rodent':'Public_Health_Safety',
          'SCRIE':'Home',
          'Sidewalk Condition':'Streets_Sidewalks',
          'Derelict Vehicle':'Vehicles_Parking',
          'Broken Muni Meter':'Vehicles_Parking',
         'APPLIANCE':'Home'}

In [15]:
df_quarter['general_type'] = df_quarter['type'].map(change)

In [16]:
df_quarter.head()

Unnamed: 0,zipcode,type,20101,20102,20103,20104,20111,20112,20113,20114,...,20144,20151,20152,20153,20154,20161,20162,20163,20164,general_type
0,11436,Street Condition,60.0,19.0,21.0,44.0,69.0,31.0,32.0,28.0,...,46.0,105.0,46.0,39.0,34.0,52.0,29.0,24.0,94.0,Streets_Sidewalks
1,11213,Street Condition,267.0,96.0,67.0,145.0,181.0,120.0,120.0,94.0,...,104.0,304.0,155.0,96.0,84.0,133.0,113.0,83.0,388.0,Streets_Sidewalks
2,11212,Street Condition,271.0,136.0,68.0,158.0,146.0,180.0,122.0,111.0,...,223.0,310.0,199.0,130.0,199.0,143.0,138.0,110.0,441.0,Streets_Sidewalks
3,11225,Street Condition,163.0,48.0,80.0,70.0,148.0,100.0,56.0,60.0,...,54.0,204.0,177.0,57.0,48.0,102.0,75.0,68.0,231.0,Streets_Sidewalks
4,11218,Street Condition,528.0,231.0,202.0,292.0,268.0,166.0,117.0,82.0,...,143.0,382.0,222.0,127.0,118.0,208.0,217.0,104.0,787.0,Streets_Sidewalks


In [17]:
df_agg = df_quarter.iloc[:2,:].drop(['type'], axis=1)
df_agg

Unnamed: 0,zipcode,20101,20102,20103,20104,20111,20112,20113,20114,20121,...,20144,20151,20152,20153,20154,20161,20162,20163,20164,general_type
0,11436,60.0,19.0,21.0,44.0,69.0,31.0,32.0,28.0,23.0,...,46.0,105.0,46.0,39.0,34.0,52.0,29.0,24.0,94.0,Streets_Sidewalks
1,11213,267.0,96.0,67.0,145.0,181.0,120.0,120.0,94.0,82.0,...,104.0,304.0,155.0,96.0,84.0,133.0,113.0,83.0,388.0,Streets_Sidewalks


In [18]:
uniq_gene_type = df_quarter['general_type'].unique()

In [19]:
mapdict = {}
for i,colname in enumerate(df_quarter.columns[2:-1]):
    mapdict[colname] = 'mean'

In [20]:
update_dict = {'general_type':'first'}
update_dict.update(mapdict)
for i, name in enumerate(uniq_gene_type):
    print(df_quarter[df_quarter['general_type']==name]['type'].unique())
    df_temp = df_quarter[df_quarter['general_type']==name]
    
    df_sum = df_temp.groupby(
        by='zipcode', as_index=False).agg(update_dict)
    print(df_sum.shape)
    df_agg = pd.concat([df_agg, df_sum], ignore_index=True)
    print(df_agg.shape)
    

['Street Condition' 'Street Light Condition' 'Sidewalk Condition']
(191, 30)
(193, 30)
['PLUMBING' 'General Construction/Plumbing' 'ELECTRIC' 'Building/Use'
 'Graffiti' 'SCRIE' 'APPLIANCE']
(182, 30)
(375, 30)
['Noise - Residential' 'Noise - Commercial']
(183, 30)
(558, 30)
['Traffic Signal Condition' 'Blocked Driveway']
(178, 30)
(736, 30)
['Damaged Tree' 'Rodent']
(181, 30)
(917, 30)
['Taxi Complaint' 'Derelict Vehicle' 'Broken Muni Meter']
(191, 30)
(1108, 30)


In [21]:
df_agg.head()

Unnamed: 0,20101,20102,20103,20104,20111,20112,20113,20114,20121,20122,...,20151,20152,20153,20154,20161,20162,20163,20164,general_type,zipcode
0,60.0,19.0,21.0,44.0,69.0,31.0,32.0,28.0,23.0,27.0,...,105.0,46.0,39.0,34.0,52.0,29.0,24.0,94.0,Streets_Sidewalks,11436
1,267.0,96.0,67.0,145.0,181.0,120.0,120.0,94.0,82.0,78.0,...,304.0,155.0,96.0,84.0,133.0,113.0,83.0,388.0,Streets_Sidewalks,11213
2,1.0,1.0,3.0,0.0,3.0,0.0,2.0,2.0,3.0,0.0,...,15.0,7.0,6.0,9.0,2.0,3.0,2.0,4.0,Streets_Sidewalks,10000
3,81.0,42.333333,66.666667,57.0,71.0,37.666667,34.0,44.666667,39.333333,44.333333,...,96.666667,74.0,54.0,48.0,72.333333,53.0,42.666667,135.666667,Streets_Sidewalks,10001
4,150.333333,127.333333,131.333333,147.666667,165.0,90.666667,93.333333,66.666667,74.666667,76.666667,...,115.333333,97.666667,94.0,102.0,105.333333,102.333333,69.666667,300.0,Streets_Sidewalks,10002


In [22]:
df_agg.drop([0,1], inplace=True)
df_agg = df_agg.round(2)
df_agg.head()

Unnamed: 0,20101,20102,20103,20104,20111,20112,20113,20114,20121,20122,...,20151,20152,20153,20154,20161,20162,20163,20164,general_type,zipcode
2,1.0,1.0,3.0,0.0,3.0,0.0,2.0,2.0,3.0,0.0,...,15.0,7.0,6.0,9.0,2.0,3.0,2.0,4.0,Streets_Sidewalks,10000
3,81.0,42.33,66.67,57.0,71.0,37.67,34.0,44.67,39.33,44.33,...,96.67,74.0,54.0,48.0,72.33,53.0,42.67,135.67,Streets_Sidewalks,10001
4,150.33,127.33,131.33,147.67,165.0,90.67,93.33,66.67,74.67,76.67,...,115.33,97.67,94.0,102.0,105.33,102.33,69.67,300.0,Streets_Sidewalks,10002
5,102.33,79.33,115.67,116.0,124.67,81.67,60.0,60.0,70.67,72.33,...,172.33,112.33,72.0,49.0,104.67,77.33,72.67,201.33,Streets_Sidewalks,10003
6,34.0,18.0,28.33,29.67,31.67,26.67,29.67,30.0,22.67,18.33,...,37.67,20.67,17.33,26.0,26.33,21.0,18.0,58.33,Streets_Sidewalks,10004


In [23]:
df_agg.shape

(1106, 30)

In [24]:
df_agg['zipcode'] = df_agg['zipcode'].astype('str')

In [25]:
df_agg.to_csv("generalTypeComplaintQuarterly.csv")

In [26]:
#urllib.request.urlretrieve('https://data.cityofnewyork.us/download/i8iw-xf4u/application%2Fzip', "file.gz")
#os.system("mv " + "file.gz " + os.getenv("PUIDATA"))
#os.system("unzip " + os.getenv("PUIDATA") + "/file.gz -d " + os.getenv("PUIDATA") + "/zipcode")
nyc = gpd.read_file(os.getenv("PUIDATA") + "/zipcode" + "/ZIP_CODE_040114.shp")
nyc.drop(['BLDGZIP', 'PO_NAME', 'STATE', 'COUNTY', 'ST_FIPS', 'CTY_FIPS', 'URL', 'SHAPE_AREA', 'SHAPE_LEN'], axis=1, inplace=True)
nyc.head()

Unnamed: 0,ZIPCODE,POPULATION,AREA,geometry
0,11436,18681.0,22699300.0,"POLYGON ((1038098.251871482 188138.3800067157,..."
1,11213,62426.0,29631000.0,"POLYGON ((1001613.712964058 186926.4395172149,..."
2,11212,83866.0,41972100.0,"POLYGON ((1011174.275535807 183696.33770971, 1..."
3,11225,56527.0,23698630.0,"POLYGON ((995908.3654508889 183617.6128015518,..."
4,11218,72280.0,36868800.0,"POLYGON ((991997.1134308875 176307.4958601296,..."


In [27]:
nyc['POPULATION'] = nyc['POPULATION'] / 1000
nyc['AREA'] = nyc['AREA'] / 1e6

In [28]:
nyc = nyc.merge(df_agg, left_on='ZIPCODE', right_on='zipcode')
nyc.drop(['zipcode'], axis=1, inplace=True)
nyc.head()

Unnamed: 0,ZIPCODE,POPULATION,AREA,geometry,20101,20102,20103,20104,20111,20112,...,20144,20151,20152,20153,20154,20161,20162,20163,20164,general_type
0,11436,18.681,22.699295,"POLYGON ((1038098.251871482 188138.3800067157,...",46.0,37.0,33.33,32.33,44.67,35.0,...,41.67,59.33,33.67,37.67,37.67,52.33,46.67,42.0,99.67,Streets_Sidewalks
1,11436,18.681,22.699295,"POLYGON ((1038098.251871482 188138.3800067157,...",31.67,27.33,39.0,22.67,29.5,27.33,...,10.5,14.67,18.67,13.67,10.17,15.5,17.0,15.17,62.17,Home
2,11436,18.681,22.699295,"POLYGON ((1038098.251871482 188138.3800067157,...",19.5,70.0,33.5,5.0,27.5,62.5,...,27.0,118.0,163.0,86.0,20.0,61.5,133.5,59.5,103.5,Noise
3,11436,18.681,22.699295,"POLYGON ((1038098.251871482 188138.3800067157,...",30.5,32.5,26.5,33.0,26.0,25.0,...,35.5,38.5,35.0,37.0,63.0,46.0,59.5,59.5,83.5,Transportation
4,11436,18.681,22.699295,"POLYGON ((1038098.251871482 188138.3800067157,...",29.0,32.0,25.5,11.0,24.5,57.0,...,2.5,10.5,23.0,16.5,6.5,22.5,35.0,16.0,64.0,Public_Health_Safety


In [29]:
nyc.dtypes

ZIPCODE          object
POPULATION      float64
AREA            float64
geometry         object
20101           float64
20102           float64
20103           float64
20104           float64
20111           float64
20112           float64
20113           float64
20114           float64
20121           float64
20122           float64
20123           float64
20124           float64
20131           float64
20132           float64
20133           float64
20134           float64
20141           float64
20142           float64
20143           float64
20144           float64
20151           float64
20152           float64
20153           float64
20154           float64
20161           float64
20162           float64
20163           float64
20164           float64
general_type     object
dtype: object

In [30]:
for i in range(4,32):
    col = nyc.columns[i] + "_p"
    nyc[col] = nyc.iloc[:,i] / nyc.iloc[:,1]

In [31]:
nyc.head()

Unnamed: 0,ZIPCODE,POPULATION,AREA,geometry,20101,20102,20103,20104,20111,20112,...,20143_p,20144_p,20151_p,20152_p,20153_p,20154_p,20161_p,20162_p,20163_p,20164_p
0,11436,18.681,22.699295,"POLYGON ((1038098.251871482 188138.3800067157,...",46.0,37.0,33.33,32.33,44.67,35.0,...,2.087683,2.230609,3.175954,1.802366,2.016487,2.016487,2.801242,2.49826,2.248274,5.335367
1,11436,18.681,22.699295,"POLYGON ((1038098.251871482 188138.3800067157,...",31.67,27.33,39.0,22.67,29.5,27.33,...,0.82972,0.562068,0.78529,0.999411,0.73176,0.544403,0.82972,0.910016,0.812055,3.32798
2,11436,18.681,22.699295,"POLYGON ((1038098.251871482 188138.3800067157,...",19.5,70.0,33.5,5.0,27.5,62.5,...,3.720358,1.445319,6.316578,8.725443,4.603608,1.070606,3.292115,7.146298,3.185054,5.540389
3,11436,18.681,22.699295,"POLYGON ((1038098.251871482 188138.3800067157,...",30.5,32.5,26.5,33.0,26.0,25.0,...,1.927092,1.900327,2.060918,1.873561,1.980622,3.37241,2.462395,3.185054,3.185054,4.469782
4,11436,18.681,22.699295,"POLYGON ((1038098.251871482 188138.3800067157,...",29.0,32.0,25.5,11.0,24.5,57.0,...,1.204432,0.133826,0.562068,1.231197,0.88325,0.347947,1.204432,1.873561,0.856485,3.425941


In [32]:
for i in range(4,32):
    col = nyc.columns[i] + "_a"
    nyc[col] = nyc.iloc[:,i] / nyc.iloc[:,2]

In [33]:
nyc.head()

Unnamed: 0,ZIPCODE,POPULATION,AREA,geometry,20101,20102,20103,20104,20111,20112,...,20143_a,20144_a,20151_a,20152_a,20153_a,20154_a,20161_a,20162_a,20163_a,20164_a
0,11436,18.681,22.699295,"POLYGON ((1038098.251871482 188138.3800067157,...",46.0,37.0,33.33,32.33,44.67,35.0,...,1.718115,1.83574,2.613738,1.483306,1.659523,1.659523,2.305358,2.056011,1.850278,4.390885
1,11436,18.681,22.699295,"POLYGON ((1038098.251871482 188138.3800067157,...",31.67,27.33,39.0,22.67,29.5,27.33,...,0.682841,0.462569,0.646276,0.822492,0.602221,0.448032,0.682841,0.748922,0.668303,2.738852
2,11436,18.681,22.699295,"POLYGON ((1038098.251871482 188138.3800067157,...",19.5,70.0,33.5,5.0,27.5,62.5,...,3.061769,1.189464,5.198399,7.18084,3.788664,0.881085,2.709335,5.88124,2.621227,4.559613
3,11436,18.681,22.699295,"POLYGON ((1038098.251871482 188138.3800067157,...",30.5,32.5,26.5,33.0,26.0,25.0,...,1.585952,1.563925,1.696088,1.541898,1.630007,2.775417,2.026495,2.621227,2.621227,3.678528
4,11436,18.681,22.699295,"POLYGON ((1038098.251871482 188138.3800067157,...",29.0,32.0,25.5,11.0,24.5,57.0,...,0.99122,0.110136,0.462569,1.013247,0.726895,0.286352,0.99122,1.541898,0.704868,2.819471


In [34]:
nyc.to_csv("complaintByPop_Area.csv")