# Import

In [1]:
%matplotlib inline
import os
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import re
from sklearn import preprocessing
from sklearn.preprocessing import PolynomialFeatures
from sklearn import metrics
from sklearn import datasets
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVR
from sklearn.preprocessing import LabelEncoder, label_binarize

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold

from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import learning_curve
from scipy.stats import randint as sp_randint


# Data Load

In [2]:
arrest_data = pd.read_csv("arrests_w_census_loc.csv")
arrest_data.PERP_RACE[arrest_data.PERP_RACE.str.contains("WHITE")]=0
arrest_data.PERP_RACE[arrest_data.PERP_RACE != 0]=1
arrest_data = arrest_data.groupby(["PERP_RACE","BlockLocation"]).size().reset_index(name='counts')
blockLocation = arrest_data["BlockLocation"]
blockLat = [float(re.findall(r'[-\d\.]+', bl)[0]) for bl in blockLocation]
blockLon = [float(re.findall(r'[-\d\.]+', bl)[1]) for bl in blockLocation]
arrest_data["blockLat"]=blockLat
arrest_data["blockLon"]=blockLon
arrest_data = arrest_data.drop("BlockLocation", axis=1)
arrest_data = arrest_data.rename(columns={"counts": "Num_Arrests", "PERP_RACE": "Race"})
block_data = pd.read_csv("census_block_loc.csv")
block_data = pd.merge(left=arrest_data, right=block_data,
                      left_on=["blockLat","blockLon"], right_on=["Latitude","Longitude"])
census_data = pd.read_csv("nyc_census_tracts.csv")
tracts = block_data["BlockCode"]
tracts = [int(str(tract)[:-4]) for tract in tracts]
block_data["tracts"]=tracts
block_data = block_data.drop(columns=["Latitude","Longitude","BlockCode","County","blockLat","blockLon"])
data = pd.merge(left=block_data, right=census_data, left_on="tracts", right_on="CensusTract")
data = data.drop("tracts", axis=1)
data = data.dropna()
data.head(n=7)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,Race,Num_Arrests,State,CensusTract,County,Borough,TotalPop,Men,Women,Hispanic,...,Walk,OtherTransp,WorkAtHome,MeanCommute,Employed,PrivateWork,PublicWork,SelfEmployed,FamilyWork,Unemployment
0,0,1,NY,36085024402,Richmond,Staten Island,4241,2023,2218,3.7,...,1.1,0.6,4.0,44.3,2046,75.2,21.2,3.6,0.0,8.3
1,0,4,NY,36085024402,Richmond,Staten Island,4241,2023,2218,3.7,...,1.1,0.6,4.0,44.3,2046,75.2,21.2,3.6,0.0,8.3
2,0,1,NY,36085024402,Richmond,Staten Island,4241,2023,2218,3.7,...,1.1,0.6,4.0,44.3,2046,75.2,21.2,3.6,0.0,8.3
3,0,1,NY,36085024402,Richmond,Staten Island,4241,2023,2218,3.7,...,1.1,0.6,4.0,44.3,2046,75.2,21.2,3.6,0.0,8.3
4,0,2,NY,36085024402,Richmond,Staten Island,4241,2023,2218,3.7,...,1.1,0.6,4.0,44.3,2046,75.2,21.2,3.6,0.0,8.3
5,1,1,NY,36085024402,Richmond,Staten Island,4241,2023,2218,3.7,...,1.1,0.6,4.0,44.3,2046,75.2,21.2,3.6,0.0,8.3
6,0,3,NY,36085024401,Richmond,Staten Island,6408,2755,3653,8.3,...,0.1,0.0,3.9,44.6,2703,70.2,29.3,0.6,0.0,6.6


In [3]:
data.shape

(10524, 39)

In [4]:
data_train = data.sample(frac=0.7,random_state=200) #meant to randomly grab 70% of data by row for test
data_test  = data.drop(data_train.index)

In [5]:
cols = data.shape[1] #num columns

X = data_train.iloc[:,0:cols-1] # iloc slicing function , split the data into X and Y
y = data_train.iloc[:,cols-1:cols]

cat_cols = ['Race','State','CensusTract','County','Borough']
num_cols = ['TotalPop','Men','Women','Hispanic','White','Black','Native','Asian','Citizen','Income','IncomeErr','IncomePerCap','IncomePerCapErr','Poverty','ChildPoverty','Professional','Service','Office','Construction','Production','Drive','Carpool','Transit','Walk','OtherTransp','WorkAtHome','MeanCommute','Employed','PrivateWork','PublicWork','SelfEmployed','FamilyWork','Unemployment']
features = cat_cols + num_cols

encoder = LabelEncoder()

y = encoder.fit_transform(data['State'])
y = encoder.fit_transform(data['County'])
y = encoder.fit_transform(data['Borough'])
y = encoder.fit_transform(data['Race'])
y = encoder.fit_transform(data['CensusTract'])


X = data.filter(features)

for var in cat_cols:
    X[var] = encoder.fit_transform(X[var])

X = X.replace('Kings',1)
X = X.replace('Queens',2)
X = X.replace('Bronx',3)
X = X.replace('New York',4)
X = X.replace('Brooklyn',5)
X = X.replace('Manhattan',6)
X = X.replace('NaN',0)

X.head(5)


Unnamed: 0,Race,State,CensusTract,County,Borough,TotalPop,Men,Women,Hispanic,White,...,Walk,OtherTransp,WorkAtHome,MeanCommute,Employed,PrivateWork,PublicWork,SelfEmployed,FamilyWork,Unemployment
0,0,0,2038,4,4,4241,2023,2218,3.7,84.5,...,1.1,0.6,4.0,44.3,2046,75.2,21.2,3.6,0.0,8.3
1,0,0,2038,4,4,4241,2023,2218,3.7,84.5,...,1.1,0.6,4.0,44.3,2046,75.2,21.2,3.6,0.0,8.3
2,0,0,2038,4,4,4241,2023,2218,3.7,84.5,...,1.1,0.6,4.0,44.3,2046,75.2,21.2,3.6,0.0,8.3
3,0,0,2038,4,4,4241,2023,2218,3.7,84.5,...,1.1,0.6,4.0,44.3,2046,75.2,21.2,3.6,0.0,8.3
4,0,0,2038,4,4,4241,2023,2218,3.7,84.5,...,1.1,0.6,4.0,44.3,2046,75.2,21.2,3.6,0.0,8.3


In [7]:
Xt = data_test.iloc[:,0:cols-1] # iloc slicing function , split the data into X and Y
yt = data_test.iloc[:,cols-1:cols]

cat_cols = ['Race','State','CensusTract','County','Borough']
num_cols = ['TotalPop','Men','Women','Hispanic','White','Black','Native','Asian','Citizen','Income','IncomeErr','IncomePerCap','IncomePerCapErr','Poverty','ChildPoverty','Professional','Service','Office','Construction','Production','Drive','Carpool','Transit','Walk','OtherTransp','WorkAtHome','MeanCommute','Employed','PrivateWork','PublicWork','SelfEmployed','FamilyWork','Unemployment']
features = cat_cols + num_cols

encoder = LabelEncoder()

yt = encoder.fit_transform(data['State'])
yt = encoder.fit_transform(data['County'])
yt = encoder.fit_transform(data['Borough'])
yt = encoder.fit_transform(data['Race'])
yt = encoder.fit_transform(data['CensusTract'])


Xt = data.filter(features)

for var in cat_cols:
    Xt[var] = encoder.fit_transform(Xt[var])

Xt = Xt.replace('Kings',1)
Xt = Xt.replace('Queens',2)
Xt = Xt.replace('Bronx',3)
Xt = Xt.replace('New York',4)
Xt = Xt.replace('Brooklyn',5)
Xt = Xt.replace('Manhattan',6)


Xt.head(5)


Unnamed: 0,Race,State,CensusTract,County,Borough,TotalPop,Men,Women,Hispanic,White,...,Walk,OtherTransp,WorkAtHome,MeanCommute,Employed,PrivateWork,PublicWork,SelfEmployed,FamilyWork,Unemployment
0,0,0,2038,4,4,4241,2023,2218,3.7,84.5,...,1.1,0.6,4.0,44.3,2046,75.2,21.2,3.6,0.0,8.3
1,0,0,2038,4,4,4241,2023,2218,3.7,84.5,...,1.1,0.6,4.0,44.3,2046,75.2,21.2,3.6,0.0,8.3
2,0,0,2038,4,4,4241,2023,2218,3.7,84.5,...,1.1,0.6,4.0,44.3,2046,75.2,21.2,3.6,0.0,8.3
3,0,0,2038,4,4,4241,2023,2218,3.7,84.5,...,1.1,0.6,4.0,44.3,2046,75.2,21.2,3.6,0.0,8.3
4,0,0,2038,4,4,4241,2023,2218,3.7,84.5,...,1.1,0.6,4.0,44.3,2046,75.2,21.2,3.6,0.0,8.3


In [8]:
Xt.columns

Index(['Race', 'State', 'CensusTract', 'County', 'Borough', 'TotalPop', 'Men',
       'Women', 'Hispanic', 'White', 'Black', 'Native', 'Asian', 'Citizen',
       'Income', 'IncomeErr', 'IncomePerCap', 'IncomePerCapErr', 'Poverty',
       'ChildPoverty', 'Professional', 'Service', 'Office', 'Construction',
       'Production', 'Drive', 'Carpool', 'Transit', 'Walk', 'OtherTransp',
       'WorkAtHome', 'MeanCommute', 'Employed', 'PrivateWork', 'PublicWork',
       'SelfEmployed', 'FamilyWork', 'Unemployment'],
      dtype='object')

In [12]:
# Fit regression models - testing different kernels
svr_rbf = SVR(kernel='rbf', gamma=0.1, epsilon=.1)
svr_lin = SVR(kernel='linear', gamma='auto')
svr_poly = SVR(kernel='poly', gamma='auto', degree=3, epsilon=.1,
               coef0=1)

# fit model
svr_rbf.fit(X.values,y)

# test model
y_pred = svr_rbf.predict(Xt.values[:500, :])
mse = mean_squared_error(yt[:500], y_pred)
print('Report: \n')
print(mse)

Report: 

633959.5180597845


In [None]:
def model_fit(X,y):
    clf = SVR(gamma='scale', C=1.0, epsilon=0.2)
    clf.fit(X,y)
    print('================'+str(my_kernel)+'==================')
    return clf

In [None]:
def cross_val(clf, X, y ):
    CV = KFold(n_splits=50, random_state=None, shuffle=False)
    y_pred = clf.predict(X)
    mse = mean_squared_error(y, y_pred)
    print('Report: \n')
    print(mse)

## don't run this one until ready 

In [None]:
kernals = ['linear', 'poly', 'rbf', 'sigmoid']
CV = 10
#for kernal in kernals:
clf = model_fit(X.values, y)
#cross_val()

In [None]:
cross_val(clf,w.values[:500, :],z[:500])

In [None]:
X['Non-White'] = 100 - X['White']

In [None]:
X = X.drop(columns=['Hispanic','Black','Native','Asian'])