In [1]:
import math

from IPython.display import display
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import linear_model
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import scipy
import statsmodels.formula.api as smf
%matplotlib inline
pd.options.display.float_format = '{:.3f}'.format

# Suppress annoying harmless error.
import warnings
warnings.filterwarnings(action='ignore', module='scipy', message='^internal gelsd')

In [2]:
df = pd.read_excel('table_8_offenses_known_to_law_enforcement_new_york_by_city_2013.xls')

In [3]:
# Delete first three rows
df = df.drop([0,1,2], axis=0)

# Make first row the column headers
df = df.reset_index(drop=True)
df.columns = df.iloc[0]
df = df.drop([0], axis=0)
df = df.reset_index(drop=True)

# Rename all column headers
df.columns = ['City', 'Population', 'Violent Crime', 'Murder and Nonnegligent Manslaughter', 'Rape (revised definition)', 'Rape (legacy definition)', 'Robbery', 'Aggravated Assault', 'Property Crime', 'Burglary', 'Larceny-Theft', 'Motor Vehicle Theft', 'Arson']

# Remove entire Unnamed: 4 or 'Rape revised defintion'
df = df.drop('Rape (revised definition)', axis=1)

# Remove null objects
df = df.drop([348, 349, 350], axis=0)
df = df.drop('Arson', axis=1)
df = df.dropna(how='all')

# Convert all columns from object to integer values
df[['Population', 'Violent Crime',
       'Murder and Nonnegligent Manslaughter', 'Rape (legacy definition)',
       'Robbery', 'Aggravated Assault', 'Property Crime', 'Burglary',
       'Larceny-Theft', 'Motor Vehicle Theft']] = df[['Population', 'Violent Crime',
       'Murder and Nonnegligent Manslaughter', 'Rape (legacy definition)',
       'Robbery', 'Aggravated Assault', 'Property Crime', 'Burglary',
       'Larceny-Theft', 'Motor Vehicle Theft']].astype(int)

# Remove New York and buffalo from data because it skews the data. Although the data is correct
df[df['City'] == 'New York']
df = df[(df['City']!='New York')&(df['City']!='Buffalo')]

# Change name of Property Crime so that it's one word
df['Propertycrime'] = df['Property Crime']
df = df.drop(columns=['Property Crime'])

df['Murder'] = df['Murder and Nonnegligent Manslaughter']
df = df.drop(columns=['Murder and Nonnegligent Manslaughter'])

df['Aggravated_Assault'] = df['Aggravated Assault']
df = df.drop(columns=['Aggravated Assault'])

In [4]:
corre = df.corr()
corre

Unnamed: 0,Population,Violent Crime,Rape (legacy definition),Robbery,Burglary,Larceny-Theft,Motor Vehicle Theft,Propertycrime,Murder,Aggravated_Assault
Population,1.0,0.796,0.737,0.778,0.767,0.871,0.804,0.857,0.666,0.799
Violent Crime,0.796,1.0,0.922,0.988,0.945,0.875,0.973,0.912,0.918,0.994
Rape (legacy definition),0.737,0.922,1.0,0.893,0.95,0.885,0.917,0.918,0.899,0.915
Robbery,0.778,0.988,0.893,1.0,0.932,0.853,0.975,0.893,0.935,0.965
Burglary,0.767,0.945,0.95,0.932,1.0,0.929,0.959,0.964,0.919,0.931
Larceny-Theft,0.871,0.875,0.885,0.853,0.929,1.0,0.89,0.994,0.813,0.871
Motor Vehicle Theft,0.804,0.973,0.917,0.975,0.959,0.89,1.0,0.928,0.928,0.954
Propertycrime,0.857,0.912,0.918,0.893,0.964,0.994,0.928,1.0,0.858,0.905
Murder,0.666,0.918,0.899,0.935,0.919,0.813,0.928,0.858,1.0,0.882
Aggravated_Assault,0.799,0.994,0.915,0.965,0.931,0.871,0.954,0.905,0.882,1.0


In [5]:
# Create Features: Population^2, Murder binary, Robbery, binary
# More specifically, create binary categories for murder and robery
df['Population^2'] = df['Population']**2
df['Murder'] = np.where(df['Murder']>0, '1', '0')
df['Robbery_binary'] = np.where(df['Robbery']>0, '1', '0')
df['Aggravated_Assault'] = np.where(df['Aggravated_Assault']>0, '1', '0')

#Convert new columns to int
df[['Population^2']] = df[['Population^2']].astype(int)
df[['Murder']] = df[['Murder']].astype(int)
df[['Robbery_binary']] = df[['Robbery_binary']].astype(int)
df[['Aggravated_Assault']] = df[['Aggravated_Assault']].astype(int)

In [6]:
# Run Model
regr = linear_model.LinearRegression()
y = df['Propertycrime'].values.reshape(-1, 1)
x = df[['Population', 'Murder', 'Robbery_binary', 'Aggravated_Assault']]
regr.fit(x,y)

# Inspect the results
print('\nCoefficients: \n', regr.coef_)
print('\nIntercept: \n', regr.intercept_)
print('\nR-squared:\n', regr.score(x,y))


Coefficients: 
 [[ 2.84659728e-02  1.13708502e+02 -1.76824841e+01 -3.04457339e+01]]

Intercept: 
 [-63.70744697]

R-squared:
 0.7370742803991965


  linalg.lstsq(X, y)


In [7]:
# Cross Validation test
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=20)
print('With 30% holdout: ' + str(regr.fit(x_train, y_train,).score(x_test, y_test)))
print('Testing on sample:' + str(regr.fit(x,y).score(x,y)))

With 30% holdout: 0.7407459517354967
Testing on sample:0.7370742803991965


In [8]:
# Cross-validation with mutliple folds
from sklearn.model_selection import cross_val_score

cross_val_score(regr, x, y, cv=5)

array([0.81387988, 0.61795213, 0.5965922 , 0.69434235, 0.61012193])

In [9]:
# Test for significance in parameters
linear_formula = 'Propertycrime ~ Population+Murder+Robbery_binary+Aggravated_Assault'

# Fit the model to our data using formula
lm = smf.ols(formula=linear_formula, data=df).fit()

In [10]:
lm.params

Intercept            -63.707
Population             0.028
Murder               113.709
Robbery_binary       -17.682
Aggravated_Assault   -30.446
dtype: float64

In [11]:
lm.pvalues

Intercept            0.220
Population           0.000
Murder               0.137
Robbery_binary       0.740
Aggravated_Assault   0.630
dtype: float64

In [12]:
# Create new Multivariable linear regression on the following model
# Propertycrime = Robbery_binary + Burglary + Larcenytheft + Motorvehicletheft
# Propertycrime = Robbery + Burglary + Larcenytheft + Motorvehicletheft
# Propertycrime = Robbery_binary + Burglary_binary + Larcenytheft_binary + Motorvehicletheft_binary


In [13]:
# Create PCA on the follwing variables: Burglary, Violent Crime, Aggravated_Assault, Larceny-Theft, Motor-Vehicle Theft, Robbery

#Standardize Data
features = ['Burglary', 'Violent Crime', 'Aggravated_Assault', 'Larceny-Theft']
x = df.loc[:, features].values

scaler = StandardScaler()

scaler.fit(x)

pd.DataFrame(data=x, columns=features).head()

# PCA
pca = PCA(n_components=2)
principalComponents = pca.fit_transform(x)
principalDf = pd.DataFrame(data = principalComponents
             , columns = ['principal component 1', 'principal component 2'])

df = pd.concat([principalDf, df], axis=1)
df.head()



Unnamed: 0,principal component 1,principal component 2,City,Population,Violent Crime,Rape (legacy definition),Robbery,Burglary,Larceny-Theft,Motor Vehicle Theft,Propertycrime,Murder,Aggravated_Assault,Population^2,Robbery_binary
0,-274.609,32.066,Adams Village,1861.0,0.0,0.0,0.0,2.0,10.0,0.0,12.0,0.0,0.0,3463321.0,0.0
1,-264.341,31.148,Addison Town and Village,2577.0,3.0,0.0,0.0,3.0,20.0,1.0,24.0,0.0,1.0,6640929.0,0.0
2,-269.562,31.81,Akron Village,2846.0,3.0,0.0,0.0,1.0,15.0,0.0,16.0,0.0,1.0,8099716.0,0.0
3,3113.889,-170.734,Albany,97956.0,791.0,30.0,227.0,705.0,3243.0,142.0,4090.0,1.0,1.0,9595377936.0,1.0
4,-110.596,21.715,Albion Village,6388.0,23.0,3.0,4.0,53.0,165.0,5.0,223.0,0.0,1.0,40806544.0,1.0
