In [1]:
import math

from IPython.display import display
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import linear_model
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import scipy
import statsmodels.formula.api as smf
%matplotlib inline
pd.options.display.float_format = '{:.3f}'.format

# Suppress annoying harmless error.
import warnings
warnings.filterwarnings(action='ignore', module='scipy', message='^internal gelsd')

In [2]:
# Read data using pandas
df = pd.read_excel('table_8_offenses_known_to_law_enforcement_new_york_by_city_2013.xls')

# Delete first three rows
df = df.drop([0,1,2], axis=0)

# Make first row the column headers
df = df.reset_index(drop=True)
df.columns = df.iloc[0]
df = df.drop([0], axis=0)
df = df.reset_index(drop=True)

# Rename all column headers
df.columns = ['City', 'Population', 'Violent Crime', 'Murder and Nonnegligent Manslaughter', 'Rape (revised definition)', 'Rape (legacy definition)', 'Robbery', 'Aggravated Assault', 'Property Crime', 'Burglary', 'Larceny-Theft', 'Motor Vehicle Theft', 'Arson']

# Remove entire Unnamed: 4 or 'Rape revised defintion'
df = df.drop('Rape (revised definition)', axis=1)

# Remove null objects
df = df.drop([348, 349, 350], axis=0)
df = df.drop('Arson', axis=1)
df = df.dropna(how='all')

# Convert all columns from object to integer values
df[['Population', 'Violent Crime',
       'Murder and Nonnegligent Manslaughter', 'Rape (legacy definition)',
       'Robbery', 'Aggravated Assault', 'Property Crime', 'Burglary',
       'Larceny-Theft', 'Motor Vehicle Theft']] = df[['Population', 'Violent Crime',
       'Murder and Nonnegligent Manslaughter', 'Rape (legacy definition)',
       'Robbery', 'Aggravated Assault', 'Property Crime', 'Burglary',
       'Larceny-Theft', 'Motor Vehicle Theft']].astype(int)

# Remove New York and buffalo from data because it skews the data. Although the data is correct
df[df['City'] == 'New York']
df = df[(df['City']!='New York')&(df['City']!='Buffalo')]

# Change name of Property Crime so that it's one word
df['Propertycrime'] = df['Property Crime']
df = df.drop(columns=['Property Crime'])

df['Murder'] = df['Murder and Nonnegligent Manslaughter']
df = df.drop(columns=['Murder and Nonnegligent Manslaughter'])

df['Aggravated_Assault'] = df['Aggravated Assault']
df = df.drop(columns=['Aggravated Assault'])

In [3]:
df.head()

Unnamed: 0,City,Population,Violent Crime,Rape (legacy definition),Robbery,Burglary,Larceny-Theft,Motor Vehicle Theft,Propertycrime,Murder,Aggravated_Assault
0,Adams Village,1861,0,0,0,2,10,0,12,0,0
1,Addison Town and Village,2577,3,0,0,3,20,1,24,0,3
2,Akron Village,2846,3,0,0,1,15,0,16,0,3
3,Albany,97956,791,30,227,705,3243,142,4090,8,526
4,Albion Village,6388,23,3,4,53,165,5,223,0,16


In [4]:
# Create Features: Population^2, Murder binary, Robbery, binary
# More specifically, create binary categories for murder and robery
df['Population^2'] = df['Population']**2
df['Murder'] = np.where(df['Murder']>0, '1', '0')
df['Robbery_binary'] = np.where(df['Robbery']>0, '1', '0')
#df['Aggravated_Assault'] = np.where(df['Aggravated_Assault']>0, '1', '0')

#Convert new columns to int
df[['Population^2']] = df[['Population^2']].astype(int)
df[['Murder']] = df[['Murder']].astype(int)
df[['Robbery_binary']] = df[['Robbery_binary']].astype(int)
df[['Aggravated_Assault']] = df[['Aggravated_Assault']].astype(int)

In [5]:
# Run Model
regr = linear_model.LinearRegression()
y = df['Propertycrime'].values.reshape(-1, 1)
x = df[['Population','Population^2', 'Murder', 'Robbery_binary']]
regr.fit(x,y)

# Inspect the results
print('\nCoefficients: \n', regr.coef_)
print('\nIntercept: \n', regr.intercept_)
print('\nR-squared:\n', regr.score(x,y))


Coefficients: 
 [[1.74484226e-02 7.28298619e-08 1.84108347e+02 6.20590854e+01]]

Intercept: 
 [-35.45330011]

R-squared:
 0.7572767492306005


  linalg.lstsq(X, y)


In [6]:
# Cross Validation test
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=20)
print('With 30% holdout: ' + str(regr.fit(x_train, y_train,).score(x_test, y_test)))
print('Testing on sample:' + str(regr.fit(x,y).score(x,y)))

With 30% holdout: 0.7686916795435182
Testing on sample:0.7572767492306005


In [7]:
# Cross-validation with mutliple folds
from sklearn.model_selection import cross_val_score

cross_val_score(regr, x, y, cv=5)

array([0.81409112, 0.71966237, 0.74405329, 0.61140936, 0.21822869])

In [8]:
# Test for significance in parameters
linear_formula = 'Propertycrime ~ Population+Population^2+Murder+Robbery_binary'

# Fit the model to our data using formula
lm = smf.ols(formula=linear_formula, data=df).fit()

In [9]:
lm.params

Intercept        -80.074
Population         8.359
Population ^ 2    -8.330
Murder           114.402
Robbery_binary   -27.541
dtype: float64

In [10]:
lm.pvalues

Intercept        0.025
Population       0.456
Population ^ 2   0.458
Murder           0.135
Robbery_binary   0.578
dtype: float64

In [11]:
# Create PCA on the follwing variables: Burglary, Violent Crime, Aggravated_Assault, Larceny-Theft, Motor-Vehicle Theft, Robbery

# Standardize Data
features = ['Burglary', 'Violent Crime', 'Aggravated_Assault', 'Larceny-Theft']
x = df.loc[:, features].values

scaler = StandardScaler()

scaler.fit(x)

pd.DataFrame(data=x, columns=features).head()

# PCA
pca = PCA(n_components=2)
principalComponents = pca.fit_transform(x)

# Create Merged Data Frame
principalDf = pd.DataFrame(data = principalComponents, columns = ['principalcomponent_1', 'principalcomponent_2'])
principalDf['City'] = df['City']
merged_inner = pd.merge(left=df, right=principalDf, left_on='City', right_on='City')
df = merged_inner



In [12]:
# Run Model
regr = linear_model.LinearRegression()
y = df['Propertycrime'].values.reshape(-1, 1)
x = df[['Population','Population^2', 'Murder', 'Robbery_binary', 'principalcomponent_1', 'principalcomponent_2']]
regr.fit(x,y)

# Inspect the results
print('\nCoefficients: \n', regr.coef_)
print('\nIntercept: \n', regr.intercept_)
print('\nR-squared:\n', regr.score(x,y))


Coefficients: 
 [[1.04847039e-02 1.70042096e-07 1.90643236e+02 9.79970918e+01
  1.52002213e-02 6.42797921e-02]]

Intercept: 
 [-8.23669611]

R-squared:
 0.8831188362844727


In [13]:
# Cross Validation test
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.5, random_state=50)
print('With 30% holdout: ' + str(regr.fit(x_train, y_train,).score(x_test, y_test)))
print('Testing on sample:' + str(regr.fit(x,y).score(x,y)))

With 30% holdout: 0.604523093919447
Testing on sample:0.8831188362844727
