In [1]:
import math

from IPython.display import display
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import linear_model
import scipy
import statsmodels.formula.api as smf
%matplotlib inline
pd.options.display.float_format = '{:.3f}'.format

# Suppress annoying harmless error.
import warnings
warnings.filterwarnings(action='ignore', module='scipy', message='^internal gelsd')

In [2]:
df = pd.read_excel('table_8_offenses_known_to_law_enforcement_new_york_by_city_2013.xls')

In [3]:
# Delete first three rows
df = df.drop([0,1,2], axis=0)

# Make first row the column headers
df = df.reset_index(drop=True)
df.columns = df.iloc[0]
df = df.drop([0], axis=0)
df = df.reset_index(drop=True)

# Rename all column headers
df.columns = ['City', 'Population', 'Violent Crime', 'Murder and Nonnegligent Manslaughter', 'Rape (revised definition)', 'Rape (legacy definition)', 'Robbery', 'Aggravated Assault', 'Property Crime', 'Burglary', 'Larceny-Theft', 'Motor Vehicle Theft', 'Arson']

# Remove entire Unnamed: 4 or 'Rape revised defintion'
df = df.drop('Rape (revised definition)', axis=1)

# Remove null objects
df = df.drop([348, 349, 350], axis=0)
df = df.drop('Arson', axis=1)
df = df.dropna(how='all')

# Convert all columns from object to integer values
df[['Population', 'Violent Crime',
       'Murder and Nonnegligent Manslaughter', 'Rape (legacy definition)',
       'Robbery', 'Aggravated Assault', 'Property Crime', 'Burglary',
       'Larceny-Theft', 'Motor Vehicle Theft']] = df[['Population', 'Violent Crime',
       'Murder and Nonnegligent Manslaughter', 'Rape (legacy definition)',
       'Robbery', 'Aggravated Assault', 'Property Crime', 'Burglary',
       'Larceny-Theft', 'Motor Vehicle Theft']].astype(int)

# Remove New York and buffalo from data because it skews the data. Although the data is correct
df[df['City'] == 'New York']
df = df[df['City']!='New York']

# Change name of Property Crime so that it's one word
df['Propertycrime'] = df['Property Crime']
df = df.drop(columns=['Property Crime'])

In [4]:
# Create Features: Population^2, Murder binary, Robbery, binary
# More specifically, create binary categories for murder and robery
df['Population^2'] = df['Population']**2
df['Murder'] = np.where(df['Murder and Nonnegligent Manslaughter']>0, '1', '0')
df['Robbery_binary'] = np.where(df['Robbery']>0, '1', '0')
df

Unnamed: 0,City,Population,Violent Crime,Murder and Nonnegligent Manslaughter,Rape (legacy definition),Robbery,Aggravated Assault,Burglary,Larceny-Theft,Motor Vehicle Theft,Propertycrime,Population^2,Murder,Robbery_binary
0,Adams Village,1861,0,0,0,0,0,2,10,0,12,3463321,0,0
1,Addison Town and Village,2577,3,0,0,0,3,3,20,1,24,6640929,0,0
2,Akron Village,2846,3,0,0,0,3,1,15,0,16,8099716,0,0
3,Albany,97956,791,8,30,227,526,705,3243,142,4090,9595377936,1,1
4,Albion Village,6388,23,0,3,4,16,53,165,5,223,40806544,0,1
5,Alfred Village,4089,5,0,0,3,2,10,36,0,46,16719921,0,1
6,Allegany Village,1781,3,0,0,0,3,0,10,0,10,3171961,0,0
7,Amherst Town,118296,107,1,7,31,68,204,1882,32,2118,13993943616,1,1
8,Amityville Village,9519,9,0,2,4,3,16,188,6,210,90611361,0,1
9,Amsterdam,18182,30,0,0,12,18,99,291,15,405,330585124,0,1


In [5]:
#Convert new columns to int
df[['Population^2', 'Murder', 'Robbery_binary']] = df[['Population^2', 'Murder', 'Robbery_binary']].astype(int)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 347 entries, 0 to 347
Data columns (total 14 columns):
City                                    347 non-null object
Population                              347 non-null int64
Violent Crime                           347 non-null int64
Murder and Nonnegligent Manslaughter    347 non-null int64
Rape (legacy definition)                347 non-null int64
Robbery                                 347 non-null int64
Aggravated Assault                      347 non-null int64
Burglary                                347 non-null int64
Larceny-Theft                           347 non-null int64
Motor Vehicle Theft                     347 non-null int64
Propertycrime                           347 non-null int64
Population^2                            347 non-null int64
Murder                                  347 non-null int64
Robbery_binary                          347 non-null int64
dtypes: int64(13), object(1)
memory usage: 40.7+ KB


In [6]:
regr = linear_model.LinearRegression()
y = df['Propertycrime'].values.reshape(-1, 1)
x = df[['Population^2', 'Murder', 'Robbery_binary']]
regr.fit(x,y)

# Inspect the results
print('\nCoefficients: \n', regr.coef_)
print('\nIntercept: \n', regr.intercept_)
print('\nR-squared:\n', regr.score(x,y))


Coefficients: 
 [[1.70507593e-07 3.71055955e+02 2.13004398e+02]]

Intercept: 
 [38.65134045]

R-squared:
 0.8282588218885816


  linalg.lstsq(X, y)


In [7]:
# Cross Validation test
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=20)
print('With 30% holdout: ' + str(regr.fit(x_train, y_train,).score(x_test, y_test)))
print('Testing on sample:' + str(regr.fit(x,y).score(x,y)))

With 30% holdout: 0.6753155287671836
Testing on sample:0.8282588218885816


In [8]:
#Inconsistent holdout

In [9]:
# Cross-validation with mutliple folds
from sklearn.model_selection import cross_val_score

cross_val_score(regr, x, y, cv=5)

array([0.93622919, 0.61992638, 0.73406894, 0.87707096, 0.27939938])

In [10]:
#Inconsistent cross validation, must look into features and measure significance
df.head()

Unnamed: 0,City,Population,Violent Crime,Murder and Nonnegligent Manslaughter,Rape (legacy definition),Robbery,Aggravated Assault,Burglary,Larceny-Theft,Motor Vehicle Theft,Propertycrime,Population^2,Murder,Robbery_binary
0,Adams Village,1861,0,0,0,0,0,2,10,0,12,3463321,0,0
1,Addison Town and Village,2577,3,0,0,0,3,3,20,1,24,6640929,0,0
2,Akron Village,2846,3,0,0,0,3,1,15,0,16,8099716,0,0
3,Albany,97956,791,8,30,227,526,705,3243,142,4090,9595377936,1,1
4,Albion Village,6388,23,0,3,4,16,53,165,5,223,40806544,0,1


In [11]:
# Test for significance in parameters
linear_formula = 'Propertycrime ~ Population+Population^2+Murder+Robbery_binary'

# Fit the model to our data using formula
lm = smf.ols(formula=linear_formula, data=df).fit()

In [12]:
lm.params

Intercept        -108.768
Population         -1.666
Population ^ 2      1.700
Murder             14.084
Robbery_binary    -94.093
dtype: float64

In [13]:
lm.pvalues

Intercept        0.008
Population       0.897
Population ^ 2   0.895
Murder           0.872
Robbery_binary   0.097
dtype: float64

In [14]:
df.head()

Unnamed: 0,City,Population,Violent Crime,Murder and Nonnegligent Manslaughter,Rape (legacy definition),Robbery,Aggravated Assault,Burglary,Larceny-Theft,Motor Vehicle Theft,Propertycrime,Population^2,Murder,Robbery_binary
0,Adams Village,1861,0,0,0,0,0,2,10,0,12,3463321,0,0
1,Addison Town and Village,2577,3,0,0,0,3,3,20,1,24,6640929,0,0
2,Akron Village,2846,3,0,0,0,3,1,15,0,16,8099716,0,0
3,Albany,97956,791,8,30,227,526,705,3243,142,4090,9595377936,1,1
4,Albion Village,6388,23,0,3,4,16,53,165,5,223,40806544,0,1


In [15]:
# Create new Multivariable linear regression on the following model
# Propertycrime = Robbery_binary + Burglary + Larcenytheft + Motorvehicletheft
# Propertycrime = Robbery + Burglary + Larcenytheft + Motorvehicletheft
# Propertycrime = Robbery_binary + Burglary_binary + Larcenytheft_binary + Motorvehicletheft_binary


In [16]:
# Change name of variables to one word
df['Larcenytheft'] = df['Larceny-Theft']
df = df.drop(columns=['Larceny-Theft'])

df['Motorvehicletheft'] = df['Motor Vehicle Theft']
df = df.drop(columns=['Motor Vehicle Theft'])

In [17]:
df.head()

Unnamed: 0,City,Population,Violent Crime,Murder and Nonnegligent Manslaughter,Rape (legacy definition),Robbery,Aggravated Assault,Burglary,Propertycrime,Population^2,Murder,Robbery_binary,Larcenytheft,Motorvehicletheft
0,Adams Village,1861,0,0,0,0,0,2,12,3463321,0,0,10,0
1,Addison Town and Village,2577,3,0,0,0,3,3,24,6640929,0,0,20,1
2,Akron Village,2846,3,0,0,0,3,1,16,8099716,0,0,15,0
3,Albany,97956,791,8,30,227,526,705,4090,9595377936,1,1,3243,142
4,Albion Village,6388,23,0,3,4,16,53,223,40806544,0,1,165,5


In [18]:
# Propertycrime = Robbery_binary + Burglary + Larcenytheft + Motorvehicletheft
regr = linear_model.LinearRegression()
a = df['Propertycrime'].values.reshape(-1, 1)
b = df[['Robbery_binary', 'Burglary', 'Larcenytheft', 'Motorvehicletheft']]
regr.fit(a,b)

# Inspect the results
print('\nCoefficients: \n', regr.coef_)
print('\nIntercept: \n', regr.intercept_)
print('\nR-squared:\n', regr.score(a,b))


Coefficients: 
 [[1.26560102e-04]
 [2.50146661e-01]
 [6.87755598e-01]
 [6.20977407e-02]]

Intercept: 
 [  0.5506028  -24.32170443  33.6910277   -9.36932327]

R-squared:
 0.9843468966715793


In [19]:
# Test for significance in parameters
linear_formula = 'Propertycrime ~ Robbery_binary+Burglary+Larcenytheft+Motorvehicletheft'

# Fit the model to our data using formula
lm = smf.ols(formula=linear_formula, data=df).fit()

In [20]:
lm.params

Intercept            0.000
Robbery_binary      -0.000
Burglary             1.000
Larcenytheft         1.000
Motorvehicletheft    1.000
dtype: float64

In [21]:
lm.pvalues

Intercept           0.000
Robbery_binary      0.000
Burglary            0.000
Larcenytheft        0.000
Motorvehicletheft   0.000
dtype: float64