In [1]:
import math

from IPython.display import display
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import linear_model
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import scipy
import statsmodels.formula.api as smf
%matplotlib inline
pd.options.display.float_format = '{:.3f}'.format

# Suppress annoying harmless error.
import warnings
warnings.filterwarnings(action='ignore', module='scipy', message='^internal gelsd')

In [2]:
# Read data using pandas
df = pd.read_excel('table_8_offenses_known_to_law_enforcement_new_york_by_city_2013.xls')

# Delete first three rows
df = df.drop([0,1,2], axis=0)

# Make first row the column headers
df = df.reset_index(drop=True)
df.columns = df.iloc[0]
df = df.drop([0], axis=0)
df = df.reset_index(drop=True)

# Rename all column headers
df.columns = ['City', 'Population', 'Violent Crime', 'Murder and Nonnegligent Manslaughter', 'Rape (revised definition)', 'Rape (legacy definition)', 'Robbery', 'Aggravated Assault', 'Property Crime', 'Burglary', 'Larceny-Theft', 'Motor Vehicle Theft', 'Arson']

# Remove entire Unnamed: 4 or 'Rape revised defintion'
df = df.drop('Rape (revised definition)', axis=1)

# Remove null objects
df = df.drop([348, 349, 350], axis=0)
df = df.drop('Arson', axis=1)
df = df.dropna(how='all')

# Convert all columns from object to integer values
df[['Population', 'Violent Crime',
       'Murder and Nonnegligent Manslaughter', 'Rape (legacy definition)',
       'Robbery', 'Aggravated Assault', 'Property Crime', 'Burglary',
       'Larceny-Theft', 'Motor Vehicle Theft']] = df[['Population', 'Violent Crime',
       'Murder and Nonnegligent Manslaughter', 'Rape (legacy definition)',
       'Robbery', 'Aggravated Assault', 'Property Crime', 'Burglary',
       'Larceny-Theft', 'Motor Vehicle Theft']].astype(int)

# Remove New York and buffalo from data because it skews the data. Although the data is correct
df[df['City'] == 'New York']
df = df[(df['City']!='New York')&(df['City']!='Buffalo')]

# Change name of Property Crime so that it's one word
df['Propertycrime'] = df['Property Crime']
df = df.drop(columns=['Property Crime'])

df['Murder'] = df['Murder and Nonnegligent Manslaughter']
df = df.drop(columns=['Murder and Nonnegligent Manslaughter'])

df['Aggravated_Assault'] = df['Aggravated Assault']
df = df.drop(columns=['Aggravated Assault'])

In [3]:
df.head()

Unnamed: 0,City,Population,Violent Crime,Rape (legacy definition),Robbery,Burglary,Larceny-Theft,Motor Vehicle Theft,Propertycrime,Murder,Aggravated_Assault
0,Adams Village,1861,0,0,0,2,10,0,12,0,0
1,Addison Town and Village,2577,3,0,0,3,20,1,24,0,3
2,Akron Village,2846,3,0,0,1,15,0,16,0,3
3,Albany,97956,791,30,227,705,3243,142,4090,8,526
4,Albion Village,6388,23,3,4,53,165,5,223,0,16


In [4]:
# Create Features: Population^2, Murder binary, Robbery, binary
# More specifically, create binary categories for murder and robery
df['Population^2'] = df['Population']**2
df['Murder'] = np.where(df['Murder']>0, '1', '0')
df['Robbery_binary'] = np.where(df['Robbery']>0, '1', '0')
#df['Aggravated_Assault'] = np.where(df['Aggravated_Assault']>0, '1', '0')

#Convert new columns to int
df[['Population^2']] = df[['Population^2']].astype(int)
df[['Murder']] = df[['Murder']].astype(int)
df[['Robbery_binary']] = df[['Robbery_binary']].astype(int)
df[['Aggravated_Assault']] = df[['Aggravated_Assault']].astype(int)

In [5]:
# Run Model
regr = linear_model.LinearRegression()
y = df['Propertycrime'].values.reshape(-1, 1)
x = df[['Population','Population^2', 'Murder', 'Robbery_binary']]
regr.fit(x,y)

# Inspect the results
print('\nCoefficients: \n', regr.coef_)
print('\nIntercept: \n', regr.intercept_)
print('\nR-squared:\n', regr.score(x,y))


Coefficients: 
 [[1.74484226e-02 7.28298619e-08 1.84108347e+02 6.20590854e+01]]

Intercept: 
 [-35.45330011]

R-squared:
 0.7572767492306005


  linalg.lstsq(X, y)


In [6]:
# Cross Validation test

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=30)
print('With 30% holdout: ' + str(regr.fit(x_train, y_train,).score(x_test, y_test)))
print('Testing on sample:' + str(regr.fit(x,y).score(x,y)))

With 30% holdout: 0.7503983756164383
Testing on sample:0.7572767492306005


In [7]:
# Cross-validation with mutliple folds
from sklearn.model_selection import cross_val_score

cross_val_score(regr, x, y, cv=5)

array([0.81409112, 0.71966237, 0.74405329, 0.61140936, 0.21822869])

In [8]:
# Test for significance in parameters
linear_formula = 'Propertycrime ~ Population+Population^2+Murder+Robbery_binary'

# Fit the model to our data using formula
lm = smf.ols(formula=linear_formula, data=df).fit()

In [9]:
lm.params

Intercept        -80.074
Population         8.359
Population ^ 2    -8.330
Murder           114.402
Robbery_binary   -27.541
dtype: float64

In [10]:
lm.pvalues

Intercept        0.025
Population       0.456
Population ^ 2   0.458
Murder           0.135
Robbery_binary   0.578
dtype: float64

In [11]:
df.corr()

Unnamed: 0,Population,Violent Crime,Rape (legacy definition),Robbery,Burglary,Larceny-Theft,Motor Vehicle Theft,Propertycrime,Murder,Aggravated_Assault,Population^2,Robbery_binary
Population,1.0,0.796,0.737,0.778,0.767,0.871,0.804,0.857,0.505,0.799,0.881,0.369
Violent Crime,0.796,1.0,0.922,0.988,0.945,0.875,0.973,0.912,0.453,0.994,0.869,0.198
Rape (legacy definition),0.737,0.922,1.0,0.893,0.95,0.885,0.917,0.918,0.442,0.915,0.776,0.225
Robbery,0.778,0.988,0.893,1.0,0.932,0.853,0.975,0.893,0.417,0.965,0.878,0.181
Burglary,0.767,0.945,0.95,0.932,1.0,0.929,0.959,0.964,0.43,0.931,0.793,0.231
Larceny-Theft,0.871,0.875,0.885,0.853,0.929,1.0,0.89,0.994,0.472,0.871,0.799,0.335
Motor Vehicle Theft,0.804,0.973,0.917,0.975,0.959,0.89,1.0,0.928,0.407,0.954,0.875,0.198
Propertycrime,0.857,0.912,0.918,0.893,0.964,0.994,0.928,1.0,0.466,0.905,0.816,0.308
Murder,0.505,0.453,0.442,0.417,0.43,0.472,0.407,0.466,1.0,0.469,0.355,0.308
Aggravated_Assault,0.799,0.994,0.915,0.965,0.931,0.871,0.954,0.905,0.469,1.0,0.852,0.206


In [12]:
# Create PCA on the follwing variables: Burglary, Violent Crime, Aggravated_Assault, Larceny-Theft, Motor-Vehicle Theft, Robbery

# Standardize Data
features = ['Burglary', 'Violent Crime', 'Aggravated_Assault', 'Larceny-Theft', 'Rape (legacy definition)', 'Motor Vehicle Theft', 'Propertycrime']
x = df.loc[:, features].values

scaler = StandardScaler()

scaler.fit(x)

pd.DataFrame(data=x, columns=features).head()

# PCA
pca = PCA(n_components=2)
principalComponents = pca.fit_transform(x)

# Create Merged Data Frame
principalDf = pd.DataFrame(data = principalComponents, columns = ['principalcomponent_1', 'principalcomponent_2'])
principalDf['City'] = df['City']
merged_inner = pd.merge(left=df, right=principalDf, left_on='City', right_on='City')
df = merged_inner



In [13]:
# Run Model
regr = linear_model.LinearRegression()
y = df['Propertycrime'].values.reshape(-1, 1)
x = df[['Population','Population^2', 'Murder', 'Robbery_binary', 'principalcomponent_1', 'principalcomponent_2']]
regr.fit(x,y)

# Inspect the results
print('\nCoefficients: \n', regr.coef_)
print('\nIntercept: \n', regr.intercept_)
print('\nR-squared:\n', regr.score(x,y))


Coefficients: 
 [[1.04836093e-02 1.70035566e-07 1.90494576e+02 9.79508108e+01
  9.14137204e-03 5.01724606e-02]]

Intercept: 
 [-8.18165976]

R-squared:
 0.8830907101191076


In [14]:
# Cross-validation with mutliple folds
from sklearn.model_selection import cross_val_score

cross_val_score(regr, x, y, cv=5)

array([0.78243918, 0.73993286, 0.7103486 , 0.91106629, 0.8731827 ])

In [15]:
# Test for significance in parameters
linear_formula = 'Propertycrime ~ Population+Population^2+Murder+Robbery_binary+principalcomponent_1+principalcomponent_2'

# Fit the model to our data using formula
lm = smf.ols(formula=linear_formula, data=df).fit()

In [16]:
lm.params

Intercept              -101.575
Population               11.409
Population ^ 2          -11.376
Murder                   57.454
Robbery_binary          -78.243
principalcomponent_1      0.001
principalcomponent_2      0.151
dtype: float64

In [17]:
lm.pvalues

Intercept              0.001
Population             0.248
Population ^ 2         0.249
Murder                 0.393
Robbery_binary         0.075
principalcomponent_1   0.963
principalcomponent_2   0.442
dtype: float64

In [26]:
# Process Second Dataset on Illinois

df1 = pd.read_excel('table_8_offenses_known_to_law_enforcement_illinois_by_city_2013.xls')

In [27]:
# Delete first three rows
df1 = df1.drop([0,1,2], axis=0)

# Make first row the column headers
df1 = df1.reset_index(drop=True)
df1.columns = df1.iloc[0]
df1 = df1.drop([0], axis=0)
df1 = df1.reset_index(drop=True)

# Rename all column headers
df1.columns = ['City', 'Population', 'Violent Crime', 'Murder and Nonnegligent Manslaughter', 'Rape (revised definition)', 'Rape (legacy definition)', 'Robbery', 'Aggravated Assault', 'Property Crime', 'Burglary', 'Larceny-Theft', 'Motor Vehicle Theft', 'Arson']

# Remove entire Unnamed: 4 or 'Rape revised defintion'
df1 = df1.drop('Rape (revised definition)', axis=1)

# Remove null objects
df1 = df1.drop([77, 398, 506, 507, 508, 509], axis=0)
df1 = df1.drop('Arson', axis=1)
df1 = df1.dropna(how='all')

# Convert all columns from object to integer values
df1[['Population', 'Violent Crime',
       'Murder and Nonnegligent Manslaughter', 'Rape (legacy definition)',
       'Robbery', 'Aggravated Assault', 'Property Crime', 'Burglary',
       'Larceny-Theft', 'Motor Vehicle Theft']] = df1[['Population', 'Violent Crime',
       'Murder and Nonnegligent Manslaughter', 'Rape (legacy definition)',
       'Robbery', 'Aggravated Assault', 'Property Crime', 'Burglary',
       'Larceny-Theft', 'Motor Vehicle Theft']].astype(int)

# Change name of Property Crime so that it's one word
df1['Propertycrime'] = df1['Property Crime']
df1 = df1.drop(columns=['Property Crime'])

df1['Murder'] = df1['Murder and Nonnegligent Manslaughter']
df1 = df1.drop(columns=['Murder and Nonnegligent Manslaughter'])

df1['Aggravated_Assault'] = df1['Aggravated Assault']
df1 = df1.drop(columns=['Aggravated Assault'])

In [28]:
# Create Features: Population^2, Murder binary, Robbery, binary
# More specifically, create binary categories for murder and robery
df1['Population^2'] = df1['Population']**2
df1['Murder'] = np.where(df1['Murder']>0, '1', '0')
df1['Robbery_binary'] = np.where(df1['Robbery']>0, '1', '0')
#df1['Aggravated_Assault'] = np.where(df1['Aggravated_Assault']>0, '1', '0')

#Convert new columns to int
df1[['Population^2']] = df1[['Population^2']].astype(int)
df1[['Murder']] = df1[['Murder']].astype(int)
df1[['Robbery_binary']] = df1[['Robbery_binary']].astype(int)
df1[['Aggravated_Assault']] = df1[['Aggravated_Assault']].astype(int)

In [29]:
# Create PCA on the follwing variables: Burglary, Violent Crime, Aggravated_Assault, Larceny-Theft, Motor-Vehicle Theft, Robbery

# Standardize Data
features = ['Burglary', 'Violent Crime', 'Aggravated_Assault', 'Larceny-Theft', 'Rape (legacy definition)', 'Motor Vehicle Theft', 'Propertycrime']
x = df1.loc[:, features].values

scaler = StandardScaler()

scaler.fit(x)

pd.DataFrame(data=x, columns=features).head()

# PCA
pca = PCA(n_components=2)
principalComponents = pca.fit_transform(x)

# Create Merged Data Frame
principalDf1 = pd.DataFrame(data = principalComponents, columns = ['principalcomponent_1', 'principalcomponent_2'])
principalDf1['City'] = df1['City']
merged_inner = pd.merge(left=df1, right=principalDf1, left_on='City', right_on='City')
df1 = merged_inner



In [32]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 502 entries, 0 to 501
Data columns (total 15 columns):
City                        502 non-null object
Population                  502 non-null int64
Violent Crime               502 non-null int64
Rape (legacy definition)    502 non-null int64
Robbery                     502 non-null int64
Burglary                    502 non-null int64
Larceny-Theft               502 non-null int64
Motor Vehicle Theft         502 non-null int64
Propertycrime               502 non-null int64
Murder                      502 non-null int64
Aggravated_Assault          502 non-null int64
Population^2                502 non-null int64
Robbery_binary              502 non-null int64
principalcomponent_1        502 non-null float64
principalcomponent_2        502 non-null float64
dtypes: float64(2), int64(12), object(1)
memory usage: 62.8+ KB


In [30]:
# Run Model
x = df1['Propertycrime'].values.reshape(-1, 1)
y = df1[['Population','Population^2', 'Murder', 'Robbery_binary', 'principalcomponent_1', 'principalcomponent_2']]

# Inspect the results
print('\nCoefficients: \n', regr.coef_)
print('\nIntercept: \n', regr.intercept_)
print('\nR-squared:\n', regr.score(x, y))


Coefficients: 
 [[1.04836093e-02 1.70035566e-07 1.90494576e+02 9.79508108e+01
  9.14137204e-03 5.01724606e-02]]

Intercept: 
 [-8.18165976]


ValueError: shapes (502,1) and (6,1) not aligned: 1 (dim 1) != 6 (dim 0)

In [None]:
# Cross-validation with mutliple folds
from sklearn.model_selection import cross_val_score

cross_val_score(regr, x, y, cv=5)

In [None]:
# Test for significance in parameters
linear_formula = 'Propertycrime ~ Population+Population^2+Murder+Robbery_binary+principalcomponent_1+principalcomponent_2'

# Fit the model to our data using formula
lm = smf.ols(formula=linear_formula, data=df1).fit()

In [None]:
lm.params

In [None]:
lm.pvalues