In [39]:
import math
import warnings

from IPython.display import display
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn import linear_model
import statsmodels.formula.api as smf

# Display preferences.
%matplotlib inline
pd.options.display.float_format = '{:.3f}'.format

# Suppress annoying harmless error.
warnings.filterwarnings(
    action="ignore",
    module="scipy",
    message="^internal gelsd"
)


dat = pd.read_csv('crime.csv')
df_or = pd.DataFrame(dat)


# this is a good command to make all column names upper/lower case
# df.columns = map(str.lower, df.columns), not needed here as I am renaming columns manually

#Remove empty columns of data here (NaN)
df_or = df_or.dropna(subset = ['Population', 'City'])
df_or = df_or.drop('Unnamed: 13', axis=1)
df_or = df_or.drop(df_or.columns[4], axis = 1) #'Rape (revised definition)1'

#Remove commas from the numbers in all data, standardize all the numbers for the future
df_or = df_or.replace(',', '', regex=True)

#Renaming columns for consistency and ease of typing
df = df_or.rename({'Robbery':'robbery',
               df_or.columns[0]:'city',
               df_or.columns[1]:'population', 
               df_or.columns[2]:'violent crime',
               df_or.columns[3]:'murder',
               df_or.columns[4]:'rape',
               df_or.columns[6]:'assault',
               df_or.columns[7]:'property crime',
               df_or.columns[8]:'burglary',
               df_or.columns[9]:'larceny',
               df_or.columns[10]:'motor vehicle theft',
               df_or.columns[11]:'arson'}, 
               axis='columns')

#This is 1 line of code that changes all columns to numeric (except the columns I exclude)
#This also fills NAN values to 0, this removes decimals from the data and allows us to 
#perform math functions on the data since it is not strings anymore

df.loc[:, df.columns != 'city'] = df.loc[:, df.columns != 'city'].apply(pd.to_numeric, 
                                        errors='coerce').fillna(0, downcast='infer')

In [40]:
from scipy import stats

z = np.abs(stats.zscore(df.loc[:, df.columns != 'city']))
# get indices of outliers
outliers= list(np.where(z > 3)[0])
# drop them
df= df.drop(outliers)

In [41]:
df.head()

Unnamed: 0,city,population,violent crime,murder,rape,robbery,assault,property crime,burglary,larceny,motor vehicle theft,arson
0,Adams Village,1861,0,0,0,0,0,12,2,10,0,0
1,Addison Town and Village,2577,3,0,0,0,3,24,3,20,1,0
2,Akron Village,2846,3,0,0,0,3,16,1,15,0,0
3,Albany,97956,791,8,30,227,526,4090,705,3243,142,0
4,Albion Village,6388,23,0,3,4,16,223,53,165,5,0


In [42]:
df['crime'] = df['violent crime'] + df['property crime']
df['populationsq'] = np.sqrt(df.population)
df['population2'] = (df.population)**2
df['population3'] = df.population**3


In [43]:
df.head()

Unnamed: 0,city,population,violent crime,murder,rape,robbery,assault,property crime,burglary,larceny,motor vehicle theft,arson,crime,populationsq,population2,population3
0,Adams Village,1861,0,0,0,0,0,12,2,10,0,0,12,43.139,3463321,6445240381
1,Addison Town and Village,2577,3,0,0,0,3,24,3,20,1,0,27,50.764,6640929,17113674033
2,Akron Village,2846,3,0,0,0,3,16,1,15,0,0,19,53.348,8099716,23051791736
3,Albany,97956,791,8,30,227,526,4090,705,3243,142,0,4881,312.979,9595377936,939924841098816
4,Albion Village,6388,23,0,3,4,16,223,53,165,5,0,246,79.925,40806544,260672203072


In [44]:
#Lets test for murder, can we predict if a murder happens based on other crimes.
df.murder.describe()

count   344.000
mean      0.291
std       1.131
min       0.000
25%       0.000
50%       0.000
75%       0.000
max      10.000
Name: murder, dtype: float64

In [45]:
#Lets convert murder to a binary outcome 1 being a murder happened, 0 = no murder in that city
df.murder[df.murder >= 1] = 1
df.murder[df.murder < 1] = 0

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,city,population,violent crime,murder,rape,robbery,assault,property crime,burglary,larceny,motor vehicle theft,arson,crime,populationsq,population2,population3
0,Adams Village,1861,0,0,0,0,0,12,2,10,0,0,12,43.139,3463321,6445240381
1,Addison Town and Village,2577,3,0,0,0,3,24,3,20,1,0,27,50.764,6640929,17113674033
2,Akron Village,2846,3,0,0,0,3,16,1,15,0,0,19,53.348,8099716,23051791736
3,Albany,97956,791,1,30,227,526,4090,705,3243,142,0,4881,312.979,9595377936,939924841098816
4,Albion Village,6388,23,0,3,4,16,223,53,165,5,0,246,79.925,40806544,260672203072


In [46]:
#Can our models predict if murder will occur in a city based on other crimes/factors
#Which model works best, LOGISTIC, RIDGE, LASSO regression

In [49]:
df = np.round(df, decimals=2)
df.head()

Unnamed: 0,city,population,violent crime,murder,rape,robbery,assault,property crime,burglary,larceny,motor vehicle theft,arson,crime,populationsq,population2,population3
0,Adams Village,1861,0,0,0,0,0,12,2,10,0,0,12,43.14,3463321,6445240381
1,Addison Town and Village,2577,3,0,0,0,3,24,3,20,1,0,27,50.76,6640929,17113674033
2,Akron Village,2846,3,0,0,0,3,16,1,15,0,0,19,53.35,8099716,23051791736
3,Albany,97956,791,1,30,227,526,4090,705,3243,142,0,4881,312.98,9595377936,939924841098816
4,Albion Village,6388,23,0,3,4,16,223,53,165,5,0,246,79.92,40806544,260672203072


In [204]:
#Load in modelling 
from sklearn import ensemble
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
import statsmodels.api as sm
from sklearn.linear_model import LogisticRegression

X = df.drop(['city', 'murder', 'crime'], 1)
Y = df['murder']


rfc = ensemble.RandomForestRegressor()

X_train, X_test, Y_train, Y_test = train_test_split(
    X, Y, test_size=.4)

Y_test_size = Y_test.size
Y_train_size = Y_train.size

rfc.fit(X_train, Y_train)

accu_train = rfc.score(X_train, Y_train)
accu_test = rfc.score(X_test, Y_test)

print("Accuracy on Train: ", accu_train)
print("Accuracy on Test: ", accu_test)


Accuracy on Train:  0.8910193548387098
Accuracy on Test:  0.12557603686635965


In [170]:
# Declare a logistic regression classifier.
# Parameter regularization coefficient C described above.
lr = LogisticRegression(C=1e9)

# Fit the model.
fit = lr.fit(X, Y)

# Display.
print('Coefficients')
print(fit.coef_)
print(fit.intercept_)
pred_y_sklearn = lr.predict(X)

print('\n Accuracy by murder status')
print(pd.crosstab(pred_y_sklearn, Y))

print('\n Logistic Regression accuracy')
print(lr.score(X_train, Y_train))

Coefficients
[[-1.23702422e-24  1.51923965e-27  3.86050651e-30  6.69456111e-28
   7.81484396e-28 -1.62180538e-26 -1.11175218e-27 -1.50961634e-26
  -1.01382286e-29 -7.29232720e-30 -1.39599333e-26 -9.60121139e-21
   5.79334608e-16]]
[-1.84419043e-28]

 Accuracy by murder status
murder    0   1
row_0          
1       299  45

 Logistic Regression accuracy
0.13106796116504854


In [138]:
from sklearn import linear_model
from sklearn import preprocessing

In [171]:
#Remove some annoying warnings
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn

ridgeregr = linear_model.Ridge(alpha=50, fit_intercept=False) 
ridgeregr.fit(X_train, Y_train)

ridgetrain_accu = ridgeregr.score(X_train, Y_train)
ridgetest_accu = ridgeregr.score(X_test, Y_test)

print("Ridge Accuracy on Train: ", ridgetrain_accu)
print("Ridge Accuracy on Test: ", ridgetest_accu)

ridgefit = ridgeregr.fit(X, Y)
print("Ridge regression coefficient values:", ridgefit.coef_)

Ridge Accuracy on Train:  0.478271171180145
Ridge Accuracy on Test:  0.3306194669740383
Ridge regression coefficient values: [ 1.60293020e-05  8.86250599e-02 -7.30592525e-02 -8.63782290e-02
 -8.94867566e-02 -8.49205302e-04  1.28359955e-03  7.65696332e-04
 -2.89850118e-03  1.40413355e-02 -7.24730417e-04 -1.23774677e-10
  3.12427260e-16]


Ill-conditioned matrix detected. Result is not guaranteed to be accurate.
Reciprocal condition number7.391851e-31
  overwrite_a=True).T
Ill-conditioned matrix detected. Result is not guaranteed to be accurate.
Reciprocal condition number7.257791e-31
  overwrite_a=True).T


In [186]:
# Define the training and test sizes (Lasso)
trainsize = int(df.shape[0] / 2)
df_test = df.iloc[trainsize:, :].copy()
df_train = df.iloc[:trainsize, :].copy()

Y_train = df_train['murder'].values.reshape(-1, 1)
X_train = df_train.drop(['city', 'murder', 'crime'], 1)

In [191]:
lass = linear_model.Lasso(alpha=.001)
lassfit = lass.fit(X_train, Y_train)


lassotrain_accu = lass.score(X_train, Y_train)
lassotest_accu = lass.score(X_test, Y_test)

print("Lasso Accuracy on Train: ", lassotrain_accu)
print("Lasso Accuracy on Test: ", lassotest_accu)

lassofit = lass.fit(X, Y)
print("Lasso regression coefficient values:", lassofit.coef_)

Lasso Accuracy on Train:  0.3584505904175648
Lasso Accuracy on Test:  0.3601618547597921
Lasso regression coefficient values: [ 2.07014714e-05  1.29022168e-02  4.65316608e-03 -9.08878530e-03
 -1.29767189e-02 -2.32233805e-05  3.03170022e-04 -6.93571462e-05
 -4.45969010e-03  9.64398527e-03 -1.43678226e-03 -1.51089835e-10
  3.20185137e-16]


In [202]:
print("Lets take a look at all of our different models:")

print('\n\nLogistic Regression accuracy')
print(lr.score(X_train, Y_train))

print("\n\nRidge Accuracy on Train: ", ridgetrain_accu)
print("Ridge Accuracy on Test: ", ridgetest_accu)

ridgefit = ridgeregr.fit(X, Y)
print("Ridge regression coefficient values:", ridgefit.coef_)

print("\n\nLasso Accuracy on Train: ", lassotrain_accu)
print("Lasso Accuracy on Test: ", lassotest_accu)

lassofit = lass.fit(X, Y)
print("Lasso regression coefficient values:", lassofit.coef_)

Lets take a look at all of our different models:


Logistic Regression accuracy
0.10465116279069768


Ridge Accuracy on Train:  0.478271171180145
Ridge Accuracy on Test:  0.3306194669740383
Ridge regression coefficient values: [ 1.60293020e-05  8.86250599e-02 -7.30592525e-02 -8.63782290e-02
 -8.94867566e-02 -8.49205302e-04  1.28359955e-03  7.65696332e-04
 -2.89850118e-03  1.40413355e-02 -7.24730417e-04 -1.23774677e-10
  3.12427260e-16]


Lasso Accuracy on Train:  0.3584505904175648
Lasso Accuracy on Test:  0.3601618547597921
Lasso regression coefficient values: [ 2.07014714e-05  1.29022168e-02  4.65316608e-03 -9.08878530e-03
 -1.29767189e-02 -2.32233805e-05  3.03170022e-04 -6.93571462e-05
 -4.45969010e-03  9.64398527e-03 -1.43678226e-03 -1.51089835e-10
  3.20185137e-16]


Ill-conditioned matrix detected. Result is not guaranteed to be accurate.
Reciprocal condition number7.257791e-31
  overwrite_a=True).T
