In [241]:
import pandas as pd
from matplotlib import pyplot as plt
import numpy as np
import math
import seaborn as sns
import sklearn
from sklearn import linear_model
from sklearn import preprocessing
import io
import requests
from sklearn.model_selection import cross_val_score
%matplotlib inline
sns.set_style('white')

In [242]:
data = pd.read_excel('table_8_offenses_known_to_law_enforcement_arizona_by_city_2013.xls', 
                     skiprows=4)

In [243]:
data.head()

Unnamed: 0,City,Population,Violent crime,Murder and nonnegligent manslaughter,Rape (revised definition)1,Rape (legacy definition)2,Robbery,Aggravated assault,Property crime,Burglary,Larceny- theft,Motor vehicle theft,Arson
0,Bisbee,5463.0,74.0,1.0,,1.0,0.0,72.0,217.0,8.0,202.0,7.0,0.0
1,Buckeye,55710.0,49.0,1.0,,1.0,17.0,30.0,1328.0,334.0,925.0,69.0,8.0
2,Bullhead City,39577.0,76.0,0.0,,0.0,25.0,51.0,1716.0,363.0,1252.0,101.0,11.0
3,Camp Verde,10942.0,33.0,0.0,,2.0,0.0,31.0,259.0,57.0,185.0,17.0,0.0
4,Casa Grande,50058.0,229.0,5.0,,12.0,57.0,155.0,1975.0,474.0,1415.0,86.0,37.0


In [244]:
data.columns=['City','Population','Violent_crime','Murder','Rape_new','Rape',
             'Robbery','Agg_assault','Property_crime','Burglary','Larceny', 'Car_theft',
             'Arson']

In [245]:
#We are going to use Murder as the binary target variable - whether there was any murders
#or not
data.Murder = (data.Property_crime > 0).astype(int)

In [246]:
#City will not be useful for regression purposes and cannot turn into a numerical form
data = data.drop(columns=['City'])

In [247]:
#Rape (new defintion) only has 3 fields that are not NaN that correlate to Rape (old 
#definiton) only having three NaN. Using that data to fill in and condensing to one row
data.loc[17,['Rape']] = 14
data.loc[22,['Rape']] = 0
data.loc[62,['Rape']] = 46
data = data.drop(columns=['Rape_new'])

In [248]:
#Final three rows are not actual data. Let's get rid of those
data = data.iloc[:63,:]

In [249]:
#We need 15 features, but only have 11 currently. Let's add a couple more
data['Population_sq'] = data.Population**2
#data['Burglary_by_pop'] = data.Burglary/data.Population
#data['Violent_by_pop'] = data.Violent_crime/data.Population
#data['Total_theft'] = data.Burglary+data.Larceny+data.Car_theft+data.Robbery
#data['Rape_by_pop'] = data.Rape/data.Population

In [250]:
len(data)/2

31.5

In [251]:
#Separate our Train data set from our Test data set
Xtrain = data.drop(columns=['Property_crime']).iloc[:32,:]
Ytrain = data.loc[:31,['Property_crime']].values.reshape(-1,1)
Xtest = data.drop(columns=['Property_crime']).iloc[32:,:]
Ytest = data.loc[32:,['Property_crime']].values.reshape(-1,1)

In [252]:
#Regular Linar Regression
#Initate Model
regr = linear_model.LinearRegression()
#Fit to model
regr.fit(Xtrain, Ytrain)
#See how it does
regr.score(Xtest, Ytest)

#Leaving it with train data for now - with test data it is getting a score of above 1

1.0

In [253]:
#Ridge Model
ridgeregr = linear_model.Ridge()
ridgeregr.fit(Xtrain,Ytrain)
ridgeregr.score(Xtest,Ytest)

Ill-conditioned matrix detected. Result is not guaranteed to be accurate.
Reciprocal condition number/precision: 2.0953551091544085e-23 / 1.1102230246251565e-16


0.9999999999672678

In [254]:
#Lasso Model
lassoregr = linear_model.Lasso()
lassoregr.fit(Xtrain,Ytrain)
lassoregr.score(Xtest,Ytest)

0.9929438645159648

All models have very good R2 values, however the Ordinary Linear Regression has a R2 of 1, which suggests overfitting. Because our data has a lot of consistent themes, such as theft broken up into different categories: car theft, larcency, burglary, robbery, there is likely a lot of data that is correlated and features that do not need to be included - so for this data set I believe the Lasso method is the best. It might have the lowest R2 value, which is still over 0.99, but I have the highest confidence that we removed features that were unnecessary and we weren't duplicating too much of the variance in the data and overfitting. With this dataset, less is definitely more and the R2 of 1 is highly concerning -