In [1]:
import math
import warnings

from IPython.display import display
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn import linear_model
import statsmodels.formula.api as smf
from statsmodels.sandbox.regression.predstd import wls_prediction_std

# Display preferences.
%matplotlib inline
pd.options.display.float_format = '{:.3f}'.format

# Suppress annoying harmless error.
warnings.filterwarnings(
    action="ignore",
    module="scipy",
    message="^internal gelsd"
)

Find a data set and build a KNN Regression and an OLS regression. Compare the two. How similar are they? Do they miss in different ways?

At the end in a markdown cell write a few paragraphs to describe the models' behaviors and why you favor one model or the other. Try to determine whether there is a situation where you would change your mind, or whether one is unambiguously better than the other. Lastly, try to note what it is about the data that causes the better model to outperform the weaker model.

## Dataset

Using state crime data from Unit 02

In [7]:
raw_data = pd.read_csv('https://raw.githubusercontent.com/Thinkful-Ed/data-201-resources/master/New_York_offenses/NEW_YORK-Offenses_Known_to_Law_Enforcement_by_City_2013%20-%2013tbl8ny.csv',
                      skiprows=[0,1,2,3])
raw_data.columns =['city',
                   'population',
                   'violent_crime',
                   'murder',
                   'rape_def_1',
                   'rape_def_2',
                  'robbery',
                  'aggravated_assault',
                  'property_crime',
                  'burglary',
                  'larceny_theft',
                  'motor_vehicle_theft',
                  'arson']

In [22]:
#set df to clean and manipulate for regression
df = raw_data[['city',
               'population',
               'violent_crime',
               'murder',
               'rape_def_2',
               'robbery',
               'aggravated_assault',
               'property_crime']]

#create population squared variable
df = df.dropna(axis=0, subset=['population']) #drop nan values
df['population'] = df['population'].str.replace(',', '') #get rid of commas
df['population'] = df['population'].astype(str).astype(int) #from object cast to string then int
df['population_sq'] = df['population'] ** 2 #square column and put in new column

#cleaning
df['violent_crime'] = df['violent_crime'].str.replace(',','').astype(int)
df['murder'] = df['murder'].astype(int)
df['rape_def_2'] = df['rape_def_2'].str.replace(',','').astype(int)
df['robbery'] = df['robbery'].str.replace(',','').astype(int)
df['aggravated_assault'] = df['aggravated_assault'].str.replace(',','').astype(int)
df['property_crime'] = df['property_crime'].str.replace(',','').astype(int)

#print(df.dtypes)
#df.head()
df['murder'].min()

0

In [16]:
#second df changing crime stat variable values to 0 or 1
#except aggregates violent_crime and property_crime
df2 = df
df2.loc[df2['murder'] > 0, ['murder']] = 1
df2.loc[df2['rape_def_2'] > 0, ['rape_def_2']] = 1
df2.loc[df2['robbery'] > 0, ['robbery']] = 1
df2.loc[df2['aggravated_assault'] > 0, ['aggravated_assault']] = 1

#also exclude outliers outside of 3 std devs
#df2 = df2[np.abs(df2.population - df2.population.mean()) <= (3 * df2.population.std())]
#df2

## KNN Regression

In [49]:
from sklearn import neighbors

#instantiate and fit knn and weighted knn
knn = neighbors.KNeighborsRegressor(n_neighbors=10)
knn_w = neighbors.KNeighborsRegressor(n_neighbors=10, weights='distance')
X = pd.DataFrame(df2[['murder','rape_def_2','population_sq']])
Y = df2[['property_crime']]
knn.fit(X,Y)
knn_w.fit(X,Y)

#set prediction lines
T_mur = np.arange(0,1,.01)[:, np.newaxis]
T_rape = np.arange(0,1,.01)[:, np.newaxis]
T_popsq = np.arange(0,1,.01)[:, np.newaxis]
T_concat = np.c_[T_mur,T_rape,T_popsq]
#print(np.shape(T_mur))
#print(np.shape(T_rape))

Y_ = knn.predict(T_concat)
Y_w = knn_w.predict(T_concat)

from sklearn.model_selection import cross_val_score
score = cross_val_score(knn,X,Y,cv=5)
print(score)
print("Unweighted Accuracy: %0.2f (+/- %0.2f)" % (score.mean(), score.std() * 2))
score_w = cross_val_score(knn_w,X,Y,cv=5)
print(score_w)
print("Unweighted Accuracy: %0.2f (+/- %0.2f)" % (score_w.mean(), score_w.std() * 2))

[0.46048    0.70524944 0.68100566 0.03460983 0.54812179]
Unweighted Accuracy: 0.49 (+/- 0.49)
[0.55572838 0.68023505 0.64122587 0.03454364 0.34655852]
Unweighted Accuracy: 0.45 (+/- 0.48)


## OLS Regression

In [50]:
#instantiate and fit linear regression
regr = linear_model.LinearRegression()
regr.fit(X,Y)

score_ols = cross_val_score(regr,X,Y,cv=5)
print(score_ols)
print("OLS Accuracy: %0.2f (+/- %0.2f)" % (score_ols.mean(), score_ols.std() * 2))

[ 2.00590045e-01 -5.65262233e-01 -3.77319209e-01 -6.09316317e+03
  3.35499989e-01]
OLS Accuracy: -1218.71 (+/- 4874.45)


In [56]:
linear_formula = 'property_crime ~ murder+rape_def_2+population_sq'
lm = smf.ols(formula=linear_formula, data=df2).fit()

print(lm.params)
print(lm.pvalues)
print(lm.rsquared)
print(lm.conf_int())

Intercept         70.463
murder          1107.813
rape_def_2       388.419
population_sq      0.000
dtype: float64
Intercept       0.268
murder          0.000
rape_def_2      0.000
population_sq   0.000
dtype: float64
0.9861887237802198
                    0        1
Intercept     -54.349  195.275
murder        809.246 1406.380
rape_def_2    179.042  597.796
population_sq   0.000    0.000
