In [1]:
import math
import warnings

from IPython.display import display
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn import linear_model
import statsmodels.formula.api as smf
from statsmodels.sandbox.regression.predstd import wls_prediction_std

# Display preferences.
%matplotlib inline
pd.options.display.float_format = '{:.3f}'.format

# Suppress annoying harmless error.
warnings.filterwarnings(
    action="ignore",
    module="scipy",
    message="^internal gelsd"
)

Find a data set and build a KNN Regression and an OLS regression. Compare the two. How similar are they? Do they miss in different ways?

At the end in a markdown cell write a few paragraphs to describe the models' behaviors and why you favor one model or the other. Try to determine whether there is a situation where you would change your mind, or whether one is unambiguously better than the other. Lastly, try to note what it is about the data that causes the better model to outperform the weaker model.

## Dataset

Using state crime data from Unit 02

In [2]:
raw_data = pd.read_csv('https://raw.githubusercontent.com/Thinkful-Ed/data-201-resources/master/New_York_offenses/NEW_YORK-Offenses_Known_to_Law_Enforcement_by_City_2013%20-%2013tbl8ny.csv',
                      skiprows=[0,1,2,3])
raw_data.columns =['city',
                   'population',
                   'violent_crime',
                   'murder',
                   'rape_def_1',
                   'rape_def_2',
                  'robbery',
                  'aggravated_assault',
                  'property_crime',
                  'burglary',
                  'larceny_theft',
                  'motor_vehicle_theft',
                  'arson']

In [3]:
#clean first df, leave aggregate numbers and outliers
#this will probably not have very good results
df = raw_data[['city',
               'population',
               'violent_crime',
               'murder',
               'rape_def_2',
               'robbery',
               'aggravated_assault',
               'property_crime']]

#create population squared variable
df = df.dropna(axis=0, subset=['population']) #drop nan values
df['population'] = df['population'].str.replace(',', '') #get rid of commas
df['population'] = df['population'].astype(str).astype(int) #from object cast to string then int
df['population_sq'] = df['population'] ** 2 #square column and put in new column

#cleaning
df['violent_crime'] = df['violent_crime'].str.replace(',','').astype(int)
df['murder'] = df['murder'].astype(int)
df['rape_def_2'] = df['rape_def_2'].str.replace(',','').astype(int)
df['robbery'] = df['robbery'].str.replace(',','').astype(int)
df['aggravated_assault'] = df['aggravated_assault'].str.replace(',','').astype(int)
df['property_crime'] = df['property_crime'].str.replace(',','').astype(int)

In [4]:
#second df with additional cleaning should yield better results
df2 = df

#handling outliers outside 2 std devs
#ONLY RUN THIS ONCE OR ELSE IT WILL KEEP ELIMINATING OUTLIERS
#AND EVENTUALLY ERROR
df2['population'] = df2.population.map(
    lambda x: x if x < (df2['population'].median()
                        + (2 * df2['population'].std())) else None)
df2['murder'] = df2.murder.map(
    lambda x: x if x < (df2['murder'].median()
                        + (2 * df2['murder'].std())) else None)
df2['rape_def_2'] = df2.rape_def_2.map(
    lambda x: x if x < (df2['rape_def_2'].median()
                        + (2 * df2['rape_def_2'].std())) else None)
df2['robbery'] = df2.robbery.map(
    lambda x: x if x < (df2['robbery'].median()
                        + (2 * df2['robbery'].std())) else None)
df2['property_crime'] = df2.property_crime.map(
    lambda x: x if x < (df2['property_crime'].median()
                        + (2 * df2['property_crime'].std())) else None)

In [5]:
#change crime stat variables to 0 or 1 while dropping nans
df2['murder'] = df2['murder'].dropna().map(lambda x: 1 if x>0 else 0)
df2['rape_def_2'] = df2['rape_def_2'].dropna().map(lambda x: 1 if x>0 else 0)
df2['robbery'] = df2['robbery'].dropna().map(lambda x: 1 if x>0 else 0)

In [6]:
#not sure why this is necessary here given previous nan handling
#dropna and fillna methods won't work for some reason

df2 = df2.dropna(axis=0, subset=['population'])
df2['population'] = df2['population'].astype(int)
df2 = df2.dropna(axis=0, subset=['murder'])
df2['murder'] = df2['murder'].astype(int)
df2 = df2.dropna(axis=0, subset=['rape_def_2'])
df2['rape_def_2'] = df2['rape_def_2'].astype(int)
df2 = df2.dropna(axis=0, subset=['robbery'])
df2['robbery'] = df2['robbery'].astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """


## KNN Regression

In [7]:
from sklearn import neighbors

#instantiate and fit knn and weighted knn
knn = neighbors.KNeighborsRegressor(n_neighbors=10)
knn_w = neighbors.KNeighborsRegressor(n_neighbors=10, weights='distance')
X = pd.DataFrame(df2[['murder','rape_def_2','population']])
Y = df2[['property_crime']]
knn.fit(X,Y)
knn_w.fit(X,Y)

#set prediction lines
T_mur = np.arange(0,1,.01)[:, np.newaxis]
T_rape = np.arange(0,1,.01)[:, np.newaxis]
T_popsq = np.arange(0,1,.01)[:, np.newaxis]
T_concat = np.c_[T_mur,T_rape,T_popsq]

#predictions
Y_ = knn.predict(T_concat)
Y_w = knn_w.predict(T_concat)

from sklearn.model_selection import cross_val_score
score = cross_val_score(knn,X,Y,cv=5)
print(score)
print("Unweighted Accuracy: %0.2f (+/- %0.2f)" % (score.mean(), score.std() * 2))
score_w = cross_val_score(knn_w,X,Y,cv=5)
print(score_w)
print("Weighted Accuracy: %0.2f (+/- %0.2f)" % (score_w.mean(), score_w.std() * 2))

[0.79348795 0.72020588 0.66293921 0.43685177 0.58827965]
Unweighted Accuracy: 0.64 (+/- 0.24)
[0.77144043 0.6729762  0.60792131 0.47524185 0.59311915]
Weighted Accuracy: 0.62 (+/- 0.19)


## Linear/OLS Regression

In [8]:
#instantiate and fit linear regression
regr = linear_model.LinearRegression()
X = pd.DataFrame(df2[['murder','rape_def_2','population']])
Y = df2[['property_crime']]
regr.fit(X,Y)

#inspect results
print('Coefficients: \n', regr.coef_)
print('Intercept: \n', regr.intercept_)
print('R-squared: \n', regr.score(X,Y))

score_regr = cross_val_score(regr,X,Y,cv=5)
print('Cross validation: \n', score_regr)

Coefficients: 
 [[1.76375222e+02 8.66344749e+01 2.17209498e-02]]
Intercept: 
 [-55.32817199]
R-squared: 
 0.7116312129411614
Cross validation: 
 [0.79649585 0.73717039 0.69282601 0.50199927 0.68048552]


In [17]:
linear_formula = 'property_crime ~ murder+rape_def_2+population'
lm = smf.ols(formula=linear_formula, data=df2).fit()

print('Coefficients:\n', lm.params)
print('\nP-values:\n', lm.pvalues)
print('\nR-squared:\n', lm.rsquared)

Coefficients:
 Intercept    -55.328
murder       176.375
rape_def_2    86.634
population     0.022
dtype: float64

P-values:
 Intercept    0.022
murder       0.004
rape_def_2   0.035
population   0.000
dtype: float64

R-squared:
 0.7116312129411613
