# Regression Model Comparison
----

In [75]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder
from IPython.display import display
from sklearn import linear_model
from sklearn.model_selection import train_test_split
import statsmodels.api as sm
from sklearn.linear_model import LogisticRegression
from scipy import stats
stats.chisqprob = lambda chisq, df: stats.chi2.sf(chisq, df)
%matplotlib inline

In [3]:
df = pd.read_csv('./data/table_8_offenses_known_to_law_enforcement_new_york_by_city_2013.csv', delimiter= ',')
print(f'Shape: {df.shape}')
df.head()

Shape: (351, 14)


Unnamed: 0,City,Population,Violent crime,Murder and nonnegligent manslaughter,Rape (revised definition)1,Rape (legacy definition)2,Robbery,Aggravated assault,Property crime,Burglary,Larceny- theft,Motor vehicle theft,Arson3,Unnamed: 13
0,Adams Village,1861,0,0.0,,0,0,0,12,2,10,0,0.0,
1,Addison Town and Village,2577,3,0.0,,0,0,3,24,3,20,1,0.0,
2,Akron Village,2846,3,0.0,,0,0,3,16,1,15,0,0.0,
3,Albany,97956,791,8.0,,30,227,526,4090,705,3243,142,,
4,Albion Village,6388,23,0.0,,3,4,16,223,53,165,5,,


In [150]:
df.columns

Index(['City', 'Population', 'Violent\r\ncrime', 'Murder',
       'Rape\r\n(revised\r\ndefinition)1', 'Rape\r\n(legacy\r\ndefinition)2',
       'Robbery', 'Aggravated\r\nassault', 'Property\r\ncrime', 'Burglary',
       'Larceny-\r\ntheft', 'Motor\r\nvehicle\r\ntheft', 'Arson3',
       'Unnamed: 13', 'MurderCat', 'BurglaryLog', 'ViolentCrimeLog',
       'AggravatedAssultLog', 'LarTheftLog', 'MotorVTheftLog',
       'PopulationLog'],
      dtype='object')

### Rename Murder column

In [5]:
# Rename murder column
df = df.rename(columns={'Murder and\r\nnonnegligent\r\nmanslaughter': 'Murder'})

In [7]:
# Set to binary form
df['MurderCat'] = np.where(df['Murder'] > 0, 1, 0)

In [18]:
# SELECTED FEATURES
# -----------------
# Violent crime
# Aggravated assault
# Burglary
# Larceny- theft
# Motor vehicle theft
# Population
# Robbery

df['Burglary'] = df['Burglary'].str.replace(',','')
df['Burglary'] = df['Burglary'].astype(float)

df['Violent\r\ncrime'] = df['Violent\r\ncrime'].str.replace(',','')
df['Violent\r\ncrime'] = df['Violent\r\ncrime'].astype(float)

df['Aggravated\r\nassault'] = df['Aggravated\r\nassault'].str.replace(',','')
df['Aggravated\r\nassault'] = df['Aggravated\r\nassault'].astype(float)

df['Larceny-\r\ntheft'] = df['Larceny-\r\ntheft'].str.replace(',','')
df['Larceny-\r\ntheft'] = df['Larceny-\r\ntheft'].astype(float)

df['Motor\r\nvehicle\r\ntheft'] = df['Motor\r\nvehicle\r\ntheft'].str.replace(',','')
df['Motor\r\nvehicle\r\ntheft'] = df['Motor\r\nvehicle\r\ntheft'].astype(float)

df['Population'] = df['Population'].str.replace(',','')
df['Population'] = df['Population'].astype(float)

df['Robbery'] = df['Robbery'].str.replace(',','')
df['Robbery'] = df['Robbery'].astype(float)

df['Property\r\ncrime'] = df['Property\r\ncrime'].str.replace(',','')
df['Property\r\ncrime'] = df['Property\r\ncrime'].astype(float)

## Feature Creation

In [214]:
# Log features
df['BurglaryLog'] = np.log1p(df['Burglary'])
df['ViolentCrimeLog'] = np.log1p(df['Violent\r\ncrime'])
df['AggravatedAssultLog'] = np.log1p(df['Aggravated\r\nassault'])
df['LarTheftLog'] = np.log1p(df['Larceny-\r\ntheft'])
df['MotorVTheftLog'] = np.log1p(df['Motor\r\nvehicle\r\ntheft'])
df['PopulationLog'] = np.log1p(df['Population'])
df['ViolentCrimePop'] = np.sqrt(df['Violent\r\ncrime'] / df['Population'])
df['RobberyLog'] = np.log1p(df['Robbery'])
df['PropertyCrimeLog'] = np.log1p(df['Property\r\ncrime'])

# Remove the 3 nan rows
df = df[df['PopulationLog'].notnull()]

## Logistic Regression Model

In [233]:
# Test and Train sets
X = df[['BurglaryLog', 'ViolentCrimeLog', 'AggravatedAssultLog', 'LarTheftLog', 'MotorVTheftLog', 'PopulationLog', 'ViolentCrimePop', 'RobberyLog', 'PropertyCrimeLog']]
y = df['MurderCat']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [234]:
X_train['intercept'] = 1
X_test['intercept'] = 1

# Declare and fit the model.
logit = sm.Logit(y_train, X_train)
result = logit.fit()

# Lots of information about the model and its coefficients, but the
# accuracy rate for predictions is missing.
print(result.summary())

Optimization terminated successfully.
         Current function value: 0.202929
         Iterations 9
                           Logit Regression Results                           
Dep. Variable:              MurderCat   No. Observations:                  261
Model:                          Logit   Df Residuals:                      252
Method:                           MLE   Df Model:                            8
Date:                Tue, 14 Aug 2018   Pseudo R-squ.:                  0.4851
Time:                        21:17:41   Log-Likelihood:                -52.964
converged:                       True   LL-Null:                       -102.86
                                        LLR p-value:                 4.702e-18
                          coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------------
BurglaryLog             0.4628      1.529      0.303      0.762      -2.533       3.459
Vi

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [235]:
# Calculate accuracy. First, get probability that each row will be admitted.
pred_statsmod = result.predict(X_test)

# Code admission as 1 if probability is greater than .5.
pred_y_statsmod = np.where(pred_statsmod < .5, 0, 1)

# Accuracy table.
table = pd.crosstab(y_test, pred_y_statsmod)

print('\n Accuracy by admission status')
print(table)
print('\n Percentage accuracy')
print((table.iloc[0,0] + table.iloc[1,1]) / (table.sum().sum()))


 Accuracy by admission status
col_0       0  1
MurderCat       
0          69  4
1           5  9

 Percentage accuracy
0.896551724137931


## Ridge Regression

In [236]:
lams = np.arange(0.1, 1, .1)

r2s_ridge = []

for lam in lams:
    ridgeregrBig = linear_model.Ridge(alpha=lam, fit_intercept=True)
    ridgeregrBig.fit(X_train, y_train)
    r2s_ridge.append(ridgeregrBig.score(X_train, y_train))

print(lams)
print('\n-----\n')
print(r2s_ridge)

[0.1 0.2 0.3 0.4 0.5 0.6 0.7 0.8 0.9]

-----

[0.3949953060454783, 0.3949903048232367, 0.3949836036064982, 0.3949758593160162, 0.39496744665221983, 0.3949585885045592, 0.39494942270204947, 0.39494003771319564, 0.39493049245559053]


In [237]:
np.argmax(r2s_ridge, axis=0)

0

In [245]:
ridgeregr = linear_model.Ridge(alpha=0.1, fit_intercept=True)
ridgeregr.fit(X_train, y_train)
print(f'Ridge Regression Train Score: {ridgeregr.score(X_train, y_train)}')

Ridge Regression Train Score: 0.3949953060454783


### Ridge Test set

In [246]:
preds = ridgeregr.predict(X_test)

# Code admission as 1 if probability is greater than .5.
pred_y = np.where(preds < .5, 0, 1)

# Accuracy table.
table = pd.crosstab(y_test, pred_y)

print('\n Accuracy by admission status')
print(table)
print('\n Percentage accuracy')
print((table.iloc[0,0] + table.iloc[1,1]) / (table.sum().sum()))


 Accuracy by admission status
col_0       0  1
MurderCat       
0          71  2
1           9  5

 Percentage accuracy
0.8735632183908046


The Ridge resgression model is slightly less accurate than the plain logistic regression model.

## Lasso Regression

In [240]:
lams = np.arange(0.1, 1, .1)

r2s_las = []

for lam in lams:
    lassBig = linear_model.Lasso(alpha=lam)
    lassBig.fit(X_train, y_train)
    r2s_las.append(lassBig.score(X_train, y_train))

print(lams)
print('\n-----\n')
print(r2s_las)

[0.1 0.2 0.3 0.4 0.5 0.6 0.7 0.8 0.9]

-----

[0.34251927296676765, 0.2222590602833382, 0.06616259658655887, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]


### Lasso Test Set

In [241]:
lass = linear_model.Lasso(alpha=0.1)
lass.fit(X_train, y_train)
lass.score(X_train, y_train)

0.34251927296676765

In [242]:
preds = lass.predict(X_test)

# Code admission as 1 if probability is greater than .5.
pred_y = np.where(preds < .5, 0, 1)

# Accuracy table.
table = pd.crosstab(y_test, pred_y)

print('\n Accuracy by admission status')
print(table)
print('\n Percentage accuracy')
print((table.iloc[0,0] + table.iloc[1,1]) / (table.sum().sum()))


 Accuracy by admission status
col_0       0  1
MurderCat       
0          73  0
1          12  2

 Percentage accuracy
0.8620689655172413


Lasso regression seems to be slightly less accurate than both plain logistic regression and ridge regression.

## Summary

Overall, it appears that the vanilla logistic regression model performed the best.  It had the highest R² value as well as the highest accuracy.  The ridge regression model was a close second and the lasso regression model performed the worst.

This intuitively makes sense considering that regularization is intended to reduce overfitting and increase the generalizability of the model.  The logistic regression model didn’t appear to be overfitting.  The coefficients weren’t growing very large and R² wasn’t suspiciously high.

Adding Lasso to a model like this might not be the best idea because we are not dealing with a very large feature set.  So If one of the few features is effectively shut off by L1 regularization, we can expect the model to potentially not perform as well.

If I were to add more features, I suspect the L2 model would end up performing better since they are currently very close in accuracy and as the feature set increases, so does the complexity of the model and therefore the greater the need for regularization.