In [1]:
import math

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.formula.api as smf
from statsmodels.sandbox.regression.predstd import wls_prediction_std

%matplotlib inline
pd.options.display.float_format = '{:.3f}'.format

In [3]:
df2 = pd.read_csv('../Datasets/2014 NY Offenses.csv')
df2.columns = ['city', 'population', 'violent crime', 'murder', 'rape (new definition)', 'rape (old definition)', 'robbery',
              'aggrevated assault', 'property_crime', 'burglary', 'larceny-theft', 'motor vehicle theft', 'arson', 'NaN']
df2['robbery'] = df2['robbery'].replace('[^0-9]+', '', regex=True)
df2['population'] = df2['population'].replace('[^0-9]+', '', regex=True)
df2['property_crime'] = df2['property_crime'].replace('[^0-9]+', '', regex=True)
df2 = df2[['population', 'robbery', 'murder', 'property_crime']].dropna()
df2 = df2.astype('int64')

In [5]:
df2['population_sq'] = (df2['population'] * df2['population']).apply(lambda x: '%.f' % x)
df2['murder'] = np.where(df2['murder'] > 0, 1, 0)
df2['robbery'] = np.where(df2['robbery'] > 0, 1, 0)

In [10]:
df2.head(3)

Unnamed: 0,population,robbery,murder,property_crime,population_sq
0,1851,0,0,11,3426201
1,2568,1,0,49,6594624
2,820,0,0,1,672400


## I am attempting to validate a regression model for prediction. To do this, I want to achieve a model with a consistent R^2 and only statistically significant parameters across multiple samples. Based on the validation test, I will create a revised model and test both old and new models on a new holdout.

In [23]:
linear_formula = 'property_crime ~ population+population_sq+murder+robbery'
lm=smf.ols(formula=linear_formula, data=df2).fit()

In [24]:
print('R^2 Value: ',lm.rsquared)
print('P Valuesn:\n', lm.pvalues)

R^2 Value:  0.9999999954627793
P Valuesn:
 Intercept                     0.061
population_sq[T.104755225]    0.058
population_sq[T.10497600]     0.068
population_sq[T.105555076]    0.347
population_sq[T.1061847396]   0.313
population_sq[T.1063738225]   0.136
population_sq[T.107584]       0.067
population_sq[T.10837264]     0.096
population_sq[T.108910096]    0.045
population_sq[T.109244304]    0.034
population_sq[T.11042329]     0.088
population_sq[T.11108889]     0.119
population_sq[T.1129833769]   0.040
population_sq[T.113827561]    0.106
population_sq[T.1153145764]   0.108
population_sq[T.11539609]     0.063
population_sq[T.116316225]    0.031
population_sq[T.117310561]    0.048
population_sq[T.1175056]      0.079
population_sq[T.11806096]     0.093
population_sq[T.11819844]     0.074
population_sq[T.1187698369]   0.034
population_sq[T.11909401]     0.086
population_sq[T.11978521]     0.102
population_sq[T.1200830409]   0.743
population_sq[T.120450625]    0.186
population_sq[T.12078

Looking at the p values, it looks like Robbery has a miniscule effect on property crime.

In [27]:
linear_formula = 'property_crime ~ population+murder'
lm=smf.ols(formula=linear_formula, data=df2).fit()
print(lm.pvalues)

Intercept    0.979
population   0.000
murder       0.000
dtype: float64


In [28]:
lm.conf_int()

Unnamed: 0,0,1
Intercept,-62.494,64.171
population,0.016,0.016
murder,491.741,836.532


In [None]:
prstd, iv_l, iv_u = wls_prediction_std(lm)

plt.figure()
plt.plot(iv_u[0:15], 'o', color='green')
plt.plot(iv_l[0:15], 'o', color='green')
plt.plot(lm.fittedvalues[0:15], 'o', color='black'))
plt.title('black: predicted, ')