In [69]:
import numpy as np
import pandas as pd
import math
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.formula.api as smf
from statsmodels.sandbox.regression.predstd import wls_prediction_std
from sklearn import linear_model
%matplotlib inline

sns.set_style('white')

In [70]:
df = pd.read_excel('/Users/cappsjr/Downloads/table_8_offenses_known_to_law_enforcement_new_york_by_city_2013.xls')
df.columns = ('City', 'Population', 'Violent Crime', 'Murder and nonnegligent manslaughter',
                     'Rape (revised definition)', 'Rape (Legacy definition)', 'Robbery', 'Aggravated assault',
                     'Property crime', 'Burglary', 'Larceny-theft', 'Motor vehicle theft', 'Arson3')

df = df.drop([0, 1, 2, 3, 352, 353, 354]).fillna(0)\

# locate the outlier in the data and remove it

col = df['Population']
col[np.abs(col) > 0.6 * 1e7]
df = df.drop(220)

df.head()

Unnamed: 0,City,Population,Violent Crime,Murder and nonnegligent manslaughter,Rape (revised definition),Rape (Legacy definition),Robbery,Aggravated assault,Property crime,Burglary,Larceny-theft,Motor vehicle theft,Arson3
4,Adams Village,1861,0,0,0,0,0,0,12,2,10,0,0
5,Addison Town and Village,2577,3,0,0,0,0,3,24,3,20,1,0
6,Akron Village,2846,3,0,0,0,0,3,16,1,15,0,0
7,Albany,97956,791,8,0,30,227,526,4090,705,3243,142,0
8,Albion Village,6388,23,0,0,3,4,16,223,53,165,5,0


In [71]:
# Make a variable where murder is a categorical variable
df['Murder'] = np.where(df['Murder and nonnegligent manslaughter'] > 0, 1, 0)

#Rename the property crime column so it will work in linear_formula
df.rename(columns={'Property crime': 'PC'}, inplace=True)

# Define your input variables and target variable
X = df[['Population', 'Murder', 'Robbery']]
Y = df['PC']

#Instantiate our model
regr = linear_model.LinearRegression()

#Fit our model to our data
regr.fit(X, Y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [72]:
# inspect the results
print('\nCoefficients: \n', regr.coef_)
print('\nIntercepts: \n', regr.intercept_)


Coefficients: 
 [  1.38974683e-02   7.07518996e+01   6.83135957e+00]

Intercepts: 
 32.148745431


In [73]:
# Write out the model formula
# Your dependent variable on the right; independent variables on the left
# Use a ~ to an '=' from the functional form
linear_formula = 'PC ~ Population+Murder+Robbery'

# fit our model to the data using the formula
lm = smf.ols(formula = linear_formula, data = df).fit()

In [74]:
# lets look at the coefficients of our model 
lm.params

Intercept     32.148745
Population     0.013897
Murder        70.751900
Robbery        6.831360
dtype: float64

In [75]:
# lets look at the p-values as a significance test for each of the coefficients
lm.pvalues

Intercept     1.127810e-01
Population    1.222015e-30
Murder        1.866948e-01
Robbery       2.057668e-73
dtype: float64

In [76]:
# finally, lets look at the r-squared
lm.rsquared

0.91844748068543458

In [78]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(regr, X, Y, cv=5)
scores

array([ 0.9621205 ,  0.77490031,  0.36071997,  0.95133092,  0.72256112])

In [79]:
from sklearn import metrics

print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Accuracy: 0.75 (+/- 0.44)


In [80]:
# The murder feature has a p-value of greater than .05, suggesting it is not statistically significant.
# Lets drop it and see how it affects the R-squared value

In [81]:
# Define your input variables and target variable
X = df[['Population', 'Robbery']]
Y = df['PC']

#Instantiate our model
regr = linear_model.LinearRegression()

#Fit our model to our data
regr.fit(X, Y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [82]:
# inspect the results
print('\nCoefficients: \n', regr.coef_)
print('\nIntercepts: \n', regr.intercept_)


Coefficients: 
 [ 0.01442253  6.80655259]

Intercepts: 
 34.0007927396


In [83]:
# Write out the model formula
# Your dependent variable on the right; independent variables on the left
# Use a ~ to an '=' from the functional form
linear_formula = 'PC ~ Population+Robbery'

# fit our model to the data using the formula
lm = smf.ols(formula = linear_formula, data = df).fit()

In [84]:
lm.params

Intercept     34.000793
Population     0.014423
Robbery        6.806553
dtype: float64

In [85]:
lm.pvalues

Intercept     9.314900e-02
Population    3.623796e-36
Robbery       3.036447e-73
dtype: float64

In [86]:
lm.rsquared

0.91803127676064289

In [87]:
scores = cross_val_score(regr, X, Y, cv=5)
scores

array([ 0.96189183,  0.79164292,  0.38233467,  0.95121006,  0.72206906])

In [88]:
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Accuracy: 0.76 (+/- 0.42)


In [89]:
# We were able to drop the Murder feature and still maintain a very high r-squared value
# Lets play with some other feature combinations

In [90]:
# Lets rename murder and non-negligent... to MNNMS to use in linear model

df.rename(columns={'Murder and nonnegligent manslaughter': 'MNNMS'}, inplace=True)

# Define your input variables and target variable
X = df[['Population', 'Robbery', 'MNNMS']]
Y = df['PC']

#Instantiate our model
regr = linear_model.LinearRegression()

#Fit our model to our data
regr.fit(X, Y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [91]:
# inspect the results
print('\nCoefficients: \n', regr.coef_)
print('\nIntercepts: \n', regr.intercept_)


Coefficients: 
 [  1.58354338e-02   2.02472175e+00   1.18437047e+02]

Intercepts: 
 25.217934543


In [92]:
# Write out the model formula
# Your dependent variable on the right; independent variables on the left
# Use a ~ to an '=' from the functional form
linear_formula = 'PC ~ Population+Robbery+MNNMS'

# fit our model to the data using the formula
lm = smf.ols(formula = linear_formula, data = df).fit()

In [93]:
lm.params

Intercept      25.217935
Population      0.015835
Robbery         2.024722
MNNMS         118.437047
dtype: float64

In [94]:
lm.pvalues

Intercept     1.771582e-01
Population    1.100236e-45
Robbery       2.481759e-03
MNNMS         4.808789e-14
dtype: float64

In [95]:
lm.rsquared

0.93055862063860428

In [96]:
scores = cross_val_score(regr, X, Y, cv=5)
scores

array([ 0.96179664,  0.73225909,  0.41193347,  0.92773808,  0.73402756])

In [97]:
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Accuracy: 0.75 (+/- 0.39)


In [98]:
# Write out the model formula
# Your dependent variable on the right; independent variables on the left
# Use a ~ to an '=' from the functional form
linear_formula = 'PC ~ Population+MNNMS'

# fit our model to the data using the formula
lm = smf.ols(formula = linear_formula, data = df).fit()

In [99]:
lm.params

Intercept      12.966134
Population      0.017278
MNNMS         160.436198
dtype: float64

In [100]:
lm.pvalues

Intercept     4.821380e-01
Population    8.710103e-62
MNNMS         1.182654e-83
dtype: float64

In [101]:
lm.rsquared

0.92867764540006936

In [102]:
scores = cross_val_score(regr, X, Y, cv=5)
scores

array([ 0.96179664,  0.73225909,  0.41193347,  0.92773808,  0.73402756])

In [103]:
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Accuracy: 0.75 (+/- 0.39)


In [104]:
# Lets add population-squared as a feature and see how it affects the model

df['pop2'] = df['Population'] ** 2

# Write out the model formula
# Your dependent variable on the right; independent variables on the left
# Use a ~ to an '=' from the functional form
linear_formula = 'PC ~ Population+MNNMS+pop2'

# fit our model to the data using the formula
lm = smf.ols(formula = linear_formula, data = df).fit()

In [105]:
lm.params

Intercept    -1.769946e+01
Population    2.035450e-02
MNNMS         1.820292e+02
pop2         -3.195291e-08
dtype: float64

In [106]:
lm.pvalues

Intercept     3.772854e-01
Population    6.742459e-48
MNNMS         4.631042e-65
pop2          3.682898e-04
dtype: float64

In [107]:
lm.rsquared

0.93127115965328766

In [108]:
scores = cross_val_score(regr, X, Y, cv=5)
scores

array([ 0.96179664,  0.73225909,  0.41193347,  0.92773808,  0.73402756])

In [109]:
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Accuracy: 0.75 (+/- 0.39)


In [110]:
df.rename(columns={'Violent Crime': 'ViolentCrime'}, inplace=True)


# Write out the model formula
# Your dependent variable on the right; independent variables on the left
# Use a ~ to an '=' from the functional form
linear_formula = 'PC ~ Population+MNNMS+pop2+ViolentCrime'

# fit our model to the data using the formula
lm = smf.ols(formula = linear_formula, data = df).fit()

In [111]:
lm.params

Intercept      -2.236997e+01
Population      1.885364e-02
MNNMS           9.409931e+01
pop2           -6.852805e-08
ViolentCrime    2.301955e+00
dtype: float64

In [112]:
lm.pvalues

Intercept       2.146514e-01
Population      3.620165e-49
MNNMS           2.204856e-13
pop2            1.787047e-13
ViolentCrime    7.164746e-18
dtype: float64

In [113]:
lm.rsquared

0.94468831959917088

In [114]:
scores = cross_val_score(regr, X, Y, cv=5)
scores

array([ 0.96179664,  0.73225909,  0.41193347,  0.92773808,  0.73402756])

In [115]:
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Accuracy: 0.75 (+/- 0.39)


In [116]:
# Write out the model formula
# Your dependent variable on the right; independent variables on the left
# Use a ~ to an '=' from the functional form
linear_formula = 'PC ~ Population+pop2+ViolentCrime'

# fit our model to the data using the formula
lm = smf.ols(formula = linear_formula, data = df).fit()

In [117]:
lm.params

Intercept      -1.894113e+01
Population      1.718707e-02
pop2           -6.588602e-08
ViolentCrime    3.815374e+00
dtype: float64

In [118]:
lm.pvalues

Intercept       3.304435e-01
Population      1.508707e-39
pop2            3.828264e-11
ViolentCrime    1.647278e-69
dtype: float64

In [119]:
lm.rsquared

0.93524830711620155

In [120]:
scores = cross_val_score(regr, X, Y, cv=5)
scores

array([ 0.96179664,  0.73225909,  0.41193347,  0.92773808,  0.73402756])

In [121]:
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Accuracy: 0.75 (+/- 0.39)


In [122]:
# Write out the model formula
# Your dependent variable on the right; independent variables on the left
# Use a ~ to an '=' from the functional form
linear_formula = 'PC ~ Population+ViolentCrime'

# fit our model to the data using the formula
lm = smf.ols(formula = linear_formula, data = df).fit()

In [123]:
scores = cross_val_score(regr, X, Y, cv=5)
scores

array([ 0.96179664,  0.73225909,  0.41193347,  0.92773808,  0.73402756])

In [124]:
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Accuracy: 0.75 (+/- 0.39)


In [140]:
df['vc2'] = df['ViolentCrime'] ** 2

# Write out the model formula
# Your dependent variable on the right; independent variables on the left
# Use a ~ to an '=' from the functional form
linear_formula = 'PC ~ Population+pop2+ViolentCrime+vc2+MNNMS'

# fit our model to the data using the formula
lm = smf.ols(formula = linear_formula, data = df).fit()

In [141]:
scores = cross_val_score(regr, X, Y, cv=5)
scores

array([ 0.96179664,  0.73225909,  0.41193347,  0.92773808,  0.73402756])

In [142]:
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Accuracy: 0.75 (+/- 0.39)
