In [44]:
import pandas as pd
import numpy as np
import statsmodels.api as sm

# Load data

In [180]:
data = pd.read_csv("data/drop_first_dataset.csv")
features, targets = data.drop(["FELONY", "MISDEMEANOR", "VIOLATION", "FELONY_perc", "MISDEMEANOR_perc", "VIOLATION_perc"], 
                              axis=1), data["FELONY"]
data.isnull().any().sum()

0

In [158]:
endog.shape

(5921, 129)

In [167]:
endog, exog = np.asarray(features), np.asarray(targets)
exog = sm.add_constant(exog)

# Linear Model

## OLS

In [168]:
model = sm.OLS(endog, exog)
results = model.fit()
print(results.summary()) # Very poor perf

ValueError: shapes (5921,128) and (5921,128) not aligned: 128 (dim 1) != 5921 (dim 0)

## Lasso

Since we have too many similar features, let's use a lasso regularization

In [160]:
from sklearn.linear_model import LassoCV, Lasso
from sklearn.model_selection import cross_val_score

In [181]:
ls = LassoCV()
ls.fit(features, targets)
best_alpha = ls.alpha_



In [182]:
# Useful columns
print("{} active features out of {}".format((ls.coef_ != 0).sum(), ls.coef_.shape[0]))
lasso_features = features.columns[ls.coef_ != 0]
sorted(lasso_features)

44 active features out of 121


['$10,000 to $14,999',
 '$100,000 to $124,999',
 '$125,000 to $149,999',
 '$20,000 to $24,999',
 '$200,000 or more',
 '$45,000 to $49,999',
 '$50,000 to $59,999',
 '$60,000 to $74,999',
 '$75,000 to $99,999',
 '11th grade',
 '6th grade',
 'Asian alone',
 "Associate's degree",
 "Bachelor's degree",
 'Black or African American alone',
 'Built 1939 or earlier',
 'Built 1940 to 1949',
 'Built 1950 to 1959',
 'Built 1960 to 1969',
 'Built 1970 to 1979',
 'Built 1990 to 1999',
 'Built 2000 to 2009',
 'Built 2010 to 2013',
 'Doctorate degree',
 'Family households: - 4-person household',
 'Female:',
 'Female: - 18 and 19 years',
 'Female: - 22 to 24 years',
 'Female: - 25 to 29 years',
 'Female: - 30 to 34 years',
 'Female: - 55 to 59 years',
 'Female: - 85 years and over',
 'GED or alternative credential',
 'In labor force: - Civilian labor force: - Employed',
 'Male: - 22 to 24 years',
 'Male: - 30 to 34 years',
 'Male: - 35 to 39 years',
 "Master's degree",
 'Nonfamily households:',
 'Nonfa

In [183]:
# Now let's look at the validation metrics of our best model

ls = Lasso(alpha=best_alpha)
ls_scores = cross_val_score(ls, features, targets, cv=10, scoring='r2')
print("Mean cross validated r2: {}".format((ls_scores).mean()))

Mean cross validated r2: 0.09662218933417434


# Random Forest

In [119]:
from sklearn.ensemble import RandomForestRegressor

In [178]:
rf = RandomForestRegressor()

rf_scores = cross_val_score(rf, features, targets, cv=10, scoring='r2')
print("Mean cross validated r2: {}".format((rf_scores).mean()))

Mean cross validated r2: -0.0742062516186158


In [147]:
rf = RandomForestRegressor()

rf_scores = cross_val_score(rf, features[lasso_features], targets, cv=10, scoring='r2')
print("Mean cross validated r2: {}".format((rf_scores).mean()))

Mean cross validated r2: 0.011025327949007967


In [148]:
rf.fit(features[lasso_features], targets)
dict(zip(lasso_features, rf.feature_importances_))

{'$10,000 to $14,999': 0.014411253837939775,
 '$100,000 to $124,999': 0.029947269958150214,
 '$15,000 to $19,999': 0.01751076513237506,
 '$150,000 to $199,999': 0.020324629867536984,
 '$200,000 or more': 0.026062860469638054,
 '$45,000 to $49,999': 0.014546343299135072,
 '$50,000 to $59,999': 0.016099168031067698,
 '$60,000 to $74,999': 0.013663617546569778,
 '$75,000 to $99,999': 0.015466940118361392,
 '11th grade': 0.012898119078055487,
 '12th grade, no diploma': 0.0073384269584358935,
 '6th grade': 0.01832624983018382,
 'Asian alone': 0.033466890947226414,
 "Associate's degree": 0.013828636455734239,
 "Bachelor's degree": 0.03967261521306699,
 'Built 1939 or earlier': 0.03832897795429022,
 'Built 1940 to 1949': 0.01901446871473163,
 'Built 1950 to 1959': 0.01845710495432919,
 'Built 1970 to 1979': 0.025944298530512278,
 'Built 1990 to 1999': 0.03564824439097982,
 'Built 2000 to 2009': 0.09356459782960215,
 'Built 2010 to 2013': 0.08825532382610822,
 'Doctorate degree': 0.01234166549