In [44]:
import pandas as pd
import numpy as np
import statsmodels.api as sm

# Load data

In [50]:
data = pd.read_csv("data/full_misdemeanor.csv").drop("Unnamed: 0", axis=1)
data.isnull().any().sum()

0

In [91]:
endog, exog = np.asarray(data["CMPLNT_NUM"]), np.asarray(data.drop("CMPLNT_NUM", axis=1))
targets, features = data["CMPLNT_NUM"], data.drop("CMPLNT_NUM", axis=1)

exog = sm.add_constant(exog)

# Linear Model

## OLS

In [45]:
model = sm.OLS(endog, exog)
results = model.fit()
print(results.summary()) # Very poor perf

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.080
Model:                            OLS   Adj. R-squared:                  0.061
Method:                 Least Squares   F-statistic:                     4.270
Date:                Sat, 21 Apr 2018   Prob (F-statistic):           4.24e-49
Time:                        10:28:57   Log-Likelihood:                -37005.
No. Observations:                6281   AIC:                         7.426e+04
Df Residuals:                    6155   BIC:                         7.511e+04
Df Model:                         125                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         78.5199      6.212     12.640      0.0

## Lasso

Since we have too many similar features, let's use a lasso regularization

In [80]:
from sklearn.linear_model import LassoCV, Lasso
from sklearn.model_selection import cross_val_score

In [78]:
ls = LassoCV()
ls.fit(features, targets)
best_alpha = ls.alpha_

In [79]:
# Columns useful
print((ls.coef_ == 0).sum(), ls.coef_.shape[0])
data.columns[1:][ls.coef_ != 0]

147 157


Index(['Male:',
       'Median household income in the past 12 months (in 2016 inflation-adjusted dollars)',
       'Total - Built 2000 to 2009', 'Total - Black or African American alone',
       'Total - Some other race alone',
       'Income in the past 12 months below poverty level: - In family households: - In married couple families:',
       'Income in the past 12 months below poverty level: - In family households: - In other families: - Female householder, no husband present: - All relatives',
       'Income in the past 12 months at or above poverty level: - In family households:',
       'Income in the past 12 months at or above poverty level: - In family households: - In married couple families:',
       'Not enrolled in school'],
      dtype='object')

In [87]:
# Now let's look at the validation metrics of our best model

ls = Lasso(alpha=best_alpha)
ls_scores = cross_val_score(ls, features, targets, cv=10, scoring='r2')
print("Mean cross validated r2: {}".format((ls_scores).mean()))

Mean cross validated r2: 0.13650926698852536


# Random Forest

In [88]:
from sklearn.ensemble import RandomForestRegressor

In [92]:
rf = RandomForestRegressor()
rf.fit(features, targets)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)