In [1]:
%load_ext autoreload
%autoreload 2

## Objective

Manually remove some features, then make a simple model to predict the number of issues, to determine what factors affect the number of 311 issues.

In [2]:
import os, sys
sys.path.append(os.path.join(os.path.dirname('.'), "../preprocessing"))

In [3]:
from transform_for_num_issues_pred import main as transform_dataset

In [4]:
from __future__ import division
import pandas as pd
import numpy as np
import warnings
import seaborn as sns
import matplotlib.pyplot as plt
from pylab import rcParams
%matplotlib inline

rcParams['figure.figsize'] = 20, 5
warnings.filterwarnings("ignore", category=DeprecationWarning)
sns.set_style("whitegrid")
sns.set_context("poster")

from helper_functions import dummify_cols_and_baselines, make_alphas

In [5]:
df_orig = pd.read_pickle('../data/data_from_remove_from_dataset.pkl')
df_orig.shape

(516406, 40)

In [6]:
df_transformed = transform_dataset(df_orig)
df_transformed.shape

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  df_subset['NUM_ISSUES'] = 1


(822, 221)

## Choosing columns

In [7]:
cols_orig_dataset = ['NUM_ISSUES', 'SubmittedPhoto', 'Property_Type', 'Source', 'is_description']
cols_type = [col for col in df_transformed.columns if 'TYPE' in col]
cols_census = ['race_white',
     'race_black',
     'race_asian',
     'race_hispanic',
     'race_other',
     'poverty_pop_below_poverty_level',
     'earned_income_per_capita',
     'poverty_pop_w_public_assistance',
     'poverty_pop_w_food_stamps',
     'poverty_pop_w_ssi',
     'school',
     'school_std_dev',
     'housing',
     'housing_std_dev',
     'bedroom',
     'bedroom_std_dev',
     'value',
     'value_std_dev',
     'rent',
     'rent_std_dev',
     'income',
     'income_std_dev']

In [28]:
df_col_subset = df_transformed[cols_orig_dataset + cols_type + cols_census]
# df_col_subset = df_transformed[cols_orig_dataset]
df_col_subset.shape

(822, 218)

In [29]:
df = df_col_subset.dropna().drop_duplicates()
df.shape

(550, 218)

## Dummify

In [30]:
cols_to_dummify = df.dtypes[df.dtypes == object].index
cols_to_dummify

Index([u'Property_Type', u'Source', u'school', u'housing'], dtype='object')

In [31]:
df_dummified, baseline_cols = dummify_cols_and_baselines(df, cols_to_dummify)

other is baseline 0 4
Constituent Call is baseline 1 4
8_6th_grade is baseline 2 4
rent is baseline 3 4


In [32]:
df_dummified.shape

(550, 228)

## Removing columns

A simple way to get a subset of columns is with the category of issue. There are 100+ categories, and some of them have few issues. We can look at the distribution of issues by category, and choose the top X% of categories.

In [None]:
df_orig[['TYPE', 'CASE_ENQUIRY_ID']].groupby('TYPE').count().describe()

25% of categories contain <50 issues. Let's remove these from our dummified dataset and run the model.

In [None]:
aa = df_orig[['TYPE', 'CASE_ENQUIRY_ID']].groupby('TYPE').count().reset_index()
aa.head()

In [None]:
col_blacklist = ['TYPE_' + col for col in aa[aa.CASE_ENQUIRY_ID < 30000].TYPE]

## Implementing the blacklist

In [None]:
df_dummified.shape

In [None]:
df_w_blacklist = df_dummified[[col for col in df_dummified.columns if col not in col_blacklist]]
df_w_blacklist.shape

## <<<

In [33]:
df_w_blacklist = df_dummified

In [34]:
df_w_blacklist.shape

(550, 228)

In [24]:
df_w_blacklist.head()

Unnamed: 0,NUM_ISSUES,SubmittedPhoto,is_description,Property_Type_Address,Property_Type_Intersection,Source_Citizens Connect App
0,741,0.03125,0.170608,1,0,0
1,1808,0.103293,0.462575,1,0,0
2,798,0.072438,0.19523,1,0,0
3,1146,0.055594,0.220987,1,0,0
4,864,0.044759,0.342756,1,0,0


## >>>

In [None]:
df_w_blacklist.is_description.sort_values(ascending=False)

In [None]:
df_w_blacklist.describe()

## Running model

In [14]:
from sklearn.cross_validation import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression, LassoCV, RidgeCV
from sklearn.cross_validation import ShuffleSplit
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.metrics import make_scorer




In [35]:
X_train, X_test, y_train, y_test = train_test_split(
    df_w_blacklist.drop('NUM_ISSUES', axis=1), 
    df_w_blacklist.NUM_ISSUES, 
    test_size=0.2, 
    random_state=300
)

In [16]:
pipe = make_pipeline(StandardScaler(), LassoCV())

In [None]:
mse = make_scorer(mean_squared_error, greater_is_better=False)

In [36]:
params = {'lassocv__alphas': make_alphas(-10, 5)}
# params = {'lassocv__alphas': [[100]]}
# params = {}
model = GridSearchCV(pipe, param_grid=params, n_jobs=-1, cv=10)
model.fit(X_train, y_train);



In [37]:
pd.DataFrame(model.cv_results_).T.iloc[:5]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
mean_fit_time,1.50855,1.85892,2.30719,2.1436,2.51253,2.63935,2.01132,1.6038,1.36686,1.24422,1.1037,0.653458,0.446119,0.355198,0.256216,0.245985
mean_score_time,0.0181295,0.0216121,0.0379518,0.0181613,0.0264389,0.0164363,0.013209,0.0207019,0.0163116,0.014388,0.0208447,0.0208607,0.0139162,0.00981028,0.00789199,0.00695519
mean_test_score,-0.574627,-0.574627,-0.574627,-0.574627,-0.574625,-0.5746,-0.574352,-0.571861,-0.548227,-0.339038,0.230292,0.442314,0.403193,-0.0531497,-0.067226,-0.067226
mean_train_score,0.985443,0.985443,0.985443,0.985443,0.985443,0.985443,0.985443,0.985443,0.985446,0.985393,0.983075,0.967541,0.911828,0.426002,0,0
param_lassocv__alphas,[1e-10],[1e-09],[1e-08],[1e-07],[1e-06],[1e-05],[0.0001],[0.001],[0.01],[0.1],[1.0],[10.0],[100.0],[1000.0],[10000.0],[100000.0]


In [38]:
model.score(X_test, y_test)

0.55862806412569355

## Conclusion

Even with Lasso regularization, the model overfits to the training set and underfits the test set. We are still in a high-variance situation. I could use dimensionality reduction to mitigate this, but that would impact the interpretability of my model.

Since my model would be less interpretable if I want it to perform better, I will use Random Forests to hopefully decrease the variance.

## For future work

- Incorporate time series aspect