In [1]:
%load_ext autoreload
%autoreload 2

## Objective

What happens to performance when we add the features from the original dataset?

In [1]:
import os, sys
sys.path.append(os.path.join(os.path.dirname('.'), "../preprocessing"))

In [2]:
from transform_for_num_issues_pred import main as transform_dataset

In [3]:
from __future__ import division
import pandas as pd
import numpy as np
import warnings
import seaborn as sns
import matplotlib.pyplot as plt
from pylab import rcParams
%matplotlib inline

warnings.filterwarnings("ignore", category=DeprecationWarning)
sns.set_style("whitegrid")
sns.set_context("poster")
rcParams['figure.figsize'] = 20, 5

from helper_functions import dummify_cols_and_baselines, make_alphas

In [4]:
df_orig = pd.read_pickle('../data/data_from_remove_from_dataset.pkl')
df_orig.shape

(516406, 40)

In [5]:
df_transformed = transform_dataset(df_orig)
df_transformed.shape

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  df_subset['NUM_ISSUES'] = 1


(822, 221)

## Choosing columns

In [7]:
cols_orig_dataset = ['NUM_ISSUES', 'tract_and_block_group', 'SubmittedPhoto', 'Property_Type', 'Source', 'is_description']
cols_type = [col for col in df_transformed.columns if 'TYPE' in col]
cols_census = ['race_white',
     'race_black',
     'race_asian',
     'race_hispanic',
     'race_other',
     'poverty_pop_below_poverty_level',
     'earned_income_per_capita',
     'poverty_pop_w_public_assistance',
     'poverty_pop_w_food_stamps',
     'poverty_pop_w_ssi',
     'school',
     'school_std_dev',
     'housing',
     'housing_std_dev',
     'bedroom',
     'bedroom_std_dev',
     'value',
     'value_std_dev',
     'rent',
     'rent_std_dev',
     'income',
     'income_std_dev']

In [8]:
df_col_subset = df_transformed[cols_orig_dataset + cols_type + cols_census]
# df_col_subset = df_transformed[cols_orig_dataset + cols_census]
# df_col_subset = df_transformed[cols_census + ['NUM_ISSUES']]
# df_col_subset = df_transformed[cols_census + ['NUM_ISSUES', 'tract_and_block_group'] + cols_type]

df_col_subset.shape

(822, 219)

In [9]:
df = df_col_subset.dropna().drop_duplicates()
df.shape

(550, 219)

## Removing outliers

0303003 is City Hall, which is where issues are assigned to when they don't have a location. Ideally, I would find out which issues truly took place in that block group and filter accordingly.

In [16]:
df = df[df.tract_and_block_group != '0303003'].drop('tract_and_block_group', axis=1)

## Dummify

In [17]:
cols_to_dummify = df.dtypes[df.dtypes == object].index
cols_to_dummify

Index([u'Property_Type', u'Source', u'school', u'housing'], dtype='object')

In [18]:
df_dummified, baseline_cols = dummify_cols_and_baselines(df, cols_to_dummify)

Intersection is baseline 0 4
Constituent Call is baseline 1 4
8_6th_grade is baseline 2 4
rent is baseline 3 4


In [19]:
df_dummified.shape

(549, 227)

In [20]:
# df_dummified = df_dummified.drop('race_other', axis=1)

In [21]:
df_dummified.head(1)

Unnamed: 0,NUM_ISSUES,SubmittedPhoto,is_description,TYPE_ADA,TYPE_Abandoned Bicycle,TYPE_Abandoned Building,TYPE_Abandoned Vehicles,TYPE_Alert Boston,TYPE_Animal Found,TYPE_Animal Generic Request,...,school_11_9th_grade,school_13_11th_grade,school_14_12th_grade_no_diploma,school_15_hs_diploma,school_18_some_college_no_degree,school_19_associates,school_20_bachelors,school_21_masters,school_22_professional_school,housing_own
0,741,0.03125,0.170608,0.0,0.0,0.0,0.01098,0.0,0.0,0.000845,...,0,0,0,1,0,0,0,0,0,1


## Running model

In [22]:
from sklearn.cross_validation import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression, LassoCV, RidgeCV
from sklearn.cross_validation import ShuffleSplit
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.metrics import make_scorer




In [35]:
X_train, X_test, y_train, y_test = train_test_split(
    df_dummified.drop('NUM_ISSUES', axis=1), 
    df_dummified.NUM_ISSUES, 
    test_size=0.2, 
    random_state=500
)

In [31]:
pipe = make_pipeline(StandardScaler(), LassoCV())

In [32]:
mse = make_scorer(mean_squared_error, greater_is_better=False)

In [36]:
params = {'lassocv__alphas': make_alphas(-10, 5)}
params = {'lassocv__alphas': [[30], [100], [300]]}
# params = {}
model = GridSearchCV(pipe, param_grid=params, n_jobs=-1, cv=10)
model.fit(X_train, y_train);

In [37]:
pd.DataFrame(model.cv_results_).T.iloc[:5]

Unnamed: 0,0,1,2
mean_fit_time,0.359371,0.39047,0.177733
mean_score_time,0.0182771,0.0214608,0.00888653
mean_test_score,-0.0154139,0.317651,-0.00617079
mean_train_score,0.697868,0.48251,0
param_lassocv__alphas,[30],[100],[300]


In [38]:
model.score(X_test, y_test)

0.35469983587882137

## Conclusion

Adding the additional features improves our model's performance, but we are still in an overfit situation.