In [1]:
%load_ext autoreload
%autoreload 2

## Objective

Let's run a Linear Regression model with only one feature, as our baseline.

In [2]:
import os, sys
sys.path.append(os.path.join(os.path.dirname('.'), "../preprocessing"))

from transform_for_num_issues_pred import main as transform_dataset

In [3]:
from __future__ import division
import pandas as pd
import numpy as np
import warnings
import seaborn as sns
import matplotlib.pyplot as plt
from pylab import rcParams
%matplotlib inline

warnings.filterwarnings("ignore", category=DeprecationWarning)
sns.set_style("whitegrid")
sns.set_context("poster")
rcParams['figure.figsize'] = 20, 5

from helper_functions import dummify_cols_and_baselines, make_alphas, adjusted_r2, transform_school

In [4]:
df_orig = pd.read_pickle('../data/data_from_remove_from_dataset.pkl')
df_orig.shape

(516406, 40)

In [5]:
df_orig = transform_school(df_orig)
df_orig.shape

  df.school = df.school.str.extract(r'(\d\d?)').astype(int)


(516406, 40)

In [6]:
df_orig.head(1).T

Unnamed: 0,905425
CASE_ENQUIRY_ID,101001983786
OPEN_DT,2017-01-07 10:51:37
CLOSED_DT,2017-01-07 11:46:43
TYPE,Request for Snow Plowing
SubmittedPhoto,True
LOCATION_ZIPCODE,2124
Property_Type,Address
LATITUDE,42.2809
LONGITUDE,-71.068
Source,Citizens Connect App


In [7]:
df_transformed = transform_dataset(df_orig)
df_transformed.shape

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  df_subset['NUM_ISSUES'] = 1


(534, 28)

## Dropping more outliers

To see if that improves the model. If so, the next question is, when do we stop removing outliers!?!

In [9]:
more_outliers = ['0107021', '0701011', '0806013'] # last one is Dudley

In [10]:
df_transformed = df_transformed[~df_transformed.tract_and_block_group.isin(more_outliers)]
df_transformed.shape

(531, 28)

In [11]:
df_transformed.head(1).T

Unnamed: 0,0
tract_and_block_group,1004002
race_white,0.242399
race_black,0.514358
race_asian,0.035473
race_hispanic,0.0675676
race_other,0.140203
poverty_pop_below_poverty_level,0
earned_income_per_capita,34340
poverty_pop_w_public_assistance,0.0597484
poverty_pop_w_food_stamps,0.138365


## Choosing columns

Income performs better than house value.

In [12]:
cols_orig_dataset = ['NUM_ISSUES_PER_1000_POP']
cols_census = ['race_white',
     'race_black',
     'race_asian',
     'race_hispanic',
     'race_other',
     'poverty_pop_below_poverty_level',
     'school',
     'school_std_dev',
     'housing',
     'housing_std_dev',
     'income',
     'income_std_dev']
cols_census = ['income', 'race_white',
     'race_black',
     'race_asian',
     'race_hispanic',
     'race_other']

In [13]:
df_col_subset = df_transformed[cols_orig_dataset + cols_census]
df_col_subset.shape

(531, 7)

## Dummify

In [14]:
cols_to_dummify = df_col_subset.dtypes[df_col_subset.dtypes == object].index
cols_to_dummify

Index([], dtype='object')

In [15]:
df_dummified, baseline_cols = dummify_cols_and_baselines(df_col_subset, cols_to_dummify)

In [16]:
df_dummified.shape

(531, 7)

In [None]:
# df_dummified = df_dummified.drop('race_other', axis=1)

## Running model

In [17]:
from sklearn.cross_validation import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression, LassoCV
from sklearn.cross_validation import ShuffleSplit
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.metrics import make_scorer




In [18]:
X_train, X_test, y_train, y_test = train_test_split(
    df_dummified.drop('NUM_ISSUES_PER_1000_POP', axis=1), 
    df_dummified.NUM_ISSUES_PER_1000_POP, 
    test_size=0.2, 
    random_state=500
)

In [19]:
# pipe = make_pipeline(StandardScaler(), LassoCV())
pipe = make_pipeline(StandardScaler(), LinearRegression())

In [20]:
mse = make_scorer(mean_squared_error, greater_is_better=False)
adjusted_r2_scorer = make_scorer(adjusted_r2, num_features=X_test.shape[1])

In [24]:
params = {'lassocv__alphas': make_alphas(-5, 3)}
params = {'lassocv__alphas': [[0.1], [0.3], [1]]}
params = {}
model = GridSearchCV(pipe, param_grid=params, n_jobs=-1, cv=5, scoring=adjusted_r2_scorer)
model.fit(X_train, y_train);

In [25]:
pd.DataFrame(model.cv_results_).T.iloc[:5]

Unnamed: 0,0
mean_fit_time,0.028716
mean_score_time,0.00154057
mean_test_score,0.105035
mean_train_score,0.20029
params,{}


In [None]:
print model.best_score_
print model.best_params_

In [None]:
model.score(X_test, y_test)

In [None]:
y_pred = model.predict(X_test)

In [None]:
r2_score(y_test, y_pred)

In [None]:
mean_squared_error(y_test, y_pred)**0.5

## Conclusion

Our baseline model's performance has adjusted $R^2=0.13$.