In [77]:
%matplotlib inline
import numpy as np
import scipy as sp
import matplotlib as mpl
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import pandas as pd
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.notebook_repr_html', True)
import seaborn as sns
sns.set_style("whitegrid")
sns.set_context("poster")

In [78]:
# with help from http://planspace.org/20150423-forward_selection_with_statsmodels/
import statsmodels.formula.api as smf

def forward_selected(data, response):
    """Linear model designed by forward selection.

    Parameters:
    -----------
    data : pandas DataFrame with all possible predictors and response

    response: string, name of response column in data

    Returns:
    --------
    model: an "optimal" fitted statsmodels linear model
           with an intercept
           selected by forward selection
           evaluated by adjusted R-squared
    """
    
    remaining = set(data.columns)
    remaining.remove(response)
    selected = []
    current_score, best_new_score = 0.0, 0.0
    while remaining and current_score == best_new_score:
        scores_with_candidates = []
        for candidate in remaining:
            formula = "{} ~ {} + 1".format(response, ' + '.join(selected + [candidate]))
            score = smf.ols(formula, data).fit().rsquared_adj
            scores_with_candidates.append((score, candidate))
        scores_with_candidates.sort()
        best_new_score, best_candidate = scores_with_candidates.pop()
        if current_score < (best_new_score * 1):
            remaining.remove(best_candidate)
            selected.append(best_candidate)
            current_score = best_new_score
    formula = "{} ~ {} + 1".format(response, ' + '.join(selected))
    
    model = smf.ols(formula, data).fit()
    return model

In [79]:
response=pd.read_pickle('data/ZipcodeMVPSnew.pkl')[['RegionName','2000_agg','2010_agg']]
response=response.rename(columns={'RegionName': 'zipcode'})
response['2010_agg_adj'] = response['2010_agg']/1.26
response['pct_delta'] = (response['2010_agg_adj']/response['2000_agg']) - 1
response = response.drop(['2010_agg','2000_agg', '2010_agg_adj'], 1)

## Testing a few different response variables:
### - house price delta (above)
### - house/rental ratio delta (below)
### - house/rental ratio 2015 (below)

In [80]:
ratio_response = pd.read_pickle('data/housing_rental_ratio.pkl')
ratio_response = ratio_response.rename(columns = {'RegionName': 'zipcode'})

ratio_delta_response = ratio_response[['zipcode','ratio_pct_delta']]
ratio_2015_response = ratio_response[['zipcode','ratio_2015']]

ratio_delta_response['zipcode'] = ratio_delta_response['zipcode'].astype('str')
ratio_2015_response['zipcode'] = ratio_2015_response['zipcode'].astype('str')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [81]:
ratio_delta_response.head(3)

Unnamed: 0,zipcode,ratio_pct_delta
0,11368,0.984804
1,79936,1.001354
2,60629,1.073514


In [82]:
ratio_2015_response.head(3)

Unnamed: 0,zipcode,ratio_2015
0,11368,197.083754
1,79936,108.09686
2,60629,96.111037


In [83]:
response.head(3)

Unnamed: 0,zipcode,pct_delta
1,79936,0.173843
2,60629,0.085907
3,90650,0.426587


In [84]:
# training and test sets that were normalized, and training and test sets that were normalized and transformed ('_t')
train_set = pd.read_pickle('data/train_all_features_norm.pkl')
itrain_df = pd.DataFrame(train_set.index).rename(columns={0: 'ind'})

train_set_t = pd.read_pickle('data/train_all_features_norm_and_transform.pkl')
itrain_t_df = pd.DataFrame(train_set_t.index).rename(columns={0: 'ind'})

test_set = pd.read_pickle('data/test_all_features_norm.pkl')
itest_df = pd.DataFrame(test_set.index).rename(columns={0: 'ind'})

test_set_t = pd.read_pickle('data/test_all_features_norm_and_transform.pkl')
itest_t_df = pd.DataFrame(test_set_t.index).rename(columns={0: 'ind'})

features = pd.concat([train_set, test_set])
features_t = pd.concat([train_set_t, test_set_t])

print features.shape, train_set.shape, test_set.shape
features.head(10)

(27653, 64) (22122, 64) (5531, 64)


Unnamed: 0,zipcode,prison,jail,A001,A002,A003,A004,A005,A006,A007,A008,A009,A010,A011,A012,A013,A014,A015,A016,A017,A018,A019,A020,A021,A022,A023,A024,A025,A026,A027,A028,A029,A030,A031,A032,A033,A034,A035,A036,A037,A038,A039,A040,A041,A042,A043,A044,A045,A046,A047,A048,A049,A050,A051,A052,A053,A054,A055,A056,n_establishments,paid_employees,first_quarter_payroll_1000,annual_payroll_1000,median_income
0,1001,0,0,0.393159,-0.844788,-0.576017,-0.174118,-0.117701,0.107539,-0.016376,0.170177,0.30242,1.862902,-0.661973,-0.790923,-0.635659,-0.198994,-0.050199,-0.043662,-0.090009,-0.044475,-0.020593,1.102067,0.72583,-0.772259,-0.455802,-0.057597,-0.171337,0.315046,0.117725,0.383984,0.672488,2.09787,0.5377,-0.396109,-0.176487,-0.012918,-0.132979,-0.367899,-0.890846,-0.31713,-0.457076,-0.174062,-0.299229,1.122282,0.268374,0.452518,1.600822,1.382658,-0.747222,0.588576,0.758788,-0.797671,-0.140852,-0.530444,-0.652569,-0.228583,0.486598,-1.127662,0.459736,0.742016,0.24498,0.334447,0.334018
1,1002,0,0,1.245532,-1.599994,1.060064,3.597249,-1.353027,-2.046088,-1.219645,-0.868623,-0.806043,-0.218702,-0.400512,-1.502592,0.492105,2.820575,-1.110719,-1.885773,-1.384025,-0.915197,-0.790664,-0.376806,0.464421,-1.508369,1.426019,3.888342,-1.261798,-1.604638,-0.859521,-0.715226,-0.731682,-0.143021,-0.22304,-0.195494,-0.162262,1.524147,-0.132979,-0.065958,-1.612656,-1.10605,-0.983398,-0.12217,0.166579,0.116185,-0.327456,-0.507041,0.520319,0.090974,-0.843631,-0.630502,0.751994,-0.790133,-0.264675,-0.530444,-0.166848,-0.330651,0.973388,-0.031431,0.763299,1.633192,0.335312,0.417431,0.110293
3,1005,0,0,-0.418261,-0.321953,0.815872,-0.337381,-0.276076,0.681839,0.556609,-0.311218,-0.667485,0.07867,0.165987,-0.29823,1.099363,-0.272642,-0.387638,0.416865,0.479358,0.001353,-0.469801,-0.202821,-0.101965,-0.262645,0.416261,-0.352621,-0.107192,0.778418,0.552056,-0.565334,-0.865412,0.193112,0.6677,-0.433724,-0.176487,-0.316536,-0.132979,-0.466359,0.196325,0.156222,0.244687,-0.139468,-0.066325,-0.223221,-0.142543,0.185974,-0.233994,0.090974,0.291852,-0.171839,0.602543,-0.624302,-0.471047,0.240861,0.226354,0.230722,-0.461362,-0.442518,-0.416119,-0.399458,-0.271116,-0.314831,0.984848
4,1007,0,0,0.246007,0.171836,0.254233,-0.435338,0.008999,1.327927,0.757154,-0.361891,-0.736764,-0.553246,-0.247993,0.194464,0.231852,-0.34629,-0.194816,0.704695,0.582879,-0.365267,-0.790664,-0.463799,0.311932,0.133722,0.232669,-0.463255,0.341821,1.57277,0.823513,-0.365477,-0.664817,-0.535177,0.54733,-0.421186,-0.190712,-0.012918,-0.003289,-0.413847,0.321082,-0.685293,-0.457076,-0.104873,0.195692,-0.502019,-0.65619,-0.507041,-0.172834,-0.339587,0.53823,-0.835693,0.779167,-0.820284,-0.677419,-0.221922,0.249484,0.154171,0.191188,-0.481669,-0.062791,-0.286955,-0.233152,-0.263373,1.043897
6,1009,0,0,-0.719368,-0.496231,-0.087635,0.201386,0.83255,0.574158,-0.102324,-0.108525,-1.256356,0.487556,0.492813,-0.133999,0.492105,0.169246,0.962115,-0.043662,-0.245291,0.23049,-0.983182,0.145149,-0.428727,-0.828883,-0.685293,0.200548,0.341821,1.043202,0.009142,-0.365477,-1.333469,0.585268,0.658071,-0.477609,-0.190712,-0.335512,-0.132979,-0.453231,-0.56113,-0.001562,-0.457076,0.759997,0.894404,0.528321,0.658746,-0.027262,0.234903,0.029466,-0.286602,-0.618432,0.548198,-0.564,-0.388498,-0.838966,-0.375014,-0.177549,-0.754192,-0.227187,-0.617665,-0.509473,-0.307238,-0.358285,-0.233389
9,1012,0,0,-0.724921,-1.193344,-0.160892,-0.435338,-0.592827,0.466476,1.817176,0.727583,-0.563567,0.190184,0.144198,-1.283617,0.05835,-0.125346,-0.725076,-0.331492,2.083938,0.642937,-0.469801,0.667104,-0.080181,-0.998755,-0.409904,-0.795156,-0.171337,1.307986,1.257844,0.733733,-0.531086,-0.086999,0.720663,-0.490147,-0.176487,-0.259608,-0.132979,-0.48605,-0.222503,0.839952,2.525414,-0.398929,-0.677697,0.128307,0.206737,-0.773585,-0.009739,-0.831657,-0.393723,-0.654642,-0.124331,0.182238,-0.759968,-0.838966,-0.398144,-0.407202,-0.753849,0.203475,-0.622642,-0.51583,-0.312899,-0.364498,-0.092024
11,1020,0,0,1.288498,-0.670509,-0.405083,0.217712,-0.054351,-0.07193,0.298766,0.068831,0.129222,0.599071,-0.313358,-0.571948,-0.375406,0.095598,-0.050199,-0.101228,0.220555,-0.044475,-0.020593,0.232141,0.377284,-0.715635,-0.364006,0.384938,0.021097,-0.015934,0.33489,0.184128,0.338162,0.697313,0.364367,-0.345955,-0.176487,-0.126775,-0.003289,0.039066,-0.783911,0.208817,-0.106195,0.517833,0.457709,0.976822,0.473833,0.132665,1.193085,0.921343,-0.58654,0.274754,0.758788,-0.797671,-0.347224,-0.530444,-0.58318,-0.2541,1.540822,-0.834029,0.668746,0.529588,0.136917,0.195893,-0.082116
12,1022,0,0,-0.600674,-1.367623,0.400747,1.278919,-0.751202,-1.723044,-0.446115,0.423543,0.544896,0.376042,-0.57482,-1.338361,0.795734,1.053023,-0.67687,-1.713075,-1.125222,0.093008,-0.213111,0.232141,0.638693,-1.22525,-0.088618,1.306886,-0.556205,-1.207462,0.33489,0.683769,1.274275,0.361179,-0.155632,0.299773,-0.148037,-0.145751,-0.003289,0.130961,-2.289909,-1.10605,-0.983398,-0.312442,-0.473906,2.867799,0.925842,0.719062,3.88415,2.059254,-1.711311,0.492015,0.419127,-0.420783,-0.801242,-0.530444,-1.624009,-0.892024,-0.60494,0.692864,-0.498231,-0.166851,-0.0996,-0.114735,-0.017228
13,1026,0,0,-0.705138,-0.873834,-0.600436,-0.859821,-0.276076,1.148458,1.903124,0.550226,-0.182533,0.115841,0.013468,-0.517205,-0.635659,-0.616333,-0.917898,0.877393,1.773374,0.505455,0.30027,0.232141,0.050523,-1.22525,-0.455802,-0.979546,0.72669,1.109398,1.746467,0.533877,-0.664817,0.081068,0.744737,-0.490147,-0.204937,-0.373465,-0.132979,-0.48605,-0.890846,-0.474914,-0.106195,-0.623795,-0.066325,1.025309,1.316213,0.3459,0.418385,-0.001289,-0.800782,-0.509801,-0.008846,0.054096,-0.512322,-0.144791,-0.745087,-0.534787,-0.724617,-1.656202,-0.615177,-0.510578,-0.309675,-0.360434,0.413503
14,1027,0,0,0.455005,-0.699556,-0.698113,-0.059834,0.610825,0.538264,0.642557,0.170177,-0.425009,0.301699,-0.487666,-0.73618,-0.765786,-0.125346,0.335444,0.186601,0.324076,0.138835,-0.533974,0.232141,0.551557,-0.602388,-0.547598,0.089914,0.790835,0.778418,0.823513,0.184128,-0.263625,0.305157,0.55696,-0.446263,-0.176487,0.006058,-0.132979,-0.354771,-0.890846,-0.580103,-0.632516,-0.104873,-0.066325,0.807119,0.309466,-0.133879,1.050377,0.460027,-0.757934,-0.28047,0.684062,-0.714756,-0.347224,-0.684705,-0.629439,-0.356168,0.616248,-1.04936,0.265654,0.056358,-0.068505,-0.067752,0.179472


In [85]:
merged = response.merge(features, how='inner', on=['zipcode'])
merged_t = response.merge(features_t, how='inner', on=['zipcode'])

merged = merged.astype('float')
merged_t = merged_t.astype('float')

merged = merged.rename(columns={'pct_delta' : 'response'})
merged_t = merged_t.rename(columns={'pct_delta' : 'response'})

merged_t.shape, merged.shape

((11602, 65), (11602, 65))

In [86]:
merged_ratio_delta = ratio_delta_response.merge(features_t, how='inner', on=['zipcode'])
merged_ratio_2015 = ratio_2015_response.merge(features_t, how='inner', on=['zipcode'])

merged_ratio_delta = merged_ratio_delta.astype('float')
merged_ratio_2015 = merged_ratio_2015.astype('float')

merged_ratio_delta = merged_ratio_delta.rename(columns={'ratio_pct_delta':'response'})
merged_ratio_2015 = merged_ratio_2015.rename(columns={'ratio_2015':'response'})

merged_ratio_delta.shape, merged_ratio_2015.shape, features_t.shape

((10306, 65), (10306, 65), (27653, 64))

In [87]:
# house price delta response (_t ones are better - only use them)
merged_train = merged.merge(itrain_df, how='inner', right_on='ind', left_index=True)
merged_test = merged.merge(itest_df, how='inner', right_on='ind', left_index=True)
merged_t_train = merged_t.merge(itrain_t_df, how='inner', right_on='ind', left_index=True)
merged_t_test = merged_t.merge(itest_t_df, how='inner', right_on='ind', left_index=True)

# house/rental ratio delta response and house/rental ratio 2015 response
ratio_delta_train = merged_ratio_delta.merge(itrain_t_df, how='inner', right_on='ind', left_index=True)
ratio_delta_test = merged_ratio_delta.merge(itest_t_df, how='inner', right_on='ind', left_index=True)

ratio_2015_train = merged_ratio_2015.merge(itrain_t_df, how='inner', right_on='ind', left_index=True)
ratio_2015_test = merged_ratio_2015.merge(itest_t_df, how='inner', right_on='ind', left_index=True)

In [88]:
merged_t_train.shape, merged_t_train.shape, ratio_2015_test.shape, ratio_2015_train.shape, ratio_delta_test.shape, ratio_delta_train.shape

((9271, 66), (9271, 66), (2098, 66), (8208, 66), (2098, 66), (8208, 66))

In [89]:
import time
# house delta response
start_time = time.time()
bestmodel = forward_selected(merged_train.drop('zipcode',1), 'response')

print '%s seconds to train model'%(time.time() - start_time)

90.3473331928 seconds to train model


In [110]:
bestmodel.summary()

0,1,2,3
Dep. Variable:,response,R-squared:,0.39
Model:,OLS,Adj. R-squared:,0.387
Method:,Least Squares,F-statistic:,120.3
Date:,"Tue, 08 Dec 2015",Prob (F-statistic):,0.0
Time:,21:14:49,Log-Likelihood:,2154.7
No. Observations:,9271,AIC:,-4209.0
Df Residuals:,9221,BIC:,-3853.0
Df Model:,49,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
Intercept,-0.5131,0.567,-0.905,0.366,-1.625 0.599
A002,-0.0550,0.008,-6.773,0.000,-0.071 -0.039
A054,-0.0208,0.011,-1.906,0.057,-0.042 0.001
A003,-0.0848,0.016,-5.374,0.000,-0.116 -0.054
median_income,0.0532,0.004,13.315,0.000,0.045 0.061
A050,16.3152,12.800,1.275,0.202,-8.776 41.407
A052,-0.0617,0.004,-16.821,0.000,-0.069 -0.055
A024,-0.1257,0.077,-1.637,0.102,-0.276 0.025
A051,-0.0435,0.003,-12.660,0.000,-0.050 -0.037

0,1,2,3
Omnibus:,825.606,Durbin-Watson:,1.982
Prob(Omnibus):,0.0,Jarque-Bera (JB):,2508.037
Skew:,0.467,Prob(JB):,0.0
Kurtosis:,5.37,Cond. No.,530000000.0


In [107]:
import time
start_time = time.time()
# house/rental ratio delta response
bestmodel_ratio_delta = forward_selected(ratio_delta_train.drop('zipcode',1), 'response')
# house/rental ratio response
bestmodel_ratio_2015 = forward_selected(ratio_2015_train.drop('zipcode',1), 'response')

print '%s seconds to train 2 models'%(time.time() - start_time)

120.388909817 seconds to train 2 models


In [108]:
bestmodel_ratio_delta.summary()

0,1,2,3
Dep. Variable:,response,R-squared:,0.285
Model:,OLS,Adj. R-squared:,0.281
Method:,Least Squares,F-statistic:,80.11
Date:,"Tue, 08 Dec 2015",Prob (F-statistic):,0.0
Time:,21:14:49,Log-Likelihood:,4941.6
No. Observations:,7686,AIC:,-9805.0
Df Residuals:,7647,BIC:,-9534.0
Df Model:,38,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
Intercept,1.1131,0.012,94.749,0.000,1.090 1.136
A036,0.0092,0.003,3.616,0.000,0.004 0.014
A056,0.0472,0.004,10.536,0.000,0.038 0.056
A052,0.0279,0.002,12.791,0.000,0.024 0.032
A034,0.0125,0.003,4.916,0.000,0.008 0.017
A039,0.0569,0.006,9.368,0.000,0.045 0.069
A002,-0.0567,0.006,-10.170,0.000,-0.068 -0.046
A032,-0.0229,0.003,-7.270,0.000,-0.029 -0.017
ind,-1.126e-05,1.58e-06,-7.132,0.000,-1.44e-05 -8.16e-06

0,1,2,3
Omnibus:,956.661,Durbin-Watson:,1.972
Prob(Omnibus):,0.0,Jarque-Bera (JB):,2006.335
Skew:,0.769,Prob(JB):,0.0
Kurtosis:,4.975,Cond. No.,97900.0


In [109]:
bestmodel_ratio_2015.summary()

0,1,2,3
Dep. Variable:,response,R-squared:,0.639
Model:,OLS,Adj. R-squared:,0.637
Method:,Least Squares,F-statistic:,351.9
Date:,"Tue, 08 Dec 2015",Prob (F-statistic):,0.0
Time:,21:14:49,Log-Likelihood:,-38445.0
No. Observations:,8208,AIC:,76970.0
Df Residuals:,8166,BIC:,77270.0
Df Model:,41,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
Intercept,115.6556,3.048,37.944,0.000,109.681 121.631
A034,5.4867,0.493,11.124,0.000,4.520 6.454
A041,-8.0052,0.812,-9.862,0.000,-9.596 -6.414
median_income,10.8612,0.600,18.112,0.000,9.686 12.037
A036,3.7798,0.502,7.527,0.000,2.795 4.764
A018,30.6848,9.065,3.385,0.001,12.915 48.455
A032,-18.3619,0.628,-29.257,0.000,-19.592 -17.132
A031,-18.2574,0.791,-23.071,0.000,-19.809 -16.706
A037,-40.0220,2.277,-17.579,0.000,-44.485 -35.559

0,1,2,3
Omnibus:,1552.9,Durbin-Watson:,1.985
Prob(Omnibus):,0.0,Jarque-Bera (JB):,6981.279
Skew:,0.858,Prob(JB):,0.0
Kurtosis:,7.18,Cond. No.,417000.0


In [113]:
import time
start_time = time.time()
bestmodel_t = forward_selected(merged_t_train.drop('zipcode',1), 'response')

print '%s seconds to train model'%(time.time() - start_time)

69.0808401108 seconds to train model


In [114]:
bestmodel_t.summary()

0,1,2,3
Dep. Variable:,response,R-squared:,0.372
Model:,OLS,Adj. R-squared:,0.369
Method:,Least Squares,F-statistic:,121.5
Date:,"Tue, 08 Dec 2015",Prob (F-statistic):,0.0
Time:,21:21:22,Log-Likelihood:,2020.2
No. Observations:,9271,AIC:,-3948.0
Df Residuals:,9225,BIC:,-3620.0
Df Model:,45,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
Intercept,0.1053,0.019,5.431,0.000,0.067 0.143
A002,-0.0688,0.009,-7.814,0.000,-0.086 -0.052
A036,0.0493,0.004,13.665,0.000,0.042 0.056
A052,-0.0674,0.004,-17.275,0.000,-0.075 -0.060
median_income,0.0475,0.004,11.838,0.000,0.040 0.055
A049,-0.0785,0.008,-9.323,0.000,-0.095 -0.062
A035,0.0163,0.002,9.038,0.000,0.013 0.020
A025,0.0136,0.006,2.403,0.016,0.003 0.025
A055,-0.0600,0.014,-4.416,0.000,-0.087 -0.033

0,1,2,3
Omnibus:,907.412,Durbin-Watson:,1.982
Prob(Omnibus):,0.0,Jarque-Bera (JB):,2983.088
Skew:,0.492,Prob(JB):,0.0
Kurtosis:,5.599,Cond. No.,1440000.0


## get features from forward selection to use in sklearn regressions

In [143]:
# house price delta response
best_features = bestmodel.params.index.values.tolist() + ['response', 'zipcode']
best_features_t = bestmodel_t.params.index.values.tolist() + ['response', 'zipcode']

# house/rental ratio delta response
best_features_ratio_delta = bestmodel_ratio_delta.params.index.values.tolist() + ['response', 'zipcode']
# house/rental ratio 2015 response
best_features_ratio_2015 = bestmodel_ratio_2015.params.index.values.tolist() + ['response', 'zipcode']

In [144]:
best_features

['Intercept',
 'A002',
 'A054',
 'A003',
 'median_income',
 'A050',
 'A052',
 'A024',
 'A051',
 'zipcode',
 'A036',
 'A035',
 'A031',
 'ind',
 'A032',
 'A044',
 'A029',
 'A040',
 'annual_payroll_1000',
 'n_establishments',
 'paid_employees',
 'A041',
 'A019',
 'A037',
 'A042',
 'A048',
 'A047',
 'A046',
 'A034',
 'A009',
 'A045',
 'A020',
 'A055',
 'A011',
 'A028',
 'A056',
 'A016',
 'A033',
 'A014',
 'A004',
 'first_quarter_payroll_1000',
 'A043',
 'jail',
 'A038',
 'A049',
 'A013',
 'A008',
 'A025',
 'A039',
 'prison',
 'response',
 'zipcode']

In [145]:
type(best_features)

list

In [146]:
len(merged_train.columns), len(best_features), len(best_features_t)

(66, 52, 48)

In [158]:
# house response
bad_features_t = [el for el in merged_t_train.columns if (el not in best_features_t)]
merged_train_t_best = merged_t_train.drop(bad_features_t + ['ind'], 1)
merged_test_t_best = merged_t_test.drop(bad_features_t + ['ind'], 1)

In [159]:
len(best_features_t), len(merged_t_train.columns)

(48, 66)

In [163]:
# house/rental ratio response
bad_features_ratio_delta = [el for el in ratio_delta_train.columns if (el not in best_features_ratio_delta)]
bad_features_ratio_2015 = [el for el in ratio_2015_train.columns if (el not in best_features_ratio_2015)]

ratio_delta_train_best = ratio_delta_train.drop(bad_features_ratio_delta + ['ind'], 1).dropna()
ratio_delta_test_best = ratio_delta_test.drop(bad_features_ratio_delta + ['ind'], 1).dropna()

ratio_2015_train_best = ratio_2015_train.drop(bad_features_ratio_2015 + ['ind'], 1).dropna()
ratio_2015_test_best = ratio_2015_test.drop(bad_features_ratio_2015 + ['ind'], 1).dropna()

In [164]:
ratio_delta_train_best.shape, ratio_delta_test_best.shape, ratio_2015_train_best.shape, ratio_2015_test_best.shape

((7686, 39), (1970, 39), (8208, 42), (2098, 42))

In [165]:
ratio_delta_train_best.shape, ratio_delta_test_best.shape, ratio_2015_train_best.shape, ratio_2015_test_best.shape

((7686, 39), (1970, 39), (8208, 42), (2098, 42))

In [166]:
merged_train_t_best.head(2)

Unnamed: 0,zipcode,response,prison,jail,A002,A003,A006,A008,A009,A010,A017,A019,A024,A025,A026,A028,A029,A030,A031,A033,A034,A035,A036,A037,A040,A041,A042,A043,A044,A045,A046,A047,A048,A049,A050,A051,A052,A053,A054,A055,A056,n_establishments,paid_employees,first_quarter_payroll_1000,annual_payroll_1000,median_income
0,79936,0.173843,0,0,1.043228,1.353093,0.251114,-1.274009,-1.290995,-0.924961,-0.918179,-1.239872,0.495572,1.047414,0.646026,-1.164903,-1.199738,-1.299125,0.008071,-0.119587,-0.020352,-0.003289,2.706301,1.256761,1.590272,1.826019,-1.362656,-1.087653,-1.200055,-1.212563,-1.354481,2.230742,-1.402987,0.860686,-1.67346,-0.729692,-0.376183,1.637257,1.481053,1.805892,1.064799,1.825619,1.803127,1.14239,1.422776,-0.054825
1,60629,0.085907,0,0,1.711295,1.035645,-0.574443,-1.426029,-1.325635,-0.813446,-1.332265,-1.239872,0.864351,1.175703,-0.479306,-1.314796,-1.266604,-0.929836,-1.99489,-0.105362,-0.396648,-0.132979,2.470766,0.909223,2.040004,1.796906,-0.938398,-0.758919,-0.507041,-0.825213,-0.862411,2.359287,-0.871903,0.290055,-0.022089,1.099626,0.240861,2.909382,3.165172,1.779076,1.573763,1.350804,1.178124,0.356887,0.430216,-0.457911


## Linear Regression with Lasso Regularization (normalized and transformed features)

In [167]:
from sklearn.linear_model import LassoCV
# HOUSE RESPONSE
start_time = time.time()

X_train = merged_train_t_best.drop(['zipcode','response'], 1).values
Y_train = merged_train_t_best['response'].values
X_test = merged_test_t_best.drop(['zipcode','response'], 1).values
Y_test = merged_test_t_best['response'].values

alphas = [0.001, 0.01, 0.1, 1.0, 10.0, 100.0]
alphas_temp = [0.001, 0.01]
clf_lasso_model = LassoCV(alphas = alphas).fit(X_train, Y_train)
best_alpha_lasso = clf_lasso_model.alpha_

print '----- %s seconds to run ----- \n---- best alpha = %s ----'%(time.time() - start_time, best_alpha_lasso)

----- 0.184986829758 seconds to run ----- 
---- best alpha = 0.001 ----


In [168]:
# calculate accuracy
start_time = time.time()

training_accuracy = clf_lasso_model.score(X_train, Y_train)
test_accuracy = clf_lasso_model.score(X_test, Y_test)
print '#### based on the best Linear Regression with Lasso Regularization ####'
print "R-squared on training data: %0.2f" % (training_accuracy)
print "R-squared on test data:     %0.2f" % (test_accuracy)
print '----- %s seconds to run -----'%(time.time() - start_time)

#### based on the best Linear Regression with Lasso Regularization ####
R-squared on training data: 0.36
R-squared on test data:     0.33
----- 0.00258708000183 seconds to run -----


In [169]:
# HOUSE/RENTAL RATIO DELTA RESPONSE
start_time = time.time()

X_train = ratio_delta_train_best.drop(['response','zipcode'], 1).values
Y_train = ratio_delta_train_best['response'].values
X_test = ratio_delta_test_best.drop(['response','zipcode'], 1).values
Y_test = ratio_delta_test_best['response'].values

alphas = [0.001, 0.01, 0.1, 1.0, 10.0, 100.0]
alphas_temp = [0.001, 0.01]
lasso_model_ratio_delta = LassoCV(alphas = alphas).fit(X_train, Y_train)
best_alpha_lasso = lasso_model_ratio_delta.alpha_

print '----- %s seconds to run ----- \n---- best alpha = %s ----'%(time.time() - start_time, best_alpha_lasso)

----- 0.0450799465179 seconds to run ----- 
---- best alpha = 0.001 ----


In [170]:
# calculate accuracy
start_time = time.time()

training_accuracy = lasso_model_ratio_delta.score(X_train, Y_train)
test_accuracy = lasso_model_ratio_delta.score(X_test, Y_test)
print '#### based on the best Linear Regression with Lasso Regularization ####'
print "R-squared on training data: %0.2f" % (training_accuracy)
print "R-squared on test data:     %0.2f" % (test_accuracy)
print '----- %s seconds to run -----'%(time.time() - start_time)

#### based on the best Linear Regression with Lasso Regularization ####
R-squared on training data: 0.26
R-squared on test data:     0.26
----- 0.00198602676392 seconds to run -----


In [171]:
# HOUSE/RENTAL RATIO 2015 RESPONSE
start_time = time.time()

X_train = ratio_2015_train_best.drop(['response','zipcode'], 1).values
Y_train = ratio_2015_train_best['response'].values
X_test = ratio_2015_test_best.drop(['response','zipcode'], 1).values
Y_test = ratio_2015_test_best['response'].values

alphas = [0.001, 0.01, 0.1, 1.0, 10.0, 100.0]
alphas_temp = [0.001, 0.01]
lasso_model_ratio_2015 = LassoCV(alphas = alphas).fit(X_train, Y_train)
best_alpha_lasso = lasso_model_ratio_2015.alpha_

print '----- %s seconds to run ----- \n---- best alpha = %s ----'%(time.time() - start_time, best_alpha_lasso)

----- 0.391252040863 seconds to run ----- 
---- best alpha = 0.1 ----


In [172]:
# calculate accuracy
start_time = time.time()

training_accuracy = lasso_model_ratio_2015.score(X_train, Y_train)
test_accuracy = lasso_model_ratio_2015.score(X_test, Y_test)
print '#### based on the best Linear Regression with Lasso Regularization ####'
print "R-squared on training data: %0.2f" % (training_accuracy)
print "R-squared on test data:     %0.2f" % (test_accuracy)
print '----- %s seconds to run -----'%(time.time() - start_time)

#### based on the best Linear Regression with Lasso Regularization ####
R-squared on training data: 0.63
R-squared on test data:     0.63
----- 0.00206398963928 seconds to run -----


## Linear Regression with Ridge Regularization (normalized and transformed features)

In [175]:
from sklearn.linear_model import RidgeCV

start_time = time.time()
X_train = merged_train_t_best.drop(['zipcode','response'], 1).values
Y_train = merged_train_t_best['response'].values
X_test = merged_test_t_best.drop(['zipcode','response'], 1).values
Y_test = merged_test_t_best['response'].values

alphas = [0.001, 0.01, 0.1, 1.0, 10.0, 100.0]
clf_ridge_model = RidgeCV(alphas = alphas).fit(X_train, Y_train)
best_alpha_ridge = clf_ridge_model.alpha_

print '----- %s seconds to run ----- \n---- best alpha = %s ----'%(time.time() - start_time, best_alpha_ridge)

----- 0.0724050998688 seconds to run ----- 
---- best alpha = 0.1 ----


In [176]:
training_accuracy = clf_ridge_model.score(X_train, Y_train)
test_accuracy = clf_ridge_model.score(X_test, Y_test)
print '#### based on the best Linear Regression with Ridge Regularization ####'
print "R-squared on training data: %0.2f" % (training_accuracy)
print "R-squared on test data:     %0.2f" % (test_accuracy)

#### based on the best Linear Regression with Ridge Regularization ####
R-squared on training data: 0.37
R-squared on test data:     0.33


In [177]:
# HOUSE/RENTAL RATIO DELTA RESPONSE
start_time = time.time()

X_train = ratio_delta_train_best.drop(['response','zipcode'], 1).values
Y_train = ratio_delta_train_best['response'].values
X_test = ratio_delta_test_best.drop(['response','zipcode'], 1).values
Y_test = ratio_delta_test_best['response'].values

alphas = [0.001, 0.01, 0.1, 1.0, 10.0, 100.0]
ridge_model_ratio_delta = RidgeCV(alphas = alphas).fit(X_train, Y_train)
best_alpha_ridge = ridge_model_ratio_delta.alpha_

print '----- %s seconds to run ----- \n---- best alpha = %s ----'%(time.time() - start_time, best_alpha_ridge)

----- 0.0404560565948 seconds to run ----- 
---- best alpha = 1.0 ----


In [178]:
training_accuracy = ridge_model_ratio_delta.score(X_train, Y_train)
test_accuracy = ridge_model_ratio_delta.score(X_test, Y_test)
print '#### based on the best Linear Regression with Ridge Regularization ####'
print "R-squared on training data: %0.2f" % (training_accuracy)
print "R-squared on test data:     %0.2f" % (test_accuracy)

#### based on the best Linear Regression with Ridge Regularization ####
R-squared on training data: 0.28
R-squared on test data:     0.26


In [179]:
# HOUSE/RENTAL RATIO 2015 RESPONSE
start_time = time.time()

X_train = ratio_2015_train_best.drop(['response','zipcode'], 1).values
Y_train = ratio_2015_train_best['response'].values
X_test = ratio_2015_test_best.drop(['response','zipcode'], 1).values
Y_test = ratio_2015_test_best['response'].values

alphas = [0.001, 0.01, 0.1, 1.0, 10.0, 100.0]
ridge_model_ratio_2015 = RidgeCV(alphas = alphas).fit(X_train, Y_train)
best_alpha_ridge = ridge_model_ratio_2015.alpha_

print '----- %s seconds to run ----- \n---- best alpha = %s ----'%(time.time() - start_time, best_alpha_ridge)

----- 0.0534300804138 seconds to run ----- 
---- best alpha = 1.0 ----


In [180]:
training_accuracy = ridge_model_ratio_2015.score(X_train, Y_train)
test_accuracy = ridge_model_ratio_2015.score(X_test, Y_test)
print '#### based on the best Linear Regression with Ridge Regularization ####'
print "R-squared on training data: %0.2f" % (training_accuracy)
print "R-squared on test data:     %0.2f" % (test_accuracy)

#### based on the best Linear Regression with Ridge Regularization ####
R-squared on training data: 0.64
R-squared on test data:     0.63


In [None]:
from sklearn.svm import LinearSVC

clfsvm=LinearSVC(loss="hinge")
Cs=[0.001, 0.01, 0.1, 1.0, 10.0, 100.0]
Xmatrix=dftouse[lcols].values
Yresp=dftouse['RESP'].values

Xmatrix_train=Xmatrix[mask]
Xmatrix_test=Xmatrix[~mask]
Yresp_train=Yresp[mask]
Yresp_test=Yresp[~mask]

#your code here
from sklearn.grid_search import GridSearchCV
import time

start_time = time.time()

n_folds = 5
n_jobs = 4
gs = GridSearchCV(clfsvm, param_grid={"C": Cs}, cv=n_folds, n_jobs=n_jobs)
gs.fit(Xmatrix_train, Yresp_train)

C_best = gs.best_params_['C']
print 'The best c-value is %s'%(C_best)

print '----- %s seconds to run -----'%(time.time() - start_time)

In [None]:
##### In case we want to do SVM regression/other types of models later

In [None]:
#calculate the accuracy here
#your code here
start_time = time.time()

best = LinearSVC(C=C_best, class_weight=None, dual=True, fit_intercept=True,
      intercept_scaling=1, loss='hinge', max_iter=1000, multi_class='ovr',
      penalty='l2', random_state=None, tol=0.0001, verbose=0)
best.fit(Xmatrix_train, Yresp_train)

training_accuracy = best.score(Xmatrix_train, Yresp_train)
test_accuracy = best.score(Xmatrix_test, Yresp_test)
print '#### based on the best LinearSVC ####'
print "Accuracy on training data: %0.2f" % (training_accuracy)
print "Accuracy on test data:     %0.2f" % (test_accuracy)
print '----- %s seconds to run -----'%(time.time() - start_time)