In [1]:
%matplotlib inline
import numpy as np
import scipy as sp
import matplotlib as mpl
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import pandas as pd
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.notebook_repr_html', True)
import seaborn as sns
sns.set_style("whitegrid")
sns.set_context("poster")

In [2]:
# with help from http://planspace.org/20150423-forward_selection_with_statsmodels/
import statsmodels.formula.api as smf

def forward_selected(data, response):
    """Linear model designed by forward selection.

    Parameters:
    -----------
    data : pandas DataFrame with all possible predictors and response

    response: string, name of response column in data

    Returns:
    --------
    model: an "optimal" fitted statsmodels linear model
           with an intercept
           selected by forward selection
           evaluated by adjusted R-squared
    """
    
    remaining = set(data.columns)
    remaining.remove(response)
    selected = []
    current_score, best_new_score = 0.0, 0.0
    while remaining and current_score == best_new_score:
        scores_with_candidates = []
        for candidate in remaining:
            formula = "{} ~ {} + 1".format(response, ' + '.join(selected + [candidate]))
            score = smf.ols(formula, data).fit().rsquared_adj
            scores_with_candidates.append((score, candidate))
        scores_with_candidates.sort()
        best_new_score, best_candidate = scores_with_candidates.pop()
        if current_score < (best_new_score * 1):
            remaining.remove(best_candidate)
            selected.append(best_candidate)
            current_score = best_new_score
    formula = "{} ~ {} + 1".format(response, ' + '.join(selected))
    
    model = smf.ols(formula, data).fit()
    return model

In [7]:
# response=pd.read_pickle('data/ZipcodeMVPSnew.pkl')[['RegionName','pct_delta']]
# response['scaled_pct'] = response.pct_delta - 1
# response=response.rename(columns={'RegionName': 'zipcode'})

### Changed this to adjust for inflation

In [17]:
response=pd.read_pickle('data/ZipcodeMVPSnew.pkl')[['RegionName','2000_agg','2010_agg']]
response=response.rename(columns={'RegionName': 'zipcode'})
response['2010_agg_adj'] = response['2010_agg']/1.26
response['pct_delta'] = (response['2010_agg_adj']/response['2000_agg'])/100
response = response.drop('2010_agg', 1)

In [18]:
response.head(3)

Unnamed: 0,zipcode,2000_agg,2010_agg_adj,pct_delta
1,79936,55.666667,65.343915,0.011738
2,60629,98.666667,107.142857,0.010859
3,90650,133.333333,190.21164,0.014266


In [18]:
features=pd.read_pickle('data/all_features_norm_not_transformed.pkl')
print features.shape
features.head(10)

(27653, 64)


Unnamed: 0,A001,A002,A003,A004,A005,A006,A007,A008,A009,A010,A011,A012,A013,A014,A015,A016,A017,A018,A019,A020,A021,A022,A023,A024,A025,A026,A027,A028,A029,A030,A031,A032,A033,A034,A035,A036,A037,A038,A039,A040,A041,A042,A043,A044,A045,A046,A047,A048,A049,A050,A051,A052,A053,A054,A055,A056,n_establishments,paid_employees,first_quarter_payroll_1000,annual_payroll_1000,prison,jail,median_income,zipcode
0,0.403822,-0.839925,-0.5772,-0.168204,-0.11491,0.097627,-0.02271,0.164482,0.297415,1.855374,-0.654254,-0.783044,-0.641046,-0.195054,-0.049217,-0.050336,-0.098035,-0.050738,-0.022365,1.082934,0.719438,-0.767238,-0.457822,-0.048472,-0.167051,0.28846,0.114242,0.384865,0.639734,2.118527,0.535081,-0.394929,-0.177156,-0.011208,-0.13138,-0.366799,-0.888653,-0.313466,-0.455594,-0.169235,-0.295995,1.116199,0.259922,0.434214,1.593216,1.397376,-0.74305,0.584263,0.759381,-0.798737,-0.134993,-0.549322,-0.651977,-0.223378,0.495932,-1.124265,0.463735,0.737979,0.231006,0.318245,-0.172158,-0.323992,0.332304,1001
1,1.26186,-1.594272,1.084305,3.638466,-1.346952,-2.015987,-1.228271,-0.871073,-0.804575,-0.223704,-0.396719,-1.489846,0.510741,2.836251,-1.107246,-1.880773,-1.383069,-0.897923,-0.802669,-0.377333,0.461899,-1.503218,1.457233,3.95887,-1.254782,-1.507307,-0.860163,-0.72209,-0.705649,-0.149133,-0.22858,-0.192467,-0.162902,1.520559,-0.13138,-0.062326,-1.609325,-1.104775,-0.981585,-0.116915,0.173738,0.114255,-0.327087,-0.496995,0.519232,0.092856,-0.83947,-0.632176,0.7526,-0.791203,-0.255063,-0.549322,-0.163923,-0.324466,0.984798,-0.027929,0.766968,1.62078,0.31638,0.397244,-0.172158,-0.323992,0.10992,1002
2,-0.043154,-3.538166,8.003111,6.934284,-3.589901,-4.975048,-4.270877,-2.815896,-2.216499,-1.486001,0.311503,-3.392777,6.71267,5.621108,-2.742381,-4.054417,-3.798934,-2.458525,-1.973126,-1.322211,-0.246334,-3.371474,7.949734,7.254629,-3.622197,-4.355765,-4.054045,-2.835367,-2.179163,-1.396346,-0.272079,-0.122871,-0.19141,1.766398,-0.13138,-0.247657,-2.48125,-2.529132,-2.121233,-1.878365,-1.705193,-0.706614,1.535146,-1.686872,-2.722984,-2.143463,-1.235864,-3.293888,1.064512,-1.137736,-0.975481,-1.112115,1.602369,1.267669,-0.808171,1.557843,-0.61001,-0.496545,-0.289793,-0.339523,-0.172158,-0.323992,-2.281652,1003
3,-0.412991,-0.317685,0.836319,-0.332995,-0.272864,0.661258,0.551366,-0.31541,-0.666826,0.073308,0.161274,-0.293719,1.130934,-0.268988,-0.385863,0.407274,0.46738,-0.00615,-0.477543,-0.205536,-0.096103,-0.257713,0.429643,-0.348087,-0.103067,0.721921,0.547311,-0.571142,-0.83378,0.191016,0.66558,-0.432891,-0.177156,-0.313779,-0.13138,-0.466084,0.196804,0.16132,0.245728,-0.134355,-0.061129,-0.22375,-0.144912,0.175545,-0.230531,0.092856,0.296146,-0.174505,0.603425,-0.625471,-0.455179,0.254667,0.231169,0.231517,-0.45607,-0.439055,-0.411169,-0.392766,-0.256765,-0.299852,-0.172158,-0.323992,0.979231,1005
4,0.255692,0.175542,0.265952,-0.431869,0.011454,1.295342,0.752293,-0.365925,-0.7357,-0.557841,-0.24649,0.195607,0.244944,-0.342922,-0.193494,0.693279,0.570183,-0.362859,-0.802669,-0.463231,0.311668,0.138583,0.242808,-0.460442,0.344822,1.464998,0.817979,-0.369877,-0.641583,-0.545973,0.544747,-0.420237,-0.19141,-0.011208,-0.005316,-0.413132,0.321364,-0.682743,-0.455594,-0.099475,0.203096,-0.501397,-0.650953,-0.496995,-0.169739,-0.341983,0.542554,-0.836923,0.779723,-0.821337,-0.655295,-0.227727,0.25441,0.155701,0.199262,-0.47821,-0.058225,-0.281321,-0.220885,-0.250865,-0.172158,-0.323992,1.037927,1007
5,-0.679626,-0.723872,-0.40361,-0.332995,-0.462409,0.731712,1.039331,1.124264,-0.425766,-0.409335,0.397348,-0.674305,-0.020853,-0.293633,-0.289678,0.064067,1.238401,1.242332,-0.412517,0.138056,-0.33218,-0.710624,-0.738074,-0.310635,-0.550956,1.217306,0.655578,0.83771,-0.449385,-0.659356,0.767079,-0.489833,-0.177156,-0.3516,-0.13138,-0.499179,0.437028,-0.629989,-1.244581,-0.762198,-0.883161,-0.59797,0.178955,0.072077,-1.182932,-0.838943,-0.164528,-0.234725,0.101655,-0.068005,-0.975481,0.254667,-0.001238,-0.24865,-0.707842,-0.439055,-0.597583,-0.468889,-0.293891,-0.34562,-0.172158,-0.323992,1.172509,1008
6,-0.716099,-0.491765,-0.081228,0.210815,0.832815,0.555577,-0.108822,-0.11335,-1.252258,0.481698,0.483193,-0.13061,0.510741,0.174617,0.960719,-0.050336,-0.252239,0.216793,-0.997746,0.138056,-0.418026,-0.823852,-0.691365,0.21369,0.344822,0.969614,0.005975,-0.369877,-1.282241,0.587857,0.655913,-0.47718,-0.19141,-0.33269,-0.13138,-0.452846,-0.559457,0.003058,-0.455594,0.77253,0.907695,0.52469,0.644513,-0.03139,0.235538,0.030737,-0.282375,-0.620132,0.54918,-0.565204,-0.375132,-0.870918,-0.373089,-0.172834,-0.750149,-0.223703,-0.612496,-0.501747,-0.290904,-0.341219,-0.172158,-0.323992,-0.231702,1009
7,-0.515704,-0.636832,0.067563,-0.662576,-0.241273,1.506704,0.981924,0.214996,-0.322454,-0.26083,-0.053338,-0.674305,0.156345,-0.490791,-0.241586,0.979285,0.878591,0.261382,-0.282466,-0.377333,0.118513,-0.484169,-0.037444,-0.760056,-0.167051,1.588844,0.980379,0.133284,-0.321254,-0.149133,0.689746,-0.458199,-0.162902,-0.313779,-0.005316,-0.439608,0.187907,-0.36622,-0.017268,-0.779639,-0.795086,-0.259965,-0.387812,-0.134858,-0.048156,0.248156,-0.068108,-0.078154,0.345759,-0.339204,-0.815388,0.174268,0.045243,-0.046474,-0.533087,-1.261307,-0.463365,-0.473681,-0.280465,-0.325452,-0.172158,-0.323992,1.164463,1010
8,-0.672149,-0.781899,0.067563,-0.514265,-0.494,1.260115,1.355073,0.012937,-0.150268,-0.149451,0.354426,-0.946152,0.244944,-0.219699,-0.241586,1.093687,1.032795,0.216793,0.107686,-0.205536,-0.289257,-0.484169,-0.17757,-0.797508,-0.742909,1.031537,1.467582,-0.218929,-0.385319,-0.092441,0.757412,-0.508814,-0.205663,-0.257047,-0.13138,-0.525655,0.009963,0.68886,0.508723,-0.256436,-0.295995,0.005611,0.23968,-0.03139,-0.230531,-0.714703,-0.335942,-0.343121,0.047409,-0.007738,-0.655295,0.495864,-0.140682,-0.097018,-0.695236,-0.32159,-0.607525,-0.513111,-0.297643,-0.348746,-0.172158,-0.323992,0.171474,1011
9,-0.721689,-1.188085,-0.155624,-0.431869,-0.588772,0.449896,1.814335,0.720145,-0.563514,0.184687,0.139813,-1.272369,0.067746,-0.12112,-0.722508,-0.336341,2.060823,0.618091,-0.477543,0.653444,-0.074641,-0.993693,-0.411113,-0.797508,-0.167051,1.217306,1.251047,0.737078,-0.513451,-0.092441,0.718746,-0.489833,-0.177156,-0.257047,-0.13138,-0.485941,-0.221364,0.847121,2.525023,-0.395956,-0.677653,0.126327,0.199197,-0.755664,-0.007628,-0.838943,-0.389509,-0.656263,-0.122108,0.180594,-0.735341,-0.870918,-0.39633,-0.400282,-0.749804,0.207001,-0.617467,-0.508045,-0.296255,-0.347133,-0.172158,-0.323992,-0.091184,1012


In [21]:
merged=pd.merge(response, features, on=['zipcode'])
merged=merged.astype('float')
merged=merged.rename(columns={'scaled_pct' : 'response'})
merged.head(10)

Unnamed: 0,zipcode,pct_delta,response,A001,A002,A003,A004,A005,A006,A007,A008,A009,A010,A011,A012,A013,A014,A015,A016,A017,A018,A019,A020,A021,A022,A023,A024,A025,A026,A027,A028,A029,A030,A031,A032,A033,A034,A035,A036,A037,A038,A039,A040,A041,A042,A043,A044,A045,A046,A047,A048,A049,A050,A051,A052,A053,A054,A055,A056,n_establishments,paid_employees,first_quarter_payroll_1000,annual_payroll_1000,prison,jail,median_income
0,79936,1.479042,0.479042,6.994015,1.045942,1.381888,0.375606,0.611679,0.238535,-0.711602,-1.275192,-1.286695,-0.929105,-0.41818,0.956779,1.263832,0.223907,0.143151,-0.221939,-0.920457,-1.210043,-1.257847,-0.892721,0.48336,1.044405,1.270398,0.513305,1.048648,0.598075,-0.372961,-1.174935,-1.154109,-0.886122,0.003418,-0.350641,-0.120142,-0.219225,-0.005316,5.126962,1.255569,0.477844,0.421058,1.609655,1.84716,-1.358482,-1.076028,-1.169534,-1.203196,-1.366963,2.235265,-1.402988,0.861091,-0.911737,-0.575249,-0.388525,1.648851,1.469845,5.325187,1.068408,3.563179,4.055737,1.079163,1.354307,-0.172158,-0.323992,-0.054208
1,60629,1.368243,0.368243,7.191755,1.713249,1.059507,0.705188,1.180314,-0.571684,-1.37179,-1.426737,-1.321132,-0.817726,-0.096261,1.609212,0.865137,0.494999,0.912627,-0.507945,-1.331668,-1.29922,-1.257847,-0.720925,0.161436,1.667157,1.130272,0.887823,1.176617,-0.454616,-1.184964,-1.325883,-1.218175,-0.716048,-2.007232,0.946383,-0.105888,-0.313779,-0.13138,3.862735,0.908579,2.060462,1.473041,2.063098,1.817802,-0.935975,-0.752161,-0.496995,-0.818182,-0.870003,2.363825,-0.873054,0.291514,-0.278938,0.745518,0.254667,2.927088,3.137795,5.06219,1.577421,1.283957,0.677465,0.336771,0.409415,-0.172158,-0.323992,-0.454877
2,90650,1.7975,0.7975,6.60713,0.581728,0.836319,0.573355,0.832815,-0.184188,-0.941233,-1.022618,-0.804575,-0.483588,-0.053338,0.630562,0.820837,0.44571,0.575981,-0.164738,-0.971858,-0.9871,-0.867695,-0.463231,0.118513,0.53488,0.709894,0.62566,0.92068,-0.145001,-0.751896,-0.873038,-0.641583,-0.43259,-1.601235,-0.230429,-0.048874,1.860952,0.372878,4.054685,1.362335,2.060462,1.034714,1.365494,0.584754,-1.599914,-1.440378,-0.807397,-1.243723,-0.807883,2.020998,0.078417,0.833968,-0.881603,-0.455179,-0.388525,3.066532,2.783988,4.040425,2.282208,1.892909,1.825187,0.682492,0.808127,-0.172158,-0.323992,0.360311
3,90201,1.753846,0.753846,6.308773,1.742262,1.406687,0.754625,1.148723,-0.39555,-1.630125,-1.553024,-1.390007,-1.003358,0.032507,1.663582,1.219533,0.544289,0.864535,-0.27914,-1.485872,-1.432986,-1.322873,-0.892721,0.032667,1.667157,1.363815,0.887823,1.176617,-0.392693,-1.455632,-1.476832,-1.282241,-0.942814,-1.528736,-0.445545,-0.063127,-0.294868,-0.005316,5.696195,1.744914,3.643081,3.401675,2.010777,2.19946,-2.070707,-1.622553,-1.1178,-1.851639,-1.429083,3.456588,-1.150065,0.833968,-0.881603,-0.134993,-0.790519,3.833475,3.289427,3.497682,3.241502,1.460428,1.325463,0.438963,0.529263,-0.172158,3.086496,-0.589723
4,90280,1.704663,0.704663,5.827838,1.307062,1.158701,0.738146,1.02236,-0.360323,-1.314383,-1.249935,-1.217821,-0.891979,-0.160645,1.228626,0.909436,0.494999,0.720258,-0.27914,-1.383069,-1.210043,-1.192822,-0.806823,0.225821,1.27086,1.223689,0.925275,1.112633,-0.33077,-1.076697,-1.174935,-1.090044,-0.829431,-1.548069,-0.451872,-0.077381,-0.257047,-0.005316,5.689576,1.66484,2.588002,2.262027,1.644535,1.436144,-1.937919,-1.602311,-1.014333,-1.628736,-1.211663,2.974487,-0.656263,0.813626,-0.859003,-0.21504,-0.549322,3.577828,3.137795,3.362989,2.693334,1.671698,1.559717,0.730275,0.94919,-0.172158,-0.323992,-0.435488
5,60618,1.607803,0.607803,5.666292,0.668768,-0.75079,1.199561,2.854628,-0.113734,-1.37179,-1.224678,-1.080072,-0.780599,0.053968,0.521823,-0.729645,0.840026,2.307301,0.121268,-1.228865,-1.165454,-1.127796,-0.720925,0.011205,0.704722,-0.644656,1.412148,2.712237,-0.33077,-1.293232,-1.124619,-0.897846,-0.716048,-0.808575,-0.287371,-0.105888,0.593935,-0.13138,2.485985,-0.924242,0.741613,-0.017268,0.266767,0.115021,0.403974,0.320647,-0.755664,0.357121,-0.870003,0.017599,-1.3789,0.35254,-0.346738,0.625448,-0.066929,0.184688,0.863317,6.020063,0.167846,3.933522,1.904459,1.004821,1.248194,-0.172158,-0.323992,0.245821
6,91331,1.819527,0.819527,6.477166,1.249035,1.059507,0.804063,0.990769,-0.289869,-1.343087,-1.325707,-1.114509,-0.743473,0.139813,1.174257,0.953736,0.642868,0.864535,-0.164738,-1.280267,-1.254632,-1.127796,-0.720925,-0.074641,1.214246,1.036855,0.850371,0.856696,-0.33077,-1.184964,-1.225251,-0.961912,-0.716048,-1.8429,-0.300025,-0.077381,0.423738,-0.005316,5.226246,1.647045,2.95728,2.086697,1.592215,1.083845,-1.937919,-1.602311,-0.962599,-1.649,-1.149543,3.060194,-0.162462,0.691574,-0.723404,-0.455179,-0.147328,4.856065,4.123403,3.297369,3.574319,1.425631,1.007556,0.444411,0.55907,-0.172158,-0.323992,-0.090261
7,8701,1.990385,0.990385,5.719326,5.165836,0.191556,0.919416,0.043045,-2.755752,-2.749574,-1.451995,-0.425766,0.184687,-0.031877,4.925749,0.112046,0.667513,0.287428,-2.166779,-2.462498,-1.477575,-0.672619,-0.03374,0.097052,4.894145,0.242808,1.075082,-0.231035,-2.498075,-2.538304,-1.275567,-0.193122,0.247708,0.085584,-0.10389,-0.162902,-0.257047,-0.13138,0.559859,0.321364,-0.524482,-0.630924,-0.291316,-0.38407,-0.15132,-1.055786,0.123811,0.82319,1.863276,1.51747,0.656527,0.535618,-0.550137,-0.09497,0.174268,2.834126,3.769595,3.73892,3.437277,3.990689,3.253988,1.438445,1.818302,-0.172158,-0.323992,-0.521267
8,92335,1.594937,0.594937,5.897781,1.887329,1.53068,0.787583,0.674861,-0.430776,-1.601421,-1.603539,-1.321132,-0.966231,0.032507,1.82669,1.352431,0.618223,0.431705,-0.393543,-1.434471,-1.432986,-1.257847,-0.892721,0.032667,1.893612,1.410524,0.925275,0.856696,-0.392693,-1.509766,-1.527148,-1.218175,-0.886122,-1.639901,-0.205121,-0.03462,-0.105761,0.372878,4.822488,1.406821,2.69351,2.437358,1.470134,1.582936,-1.636129,-1.359411,-0.859131,-1.38557,-0.994243,3.124474,-0.873054,0.65089,-0.678204,0.265239,-0.066929,3.879956,3.592691,3.37784,2.967418,2.432267,2.84791,1.644998,2.045278,-0.172158,-0.323992,-0.306709
9,90011,1.681818,0.681818,6.49135,2.177462,1.456284,0.935895,1.306678,-0.571684,-1.945867,-1.780341,-1.527755,-1.114737,0.204197,2.044168,1.308132,0.790736,1.056903,-0.393543,-1.79428,-1.611341,-1.452923,-0.978619,-0.139026,2.120067,1.363815,0.962726,1.240601,-0.578462,-1.780434,-1.728412,-1.410373,-1.056197,-2.355229,0.085919,-0.03462,-0.294868,-0.13138,5.33215,1.602559,3.854097,3.577006,2.342139,2.551759,-1.913776,-1.521344,-0.910865,-1.689528,-1.429083,3.735136,-1.258461,0.664451,-0.69327,0.105146,-0.388525,5.01875,4.426666,3.257134,3.985445,1.102514,0.501261,0.058735,0.08716,-0.172158,-0.323992,-0.957461


In [None]:
#minimerge = merged[0:50]
bestmodel = forward_selected(merged, 'response')

In [11]:
bestmodel.summary()

0,1,2,3
Dep. Variable:,response,R-squared:,0.356
Model:,OLS,Adj. R-squared:,0.354
Method:,Least Squares,F-statistic:,159.7
Date:,"Sun, 06 Dec 2015",Prob (F-statistic):,0.0
Time:,22:50:30,Log-Likelihood:,-177.5
No. Observations:,11602,AIC:,437.0
Df Residuals:,11561,BIC:,738.7
Df Model:,40,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
Intercept,0.4691,0.004,130.389,0.000,0.462 0.476
A002,-0.0629,0.010,-6.366,0.000,-0.082 -0.044
A054,0.0174,0.012,1.447,0.148,-0.006 0.041
A003,-0.0822,0.009,-9.125,0.000,-0.100 -0.065
median_income,0.0769,0.004,17.849,0.000,0.068 0.085
A050,0.1111,0.005,24.278,0.000,0.102 0.120
A052,-0.0779,0.004,-19.507,0.000,-0.086 -0.070
A024,-0.1281,0.087,-1.479,0.139,-0.298 0.042
A051,-0.0577,0.004,-13.904,0.000,-0.066 -0.050

0,1,2,3
Omnibus:,954.193,Durbin-Watson:,1.976
Prob(Omnibus):,0.0,Jarque-Bera (JB):,3004.072
Skew:,0.418,Prob(JB):,0.0
Kurtosis:,5.349,Cond. No.,355.0
