In [1]:
%matplotlib inline
import numpy as np
import scipy as sp
import matplotlib as mpl
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import pandas as pd
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.notebook_repr_html', True)
import seaborn as sns
sns.set_style("whitegrid")
sns.set_context("poster")

In [2]:
# with help from http://planspace.org/20150423-forward_selection_with_statsmodels/
import statsmodels.formula.api as smf

def forward_selected(data, response):
    """Linear model designed by forward selection.

    Parameters:
    -----------
    data : pandas DataFrame with all possible predictors and response

    response: string, name of response column in data

    Returns:
    --------
    model: an "optimal" fitted statsmodels linear model
           with an intercept
           selected by forward selection
           evaluated by adjusted R-squared
    """
    
    remaining = set(data.columns)
    remaining.remove(response)
    selected = []
    current_score, best_new_score = 0.0, 0.0
    while remaining and current_score == best_new_score:
        scores_with_candidates = []
        for candidate in remaining:
            formula = "{} ~ {} + 1".format(response, ' + '.join(selected + [candidate]))
            score = smf.ols(formula, data).fit().rsquared_adj
            scores_with_candidates.append((score, candidate))
        scores_with_candidates.sort()
        best_new_score, best_candidate = scores_with_candidates.pop()
        if current_score < (best_new_score * 1):
            remaining.remove(best_candidate)
            selected.append(best_candidate)
            current_score = best_new_score
    formula = "{} ~ {} + 1".format(response, ' + '.join(selected))
    
    model = smf.ols(formula, data).fit()
    return model

In [3]:
importedResponse=pd.read_pickle('data/ZipcodeMVPSnew.pkl')
response=importedResponse[['RegionName','City','State','2000_agg','2010_agg','pct_delta']]
response['scaled_pct'] = response.pct_delta - 1
response=response.rename(columns={'RegionName': 'Zip_Code'})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()


In [4]:
response.head(10)

Unnamed: 0,Zip_Code,City,State,2000_agg,2010_agg,pct_delta,scaled_pct
1,79936,El Paso,TX,55.666667,82.333333,1.479042,0.479042
2,60629,Chicago,IL,98.666667,135.0,1.368243,0.368243
3,90650,Norwalk,CA,133.333333,239.666667,1.7975,0.7975
7,90201,Bell,CA,130.0,228.0,1.753846,0.753846
8,90280,South Gate,CA,128.666667,219.333333,1.704663,0.704663
11,60618,Chicago,IL,162.333333,261.0,1.607803,0.607803
13,91331,Los Angeles,CA,112.666667,205.0,1.819527,0.819527
14,8701,Lakewood Township,NJ,69.333333,138.0,1.990385,0.990385
15,92335,Fontana,CA,79.0,126.0,1.594937,0.594937
18,90011,Los Angeles,CA,110.0,185.0,1.681818,0.681818


In [5]:
cdd=pd.read_pickle('data/cleaned_demographic_features.pkl')
cdd=cdd.rename(columns={'GEO.id2' : 'Zip_Code'})
print cdd.shape
cdd.head(10)

(33120, 59)


Unnamed: 0,GEO.id,Zip_Code,GEO.display-label,A001,A002,A003,A004,A005,A006,A007,A008,A009,A010,A011,A012,A013,A014,A015,A016,A017,A018,A019,A020,A021,A022,A023,A024,A025,A026,A027,A028,A029,A030,A031,A032,A033,A034,A035,A036,A037,A038,A039,A040,A041,A042,A043,A044,A045,A046,A047,A048,A049,A050,A051,A052,A053,A054,A055,A056
1,8600000US00601,601,ZCTA5 00601,18570,12.8,16.2,13.9,11.7,13.3,13.1,10.2,5.7,3.1,48.9,6.6,8.5,6.9,5.4,6.3,6.3,4.9,2.7,1.3,51.1,6.3,7.7,7.0,6.3,6.9,6.8,5.3,2.9,1.8,93.1,3.1,0.4,0.0,0.0,99.5,77.8,5.1,2.6,20.3,9.6,20.3,10.2,3.3,10.1,5.5,39.5,29.0,84.3,15.7,2.9,0.5,2.84,3.27,7744,2.87
2,8600000US00602,602,ZCTA5 00602,41520,11.5,15.2,13.5,12.9,14.6,14.0,10.1,5.7,2.3,49.1,5.9,7.7,6.7,6.4,7.0,6.8,4.9,2.6,1.1,50.9,5.6,7.5,6.8,6.5,7.6,7.2,5.2,3.1,1.3,86.7,5.3,0.3,0.1,0.0,99.4,77.6,5.1,2.1,17.4,8.0,20.5,10.5,3.0,10.0,4.9,37.6,25.9,83.0,17.0,2.6,1.3,2.76,3.2,18073,2.62
3,8600000US00603,603,ZCTA5 00603,54689,12.2,14.2,13.0,13.0,12.7,12.6,12.0,7.0,3.3,48.6,6.4,7.3,6.5,6.4,6.0,5.8,5.6,3.2,1.4,51.4,5.8,7.0,6.5,6.6,6.6,6.7,6.4,3.7,2.1,82.9,7.6,0.3,0.2,0.0,98.5,71.1,4.7,1.7,22.0,10.9,26.4,12.7,4.1,13.8,6.8,34.0,31.1,82.5,17.5,3.6,1.5,2.53,3.06,25653,2.51
4,8600000US00606,606,ZCTA5 00606,6615,12.8,15.1,14.7,12.1,13.1,13.4,10.5,5.5,2.9,49.4,6.6,7.5,7.1,6.1,6.5,6.3,5.2,2.6,1.4,50.6,6.2,7.5,7.6,6.0,6.6,7.0,5.4,2.9,1.5,88.9,4.7,0.4,0.0,0.1,99.4,74.6,6.2,2.7,18.4,8.1,22.8,12.7,4.5,10.1,4.7,37.5,28.4,83.6,16.4,1.7,0.8,2.75,3.24,2877,2.83
5,8600000US00610,610,ZCTA5 00610,29016,11.8,15.1,12.6,13.4,13.9,12.6,11.2,6.7,2.8,48.6,6.2,7.7,6.4,6.6,6.5,5.9,5.1,3.2,1.1,51.4,5.6,7.5,6.2,6.9,7.4,6.7,6.1,3.5,1.7,82.0,7.2,0.3,0.1,0.0,99.2,74.4,4.9,1.9,19.9,9.4,22.8,11.0,3.4,11.9,5.8,36.5,29.4,85.9,14.1,1.9,0.8,2.67,3.14,12618,2.66
6,8600000US00612,612,ZCTA5 00612,67010,11.4,14.3,12.5,12.6,13.1,12.2,11.8,7.7,4.2,47.4,6.0,7.3,6.2,6.0,6.2,5.5,5.2,3.4,1.7,52.6,5.5,7.0,6.4,6.7,6.9,6.6,6.7,4.4,2.5,84.3,6.2,0.4,0.1,0.0,99.1,71.7,4.8,1.8,22.4,10.2,25.6,10.7,3.8,14.8,7.8,33.8,33.7,84.0,16.0,3.1,2.2,2.55,3.05,30992,2.54
7,8600000US00616,616,ZCTA5 00616,11017,11.8,14.9,11.8,13.3,14.5,11.4,11.9,6.6,3.7,48.3,6.1,7.5,5.9,6.1,6.9,5.2,5.5,3.2,1.6,51.7,5.7,7.3,5.9,7.2,7.6,6.2,6.4,3.4,2.1,84.8,7.1,0.3,0.0,0.0,99.5,73.3,4.4,1.5,20.5,9.1,24.3,12.0,5.0,12.2,6.2,36.0,31.6,84.6,15.4,2.3,1.0,2.65,3.16,4896,2.53
8,8600000US00617,617,ZCTA5 00617,24597,13.4,15.1,13.0,15.0,12.7,10.9,10.6,6.1,3.2,48.1,6.8,7.8,6.3,7.0,5.9,5.1,5.0,2.8,1.3,51.9,6.5,7.4,6.7,8.0,6.9,5.7,5.5,3.3,1.8,80.6,7.7,0.3,0.1,0.0,99.4,74.7,5.2,2.2,23.6,11.5,22.8,11.4,3.9,11.4,5.5,38.9,29.0,85.7,14.3,1.9,2.5,2.71,3.18,10594,2.8
9,8600000US00622,622,ZCTA5 00622,7853,9.6,13.0,11.0,11.1,12.5,13.1,14.6,9.7,5.3,48.2,4.9,6.5,5.2,5.3,6.4,6.1,7.1,4.6,2.1,51.8,4.7,6.5,5.8,5.8,6.2,7.1,7.6,5.0,3.2,85.8,5.0,0.2,0.0,0.0,98.2,69.1,5.5,2.6,15.5,5.9,26.9,13.4,4.2,13.4,7.3,28.9,37.8,36.6,63.4,3.7,4.9,2.45,2.97,8714,2.51
10,8600000US00623,623,ZCTA5 00623,43061,12.3,14.4,11.3,14.1,13.3,11.3,11.7,7.5,4.0,47.8,6.4,7.3,5.6,6.6,6.4,5.2,5.4,3.5,1.6,52.2,6.1,7.1,5.7,7.6,6.9,6.1,6.3,4.0,2.3,83.8,5.5,0.3,0.1,0.0,99.1,71.6,4.9,2.2,19.3,9.4,25.0,11.5,4.0,13.5,7.3,35.7,32.6,77.6,22.4,1.8,2.7,2.58,3.08,21426,2.61


In [6]:
merged=pd.merge(response, cdd, on=['Zip_Code'])
merged=merged.drop(['Zip_Code','City','State','2000_agg','2010_agg','GEO.id','GEO.display-label','pct_delta'], 1)
merged=merged.astype('float')
merged=merged.rename(columns={'scaled_pct' : 'response'})
merged.head(10)

Unnamed: 0,response,A001,A002,A003,A004,A005,A006,A007,A008,A009,A010,A011,A012,A013,A014,A015,A016,A017,A018,A019,A020,A021,A022,A023,A024,A025,A026,A027,A028,A029,A030,A031,A032,A033,A034,A035,A036,A037,A038,A039,A040,A041,A042,A043,A044,A045,A046,A047,A048,A049,A050,A051,A052,A053,A054,A055,A056
0,0.479042,111086,15.8,19.0,14.0,13.6,14.8,12.5,6.1,2.7,1.5,47.9,8.0,9.8,7.0,6.2,6.7,5.7,2.8,1.1,0.5,52.1,7.8,9.2,7.0,7.4,8.0,6.8,3.3,1.6,0.9,82.6,2.5,0.6,1.0,0.1,86.3,82.0,5.7,2.9,20.0,12.1,14.6,7.1,1.0,7.5,2.5,52.4,15.7,97.0,3.0,1.0,0.9,3.22,3.58,35523,2.95
1,0.368243,113916,18.1,17.7,16.0,15.4,12.5,10.2,5.5,2.6,1.8,49.4,9.2,8.9,8.1,7.8,6.2,4.9,2.6,1.1,0.7,50.6,8.9,8.9,8.0,7.6,6.3,5.3,3.0,1.5,1.2,41.0,23.0,0.7,0.5,0.0,67.2,78.1,8.7,4.1,22.6,12.0,18.1,8.7,2.3,9.4,4.1,53.6,20.1,88.6,11.4,4.3,1.7,3.77,4.24,34000,3.21
2,0.7975,105549,14.2,16.8,15.2,14.3,13.6,11.7,7.1,4.1,2.7,49.6,7.4,8.8,7.9,7.1,6.8,5.6,3.3,1.7,1.0,50.4,6.9,8.0,7.3,7.2,6.8,6.1,3.9,2.4,1.7,49.4,4.4,1.1,12.0,0.4,70.1,83.2,8.7,3.6,18.6,7.8,12.6,5.3,1.7,7.3,4.3,50.4,28.0,96.6,3.4,1.3,0.9,3.83,4.1,28083,3.57
3,0.753846,101279,18.2,19.1,16.3,15.3,13.0,9.3,5.0,2.4,1.3,50.0,9.3,9.7,8.3,7.7,6.6,4.6,2.3,1.0,0.5,50.0,8.9,9.4,8.0,7.6,6.4,4.8,2.7,1.4,0.8,50.9,1.0,1.0,0.6,0.1,94.9,87.5,11.7,6.3,22.3,13.3,8.7,4.4,1.1,4.3,2.3,63.8,17.8,96.6,3.4,2.1,0.4,4.16,4.3,24940,4.06
4,0.704663,94396,16.7,18.1,16.2,14.9,13.1,10.4,6.2,2.9,1.6,49.1,8.5,9.0,8.1,7.4,6.6,4.8,2.8,1.2,0.6,50.9,8.2,9.1,8.1,7.5,6.5,5.5,3.3,1.7,1.0,50.5,0.9,0.9,0.8,0.1,94.8,86.6,9.7,5.0,20.2,10.7,9.8,4.5,1.3,5.4,3.0,59.3,21.9,96.3,3.7,1.9,0.7,4.05,4.24,24160,3.78
5,0.607803,92084,14.5,10.4,19.0,20.7,13.8,10.2,6.3,3.3,1.9,50.1,7.2,5.3,9.5,10.7,7.3,5.1,2.9,1.3,0.7,49.9,7.2,5.1,9.4,10.0,6.5,5.1,3.4,2.0,1.2,65.8,3.5,0.7,5.3,0.0,46.4,57.5,6.2,2.4,12.3,6.2,29.2,14.0,1.8,15.2,4.1,31.7,15.9,89.5,10.5,4.0,1.3,2.59,3.34,39547,2.49
6,0.819527,103689,16.5,17.7,16.6,14.8,13.3,10.3,5.9,3.2,2.0,50.5,8.4,9.1,8.7,7.7,6.8,5.0,2.7,1.3,0.7,49.5,8.1,8.7,7.9,7.1,6.5,5.3,3.2,1.9,1.2,44.4,3.3,0.9,4.4,0.1,87.8,86.4,10.4,4.8,19.9,9.5,9.8,4.5,1.4,5.3,3.2,60.1,26.0,94.5,5.5,1.3,1.2,4.6,4.63,23780,4.23
7,0.990385,92843,30.0,14.2,17.3,11.8,6.3,5.4,5.4,5.2,4.5,49.7,15.3,7.2,8.8,6.5,3.3,2.7,2.2,2.0,1.5,50.3,14.6,7.0,8.5,5.4,3.0,2.8,3.1,3.1,2.9,84.3,6.4,0.3,0.8,0.0,17.3,71.5,3.8,1.7,9.1,4.5,24.6,7.2,3.5,17.5,12.9,45.7,32.8,92.2,7.8,2.2,1.6,3.73,4.49,26337,4.16
8,0.594937,95397,18.7,19.6,16.5,13.8,12.9,9.4,4.8,2.6,1.4,50.0,9.6,10.0,8.6,6.8,6.4,4.7,2.3,1.1,0.5,50.0,9.3,9.5,8.1,7.1,6.4,4.7,2.6,1.5,0.9,48.6,4.8,1.2,1.6,0.4,81.7,83.7,9.9,5.2,19.2,11.2,12.3,5.7,1.6,6.6,3.7,60.7,20.1,93.9,6.1,3.1,1.3,4.18,4.42,24246,3.92
9,0.681818,103892,19.7,19.3,17.4,15.8,12.5,8.2,4.1,2.0,1.0,50.8,10.0,9.9,9.3,8.1,6.4,4.0,1.9,0.8,0.4,49.2,9.7,9.4,8.2,7.7,6.1,4.2,2.2,1.2,0.6,33.8,9.4,1.2,0.6,0.0,89.4,85.9,12.1,6.5,24.2,14.5,10.0,4.9,1.5,5.1,2.3,66.4,16.9,94.1,5.9,2.7,0.9,4.67,4.75,23547,4.44


In [7]:
#minimerge = merged[0:50]
bestmodel = forward_selected(merged, 'response')

In [8]:
bestmodel.summary()

0,1,2,3
Dep. Variable:,response,R-squared:,0.334
Model:,OLS,Adj. R-squared:,0.332
Method:,Least Squares,F-statistic:,151.8
Date:,"Sun, 06 Dec 2015",Prob (F-statistic):,0.0
Time:,17:53:38,Log-Likelihood:,-373.2
No. Observations:,11842,AIC:,826.4
Df Residuals:,11802,BIC:,1122.0
Df Model:,39,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
Intercept,122.3462,112.001,1.092,0.275,-97.194 341.887
A002,-0.0259,0.004,-6.390,0.000,-0.034 -0.018
A054,-0.0142,0.031,-0.458,0.647,-0.075 0.047
A003,-0.0301,0.004,-7.245,0.000,-0.038 -0.022
A052,-0.0610,0.003,-19.068,0.000,-0.067 -0.055
A050,0.0080,0.000,23.645,0.000,0.007 0.009
A051,-0.0241,0.002,-14.395,0.000,-0.027 -0.021
A029,0.0408,0.034,1.208,0.227,-0.025 0.107
A011,-1.1918,1.120,-1.064,0.287,-3.387 1.004

0,1,2,3
Omnibus:,1043.428,Durbin-Watson:,1.978
Prob(Omnibus):,0.0,Jarque-Bera (JB):,3110.221
Skew:,0.469,Prob(JB):,0.0
Kurtosis:,5.329,Cond. No.,1290000000.0
