In [6]:
import numpy as np
import pandas as pd

Will first get the Stepwise R variables and coefficients into a nicely formatted dataframe for use in the app.

In [7]:
coefs_table = pd.read_csv('stepwise_R_coefs.csv', index_col=False)
coefs_table.columns=['Feature', 'Coefficient ($)']
coefs_table = coefs_table.sort_values(by='Coefficient ($)', key=abs, ascending=False)
coefs_table['Coefficient ($)'] = round(coefs_table['Coefficient ($)'], 2)

coefs_table

Unnamed: 0,Feature,Coefficient ($)
0,(Intercept),-880307.3
26,ExterQual_Ex,44916.26
2,Foundation_Wood,-42227.76
18,Functional_Maj,-25757.75
23,Neighborhood.Cluster.Label_3,24570.46
24,KitchenQual_Ex,24389.63
22,OverallCondBinary,-19385.77
25,BsmtQual_Ex,18549.78
4,BsmtQual_None,14860.8
19,BldgType_TwnhsE,-14543.12


In [8]:
# This order is taken right from our R model's feature importance, standardized MLR in R. 
fir_ = pd.DataFrame(data={'Feature': ['(Intercept)', 'GrLivArea', 'TotalBsmtSF', 'YearBuilt', 
                 'OverallQual', 'ExterQual_Ex', 'BsmtUnfSF', 'RemodelYrsAftBuilt', 
                 'KitchenQual_Ex', 'BsmtQual_Ex', 'GarageArea', 'Neighborhood.Cluster.Label_3', 
                 'BldgType_TwnhsE', 'LotArea', 'BsmtExposure', 'BedroomAbvGr', 'OverallCondBinary', 
                 'Functional_Maj', 'ExterQual_Gd', 'Functional_Min', 'Fireplaces', 'BsmtFinGdLvng', 
                 'SchD_S', 'Dist_From_UoI', 'BldgType_Twnhs', 'Condition1_Artery', 'BsmtQual_None', 
                 'Exterior1st_HdbdOther', 'Exterior1st_BrkFace', 'GarageFinish_RFn', 'Foundation_Wood', 
                 'Exterior1st_AsbShng'], 'Importance Ranking': range(1,33)})
fir_

Unnamed: 0,Feature,Importance Ranking
0,(Intercept),1
1,GrLivArea,2
2,TotalBsmtSF,3
3,YearBuilt,4
4,OverallQual,5
5,ExterQual_Ex,6
6,BsmtUnfSF,7
7,RemodelYrsAftBuilt,8
8,KitchenQual_Ex,9
9,BsmtQual_Ex,10


In [9]:
stepwise_added = pd.merge(coefs_table, fir_, how='left', on='Feature')

In [10]:
# Already ran
# stepwise_added.to_csv('stepwise_coefs_dash.csv', index=False)

Here I will train the stepwise model we used in R in the python environment. I want a pickle file of the fitted model to use in my Dash application. (Should also check to make sure the coefficients are the same). 

In [11]:
from sklearn.linear_model import LinearRegression

In [23]:
stepwise_feats = fir_.copy()
step_coefs = stepwise_feats['Feature'][1:]
step_coefs[11]='Neighborhood Cluster Label_3'


1                        GrLivArea
2                      TotalBsmtSF
3                        YearBuilt
4                      OverallQual
5                     ExterQual_Ex
6                        BsmtUnfSF
7               RemodelYrsAftBuilt
8                   KitchenQual_Ex
9                      BsmtQual_Ex
10                      GarageArea
11    Neighborhood Cluster Label_3
12                 BldgType_TwnhsE
13                         LotArea
14                    BsmtExposure
15                    BedroomAbvGr
16               OverallCondBinary
17                  Functional_Maj
18                    ExterQual_Gd
19                  Functional_Min
20                      Fireplaces
21                   BsmtFinGdLvng
22                          SchD_S
23                   Dist_From_UoI
24                  BldgType_Twnhs
25               Condition1_Artery
26                   BsmtQual_None
27           Exterior1st_HdbdOther
28             Exterior1st_BrkFace
29                Ga

In [20]:
data=pd.read_csv('RowFiltered_dummied_data_TRAIN.csv', index_col=False)
data

Unnamed: 0,SalePrice,Lat_Long,Dist_From_UoI,GrLivArea,1stFlrSF,2ndFlrSF,TotRmsAbvGrd,BedroomAbvGr,FullBath,HalfBath,...,WoodDeckBinary,HasPorch,FenceBinary,Fireplaces,NoCentralAir,SchD_S,Neighborhood Cluster Label_2,Neighborhood Cluster Label_3,Neighborhood Cluster Label_4,Neighborhood Cluster Label_5
0,179200,"[42.036345280662346, -93.6878802695534]",3.587525,1294,1294,0,6,3,2,0,...,1,1,1,0,0,0,0,0,0,0
1,115000,"[42.038378, -93.61261990196078]",3.084359,1540,1040,500,7,4,1,0,...,0,0,0,0,0,0,0,0,0,0
2,110000,"[42.04854121553709, -93.6265682965729]",2.937397,892,892,0,5,3,1,0,...,0,0,1,0,0,0,0,0,0,0
3,131750,"[42.018681900000004, -93.6663829759606]",1.868841,960,960,0,4,2,1,0,...,1,1,0,0,0,0,0,0,0,0
4,190000,"[42.055141, -93.633802]",3.337009,1969,1161,808,8,3,2,1,...,0,1,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1444,195000,"[42.0624755, -93.6414793]",4.005867,1455,1455,0,6,2,2,0,...,1,1,0,1,0,1,0,0,0,0
1445,146500,"[42.02730468048847, -93.60702960866344]",3.257420,1521,780,741,8,4,1,0,...,0,1,1,0,0,0,0,0,0,0
1446,211500,"[42.04299352441229, -93.64930425966442]",1.833583,1226,1226,0,4,1,1,0,...,1,1,0,2,0,0,1,0,0,0
1447,117000,"[42.021659183673464, -93.68313334693877]",3.079854,875,875,0,5,3,1,0,...,1,0,0,0,0,0,0,0,0,0


In [25]:
X_vals = data[step_coefs]
y_vals = data['SalePrice']

In [29]:
ols = LinearRegression()
ols.fit(X_vals, y_vals)
pd.DataFrame({'Features': step_coefs, 'Coefficient': ols.coef_})
# Awesome, the coefficients are the same! glad it worked. 

Unnamed: 0,Features,Coefficient
1,GrLivArea,57.688329
2,TotalBsmtSF,39.072894
3,YearBuilt,447.277788
4,OverallQual,8292.669227
5,ExterQual_Ex,44916.256047
6,BsmtUnfSF,-16.582442
7,RemodelYrsAftBuilt,216.47177
8,KitchenQual_Ex,24389.628049
9,BsmtQual_Ex,18549.781618
10,GarageArea,21.722193


In [36]:
import pickle

with open('Dash/stepwise_model.pickle', 'wb') as f:
    pickle.dump(ols, f)

In [33]:
type(ols)

sklearn.linear_model._base.LinearRegression

In [43]:
ols.predict((np.array(X_vals.iloc[1,:])).reshape(1,-1))[0]

114363.02134551026

In [44]:
X_vals.iloc[1,:]


GrLivArea                       1540.000000
TotalBsmtSF                     1190.000000
YearBuilt                       1955.000000
OverallQual                        4.000000
ExterQual_Ex                       0.000000
BsmtUnfSF                       1040.000000
RemodelYrsAftBuilt                 0.000000
KitchenQual_Ex                     0.000000
BsmtQual_Ex                        0.000000
GarageArea                       352.000000
Neighborhood Cluster Label_3       0.000000
BldgType_TwnhsE                    0.000000
LotArea                         7150.000000
BsmtExposure                       0.000000
BedroomAbvGr                       4.000000
OverallCondBinary                  1.000000
Functional_Maj                     0.000000
ExterQual_Gd                       0.000000
Functional_Min                     0.000000
Fireplaces                         0.000000
BsmtFinGdLvng                      0.000000
SchD_S                             0.000000
Dist_From_UoI                   