In [12]:
import numpy as np
import pandas as pd
import pickle
import statsmodels.api as sm
from sklearn import linear_model
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.model_selection import KFold

## Build Model

In [2]:
df_train = pd.read_csv("data/train_clean.csv", header=0, index_col=0)
X = df_train.drop(labels=["SalePrice"], axis=1)
y = df_train["SalePrice"]

In [3]:
# sm model
X_sm = sm.add_constant(X)
model_sm = sm.OLS(y, X_sm).fit()
print(model_sm.summary())

                            OLS Regression Results                            
Dep. Variable:              SalePrice   R-squared:                       0.933
Model:                            OLS   Adj. R-squared:                  0.919
Method:                 Least Squares   F-statistic:                     64.87
Date:                Wed, 07 Nov 2018   Prob (F-statistic):               0.00
Time:                        14:08:26   Log-Likelihood:                -16567.
No. Observations:                1460   AIC:                         3.365e+04
Df Residuals:                    1200   BIC:                         3.503e+04
Df Model:                         259                                         
Covariance Type:            nonrobust                                         
                            coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------
const                 -2.782e+

# K-Fold Cross Validation

In [4]:
# Subset significant model columns
columns_model_only = [
    "LotArea", "OverallQual", "OverallCond", "YearBuilt", "YearRemodAdd",
    "MasVnrArea", "BsmtFinSF1", "TotalBsmtSF", "1stFlrSF", "2ndFlrSF",
    "GrLivArea", "FullBath", "FullBath", "Fireplaces", "GarageArea",
    "WoodDeckSF", "ScreenPorch", "PoolArea", "Street_Pave", "Neighborhood_Edwards",
    "Neighborhood_Mitchel", "Neighborhood_NAmes", "Neighborhood_NWAmes", 
    "Neighborhood_NoRidge", "Neighborhood_NridgHt", "Neighborhood_StoneBr",
    "Condition2_PosA", "Condition2_PosN", "RoofStyle_Shed", "RoofMatl_ClyTile",
    "RoofMatl_CompShg", "RoofMatl_Membran", "RoofMatl_Metal", "RoofMatl_Tar&Grv",
    "RoofMatl_WdShake", "GarageQual_Ex", "GarageCond_Ex",
    "PoolQC_Fa", "PoolQC_Gd", "PoolQC_NA"
]
columns_model_only = X.columns # all columns
X = df_train[columns_model_only]
y = df_train["SalePrice"]

In [28]:
X_cv = X.values
y_cv = y.values

kf = KFold(n_splits=10)
kf.get_n_splits(X_cv)
cv_results = pd.DataFrame(columns=[
    "test_idx", "R2 train", "RMSE train", "MAE test", 
    "R2 test", "RMSE test", "MAE test"])
for train_index, test_index in kf.split(X_cv):
    X_train, X_test = X_cv[train_index], X_cv[test_index]
    y_train, y_test = y_cv[train_index], y_cv[test_index]
    model = linear_model.LinearRegression()
    model.fit(X_train, y_train)
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    # remove <0 prices
    npvec_zero = np.vectorize(lambda x: max(min(y_train_pred), x))
    y_train_pred = npvec_zero(y_train_pred)
    y_test_pred = npvec_zero(y_test_pred)
    # use log price for comparison between expensive and cheap houses
    # TODO
#     npvec_log = np.vectorize(lambda x: np.log(x))
#     y_train_pred = npvec_log(y_train_pred)
#     y_test_pred = npvec_log(y_test_pred)
    mae_train = mean_absolute_error(y_train, y_train_pred)
    mse = mean_squared_error(y_train, y_train_pred)
    rmse_train = mse ** (0.5)
    r2_train = r2_score(y_train, y_train_pred)
    mae_test = mean_absolute_error(y_test, y_test_pred)
    mse = mean_squared_error(y_test, y_test_pred)
    rmse_test = mse ** (0.5)
    r2_test = r2_score(y_test, y_test_pred)
    cv_results.loc[len(cv_results)] = ["{} - {}".format(min(test_index), max(test_index)), 
                  r2_train, rmse_train, mae_train,
                  r2_test, rmse_test, mae_test]
cv_results

Unnamed: 0,test_idx,R2 train,RMSE train,MAE test,R2 test,RMSE test,MAE test.1
0,0 - 145,0.93434,20638.266545,13222.747845,0.868064,24739.144372,17301.587462
1,146 - 291,0.934855,20253.996752,12912.648005,0.800307,35531.553171,19637.351941
2,292 - 437,0.932891,20576.894498,13363.748455,0.911108,23626.409143,16978.004049
3,438 - 583,0.942594,18933.426923,12873.837822,0.746547,41702.880402,19322.323388
4,584 - 729,0.933983,19861.882565,12813.482838,0.90099,30272.024738,19397.237704
5,730 - 875,0.943923,18873.501557,12886.165063,0.717987,40771.870299,20231.028475
6,876 - 1021,0.935041,20435.976522,13220.029998,0.888555,23940.831603,16134.374573
7,1022 - 1167,0.934673,20519.078456,13385.664472,0.897066,22660.088628,15010.687232
8,1168 - 1313,0.934392,20007.567905,12805.014597,0.411297,68992.699066,24795.234028
9,1314 - 1459,0.936445,20202.824415,13028.936212,-0.581023,91007.173942,22780.432165


In [7]:
# save output
model = linear_model.LinearRegression()
model.fit(X, y)
pickle.dump(model, open("model.pkl", "wb"))
pickle.dump(columns_model_only, open("columns_model_only.pkl", "wb"))

In [8]:
# sm model - final summary
X_sm = sm.add_constant(X)
model_sm = sm.OLS(y, X_sm).fit()
print(model_sm.summary())

                            OLS Regression Results                            
Dep. Variable:              SalePrice   R-squared:                       0.933
Model:                            OLS   Adj. R-squared:                  0.919
Method:                 Least Squares   F-statistic:                     64.87
Date:                Wed, 07 Nov 2018   Prob (F-statistic):               0.00
Time:                        14:09:14   Log-Likelihood:                -16567.
No. Observations:                1460   AIC:                         3.365e+04
Df Residuals:                    1200   BIC:                         3.503e+04
Df Model:                         259                                         
Covariance Type:            nonrobust                                         
                            coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------
const                 -2.782e+