In [2]:
import numpy as np
import pandas as pd
import pickle
import statsmodels.api as sm
from sklearn.neural_network import MLPRegressor
from sklearn.feature_selection import RFE
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.model_selection import KFold

## Build Model

In [13]:
df_train = pd.read_csv("data/train_clean.csv", header=0, index_col=0)
columns_model_only = pickle.load(open("columns_model_only.pkl", "rb"))
X = df_train[columns_model_only]
y = df_train["SalePrice"]

# K-Fold Cross Validation

In [19]:
X_cv = X.values
y_cv = y.values

kf = KFold(n_splits=5)
kf.get_n_splits(X_cv)
cv_results = pd.DataFrame(columns=[
    "test_idx", "R2 train", "RMSE train", "MAE test", 
    "R2 test", "RMSE test", "MAE test"])
for train_index, test_index in kf.split(X_cv):
    X_train, X_test = X_cv[train_index], X_cv[test_index]
    y_train, y_test = y_cv[train_index], y_cv[test_index]
    estimator = MLPRegressor(hidden_layer_sizes=(50, 20), solver="adam", activation="relu", 
                         random_state=2, max_iter=3000)
    estimator.fit(X_train, y_train)
    y_train_pred = estimator.predict(X_train)
    y_test_pred = estimator.predict(X_test)
    # remove <0 prices
    npvec_zero = np.vectorize(lambda x: max(min(y_train_pred), x))
    y_train_pred = npvec_zero(y_train_pred)
    y_test_pred = npvec_zero(y_test_pred)
    mae_train = mean_absolute_error(y_train, y_train_pred)
    mse = mean_squared_error(y_train, y_train_pred)
    rmse_train = mse ** (0.5)
    r2_train = r2_score(y_train, y_train_pred)
    mae_test = mean_absolute_error(y_test, y_test_pred)
    mse = mean_squared_error(y_test, y_test_pred)
    rmse_test = mse ** (0.5)
    r2_test = r2_score(y_test, y_test_pred)
    cv_results.loc[len(cv_results)] = ["{} - {}".format(min(test_index), max(test_index)), 
                  r2_train, rmse_train, mae_train,
                  r2_test, rmse_test, mae_test]
    print(cv_results)
cv_results

  test_idx  R2 train    RMSE train      MAE test   R2 test     RMSE test  \
0  0 - 291  0.866907  29412.979195  18977.421772  0.878618  25914.508516   

       MAE test  
0  18846.504179  
    test_idx  R2 train    RMSE train      MAE test   R2 test     RMSE test  \
0    0 - 291  0.866907  29412.979195  18977.421772  0.878618  25914.508516   
1  292 - 583  0.865911  28924.874828  18992.379292  0.838658  32559.516236   

       MAE test  
0  18846.504179  
1  21042.767972  
    test_idx  R2 train    RMSE train      MAE test   R2 test     RMSE test  \
0    0 - 291  0.866907  29412.979195  18977.421772  0.878618  25914.508516   
1  292 - 583  0.865911  28924.874828  18992.379292  0.838658  32559.516236   
2  584 - 875  0.861677  28772.237848  18768.702358  0.864734  32015.951647   

       MAE test  
0  18846.504179  
1  21042.767972  
2  20462.013796  
     test_idx  R2 train    RMSE train      MAE test   R2 test     RMSE test  \
0     0 - 291  0.866907  29412.979195  18977.421772  0.878

Unnamed: 0,test_idx,R2 train,RMSE train,MAE test,R2 test,RMSE test,MAE test.1
0,0 - 291,0.866907,29412.979195,18977.421772,0.878618,25914.508516,18846.504179
1,292 - 583,0.865911,28924.874828,18992.379292,0.838658,32559.516236,21042.767972
2,584 - 875,0.861677,28772.237848,18768.702358,0.864734,32015.951647,20462.013796
3,876 - 1167,0.866801,29644.81725,19142.027248,0.862753,26368.027019,18960.675673
4,1168 - 1459,0.878511,27459.627859,18576.49816,0.793808,37183.217768,20880.144872


In [21]:
# save output
estimator.fit(X_train, y_train)
pickle.dump(estimator, open("estimator_nn.pkl", "wb"))

In [22]:
# final summary
y_pred = estimator.predict(X)
mae_final = mean_absolute_error(y, y_pred)
mse_final = mean_squared_error(y, y_pred)
rmse_final = mse ** (0.5)
r2_final = r2_score(y, y_pred)
print("R2 {}, RMSE {}, MAE {}".format(r2_final, rmse_final, mae_final))

R2 0.860508521168799, RMSE 37183.21776792265, MAE 19037.22750226403
