In [1]:
import pandas as pd
import numpy as np
from sklearn import model_selection
from sklearn import metrics
from sklearn.linear_model import SGDRegressor
from sklearn.feature_selection import RFE

df = pd.read_csv("./data/df_final.csv")
print(df.head(5))

X, y = df.drop(columns=['SalePrice']), df['SalePrice']

X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.2, random_state=0)

   MSSubClass_120  MSSubClass_160  MSSubClass_180  MSSubClass_190  \
0               0               0               0               0   
1               0               0               0               0   
2               0               0               0               0   
3               0               0               0               0   
4               0               0               0               0   

   MSSubClass_20  MSSubClass_30  MSSubClass_40  MSSubClass_45  MSSubClass_50  \
0              0              0              0              0              0   
1              0              0              0              0              0   
2              0              0              0              0              0   
3              1              0              0              0              0   
4              0              0              0              0              0   

   MSSubClass_60  ...  GarageCars  GarageArea  WoodDeckSF  OpenPorchSF  \
0              1  ...    0.339

## Grid Search for Training Hyperparameters

In [2]:
param_grid = {
    'alpha': np.arange(0.001, 0.002, 0.0002),
    'eta0': np.arange(0.001, 0.003, 0.001),
    'max_iter': range(50, 150, 50),
    'penalty' : ['l1', 'l2']  # Lasso = l1, Ridge = l2
}

gs = model_selection.GridSearchCV(SGDRegressor(early_stopping=True, random_state=0, learning_rate='constant'), 
                                  param_grid=param_grid, scoring='neg_root_mean_squared_error', refit=True, 
                                  return_train_score=True, cv=5)
gs.fit(X, y)
print('Optimal parameters:', gs.best_params_)
print('neg_root_mean_squared_error with above optimal parameters:', gs.best_score_)

Optimal parameters: {'alpha': 0.0018000000000000004, 'eta0': 0.002, 'max_iter': 50, 'penalty': 'l2'}
neg_root_mean_squared_error with above optimal parameters: -21550.664584041842


#### Predict with Optimal Parameters

In [3]:
print('r2_error(train):', metrics.r2_score(y_train, gs.predict(X_train)))
print('r2_error(test):', metrics.r2_score(y_test, gs.predict(X_test)))

print('\nroot_mean_squared_error(train):', metrics.mean_squared_error(y_train, gs.predict(X_train), squared=False))
print('root_mean_squared_error(test):', metrics.mean_squared_error(y_test, gs.predict(X_test), squared=False))

print('\nmax_error(train):', metrics.max_error(y_train, gs.predict(X_train)))
print('max_error(test):', metrics.max_error(y_test, gs.predict(X_test)))

r2_error(train): 0.9226755606072808
r2_error(test): 0.9213946398744604

root_mean_squared_error(train): 19396.50752496842
root_mean_squared_error(test): 18836.37680755738

max_error(train): 147459.27472448925
max_error(test): 78052.09125596564


## Training (SGDRegressor)

In [4]:
# https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.SGDRegressor.html#sklearn.linear_model.SGDRegressor

sgdr = SGDRegressor(early_stopping=True, random_state=0, learning_rate='constant', 
                    alpha=0.0018, eta0=0.002, max_iter=50, penalty='l2')
sgdr.fit(X_train, y_train)

print(sgdr.get_params())
print('\n\nr2_error(train):', sgdr.score(X_train, y_train))
print('r2_error(test):', sgdr.score(X_test, y_test))

print('\nroot_mean_squared_error(train):', metrics.mean_squared_error(y_train, sgdr.predict(X_train), squared=False))
print('root_mean_squared_error(test):', metrics.mean_squared_error(y_test, sgdr.predict(X_test), squared=False))

print('\nmax_error(train):', metrics.max_error(y_train, sgdr.predict(X_train)))
print('max_error(test):', metrics.max_error(y_test, sgdr.predict(X_test)))

print('\nActual number of iteration:', sgdr.n_iter_)

{'alpha': 0.0018, 'average': False, 'early_stopping': True, 'epsilon': 0.1, 'eta0': 0.002, 'fit_intercept': True, 'l1_ratio': 0.15, 'learning_rate': 'constant', 'loss': 'squared_loss', 'max_iter': 50, 'n_iter_no_change': 5, 'penalty': 'l2', 'power_t': 0.25, 'random_state': 0, 'shuffle': True, 'tol': 0.001, 'validation_fraction': 0.1, 'verbose': 0, 'warm_start': False}


r2_error(train): 0.9314606137041556
r2_error(test): 0.9093709639638278

root_mean_squared_error(train): 18261.44984986743
root_mean_squared_error(test): 20225.765565459587

max_error(train): 153623.3777612658
max_error(test): 83418.71910363773

Actual number of iteration: 13


## Cross Validation

In [5]:
scoring= ['r2', 'neg_root_mean_squared_error', 'max_error']

scores = model_selection.cross_validate(sgdr, X, y, scoring=scoring, 
                                        cv=10, return_train_score=True)

print('Mean of r2(train):', np.mean(scores['train_r2']))
print('Mean of r2(test):', np.mean(scores['test_r2']))

print('Mean of RMSE(train):', -(np.mean(scores['train_neg_root_mean_squared_error'])))
print('Mean of RMSE(test):', -(np.mean(scores['test_neg_root_mean_squared_error'])))

print('Mean of max_error(train):', np.mean(scores['train_max_error']))
print('Mean of max_error(test):', np.mean(scores['test_max_error']))

Mean of r2(train): 0.8441815584879716
Mean of r2(test): 0.9013776846383786
Mean of RMSE(train): 25518.316436921665
Mean of RMSE(test): 21436.881854746374
Mean of max_error(train): -497906.6516500757
Mean of max_error(test): -105066.59966579238


## Residual Plots

In [None]:
import plotly.express as px

y_train_pred = sgdr.predict(X_train)
y_test_pred = sgdr.predict(X_test)


res_train_df = y_train.to_frame()
res_train_df['prediction'] = y_train_pred
res_train_df['Set'] = 'train'

res_test_df = y_test.to_frame()
res_test_df['prediction'] = y_test_pred
res_test_df['Set'] = 'test'


result = pd.concat([res_train_df, res_test_df])
result['residual'] = result['prediction'] - result['SalePrice']

fig = px.scatter(
    result, x='prediction', y='residual',
    marginal_y='violin',
    color='Set', trendline='ols',
    title="Residual Plot"
)
fig.show()