## Refetching transformed data

In [1]:
import pandas as pd

train_data = pd.read_csv('./data/aligned_encoded_train_data.csv')
test_data = pd.read_csv('./data/aligned_encoded_test_data.csv')

test_data.drop(columns=['SalePrice'], inplace=True)

train_data.head()

Unnamed: 0,1stFlrSF,2ndFlrSF,3SsnPorch,BedroomAbvGr,BldgType_1Fam,BldgType_2fmCon,BldgType_Duplex,BldgType_Twnhs,BldgType_TwnhsE,BsmtCond,...,ScreenPorch,Street_Grvl,Street_Pave,TotRmsAbvGrd,TotalBsmtSF,Utilities,WoodDeckSF,YearBuilt,YearRemodAdd,YrSold
0,856,854,0,3,1,0,0,0,0,3,...,0,0,1,8,856,4,0,2003,2003,2008
1,1262,0,0,3,1,0,0,0,0,3,...,0,0,1,6,1262,4,298,1976,1976,2007
2,920,866,0,3,1,0,0,0,0,3,...,0,0,1,6,920,4,0,2001,2002,2008
3,961,756,0,3,1,0,0,0,0,4,...,0,0,1,7,756,4,0,1915,1970,2006
4,1145,1053,0,4,1,0,0,0,0,3,...,0,0,1,9,1145,4,192,2000,2000,2008


## Feature scaling and normalization

References:

- https://towardsdatascience.com/the-ultimate-guide-to-data-cleaning-3969843991d4#d078
- https://www.codecademy.com/articles/normalization
- https://machinelearningmastery.com/feature-selection-machine-learning-python/
- https://towardsdatascience.com/feature-selection-techniques-in-machine-learning-with-python-f24e7da3f36e

In [2]:
# space (normalize and scale):
#     LotArea, BsmtFinSF1, BsmtFinSF2, BsmtUnfSF, TotalBsmtSF, 1stFlrSF, 2ndFlrSF, 
#     GrLivArea, GarageArea, WoodDeckSF, OpenPorchSF, EnclosedPorch, 3SsnPorch, ScreenPorch, PoolArea,
#     BedroomAbvGr, KitchenAbvGr, LotFrontage (*), LotDepth (*), MasVnrArea, 
#
# quantity (scale):
#     BsmtFullBath, BsmtHalfBath, FullBath, HalfBath, TotRmsAbvGrd, Fireplaces, GarageCars
#
# grade (scale):
#     LotShape, LandSlope, Utilities, ExterQual, ExterCond, BsmtQual, BsmtCond, BsmtExposure,
#     HeatingQC, CentralAir, KitchenQual, Functional, FireplaceQu, GarageFinish, GarageQual, GarageCond
#     BsmtFinType1, BsmtFinType2, OverallQual, OverallCond, 
#
# time (scale):
#     YearBuilt, YearRemodAdd, YrSold, GarageYrBlt,
#
# currency (normalize and scale):
#     MiscVal, NeighborhoodMeanPrice, GarageTypeMeanPrice
#
# one-hot-encoded-binary (keep):
#     MSSubClass, MSZoning, LandContour, Street, LotConfig, BldgType, HouseStyle, RoofStyle, RoofMatl, MoSold,
#     MasVnrType, Foundation, Heating, Electrical, PavedDrive, Condition1, Condition2, Exterior1st, Exterior2nd

columns_to_normalize = [
    'MiscVal',
    'NeighborhoodMeanPrice',
    'GarageTypeMeanPrice',
    'LotArea',
    'BsmtFinSF1',
    'BsmtFinSF2',
    'BsmtUnfSF',
    'TotalBsmtSF',
    '1stFlrSF',
    '2ndFlrSF',
    'GrLivArea',
    'GarageArea',
    'WoodDeckSF',
    'OpenPorchSF',
    'EnclosedPorch',
    '3SsnPorch',
    'ScreenPorch',
    'PoolArea',
    'BedroomAbvGr',
    'KitchenAbvGr',
    'LotFrontage',
    'LotDepth',
    'MasVnrArea'
]

columns_to_scale = [
    'MiscVal',
    'NeighborhoodMeanPrice',
    'GarageTypeMeanPrice',
    'LotArea',
    'BsmtFinSF1',
    'BsmtFinSF2',
    'BsmtUnfSF',
    'TotalBsmtSF',
    '1stFlrSF',
    '2ndFlrSF',
    'GrLivArea',
    'GarageArea',
    'WoodDeckSF',
    'OpenPorchSF',
    'EnclosedPorch',
    '3SsnPorch',
    'ScreenPorch',
    'PoolArea',
    'BedroomAbvGr',
    'KitchenAbvGr',
    'LotFrontage',
    'LotDepth',
    'MasVnrArea',
    'BsmtFullBath',
    'BsmtHalfBath',
    'FullBath',
    'HalfBath',
    'TotRmsAbvGrd',
    'Fireplaces',
    'GarageCars',
    'LotShape',
    'LandSlope',
    'Utilities',
    'ExterQual',
    'ExterCond',
    'BsmtQual',
    'BsmtCond',
    'BsmtExposure',
    'HeatingQC',
    'CentralAir',
    'KitchenQual',
    'Functional',
    'FireplaceQu',
    'GarageFinish',
    'GarageQual',
    'GarageCond',
    'BsmtFinType1',
    'BsmtFinType2',
    'OverallQual',
    'OverallCond',
    'YearBuilt',
    'YearRemodAdd',
    'YrSold',
    'GarageYrBlt'
]

In [3]:
# mean normalization
normalized_train_data = train_data.copy()
normalized_test_data = test_data.copy()

normalized_train_data[columns_to_normalize] = (normalized_train_data[columns_to_normalize] - normalized_train_data[columns_to_normalize].mean()) / normalized_train_data[columns_to_normalize].std()
normalized_test_data[columns_to_normalize] = (normalized_test_data[columns_to_normalize] - normalized_test_data[columns_to_normalize].mean()) / normalized_test_data[columns_to_normalize].std()

normalized_train_data.head()

Unnamed: 0,1stFlrSF,2ndFlrSF,3SsnPorch,BedroomAbvGr,BldgType_1Fam,BldgType_2fmCon,BldgType_Duplex,BldgType_Twnhs,BldgType_TwnhsE,BsmtCond,...,ScreenPorch,Street_Grvl,Street_Pave,TotRmsAbvGrd,TotalBsmtSF,Utilities,WoodDeckSF,YearBuilt,YearRemodAdd,YrSold
0,-0.793162,1.161454,-0.116299,0.163723,1,0,0,0,0,3,...,-0.270116,0,1,8,-0.459145,4,-0.751918,2003,2003,2008
1,0.257052,-0.794891,-0.116299,0.163723,1,0,0,0,0,3,...,-0.270116,0,1,6,0.466305,4,1.625638,1976,1976,2007
2,-0.627611,1.188943,-0.116299,0.163723,1,0,0,0,0,3,...,-0.270116,0,1,6,-0.313261,4,-0.751918,2001,2002,2008
3,-0.521555,0.936955,-0.116299,0.163723,1,0,0,0,0,4,...,-0.270116,0,1,7,-0.687089,4,-0.751918,1915,1970,2006
4,-0.045596,1.617323,-0.116299,1.389547,1,0,0,0,0,3,...,-0.270116,0,1,9,0.199611,4,0.77993,2000,2000,2008


In [4]:
# min max scaling
# normalized_df=(df-df.min())/(df.max()-df.min())
scaled_train_data = normalized_train_data.copy()
scaled_test_data = normalized_test_data.copy()

scaled_train_data[columns_to_scale] = (scaled_train_data[columns_to_scale] - scaled_train_data[columns_to_scale].min()) / (scaled_train_data[columns_to_scale].max() - scaled_train_data[columns_to_scale].min())
scaled_test_data[columns_to_scale] = (scaled_test_data[columns_to_scale] - scaled_train_data[columns_to_scale].min()) / (scaled_train_data[columns_to_scale].max() - scaled_train_data[columns_to_scale].min())

scaled_train_data.head()

Unnamed: 0,1stFlrSF,2ndFlrSF,3SsnPorch,BedroomAbvGr,BldgType_1Fam,BldgType_2fmCon,BldgType_Duplex,BldgType_Twnhs,BldgType_TwnhsE,BsmtCond,...,ScreenPorch,Street_Grvl,Street_Pave,TotRmsAbvGrd,TotalBsmtSF,Utilities,WoodDeckSF,YearBuilt,YearRemodAdd,YrSold
0,0.11978,0.413559,0.0,0.375,1,0,0,0,0,0.75,...,0.0,0,1,0.5,0.140098,1.0,0.0,0.949275,0.883333,0.5
1,0.212942,0.0,0.0,0.375,1,0,0,0,0,0.75,...,0.0,0,1,0.333333,0.206547,1.0,0.347725,0.753623,0.433333,0.25
2,0.134465,0.41937,0.0,0.375,1,0,0,0,0,0.75,...,0.0,0,1,0.333333,0.150573,1.0,0.0,0.934783,0.866667,0.5
3,0.143873,0.366102,0.0,0.375,1,0,0,0,0,1.0,...,0.0,0,1,0.416667,0.123732,1.0,0.0,0.311594,0.333333,0.0
4,0.186095,0.509927,0.0,0.5,1,0,0,0,0,0.75,...,0.0,0,1,0.583333,0.187398,1.0,0.224037,0.927536,0.833333,0.5


## Feature Selection

References:

- https://towardsdatascience.com/feature-selection-techniques-in-machine-learning-with-python-f24e7da3f36e
- https://seaborn.pydata.org/generated/seaborn.pairplot.html

### Feature correlation

- Drop highly correlated features

In [5]:
reduced_train_data = scaled_train_data.copy()
reduced_test_data = scaled_test_data.copy()

def run_corr_analysis():
    features = reduced_train_data.drop(columns=['Id', 'SalePrice'])

    corr = features.corr().abs()
    corr[corr == 1] = 0
    corr_cols = corr.max().sort_values(ascending=False)
    return corr_cols[corr_cols > 0.8]

display(run_corr_analysis())
reduced_train_data.drop(columns=['MSSubClass_190'], inplace=True)
reduced_test_data.drop(columns=['MSSubClass_190'], inplace=True)
reduced_train_data.drop(columns=['Exterior2nd_VinylSd'], inplace=True)
reduced_test_data.drop(columns=['Exterior2nd_VinylSd'], inplace=True)
reduced_train_data.drop(columns=['Exterior2nd_CmentBd'], inplace=True)
reduced_test_data.drop(columns=['Exterior2nd_CmentBd'], inplace=True)
reduced_train_data.drop(columns=['Exterior2nd_MetalSd'], inplace=True)
reduced_test_data.drop(columns=['Exterior2nd_MetalSd'], inplace=True)
reduced_train_data.drop(columns=['GarageQual'], inplace=True)
reduced_test_data.drop(columns=['GarageQual'], inplace=True)
reduced_train_data.drop(columns=['MSSubClass_80'], inplace=True)
reduced_test_data.drop(columns=['MSSubClass_80'], inplace=True)
reduced_train_data.drop(columns=['MSSubClass_50'], inplace=True)
reduced_test_data.drop(columns=['MSSubClass_50'], inplace=True)
reduced_train_data.drop(columns=['RoofStyle_Hip'], inplace=True)
reduced_test_data.drop(columns=['RoofStyle_Hip'], inplace=True)
reduced_train_data.drop(columns=['MSSubClass_45'], inplace=True)
reduced_test_data.drop(columns=['MSSubClass_45'], inplace=True)
reduced_train_data.drop(columns=['Exterior2nd_HdBoard'], inplace=True)
reduced_test_data.drop(columns=['Exterior2nd_HdBoard'], inplace=True)
reduced_train_data.drop(columns=['GarageArea'], inplace=True)
reduced_test_data.drop(columns=['GarageArea'], inplace=True)
reduced_train_data.drop(columns=['Exterior2nd_Wd Sdng'], inplace=True)
reduced_test_data.drop(columns=['Exterior2nd_Wd Sdng'], inplace=True)
reduced_train_data.drop(columns=['PavedDrive_N'], inplace=True)
reduced_test_data.drop(columns=['PavedDrive_N'], inplace=True)
reduced_train_data.drop(columns=['Exterior2nd_AsbShng'], inplace=True)
reduced_test_data.drop(columns=['Exterior2nd_AsbShng'], inplace=True)
reduced_train_data.drop(columns=['RoofMatl_Tar&Grv'], inplace=True)
reduced_test_data.drop(columns=['RoofMatl_Tar&Grv'], inplace=True)
reduced_train_data.drop(columns=['GrLivArea'], inplace=True)
reduced_test_data.drop(columns=['GrLivArea'], inplace=True)

reduced_train_data.drop(columns=['Electrical_FuseA'], inplace=True)
reduced_test_data.drop(columns=['Electrical_FuseA'], inplace=True)
reduced_train_data.drop(columns=['TotalBsmtSF'], inplace=True)
reduced_test_data.drop(columns=['TotalBsmtSF'], inplace=True)
reduced_train_data.drop(columns=['2ndFlrSF'], inplace=True)
reduced_test_data.drop(columns=['2ndFlrSF'], inplace=True)
reduced_train_data.drop(columns=['MSZoning_RM'], inplace=True)
reduced_test_data.drop(columns=['MSZoning_RM'], inplace=True)
reduced_train_data.drop(columns=['MasVnrType_None'], inplace=True)
reduced_test_data.drop(columns=['MasVnrType_None'], inplace=True)
run_corr_analysis()

MSSubClass_190         0.983395
BldgType_2fmCon        0.983395
Exterior2nd_VinylSd    0.977525
Exterior1st_VinylSd    0.977525
Exterior1st_CemntBd    0.974171
Exterior2nd_CmentBd    0.974171
Exterior2nd_MetalSd    0.973065
Exterior1st_MetalSd    0.973065
GarageQual             0.959172
GarageCond             0.959172
MSSubClass_80          0.942259
HouseStyle_SLvl        0.942259
HouseStyle_1.5Fin      0.940871
MSSubClass_50          0.940871
RoofStyle_Gable        0.933462
RoofStyle_Hip          0.933462
HouseStyle_1.5Unf      0.925181
MSSubClass_45          0.925181
Exterior2nd_HdBoard    0.883271
Exterior1st_HdBoard    0.883271
GarageArea             0.882475
GarageCars             0.882475
FireplaceQu            0.863241
Fireplaces             0.863241
Exterior1st_Wd Sdng    0.859244
Exterior2nd_Wd Sdng    0.859244
Electrical_SBrkr       0.857283
Electrical_FuseA       0.857283
PavedDrive_Y           0.856491
PavedDrive_N           0.856491
Exterior1st_AsbShng    0.847917
Exterior

FireplaceQu    0.863241
Fireplaces     0.863241
dtype: float64

### Target correlation

- Drop lowly target-correlated features

In [6]:
def run_target_corr_analysis():
    features = reduced_train_data.drop(columns=['Id'])

    corr = features.corr().abs()
    target_corr = corr['SalePrice'].sort_values(ascending=True)
    return target_corr[target_corr < 0.2]

lowly_correlated_feats = run_target_corr_analysis()
display(reduced_train_data.shape)
display(lowly_correlated_feats)

reduced_train_data.drop(columns=lowly_correlated_feats.index, inplace=True)
reduced_test_data.drop(columns=lowly_correlated_feats.index, inplace=True)

display(run_target_corr_analysis())
display(reduced_train_data.shape)

(1460, 185)

RoofMatl_Metal         0.000304
RoofStyle_Mansard      0.000308
BsmtFinType2           0.000651
Foundation_Wood        0.002711
Condition2_RRAe        0.002993
BldgType_TwnhsE        0.003804
LotConfig_Corner       0.004145
MoSold_10              0.004354
Condition1_RRNe        0.004584
Condition1_RRAn        0.005893
MoSold_1               0.005980
LotConfig_FR2          0.006859
RoofMatl_ClyTile       0.006897
MoSold_2               0.007355
MoSold_3               0.008219
BsmtFinSF2             0.011378
Foundation_Stone       0.012103
MoSold_8               0.014185
Utilities              0.014314
MoSold_12              0.014465
RoofMatl_Roll          0.014479
Condition2_RRAn        0.014510
MSSubClass_75          0.015265
MSSubClass_40          0.016366
RoofStyle_Flat         0.016433
Exterior2nd_Stone      0.016754
BsmtHalfBath           0.016844
Exterior1st_Plywood    0.017719
LotConfig_FR3          0.018186
ExterCond              0.018899
                         ...   
Exterior

Series([], Name: SalePrice, dtype: float64)

(1460, 53)

### Feature aggregation

In [7]:
reduced_train_data['FireplaceRating'] = reduced_train_data['Fireplaces'] * reduced_train_data['FireplaceQu']
reduced_test_data['FireplaceRating'] = reduced_test_data['Fireplaces'] * reduced_test_data['FireplaceQu']

### Feature variance

- Drop features with no variance

In [8]:
reduced_var_train_data = reduced_train_data.copy()
reduced_var_test_data = reduced_test_data.copy()

def run_var_analysis():
    features = reduced_var_train_data.drop(columns=['Id', 'SalePrice'])

    var_feats = features.var()
    var_feats_ordered = var_feats.sort_values(ascending=True)
    return var_feats_ordered[var_feats_ordered == 0]

low_variance_feats = run_var_analysis()
display(reduced_var_train_data.shape)
display(low_variance_feats)

reduced_var_train_data.drop(columns=low_variance_feats.index, inplace=True)
reduced_var_test_data.drop(columns=low_variance_feats.index, inplace=True)

display(run_var_analysis())
display(reduced_var_train_data.shape)

(1460, 181)

MSSubClass_150       0.0
Exterior1st_Other    0.0
dtype: float64

Series([], dtype: float64)

(1460, 179)

### Feature importance

- Drop lowly important features

In [40]:
from sklearn.ensemble import RandomForestRegressor

rfr = RandomForestRegressor(n_jobs=-1, random_state=42, n_estimators=50)

feature_selected_train_data = reduced_var_train_data.copy()
feature_selected_test_data = reduced_var_test_data.copy()

def run_importance_analysis():
    features = feature_selected_train_data.drop(columns=['Id', 'SalePrice'])
    
    rfr_importance_analyser = rfr.fit(features, feature_selected_train_data['SalePrice'])
    return features.columns[rfr_importance_analyser.feature_importances_ < 0.1]

low_importance_feats = run_importance_analysis()
display(feature_selected_train_data.shape)
display(low_importance_feats)

feature_selected_train_data.drop(columns=low_importance_feats, inplace=True)
feature_selected_test_data.drop(columns=low_importance_feats, inplace=True)

display(run_importance_analysis())
display(feature_selected_train_data.shape)

(1460, 179)

Index([u'1stFlrSF', u'3SsnPorch', u'BedroomAbvGr', u'BldgType_1Fam',
       u'BldgType_2fmCon', u'BldgType_Duplex', u'BldgType_Twnhs',
       u'BldgType_TwnhsE', u'BsmtCond', u'BsmtExposure',
       ...
       u'ScreenPorch', u'Street_Grvl', u'Street_Pave', u'TotRmsAbvGrd',
       u'Utilities', u'WoodDeckSF', u'YearBuilt', u'YearRemodAdd', u'YrSold',
       u'FireplaceRating'],
      dtype='object', length=175)

Index([], dtype='object')

(1460, 4)

In [41]:
feature_selected_train_data.head()

Unnamed: 0,Id,NeighborhoodMeanPrice,OverallQual,SalePrice
0,1,0.419865,0.666667,208500
1,2,0.592251,0.555556,181500
2,3,0.419865,0.666667,223500
3,4,0.473342,0.666667,140000
4,5,1.0,0.777778,250000


## Regression Model Selection

### Random Forest Regressor

- It did pretty well! Wow!
- It overfitted the training data, which is not a good sign

In [42]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_log_error
from sklearn.model_selection import KFold

rfr_train_X = feature_selected_train_data.drop(columns=['Id', 'SalePrice'])
rfr_train_y = feature_selected_train_data['SalePrice']

rfr = RandomForestRegressor(n_jobs=-1, random_state=42)

kf = KFold(n_splits=2, random_state=42, shuffle=False)

for kf_chunks in kf.split(rfr_train_X):
    train_index = kf_chunks[0]
    test_index = kf_chunks[1]
    
    X_train = rfr_train_X.iloc[train_index]
    X_test = rfr_train_X.iloc[test_index]
    y_train = rfr_train_y.iloc[train_index]
    y_test = rfr_train_y.iloc[test_index]
    
    rfr_predictor = rfr.fit(X_train, y_train)
    y_train_predicted = rfr_predictor.predict(X_train)
    y_test_predicted = rfr_predictor.predict(X_test)
    
    y_train_predicted[y_train_predicted < 0] = 0
    y_test_predicted[y_test_predicted < 0] = 0
    
    print('[Random Forest] Current fold train performance', mean_squared_log_error(y_train, y_train_predicted))
    print('[Random Forest] Current fold test performance', mean_squared_log_error(y_test, y_test_predicted))

('[Random Forest] Current fold train performance', 0.030058665363321534)
('[Random Forest] Current fold test performance', 0.04632713105196127)
('[Random Forest] Current fold train performance', 0.03536572832532612)
('[Random Forest] Current fold test performance', 0.03902620322071313)


### Linear Regression

- It performed much better than the Naive Linear Regression Predictor
- Feature engineering made that improvement happen
- It still haven't generalized well

In [43]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_log_error
from sklearn.model_selection import KFold

lr_train_X = feature_selected_train_data.drop(columns=['Id', 'SalePrice'])
lr_train_y = feature_selected_train_data['SalePrice']

linear_regression = LinearRegression(n_jobs=-1)

kf = KFold(n_splits=2, random_state=42, shuffle=False)

for kf_chunks in kf.split(lr_train_X):
    train_index = kf_chunks[0]
    test_index = kf_chunks[1]
    
    X_train = lr_train_X.iloc[train_index]
    X_test = lr_train_X.iloc[test_index]
    y_train = lr_train_y.iloc[train_index]
    y_test = lr_train_y.iloc[test_index]
    
    lr_predictor = linear_regression.fit(X_train, y_train)
    y_train_predicted = lr_predictor.predict(X_train)
    y_test_predicted = lr_predictor.predict(X_test)
    
    y_train_predicted[y_train_predicted < 0] = 0
    y_test_predicted[y_test_predicted < 0] = 0
    
    print('[Linear Regression] Current fold train performance', mean_squared_log_error(y_train, y_train_predicted))
    print('[Linear Regression] Current fold test performance', mean_squared_log_error(y_test, y_test_predicted))

('[Linear Regression] Current fold train performance', 0.045523847863209795)
('[Linear Regression] Current fold test performance', 0.07485829009819378)
('[Linear Regression] Current fold train performance', 0.3720154305116624)
('[Linear Regression] Current fold test performance', 0.051325831826506095)


### Stochastic Gradient Descent Regressor

- It has performed pretty good!
- Also, it have generalized well!
- RandomForest have performed better.
- RandomForest have generalized better with the submission test_data.

In [44]:
from sklearn.linear_model import SGDRegressor
from sklearn.metrics import mean_squared_log_error
from sklearn.model_selection import KFold

sgdr_train_X = feature_selected_train_data.drop(columns=['Id', 'SalePrice'])
sgdr_train_y = feature_selected_train_data['SalePrice']

sgdr = SGDRegressor(random_state=42, max_iter=100)

kf = KFold(n_splits=2, random_state=42, shuffle=False)

for kf_chunks in kf.split(sgdr_train_X):
    train_index = kf_chunks[0]
    test_index = kf_chunks[1]
    
    X_train = sgdr_train_X.iloc[train_index]
    X_test = sgdr_train_X.iloc[test_index]
    y_train = sgdr_train_y.iloc[train_index]
    y_test = sgdr_train_y.iloc[test_index]
    
    sgdr_predictor = sgdr.fit(X_train, y_train)
    y_train_predicted = sgdr_predictor.predict(X_train)
    y_test_predicted = sgdr_predictor.predict(X_test)
    
    y_train_predicted[y_train_predicted < 0] = 0
    y_test_predicted[y_test_predicted < 0] = 0
    
    print('[SGD Regressor] Current fold train performance', mean_squared_log_error(y_train, y_train_predicted))
    print('[SGD Regressor] Current fold test performance', mean_squared_log_error(y_test, y_test_predicted))

('[SGD Regressor] Current fold train performance', 0.04318651679086718)
('[SGD Regressor] Current fold test performance', 0.04881263014116553)
('[SGD Regressor] Current fold train performance', 0.0485676673910088)
('[SGD Regressor] Current fold test performance', 0.04316583135180444)


### Tuning Random Forest

In [13]:
# from sklearn.ensemble import RandomForestRegressor
# from sklearn.metrics import mean_squared_log_error, make_scorer
# from sklearn.model_selection import KFold, GridSearchCV

# def custom_scorer(y, y_pred):
#     y_pred[y_pred < 0] = 0
#     return mean_squared_log_error(y, y_pred)

# gs_scorer = make_scorer(custom_scorer)

# gs_train_X = feature_selected_train_data.drop(columns=['Id', 'SalePrice'])
# gs_train_y = feature_selected_train_data['SalePrice']

# rfr = RandomForestRegressor(n_jobs=-1, random_state=42)

# rfr_gs_params = {
#     'n_estimators': [5, 10, 25, 50, 100],
#     'max_depth': [2, 3, 4, 5, 8, 10],
#     'min_samples_split': [2, 3, 5],
#     'min_samples_leaf': [1, 2, 3, 5],
#     'max_leaf_nodes': [6, 8, 10, None]
# }

# gs = GridSearchCV(rfr, rfr_gs_params, n_jobs=-1, cv=3, scoring=gs_scorer)

# rfr_gs_predictor = gs.fit(gs_train_X, gs_train_y)

# rfr_gs_predictor.best_params_

{'max_depth': 2,
 'max_leaf_nodes': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'n_estimators': 5}

In [14]:
# from sklearn.ensemble import RandomForestRegressor
# from sklearn.metrics import mean_squared_log_error
# from sklearn.model_selection import KFold

# rfr_train_X = feature_selected_train_data.drop(columns=['Id', 'SalePrice'])
# rfr_train_y = feature_selected_train_data['SalePrice']

# # max_depth=10, n_estimators=10
# # max_depth=2, n_estimators=100
# # max_depth=2, n_estimators=100, min_samples_leaf= 3, min_samples_split=3

# rfr = RandomForestRegressor(n_jobs=-1, random_state=42, max_depth=2, n_estimators=5, min_samples_leaf= 1, min_samples_split=2)

# kf = KFold(n_splits=2, random_state=42, shuffle=False)

# for kf_chunks in kf.split(rfr_train_X):
#     train_index = kf_chunks[0]
#     test_index = kf_chunks[1]
    
#     X_train = rfr_train_X.iloc[train_index]
#     X_test = rfr_train_X.iloc[test_index]
#     y_train = rfr_train_y.iloc[train_index]
#     y_test = rfr_train_y.iloc[test_index]
    
#     rfr_predictor = rfr.fit(X_train, y_train)
#     y_train_predicted = rfr_predictor.predict(X_train)
#     y_test_predicted = rfr_predictor.predict(X_test)
    
#     y_train_predicted[y_train_predicted < 0] = 0
#     y_test_predicted[y_test_predicted < 0] = 0
    
#     print('[Random Forest] Current fold train performance', mean_squared_log_error(y_train, y_train_predicted))
#     print('[Random Forest] Current fold test performance', mean_squared_log_error(y_test, y_test_predicted))

('[Random Forest] Current fold train performance', 0.07483793962437083)
('[Random Forest] Current fold test performance', 0.08518006322946116)
('[Random Forest] Current fold train performance', 0.0862887548227771)
('[Random Forest] Current fold test performance', 0.08464514415771207)


### Tuning SGD Regressor

In [45]:
from sklearn.linear_model import SGDRegressor
from sklearn.metrics import mean_squared_log_error, make_scorer
from sklearn.model_selection import KFold, GridSearchCV

def custom_scorer(y, y_pred):
    y_pred[y_pred < 0] = 0
    return mean_squared_log_error(y, y_pred)

gs_scorer = make_scorer(custom_scorer, greater_is_better=False)

gs_train_X = feature_selected_train_data.drop(columns=['Id', 'SalePrice'])
gs_train_y = feature_selected_train_data['SalePrice']

sgd = SGDRegressor(random_state=42)

sgd_gs_params = {
    'max_iter': [1000, 500, 2000],
    'alpha': [0.0001, 0.00001, 0.001],
    'tol': [0.001, 0.01, 0.0001],
    'learning_rate': ['constant', 'optimal', 'invscaling', 'adaptive'],
    'eta0': [0.01, 0.1, 0.001]
}

gs = GridSearchCV(sgd, sgd_gs_params, n_jobs=-1, cv=2, scoring=gs_scorer)

sgd_gs_predictor = gs.fit(gs_train_X, gs_train_y)

sgd_gs_predictor.best_params_

{'alpha': 1e-05,
 'eta0': 0.001,
 'learning_rate': 'invscaling',
 'max_iter': 2000,
 'tol': 0.001}

In [46]:
from sklearn.linear_model import SGDRegressor
from sklearn.metrics import mean_squared_log_error
from sklearn.model_selection import KFold

sgdr_train_X = feature_selected_train_data.drop(columns=['Id', 'SalePrice'])
sgdr_train_y = feature_selected_train_data['SalePrice']

sgdr = SGDRegressor(random_state=42, max_iter=2000, alpha=1e-05, eta0=0.001, tol=0.001, learning_rate='invscaling')

kf = KFold(n_splits=2, random_state=42, shuffle=False)

for kf_chunks in kf.split(sgdr_train_X):
    train_index = kf_chunks[0]
    test_index = kf_chunks[1]
    
    X_train = sgdr_train_X.iloc[train_index]
    X_test = sgdr_train_X.iloc[test_index]
    y_train = sgdr_train_y.iloc[train_index]
    y_test = sgdr_train_y.iloc[test_index]
    
    sgdr_predictor = sgdr.fit(X_train, y_train)
    y_train_predicted = sgdr_predictor.predict(X_train)
    y_test_predicted = sgdr_predictor.predict(X_test)
    
    y_train_predicted[y_train_predicted < 0] = 0
    y_test_predicted[y_test_predicted < 0] = 0
    
    print('[SGD Regressor] Current fold train performance', mean_squared_log_error(y_train, y_train_predicted))
    print('[SGD Regressor] Current fold test performance', mean_squared_log_error(y_test, y_test_predicted))

('[SGD Regressor] Current fold train performance', 0.04318339674048233)
('[SGD Regressor] Current fold test performance', 0.048847239348347574)
('[SGD Regressor] Current fold train performance', 0.04869773385035724)
('[SGD Regressor] Current fold test performance', 0.04327340216410617)


## Results for submission

In [47]:
submit_test_X = feature_selected_test_data.drop(columns=['Id'])

sgdr_predictor.fit(sgdr_train_X, sgdr_train_y)
results = sgdr_predictor.predict(submit_test_X)

kaggle_sb_df = pd.DataFrame({ 'SalePrice': results, 'Id': feature_selected_test_data['Id'] })

kaggle_sb_df.to_csv('data/kaggle_submission_04.csv', index=False)