## Refetching transformed data

In [1]:
import pandas as pd

train_data = pd.read_csv('./data/aligned_encoded_train_data.csv')
test_data = pd.read_csv('./data/aligned_encoded_test_data.csv')

test_data.drop(columns=['SalePrice'], inplace=True)

train_data.head()

Unnamed: 0,1stFlrSF,2ndFlrSF,3SsnPorch,BedroomAbvGr,BldgType_1Fam,BldgType_2fmCon,BldgType_Duplex,BldgType_Twnhs,BldgType_TwnhsE,BsmtCond,...,ScreenPorch,Street_Grvl,Street_Pave,TotRmsAbvGrd,TotalBsmtSF,Utilities,WoodDeckSF,YearBuilt,YearRemodAdd,YrSold
0,856,854,0,3,1,0,0,0,0,3,...,0,0,1,8,856,4,0,2003,2003,2008
1,1262,0,0,3,1,0,0,0,0,3,...,0,0,1,6,1262,4,298,1976,1976,2007
2,920,866,0,3,1,0,0,0,0,3,...,0,0,1,6,920,4,0,2001,2002,2008
3,961,756,0,3,1,0,0,0,0,4,...,0,0,1,7,756,4,0,1915,1970,2006
4,1145,1053,0,4,1,0,0,0,0,3,...,0,0,1,9,1145,4,192,2000,2000,2008


## Removing outliers

In [2]:
import numpy as np

outliers_data = train_data.copy(deep=True).drop(train_data.index)
outlier_column = 'Is outlier in more than one feature?'

features_to_check_for_outliers = [
    'SalePrice',
    'MiscVal',
    'NeighborhoodMeanPrice',
    'GarageTypeMeanPrice',
    'LotArea',
    'BsmtFinSF1',
    'BsmtFinSF2',
    'BsmtUnfSF',
    'TotalBsmtSF',
    '1stFlrSF',
    '2ndFlrSF',
    'GrLivArea',
    'GarageArea',
    'WoodDeckSF',
    'OpenPorchSF',
    'EnclosedPorch',
    '3SsnPorch',
    'ScreenPorch',
    'PoolArea',
    'BedroomAbvGr',
    'KitchenAbvGr',
    'LotFrontage',
    'LotDepth',
    'MasVnrArea'
]

# For each feature find the data points with extreme high or low values
for feature in features_to_check_for_outliers:
    
    # TODO: Calculate Q1 (25th percentile of the data) for the given feature
    Q1 = np.percentile(train_data[feature], 25)
    
    # TODO: Calculate Q3 (75th percentile of the data) for the given feature
    Q3 = np.percentile(train_data[feature], 75)
    
    # TODO: Use the interquartile range to calculate an outlier step (1.5 times the interquartile range)
    step = (Q3 - Q1) * 1.5
    
    # Display the outliers
    print("Data points considered outliers for the feature '{}':".format(feature))
    feature_outliers = train_data[~((train_data[feature] >= Q1 - step) & (train_data[feature] <= Q3 + step))]
    # display(feature_outliers)
    display(feature)
    display(feature_outliers.shape)
    
    outliers_data = outliers_data.append(feature_outliers)

# OPTIONAL: Create Duplicated feature column
outliers_data[outlier_column] = outliers_data.duplicated(keep=False)
# OPTIONAL: Drop duplicated rows
outliers_data_deduplicated = outliers_data.sort_index().drop_duplicates()
# OPTIONAL: Filter rows with Duplicated == True
outliers_in_more_than_one_feature = outliers_data_deduplicated[outliers_data_deduplicated[outlier_column]]

# Display rows with more than one feature considered outlier
print("Data points considered outliers for more than one feature:")
# display(outliers_in_more_than_one_feature)

# OPTIONAL: Select the indices for data points you wish to remove
# Remove data points where more than one feature is considered outlier
outliers  = outliers_in_more_than_one_feature.index

# Remove the outliers, if any were specified
good_data = train_data.drop(train_data.index[outliers]).reset_index(drop = True)

display(outliers)
display(good_data.shape)

Data points considered outliers for the feature 'SalePrice':


'SalePrice'

(61, 201)

Data points considered outliers for the feature 'MiscVal':


'MiscVal'

(52, 201)

Data points considered outliers for the feature 'NeighborhoodMeanPrice':


'NeighborhoodMeanPrice'

(143, 201)

Data points considered outliers for the feature 'GarageTypeMeanPrice':


'GarageTypeMeanPrice'

(0, 201)

Data points considered outliers for the feature 'LotArea':


'LotArea'

(69, 201)

Data points considered outliers for the feature 'BsmtFinSF1':


'BsmtFinSF1'

(7, 201)

Data points considered outliers for the feature 'BsmtFinSF2':


'BsmtFinSF2'

(167, 201)

Data points considered outliers for the feature 'BsmtUnfSF':


'BsmtUnfSF'

(29, 201)

Data points considered outliers for the feature 'TotalBsmtSF':


'TotalBsmtSF'

(61, 201)

Data points considered outliers for the feature '1stFlrSF':


'1stFlrSF'

(20, 201)

Data points considered outliers for the feature '2ndFlrSF':


'2ndFlrSF'

(2, 201)

Data points considered outliers for the feature 'GrLivArea':


'GrLivArea'

(31, 201)

Data points considered outliers for the feature 'GarageArea':


'GarageArea'

(21, 201)

Data points considered outliers for the feature 'WoodDeckSF':


'WoodDeckSF'

(32, 201)

Data points considered outliers for the feature 'OpenPorchSF':


'OpenPorchSF'

(77, 201)

Data points considered outliers for the feature 'EnclosedPorch':


'EnclosedPorch'

(208, 201)

Data points considered outliers for the feature '3SsnPorch':


'3SsnPorch'

(24, 201)

Data points considered outliers for the feature 'ScreenPorch':


'ScreenPorch'

(116, 201)

Data points considered outliers for the feature 'PoolArea':


'PoolArea'

(7, 201)

Data points considered outliers for the feature 'BedroomAbvGr':


'BedroomAbvGr'

(35, 201)

Data points considered outliers for the feature 'KitchenAbvGr':


'KitchenAbvGr'

(68, 201)

Data points considered outliers for the feature 'LotFrontage':


'LotFrontage'

(121, 201)

Data points considered outliers for the feature 'LotDepth':


'LotDepth'

(228, 201)

Data points considered outliers for the feature 'MasVnrArea':


'MasVnrArea'

(98, 201)

Data points considered outliers for more than one feature:


Int64Index([   5,    7,    8,   11,   14,   17,   25,   28,   39,   45,
            ...
            1426, 1430, 1437, 1438, 1439, 1445, 1446, 1449, 1458, 1459],
           dtype='int64', length=432)

(1028, 201)

## Feature scaling and normalization

References:

- https://towardsdatascience.com/the-ultimate-guide-to-data-cleaning-3969843991d4#d078
- https://www.codecademy.com/articles/normalization
- https://machinelearningmastery.com/feature-selection-machine-learning-python/
- https://towardsdatascience.com/feature-selection-techniques-in-machine-learning-with-python-f24e7da3f36e

In [3]:
# space (normalize and scale):
#     LotArea, BsmtFinSF1, BsmtFinSF2, BsmtUnfSF, TotalBsmtSF, 1stFlrSF, 2ndFlrSF, 
#     GrLivArea, GarageArea, WoodDeckSF, OpenPorchSF, EnclosedPorch, 3SsnPorch, ScreenPorch, PoolArea,
#     BedroomAbvGr, KitchenAbvGr, LotFrontage (*), LotDepth (*), MasVnrArea, 
#
# quantity (scale):
#     BsmtFullBath, BsmtHalfBath, FullBath, HalfBath, TotRmsAbvGrd, Fireplaces, GarageCars
#
# grade (scale):
#     LotShape, LandSlope, Utilities, ExterQual, ExterCond, BsmtQual, BsmtCond, BsmtExposure,
#     HeatingQC, CentralAir, KitchenQual, Functional, FireplaceQu, GarageFinish, GarageQual, GarageCond
#     BsmtFinType1, BsmtFinType2, OverallQual, OverallCond, 
#
# time (scale):
#     YearBuilt, YearRemodAdd, YrSold, GarageYrBlt,
#
# currency (normalize and scale):
#     MiscVal, NeighborhoodMeanPrice, GarageTypeMeanPrice
#
# one-hot-encoded-binary (keep):
#     MSSubClass, MSZoning, LandContour, Street, LotConfig, BldgType, HouseStyle, RoofStyle, RoofMatl, MoSold,
#     MasVnrType, Foundation, Heating, Electrical, PavedDrive, Condition1, Condition2, Exterior1st, Exterior2nd

columns_to_normalize = [
#     'MiscVal',
#     'NeighborhoodMeanPrice',
#     'GarageTypeMeanPrice',
#     'LotArea',
#     'BsmtFinSF1',
#     'BsmtFinSF2',
    'BsmtUnfSF',
#     'TotalBsmtSF',
#     '1stFlrSF',
#     '2ndFlrSF',
#     'GrLivArea',
#     'GarageArea',
    'WoodDeckSF',
    'OpenPorchSF',
    'EnclosedPorch',
    '3SsnPorch',
    'ScreenPorch',
    'PoolArea',
    'BedroomAbvGr',
    'KitchenAbvGr',
#     'LotFrontage',
#     'LotDepth',
    'MasVnrArea'
]

columns_to_scale = [
#     'MiscVal',
#     'NeighborhoodMeanPrice',
#     'GarageTypeMeanPrice',
#     'LotArea',
#     'BsmtFinSF1',
#     'BsmtFinSF2',
    'BsmtUnfSF',
#     'TotalBsmtSF',
#     '1stFlrSF',
#     '2ndFlrSF',
#     'GrLivArea',
#     'GarageArea',
    'WoodDeckSF',
    'OpenPorchSF',
    'EnclosedPorch',
    '3SsnPorch',
    'ScreenPorch',
    'PoolArea',
    'BedroomAbvGr',
    'KitchenAbvGr',
#     'LotFrontage',
#     'LotDepth',
    'MasVnrArea',
    'BsmtFullBath',
    'BsmtHalfBath',
    'FullBath',
    'HalfBath',
    'TotRmsAbvGrd',
    'Fireplaces',
    'GarageCars',
    'LotShape',
    'LandSlope',
    'Utilities',
    'ExterQual',
    'ExterCond',
    'BsmtQual',
    'BsmtCond',
    'BsmtExposure',
    'HeatingQC',
    'CentralAir',
    'KitchenQual',
    'Functional',
    'FireplaceQu',
    'GarageFinish',
    'GarageQual',
    'GarageCond',
    'BsmtFinType1',
    'BsmtFinType2',
    'OverallQual',
    'OverallCond',
    'YearBuilt',
    'YearRemodAdd',
    'YrSold',
    'GarageYrBlt'
]

In [4]:
# mean normalization
normalized_train_data = good_data.copy()
normalized_test_data = test_data.copy()

normalized_train_data[columns_to_normalize] = (normalized_train_data[columns_to_normalize] - normalized_train_data[columns_to_normalize].mean()) / normalized_train_data[columns_to_normalize].std()
normalized_test_data[columns_to_normalize] = (normalized_test_data[columns_to_normalize] - normalized_test_data[columns_to_normalize].mean()) / normalized_test_data[columns_to_normalize].std()
normalized_train_data.fillna(0, inplace=True)
normalized_test_data.fillna(0, inplace=True)

normalized_train_data.head()

Unnamed: 0,1stFlrSF,2ndFlrSF,3SsnPorch,BedroomAbvGr,BldgType_1Fam,BldgType_2fmCon,BldgType_Duplex,BldgType_Twnhs,BldgType_TwnhsE,BsmtCond,...,ScreenPorch,Street_Grvl,Street_Pave,TotRmsAbvGrd,TotalBsmtSF,Utilities,WoodDeckSF,YearBuilt,YearRemodAdd,YrSold
0,856,854,-0.097769,0.281436,1,0,0,0,0,3,...,-0.208611,0,1,8,856,4,-0.784357,2003,2003,2008
1,1262,0,-0.097769,0.281436,1,0,0,0,0,3,...,-0.208611,0,1,6,1262,4,1.968847,1976,1976,2007
2,920,866,-0.097769,0.281436,1,0,0,0,0,3,...,-0.208611,0,1,6,920,4,-0.784357,2001,2002,2008
3,961,756,-0.097769,0.281436,1,0,0,0,0,4,...,-0.208611,0,1,7,756,4,-0.784357,1915,1970,2006
4,1145,1053,-0.097769,1.720817,1,0,0,0,0,3,...,-0.208611,0,1,9,1145,4,0.989519,2000,2000,2008


In [5]:
# min max scaling
# normalized_df=(df-df.min())/(df.max()-df.min())
scaled_train_data = normalized_train_data.copy()
scaled_test_data = normalized_test_data.copy()

scaled_train_data[columns_to_scale] = (scaled_train_data[columns_to_scale] - scaled_train_data[columns_to_scale].min()) / (scaled_train_data[columns_to_scale].max() - scaled_train_data[columns_to_scale].min())
scaled_test_data[columns_to_scale] = (scaled_test_data[columns_to_scale] - scaled_train_data[columns_to_scale].min()) / (scaled_train_data[columns_to_scale].max() - scaled_train_data[columns_to_scale].min())
scaled_train_data.fillna(0, inplace=True)
scaled_test_data.fillna(0, inplace=True)

scaled_train_data.head()

Unnamed: 0,1stFlrSF,2ndFlrSF,3SsnPorch,BedroomAbvGr,BldgType_1Fam,BldgType_2fmCon,BldgType_Duplex,BldgType_Twnhs,BldgType_TwnhsE,BsmtCond,...,ScreenPorch,Street_Grvl,Street_Pave,TotRmsAbvGrd,TotalBsmtSF,Utilities,WoodDeckSF,YearBuilt,YearRemodAdd,YrSold
0,856,854,0.0,0.6,1,0,0,0,0,0.75,...,0.0,0,1,0.666667,856,0.0,0.0,0.956204,0.883333,0.5
1,1262,0,0.0,0.6,1,0,0,0,0,0.75,...,0.0,0,1,0.444444,1262,0.0,0.55597,0.759124,0.433333,0.25
2,920,866,0.0,0.6,1,0,0,0,0,0.75,...,0.0,0,1,0.444444,920,0.0,0.0,0.941606,0.866667,0.5
3,961,756,0.0,0.6,1,0,0,0,0,1.0,...,0.0,0,1,0.555556,756,0.0,0.0,0.313869,0.333333,0.0
4,1145,1053,0.0,0.8,1,0,0,0,0,0.75,...,0.0,0,1,0.777778,1145,0.0,0.358209,0.934307,0.833333,0.5


## Feature Selection

References:

- https://towardsdatascience.com/feature-selection-techniques-in-machine-learning-with-python-f24e7da3f36e
- https://seaborn.pydata.org/generated/seaborn.pairplot.html

### Feature variance

- Drop features with no variance

In [6]:
reduced_var_train_data = scaled_train_data.copy()
reduced_var_test_data = scaled_test_data.copy()

def run_var_analysis():
    features = reduced_var_train_data.drop(columns=['Id', 'SalePrice'])

    var_feats = features.var()
    var_feats_ordered = var_feats.sort_values(ascending=True)
    return var_feats_ordered[var_feats_ordered < 0.1]

low_variance_feats = run_var_analysis()
display(reduced_var_train_data.shape)
display(low_variance_feats)

reduced_var_train_data.drop(columns=low_variance_feats.index, inplace=True)
reduced_var_test_data.drop(columns=low_variance_feats.index, inplace=True)

display(run_var_analysis())
display(reduced_var_train_data.shape)

(1028, 201)

Exterior1st_AsphShn    0.000000
RoofMatl_Membran       0.000000
RoofMatl_Metal         0.000000
RoofMatl_Roll          0.000000
Heating_Wall           0.000000
Exterior2nd_Other      0.000000
Exterior2nd_CBlock     0.000000
Exterior1st_Stone      0.000000
Exterior1st_Other      0.000000
RoofStyle_Shed         0.000000
Exterior1st_ImStucc    0.000000
Exterior1st_CBlock     0.000000
Condition2_RRAe        0.000000
Condition2_PosN        0.000000
Condition2_PosA        0.000000
MSSubClass_150         0.000000
RoofMatl_ClyTile       0.000000
Utilities              0.000000
Exterior2nd_Stone      0.000973
Heating_Floor          0.000973
PoolArea               0.000973
Foundation_Wood        0.000973
Condition2_RRAn        0.000973
RoofMatl_WdShngl       0.000973
RoofMatl_WdShake       0.000973
Electrical_Mix         0.000973
Electrical_FuseP       0.000973
Heating_OthW           0.001944
Street_Pave            0.001944
Street_Grvl            0.001944
                         ...   
YearBuil

Series([], dtype: float64)

(1028, 47)

### Feature correlation

- Drop highly correlated features

In [7]:
reduced_train_data = reduced_var_train_data.copy()
reduced_test_data = reduced_var_test_data.copy()

def run_corr_analysis():
    features = reduced_train_data.drop(columns=['Id', 'SalePrice'])

    corr = features.corr().abs()
    corr[corr == 1] = 0
    corr_cols = corr.max().sort_values(ascending=False)
    return corr_cols[corr_cols > 0.8]

display(reduced_train_data.shape)

corr_results = run_corr_analysis()
corr_result_index = 0
for corr_result in corr_results.index:
    # display(corr_result)
    
    if (corr_result_index % 2 == 1):
        # display(corr_result)
        reduced_train_data.drop(columns=[corr_result], inplace=True)
        reduced_test_data.drop(columns=[corr_result], inplace=True)
    
    corr_result_index = corr_result_index + 1

display(reduced_train_data.shape)

(1028, 47)

(1028, 38)

### Target correlation

- Drop lowly target-correlated features

In [8]:
def run_target_corr_analysis():
    features = reduced_train_data.drop(columns=['Id'])

    corr = features.corr().abs()
    target_corr = corr['SalePrice'].sort_values(ascending=True)
    return target_corr[target_corr < 0.2]

lowly_correlated_feats = run_target_corr_analysis()
display(reduced_train_data.shape)
display(lowly_correlated_feats)

reduced_train_data.drop(columns=lowly_correlated_feats.index, inplace=True)
reduced_test_data.drop(columns=lowly_correlated_feats.index, inplace=True)

display(run_target_corr_analysis())
display(reduced_train_data.shape)

(1028, 38)

MiscVal                0.008387
MSSubClass_20          0.011899
YrSold                 0.013726
MoSold_5               0.014451
LotConfig_Inside       0.016162
BsmtFinSF2             0.021231
MoSold_6               0.032318
LotConfig_Corner       0.037046
BldgType_1Fam          0.038686
MoSold_7               0.039482
RoofStyle_Hip          0.060995
LowQualFinSF           0.075244
LotDepth               0.085522
HouseStyle_1Story      0.104140
Exterior1st_HdBoard    0.106123
Condition1_Norm        0.153754
MSZoning_RL            0.199919
Name: SalePrice, dtype: float64

Series([], Name: SalePrice, dtype: float64)

(1028, 21)

### Feature aggregation

In [9]:
# reduced_train_data['FireplaceRating'] = reduced_train_data['Fireplaces'] * reduced_train_data['FireplaceQu']
# reduced_test_data['FireplaceRating'] = reduced_test_data['Fireplaces'] * reduced_test_data['FireplaceQu']

### Feature importance

- Drop lowly important features

In [10]:
from sklearn.ensemble import RandomForestRegressor

rfr = RandomForestRegressor(n_jobs=-1, random_state=42, n_estimators=50)

feature_selected_train_data = reduced_train_data.copy()
feature_selected_test_data = reduced_test_data.copy()

def run_importance_analysis():
    features = feature_selected_train_data.drop(columns=['Id', 'SalePrice'])
    
    rfr_importance_analyser = rfr.fit(features, feature_selected_train_data['SalePrice'])
    return features.columns[rfr_importance_analyser.feature_importances_ < 0.05]

low_importance_feats = run_importance_analysis()
display(feature_selected_train_data.shape)
display(low_importance_feats)

feature_selected_train_data.drop(columns=low_importance_feats, inplace=True)
feature_selected_test_data.drop(columns=low_importance_feats, inplace=True)

display(run_importance_analysis())
display(feature_selected_train_data.shape)

(1028, 21)

Index([u'BsmtFinType1', u'Exterior1st_MetalSd', u'Exterior1st_VinylSd',
       u'Exterior2nd_Wd Sdng', u'FireplaceQu', u'Foundation_PConc',
       u'GarageTypeMeanPrice', u'HouseStyle_2Story', u'LotArea',
       u'LotFrontage', u'MSSubClass_60', u'MSZoning_RM', u'MasVnrType_None',
       u'YearRemodAdd'],
      dtype='object')

Index([], dtype='object')

(1028, 7)

In [11]:
feature_selected_train_data.describe()

Unnamed: 0,1stFlrSF,BsmtFinSF1,GarageArea,GrLivArea,Id,NeighborhoodMeanPrice,SalePrice
count,1028.0,1028.0,1028.0,1028.0,1028.0,1028.0,1028.0
mean,1090.909533,414.585603,451.845331,1405.243191,732.459144,174643.252918,168071.782101
std,304.68879,393.970246,191.697934,406.980093,421.466399,47803.092373,56130.459219
min,334.0,0.0,0.0,334.0,1.0,98576.0,35311.0
25%,864.0,0.0,312.0,1094.75,367.75,136793.0,129500.0
50%,1040.0,384.0,461.5,1382.0,739.5,186556.0,158000.0
75%,1276.25,690.5,564.0,1668.0,1099.25,197966.0,197600.0
max,2136.0,1619.0,1053.0,2730.0,1458.0,335295.0,370878.0


## Regression Model Selection

### Random Forest Regressor

- It did pretty well! Wow!
- It overfitted the training data, which is not a good sign

In [12]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_log_error
from sklearn.model_selection import KFold

rfr_train_X = feature_selected_train_data.drop(columns=['Id', 'SalePrice'])
rfr_train_y = feature_selected_train_data['SalePrice']

rfr = RandomForestRegressor(n_jobs=-1, random_state=42)

kf = KFold(n_splits=2, random_state=42, shuffle=False)

for kf_chunks in kf.split(rfr_train_X):
    train_index = kf_chunks[0]
    test_index = kf_chunks[1]
    
    X_train = rfr_train_X.iloc[train_index]
    X_test = rfr_train_X.iloc[test_index]
    y_train = rfr_train_y.iloc[train_index]
    y_test = rfr_train_y.iloc[test_index]
    
    rfr_predictor = rfr.fit(X_train, y_train)
    y_train_predicted = rfr_predictor.predict(X_train)
    y_test_predicted = rfr_predictor.predict(X_test)
    
    y_train_predicted[y_train_predicted < 0] = 0
    y_test_predicted[y_test_predicted < 0] = 0
    
    print('[Random Forest] Current fold train performance', mean_squared_log_error(y_train, y_train_predicted))
    print('[Random Forest] Current fold test performance', mean_squared_log_error(y_test, y_test_predicted))



('[Random Forest] Current fold train performance', 0.004254810992312083)
('[Random Forest] Current fold test performance', 0.026745082263124047)
('[Random Forest] Current fold train performance', 0.0049320784223269715)
('[Random Forest] Current fold test performance', 0.024173048626232544)


### Linear Regression

- It performed much better than the Naive Linear Regression Predictor
- Feature engineering made that improvement happen
- It still haven't generalized well

In [13]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_log_error
from sklearn.model_selection import KFold

lr_train_X = feature_selected_train_data.drop(columns=['Id', 'SalePrice'])
lr_train_y = feature_selected_train_data['SalePrice']

linear_regression = LinearRegression(n_jobs=-1)

kf = KFold(n_splits=2, random_state=42, shuffle=False)

for kf_chunks in kf.split(lr_train_X):
    train_index = kf_chunks[0]
    test_index = kf_chunks[1]
    
    X_train = lr_train_X.iloc[train_index]
    X_test = lr_train_X.iloc[test_index]
    y_train = lr_train_y.iloc[train_index]
    y_test = lr_train_y.iloc[test_index]
    
    lr_predictor = linear_regression.fit(X_train, y_train)
    y_train_predicted = lr_predictor.predict(X_train)
    y_test_predicted = lr_predictor.predict(X_test)
    
    y_train_predicted[y_train_predicted < 0] = 0
    y_test_predicted[y_test_predicted < 0] = 0
    
    print('[Linear Regression] Current fold train performance', mean_squared_log_error(y_train, y_train_predicted))
    print('[Linear Regression] Current fold test performance', mean_squared_log_error(y_test, y_test_predicted))

('[Linear Regression] Current fold train performance', 0.022001772750309314)
('[Linear Regression] Current fold test performance', 0.02605606354135643)
('[Linear Regression] Current fold train performance', 0.026381716118579698)
('[Linear Regression] Current fold test performance', 0.02423829587811468)


### Stochastic Gradient Descent Regressor

- It has performed pretty good!
- Also, it have generalized well!
- RandomForest have performed better.
- RandomForest have generalized better with the submission test_data.

In [14]:
from sklearn.linear_model import SGDRegressor
from sklearn.metrics import mean_squared_log_error
from sklearn.model_selection import KFold

sgdr_train_X = feature_selected_train_data.drop(columns=['Id', 'SalePrice'])
sgdr_train_y = feature_selected_train_data['SalePrice']

sgdr = SGDRegressor(random_state=42, max_iter=100)

kf = KFold(n_splits=2, random_state=42, shuffle=False)

for kf_chunks in kf.split(sgdr_train_X):
    train_index = kf_chunks[0]
    test_index = kf_chunks[1]
    
    X_train = sgdr_train_X.iloc[train_index]
    X_test = sgdr_train_X.iloc[test_index]
    y_train = sgdr_train_y.iloc[train_index]
    y_test = sgdr_train_y.iloc[test_index]
    
    sgdr_predictor = sgdr.fit(X_train, y_train)
    y_train_predicted = sgdr_predictor.predict(X_train)
    y_test_predicted = sgdr_predictor.predict(X_test)
    
    y_train_predicted[y_train_predicted < 0] = 0
    y_test_predicted[y_test_predicted < 0] = 0
    
    print('[SGD Regressor] Current fold train performance', mean_squared_log_error(y_train, y_train_predicted))
    print('[SGD Regressor] Current fold test performance', mean_squared_log_error(y_test, y_test_predicted))

('[SGD Regressor] Current fold train performance', 143.52029558305387)
('[SGD Regressor] Current fold test performance', 143.5776718397384)
('[SGD Regressor] Current fold train performance', 143.5776718397384)
('[SGD Regressor] Current fold test performance', 143.52029558305387)




### Tuning Random Forest

In [15]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_log_error, make_scorer
from sklearn.model_selection import KFold, GridSearchCV

def custom_scorer(y, y_pred):
    y_pred[y_pred < 0] = 0
    return mean_squared_log_error(y, y_pred)

gs_scorer = make_scorer(custom_scorer, greater_is_better=False)

gs_train_X = feature_selected_train_data.drop(columns=['Id', 'SalePrice'])
gs_train_y = feature_selected_train_data['SalePrice']

rfr = RandomForestRegressor(n_jobs=-1, random_state=42)

rfr_gs_params = {
    'n_estimators': [5, 50, 100],
    'max_depth': [2, 5, 10],
    'min_samples_split': [2, 3, 5],
    'min_samples_leaf': [1, 2, 3, 5],
    'max_leaf_nodes': [None]
}

gs = GridSearchCV(rfr, rfr_gs_params, n_jobs=-1, cv=3, scoring=gs_scorer)

rfr_gs_predictor = gs.fit(gs_train_X, gs_train_y)

rfr_gs_predictor.best_params_

{'max_depth': 10,
 'max_leaf_nodes': None,
 'min_samples_leaf': 3,
 'min_samples_split': 2,
 'n_estimators': 100}

In [16]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_log_error
from sklearn.model_selection import KFold

rfr_train_X = feature_selected_train_data.drop(columns=['Id', 'SalePrice'])
rfr_train_y = feature_selected_train_data['SalePrice']

# max_depth=10, n_estimators=10
# max_depth=2, n_estimators=100
# max_depth=2, n_estimators=100, min_samples_leaf= 3, min_samples_split=3

rfr = RandomForestRegressor(n_jobs=-1, random_state=42, max_depth=10, n_estimators=100, min_samples_leaf=3, min_samples_split=2)

kf = KFold(n_splits=2, random_state=42, shuffle=False)

for kf_chunks in kf.split(rfr_train_X):
    train_index = kf_chunks[0]
    test_index = kf_chunks[1]
    
    X_train = rfr_train_X.iloc[train_index]
    X_test = rfr_train_X.iloc[test_index]
    y_train = rfr_train_y.iloc[train_index]
    y_test = rfr_train_y.iloc[test_index]
    
    rfr_predictor = rfr.fit(X_train, y_train)
    y_train_predicted = rfr_predictor.predict(X_train)
    y_test_predicted = rfr_predictor.predict(X_test)
    
    y_train_predicted[y_train_predicted < 0] = 0
    y_test_predicted[y_test_predicted < 0] = 0
    
    print('[Random Forest] Current fold train performance', mean_squared_log_error(y_train, y_train_predicted))
    print('[Random Forest] Current fold test performance', mean_squared_log_error(y_test, y_test_predicted))

('[Random Forest] Current fold train performance', 0.009034906728874866)
('[Random Forest] Current fold test performance', 0.024546851023777964)
('[Random Forest] Current fold train performance', 0.011039039658232446)
('[Random Forest] Current fold test performance', 0.021884807118621754)


### Tuning SGD Regressor

In [17]:
from sklearn.linear_model import SGDRegressor
from sklearn.metrics import mean_squared_log_error, make_scorer
from sklearn.model_selection import KFold, GridSearchCV

def custom_scorer(y, y_pred):
    y_pred[y_pred < 0] = 0
    return mean_squared_log_error(y, y_pred)

gs_scorer = make_scorer(custom_scorer, greater_is_better=False)

gs_train_X = feature_selected_train_data.drop(columns=['Id', 'SalePrice'])
gs_train_y = feature_selected_train_data['SalePrice']

sgd = SGDRegressor(random_state=42)

sgd_gs_params = {
    'max_iter': [1000, 500, 2000],
    'alpha': [0.0001, 0.00001, 0.001],
    'tol': [0.001, 0.01, 0.0001],
    'learning_rate': ['constant', 'optimal', 'invscaling', 'adaptive'],
    'eta0': [0.01, 0.1, 0.001]
}

gs = GridSearchCV(sgd, sgd_gs_params, n_jobs=-1, cv=2, scoring=gs_scorer)

sgd_gs_predictor = gs.fit(gs_train_X, gs_train_y)

sgd_gs_predictor.best_params_

{'alpha': 0.0001,
 'eta0': 0.01,
 'learning_rate': 'optimal',
 'max_iter': 1000,
 'tol': 0.001}

In [18]:
from sklearn.linear_model import SGDRegressor
from sklearn.metrics import mean_squared_log_error
from sklearn.model_selection import KFold

sgdr_train_X = feature_selected_train_data.drop(columns=['Id', 'SalePrice'])
sgdr_train_y = feature_selected_train_data['SalePrice']

sgdr = SGDRegressor(random_state=42, max_iter=1000, alpha=0.0001, eta0=0.1, tol=0.001, learning_rate='constant')

kf = KFold(n_splits=2, random_state=42, shuffle=False)

for kf_chunks in kf.split(sgdr_train_X):
    train_index = kf_chunks[0]
    test_index = kf_chunks[1]
    
    X_train = sgdr_train_X.iloc[train_index]
    X_test = sgdr_train_X.iloc[test_index]
    y_train = sgdr_train_y.iloc[train_index]
    y_test = sgdr_train_y.iloc[test_index]
    
    sgdr_predictor = sgdr.fit(X_train, y_train)
    y_train_predicted = sgdr_predictor.predict(X_train)
    y_test_predicted = sgdr_predictor.predict(X_test)
    
    y_train_predicted[y_train_predicted < 0] = 0
    y_test_predicted[y_test_predicted < 0] = 0
    
    print('[SGD Regressor] Current fold train performance', mean_squared_log_error(y_train, y_train_predicted))
    print('[SGD Regressor] Current fold test performance', mean_squared_log_error(y_test, y_test_predicted))

('[SGD Regressor] Current fold train performance', 143.52029558305387)
('[SGD Regressor] Current fold test performance', 143.5776718397384)
('[SGD Regressor] Current fold train performance', 1379.9462682183603)
('[SGD Regressor] Current fold test performance', 1380.7153202545874)


## Results for submission

In [19]:
submit_test_X = feature_selected_test_data.drop(columns=['Id'])

# sgdr_predictor.fit(sgdr_train_X, sgdr_train_y)
# results = sgdr_predictor.predict(submit_test_X)
rfr_predictor.fit(rfr_train_X, rfr_train_y)
results = rfr_predictor.predict(submit_test_X)

kaggle_sb_df = pd.DataFrame({ 'SalePrice': results, 'Id': feature_selected_test_data['Id'] })

kaggle_sb_df.to_csv('data/kaggle_submission_09.csv', index=False)