## Refetching transformed data

In [1]:
import pandas as pd

train_data = pd.read_csv('./data/aligned_encoded_train_data.csv')
test_data = pd.read_csv('./data/aligned_encoded_test_data.csv')

test_data.drop(columns=['SalePrice'], inplace=True)

train_data.head()

Unnamed: 0,1stFlrSF,2ndFlrSF,3SsnPorch,BedroomAbvGr,BldgType_1Fam,BldgType_2fmCon,BldgType_Duplex,BldgType_Twnhs,BldgType_TwnhsE,BsmtCond,...,ScreenPorch,Street_Grvl,Street_Pave,TotRmsAbvGrd,TotalBsmtSF,Utilities,WoodDeckSF,YearBuilt,YearRemodAdd,YrSold
0,856,854,0,3,1,0,0,0,0,3,...,0,0,1,8,856,4,0,2003,2003,2008
1,1262,0,0,3,1,0,0,0,0,3,...,0,0,1,6,1262,4,298,1976,1976,2007
2,920,866,0,3,1,0,0,0,0,3,...,0,0,1,6,920,4,0,2001,2002,2008
3,961,756,0,3,1,0,0,0,0,4,...,0,0,1,7,756,4,0,1915,1970,2006
4,1145,1053,0,4,1,0,0,0,0,3,...,0,0,1,9,1145,4,192,2000,2000,2008


## Removing outliers

In [2]:
import numpy as np

outliers_data = train_data.copy(deep=True).drop(train_data.index)
outlier_column = 'Is outlier in more than one feature?'

features_to_check_for_outliers = [
    'SalePrice',
    'MiscVal',
    'NeighborhoodMeanPrice',
    'GarageTypeMeanPrice',
    'LotArea',
    'BsmtFinSF1',
    'BsmtFinSF2',
    'BsmtUnfSF',
    'TotalBsmtSF',
    '1stFlrSF',
    '2ndFlrSF',
    'GrLivArea',
    'GarageArea',
    'WoodDeckSF',
    'OpenPorchSF',
    'EnclosedPorch',
    '3SsnPorch',
    'ScreenPorch',
    'PoolArea',
    'BedroomAbvGr',
    'KitchenAbvGr',
    'LotFrontage',
    'LotDepth',
    'MasVnrArea'
]

# For each feature find the data points with extreme high or low values
for feature in features_to_check_for_outliers:
    
    # TODO: Calculate Q1 (25th percentile of the data) for the given feature
    Q1 = np.percentile(train_data[feature], 25)
    
    # TODO: Calculate Q3 (75th percentile of the data) for the given feature
    Q3 = np.percentile(train_data[feature], 75)
    
    # TODO: Use the interquartile range to calculate an outlier step (1.5 times the interquartile range)
    step = (Q3 - Q1) * 1.5
    
    # Display the outliers
    print("Data points considered outliers for the feature '{}':".format(feature))
    feature_outliers = train_data[~((train_data[feature] >= Q1 - step) & (train_data[feature] <= Q3 + step))]
    # display(feature_outliers)
    display(feature)
    display(feature_outliers.shape)
    
    outliers_data = outliers_data.append(feature_outliers)

# OPTIONAL: Create Duplicated feature column
outliers_data[outlier_column] = outliers_data.duplicated(keep=False)
# OPTIONAL: Drop duplicated rows
outliers_data_deduplicated = outliers_data.sort_index().drop_duplicates()
# OPTIONAL: Filter rows with Duplicated == True
outliers_in_more_than_one_feature = outliers_data_deduplicated[outliers_data_deduplicated[outlier_column]]

# Display rows with more than one feature considered outlier
# print("Data points considered outliers for more than one feature:")
# display(outliers_in_more_than_one_feature)

# OPTIONAL: Select the indices for data points you wish to remove
# Remove data points where more than one feature is considered outlier
outliers  = outliers_in_more_than_one_feature.index

# Remove the outliers, if any were specified
good_data = train_data.drop(train_data.index[outliers]).reset_index(drop = True)


print("Final data shape:")
display(good_data.shape)

Data points considered outliers for the feature 'SalePrice':


'SalePrice'

(61, 201)

Data points considered outliers for the feature 'MiscVal':


'MiscVal'

(52, 201)

Data points considered outliers for the feature 'NeighborhoodMeanPrice':


'NeighborhoodMeanPrice'

(143, 201)

Data points considered outliers for the feature 'GarageTypeMeanPrice':


'GarageTypeMeanPrice'

(0, 201)

Data points considered outliers for the feature 'LotArea':


'LotArea'

(69, 201)

Data points considered outliers for the feature 'BsmtFinSF1':


'BsmtFinSF1'

(7, 201)

Data points considered outliers for the feature 'BsmtFinSF2':


'BsmtFinSF2'

(167, 201)

Data points considered outliers for the feature 'BsmtUnfSF':


'BsmtUnfSF'

(29, 201)

Data points considered outliers for the feature 'TotalBsmtSF':


'TotalBsmtSF'

(61, 201)

Data points considered outliers for the feature '1stFlrSF':


'1stFlrSF'

(20, 201)

Data points considered outliers for the feature '2ndFlrSF':


'2ndFlrSF'

(2, 201)

Data points considered outliers for the feature 'GrLivArea':


'GrLivArea'

(31, 201)

Data points considered outliers for the feature 'GarageArea':


'GarageArea'

(21, 201)

Data points considered outliers for the feature 'WoodDeckSF':


'WoodDeckSF'

(32, 201)

Data points considered outliers for the feature 'OpenPorchSF':


'OpenPorchSF'

(77, 201)

Data points considered outliers for the feature 'EnclosedPorch':


'EnclosedPorch'

(208, 201)

Data points considered outliers for the feature '3SsnPorch':


'3SsnPorch'

(24, 201)

Data points considered outliers for the feature 'ScreenPorch':


'ScreenPorch'

(116, 201)

Data points considered outliers for the feature 'PoolArea':


'PoolArea'

(7, 201)

Data points considered outliers for the feature 'BedroomAbvGr':


'BedroomAbvGr'

(35, 201)

Data points considered outliers for the feature 'KitchenAbvGr':


'KitchenAbvGr'

(68, 201)

Data points considered outliers for the feature 'LotFrontage':


'LotFrontage'

(121, 201)

Data points considered outliers for the feature 'LotDepth':


'LotDepth'

(228, 201)

Data points considered outliers for the feature 'MasVnrArea':


'MasVnrArea'

(98, 201)

Final data shape:


(1028, 201)

## Feature aggregation

References:

- https://www.kaggle.com/shaygu/house-prices-begginer-top-7

In [3]:
# space:
#     LotArea, BsmtFinSF1, BsmtFinSF2, BsmtUnfSF, TotalBsmtSF, 1stFlrSF, 2ndFlrSF, 
#     GrLivArea, GarageArea, WoodDeckSF, OpenPorchSF, EnclosedPorch, 3SsnPorch, ScreenPorch, PoolArea,
#     BedroomAbvGr, KitchenAbvGr, LotFrontage (*), LotDepth (*), MasVnrArea, 

# create: FloorSF, HouseSF, AbvGrSF, PorchSF
good_data['FloorSF'] = good_data['1stFlrSF'] + good_data['2ndFlrSF']
test_data['FloorSF'] = test_data['1stFlrSF'] + test_data['2ndFlrSF']
good_data['HouseSF'] = good_data['FloorSF'] + (good_data['TotalBsmtSF'] * .75)
test_data['HouseSF'] = test_data['FloorSF'] + (test_data['TotalBsmtSF'] * .75)
good_data['AbvGrSF'] = good_data['BedroomAbvGr'] + good_data['KitchenAbvGr']
test_data['AbvGrSF'] = test_data['BedroomAbvGr'] + test_data['KitchenAbvGr']
good_data['PorchSF'] = good_data['OpenPorchSF'] + good_data['EnclosedPorch'] + good_data['3SsnPorch'] + good_data['ScreenPorch'] + good_data['WoodDeckSF']
test_data['PorchSF'] = test_data['OpenPorchSF'] + test_data['EnclosedPorch'] + test_data['3SsnPorch'] + test_data['ScreenPorch'] + test_data['WoodDeckSF']

In [4]:
# quantity:
#     BsmtFullBath, BsmtHalfBath, FullBath, HalfBath, TotRmsAbvGrd, Fireplaces, GarageCars

# create: BsmtBathrooms, AbvGrBathrooms, TotalBathrooms
good_data['BsmtBathrooms'] = good_data['BsmtFullBath'] + (good_data['BsmtHalfBath'] * 0.3)
test_data['BsmtBathrooms'] = test_data['BsmtFullBath'] + (test_data['BsmtHalfBath'] * 0.3)
good_data['AbvGrBathrooms'] = good_data['FullBath'] + (good_data['HalfBath'] * 0.3)
test_data['AbvGrBathrooms'] = test_data['FullBath'] + (test_data['HalfBath'] * 0.3)
good_data['TotalBathrooms'] = good_data['BsmtBathrooms'] + good_data['AbvGrBathrooms']
test_data['TotalBathrooms'] = test_data['BsmtBathrooms'] + test_data['AbvGrBathrooms']

In [5]:
# grade:
#     LotShape, LandSlope, Utilities, ExterQual, ExterCond, BsmtQual, BsmtCond, BsmtExposure,
#     HeatingQC, CentralAir, KitchenQual, Functional, FireplaceQu, GarageFinish, GarageQual, GarageCond
#     BsmtFinType1, BsmtFinType2, OverallQual, OverallCond, 

# create: ExternalRating, BsmtRating, ClimateRating, GarageRating, OverallRating, FunctionalRating
good_data['ExternalRating'] = (good_data['ExterQual'] + good_data['ExterCond']) / 2
test_data['ExternalRating'] = (test_data['ExterQual'] + test_data['ExterCond']) / 2
good_data['BsmtRating'] = (good_data['BsmtQual'] + good_data['BsmtCond'] + good_data['BsmtExposure']) / 3
test_data['BsmtRating'] = (test_data['BsmtQual'] + test_data['BsmtCond'] + test_data['BsmtExposure']) / 3
good_data['ClimateRating'] = (good_data['HeatingQC'] + good_data['CentralAir'] + good_data['FireplaceQu']) / 3
test_data['ClimateRating'] = (test_data['HeatingQC'] + test_data['CentralAir'] + test_data['FireplaceQu']) / 3
good_data['GarageRating'] = (good_data['GarageFinish'] + good_data['GarageQual'] + good_data['GarageCond']) / 3
test_data['GarageRating'] = (test_data['GarageFinish'] + test_data['GarageQual'] + test_data['GarageCond']) / 3
good_data['OverallRating'] = (good_data['OverallQual'] + good_data['OverallCond']) / 2
test_data['OverallRating'] = (test_data['OverallQual'] + test_data['OverallCond']) / 2
good_data['FunctionalRating'] = (good_data['KitchenQual'] + good_data['Functional']) / 2
test_data['FunctionalRating'] = (test_data['KitchenQual'] + test_data['Functional']) / 2

In [6]:
# time:
#     YearBuilt, YearRemodAdd, YrSold, GarageYrBlt,

# create: LotAge, GarageAgeWhenSold, RemodAgeWhenSold, LotAgeWhenSold
good_data['LotAge'] = 2019 - good_data['YearBuilt']
test_data['LotAge'] = 2019 - test_data['YearBuilt']
good_data['GarageAgeWhenSold'] = good_data['YrSold'] - good_data['GarageYrBlt']
test_data['GarageAgeWhenSold'] = test_data['YrSold'] - test_data['GarageYrBlt']
good_data['RemodAgeWhenSold'] = good_data['YrSold'] - good_data['YearRemodAdd']
test_data['RemodAgeWhenSold'] = test_data['YrSold'] - test_data['YearRemodAdd']
good_data['LotAgeWhenSold'] = good_data['YrSold'] - good_data['YearBuilt']
test_data['LotAgeWhenSold'] = test_data['YrSold'] - test_data['YearBuilt']

In [7]:
# currency:
#     MiscVal, NeighborhoodMeanPrice, GarageTypeMeanPrice

# create: NetBuildingMeanPrice
good_data['NetBuildingMeanPrice'] = good_data['NeighborhoodMeanPrice'] - good_data['GarageTypeMeanPrice'] - good_data['MiscVal']
test_data['NetBuildingMeanPrice'] = test_data['NeighborhoodMeanPrice'] - test_data['GarageTypeMeanPrice'] - test_data['MiscVal']

In [8]:
# cross:
#     LotArea + NeighborhoodMeanPrice
#     Fireplaces + FireplaceRating
#     NeighborhoodMeanPrice + LotAge

# create: AreaMeanPrice, SFMeanPrice, WeightedFireplaceRating, WeightedNeighborhoodMeanPrice
good_data['AreaMeanPrice'] = good_data['NeighborhoodMeanPrice'] / good_data['LotArea']
test_data['AreaMeanPrice'] = test_data['NeighborhoodMeanPrice'] / test_data['LotArea']
good_data['SFMeanPrice'] = good_data['NeighborhoodMeanPrice'] / good_data['HouseSF']
test_data['SFMeanPrice'] = test_data['NeighborhoodMeanPrice'] / test_data['HouseSF']
good_data['WeightedFireplaceRating'] = good_data['FireplaceQu'] * (1. + (good_data['Fireplaces'] / 4.))
test_data['WeightedFireplaceRating'] = test_data['FireplaceQu'] * (1. + (test_data['Fireplaces'] / 4.))
good_data['WeightedNeighborhoodMeanPrice'] = good_data['NeighborhoodMeanPrice'] * (1. + (.1 / good_data['LotAge']))
test_data['WeightedNeighborhoodMeanPrice'] = test_data['NeighborhoodMeanPrice'] * (1. + (.1 / test_data['LotAge']))

## Feature scaling and normalization

References:

- https://towardsdatascience.com/the-ultimate-guide-to-data-cleaning-3969843991d4#d078
- https://www.codecademy.com/articles/normalization
- https://machinelearningmastery.com/feature-selection-machine-learning-python/
- https://towardsdatascience.com/feature-selection-techniques-in-machine-learning-with-python-f24e7da3f36e

In [9]:
# created:
#     space (?): FloorSF, HouseSF, AbvGrSF, PorchSF
#     quantity (scale): BsmtBathrooms, AbvGrBathrooms, TotalBathrooms
#     grade (scale): ExternalRating, BsmtRating, ClimateRating, GarageRating, OverallRating, FunctionalRating, WeightedFireplaceRating,
#     time (scale): LotAge, GarageAgeWhenSold, RemodAgeWhenSold, LotAgeWhenSold
#     currency (?): NetBuildingMeanPrice, AreaMeanPrice, SFMeanPrice,  WeightedNeighborhoodMeanPrice

# space (?):
#     LotArea, BsmtFinSF1, BsmtFinSF2, BsmtUnfSF, TotalBsmtSF, 1stFlrSF, 2ndFlrSF, 
#     GrLivArea, GarageArea, WoodDeckSF, OpenPorchSF, EnclosedPorch, 3SsnPorch, ScreenPorch, PoolArea,
#     BedroomAbvGr, KitchenAbvGr, LotFrontage (*), LotDepth (*), MasVnrArea, 
#
# quantity (scale):
#     BsmtFullBath, BsmtHalfBath, FullBath, HalfBath, TotRmsAbvGrd, Fireplaces, GarageCars
#
# grade (scale):
#     LotShape, LandSlope, Utilities, ExterQual, ExterCond, BsmtQual, BsmtCond, BsmtExposure,
#     HeatingQC, CentralAir, KitchenQual, Functional, FireplaceQu, GarageFinish, GarageQual, GarageCond
#     BsmtFinType1, BsmtFinType2, OverallQual, OverallCond, 
#
# time (scale):
#     YearBuilt, YearRemodAdd, YrSold, GarageYrBlt,
#
# currency (?):
#     MiscVal, NeighborhoodMeanPrice, GarageTypeMeanPrice
#
# one-hot-encoded-binary (keep):
#     MSSubClass, MSZoning, LandContour, Street, LotConfig, BldgType, HouseStyle, RoofStyle, RoofMatl, MoSold,
#     MasVnrType, Foundation, Heating, Electrical, PavedDrive, Condition1, Condition2, Exterior1st, Exterior2nd

columns_to_normalize = [
#     'MiscVal',
#     'NeighborhoodMeanPrice',
#     'GarageTypeMeanPrice',
#     'LotArea',
#     'BsmtFinSF1',
#     'BsmtFinSF2',
#     'BsmtUnfSF',
#     'TotalBsmtSF',
#     '1stFlrSF',
#     '2ndFlrSF',
#     'GrLivArea',
#     'GarageArea',
#     'WoodDeckSF',
#     'OpenPorchSF',
#     'EnclosedPorch',
#     '3SsnPorch',
#     'ScreenPorch',
#     'PoolArea',
#     'BedroomAbvGr',
#     'KitchenAbvGr',
#     'LotFrontage',
#     'LotDepth',
#     'MasVnrArea'
]

columns_to_scale = [
    'BsmtFullBath',
    'BsmtHalfBath',
    'FullBath',
    'HalfBath',
    'TotRmsAbvGrd',
    'Fireplaces',
    'GarageCars',
    'LotShape',
    'LandSlope',
    'Utilities',
    'ExterQual',
    'ExterCond',
    'BsmtQual',
    'BsmtCond',
    'BsmtExposure',
    'HeatingQC',
    'CentralAir',
    'KitchenQual',
    'Functional',
    'FireplaceQu',
    'GarageFinish',
    'GarageQual',
    'GarageCond',
    'BsmtFinType1',
    'BsmtFinType2',
    'OverallQual',
    'OverallCond',
    'YearBuilt',
    'YearRemodAdd',
    'YrSold',
    'GarageYrBlt',
    
    'BsmtBathrooms',
    'AbvGrBathrooms',
    'TotalBathrooms',
    
    'ExternalRating',
    'BsmtRating',
    'ClimateRating',
    'GarageRating',
    'OverallRating',
    'FunctionalRating',
    
    'WeightedFireplaceRating',
    
    'LotAge',
    'GarageAgeWhenSold',
    'RemodAgeWhenSold',
    'LotAgeWhenSold'
]

In [10]:
# mean normalization
normalized_train_data = good_data.copy()
normalized_test_data = test_data.copy()

normalized_train_data[columns_to_normalize] = (normalized_train_data[columns_to_normalize] - normalized_train_data[columns_to_normalize].mean()) / normalized_train_data[columns_to_normalize].std()
normalized_test_data[columns_to_normalize] = (normalized_test_data[columns_to_normalize] - normalized_test_data[columns_to_normalize].mean()) / normalized_test_data[columns_to_normalize].std()
normalized_train_data.fillna(0, inplace=True)
normalized_test_data.fillna(0, inplace=True)

normalized_train_data.head()

Unnamed: 0,1stFlrSF,2ndFlrSF,3SsnPorch,BedroomAbvGr,BldgType_1Fam,BldgType_2fmCon,BldgType_Duplex,BldgType_Twnhs,BldgType_TwnhsE,BsmtCond,...,FunctionalRating,LotAge,GarageAgeWhenSold,RemodAgeWhenSold,LotAgeWhenSold,NetBuildingMeanPrice,AreaMeanPrice,SFMeanPrice,WeightedFireplaceRating,WeightedNeighborhoodMeanPrice
0,856,854,0,3,1,0,0,0,0,3,...,6.0,16,5,5,5,-4927,23.427929,84.169218,0.0,199203.2875
1,1262,0,0,3,1,0,0,0,0,3,...,5.5,43,31,31,31,35880,24.872187,108.115463,3.75,239328.286047
2,920,866,0,3,1,0,0,0,0,3,...,6.0,18,7,6,7,-4927,17.596978,79.953958,3.75,199065.811111
3,961,756,0,3,1,0,0,0,0,4,...,6.0,104,8,36,91,76534,22.054974,92.217601,5.0,210827.524038
4,1145,1053,0,4,1,0,0,0,0,3,...,6.0,19,8,8,8,132402,23.512973,109.69003,3.75,337059.710526


In [11]:
# min max scaling
# normalized_df=(df-df.min())/(df.max()-df.min())
scaled_train_data = normalized_train_data.copy()
scaled_test_data = normalized_test_data.copy()

scaled_train_data[columns_to_scale] = (scaled_train_data[columns_to_scale] - scaled_train_data[columns_to_scale].min()) / (scaled_train_data[columns_to_scale].max() - scaled_train_data[columns_to_scale].min())
scaled_test_data[columns_to_scale] = (scaled_test_data[columns_to_scale] - scaled_train_data[columns_to_scale].min()) / (scaled_train_data[columns_to_scale].max() - scaled_train_data[columns_to_scale].min())
scaled_train_data.fillna(0, inplace=True)
scaled_test_data.fillna(0, inplace=True)

scaled_train_data.head()

Unnamed: 0,1stFlrSF,2ndFlrSF,3SsnPorch,BedroomAbvGr,BldgType_1Fam,BldgType_2fmCon,BldgType_Duplex,BldgType_Twnhs,BldgType_TwnhsE,BsmtCond,...,FunctionalRating,LotAge,GarageAgeWhenSold,RemodAgeWhenSold,LotAgeWhenSold,NetBuildingMeanPrice,AreaMeanPrice,SFMeanPrice,WeightedFireplaceRating,WeightedNeighborhoodMeanPrice
0,856,854,0,3,1,0,0,0,0,0.75,...,0.857143,0.043796,0.041667,0.083333,0.036765,-4927,23.427929,84.169218,0.0,199203.2875
1,1262,0,0,3,1,0,0,0,0,0.75,...,0.714286,0.240876,0.258333,0.516667,0.227941,35880,24.872187,108.115463,0.5,239328.286047
2,920,866,0,3,1,0,0,0,0,0.75,...,0.857143,0.058394,0.058333,0.1,0.051471,-4927,17.596978,79.953958,0.5,199065.811111
3,961,756,0,3,1,0,0,0,0,1.0,...,0.857143,0.686131,0.066667,0.6,0.669118,76534,22.054974,92.217601,0.666667,210827.524038
4,1145,1053,0,4,1,0,0,0,0,0.75,...,0.857143,0.065693,0.066667,0.133333,0.058824,132402,23.512973,109.69003,0.5,337059.710526


## Feature Selection

References:

- https://towardsdatascience.com/feature-selection-techniques-in-machine-learning-with-python-f24e7da3f36e
- https://seaborn.pydata.org/generated/seaborn.pairplot.html

### Feature variance

- Drop features with no variance

In [12]:
reduced_var_train_data = scaled_train_data.copy()
reduced_var_test_data = scaled_test_data.copy()

def run_var_analysis():
    features = reduced_var_train_data.drop(columns=['Id', 'SalePrice'])

    var_feats = features.var()
    var_feats_ordered = var_feats.sort_values(ascending=True)
    return var_feats_ordered[var_feats_ordered < 0.2]

low_variance_feats = run_var_analysis()
display(reduced_var_train_data.shape)
display(low_variance_feats)

reduced_var_train_data.drop(columns=low_variance_feats.index, inplace=True)
reduced_var_test_data.drop(columns=low_variance_feats.index, inplace=True)

display(run_var_analysis())
display(reduced_var_train_data.shape)

(1028, 223)

RoofMatl_Metal             0.000000
Condition2_PosA            0.000000
Heating_Wall               0.000000
RoofMatl_Roll              0.000000
Exterior1st_AsphShn        0.000000
Exterior1st_CBlock         0.000000
Condition2_PosN            0.000000
Exterior1st_ImStucc        0.000000
RoofMatl_Membran           0.000000
RoofStyle_Shed             0.000000
Condition2_RRAe            0.000000
MSSubClass_150             0.000000
Exterior1st_Stone          0.000000
Exterior2nd_Other          0.000000
RoofMatl_ClyTile           0.000000
Exterior2nd_CBlock         0.000000
Exterior1st_Other          0.000000
Utilities                  0.000000
Exterior2nd_Stone          0.000973
Heating_Floor              0.000973
Foundation_Wood            0.000973
Condition2_RRAn            0.000973
RoofMatl_WdShngl           0.000973
RoofMatl_WdShake           0.000973
Electrical_Mix             0.000973
Electrical_FuseP           0.000973
Heating_OthW               0.001944
Street_Grvl                0

Series([], dtype: float64)

(1028, 42)

### Feature correlation

- Drop highly correlated features

In [13]:
reduced_train_data = reduced_var_train_data.copy()
reduced_test_data = reduced_var_test_data.copy()

def run_corr_analysis():
    features = reduced_train_data.drop(columns=['Id', 'SalePrice'])

    corr = features.corr().abs()
    corr[corr == 1] = 0
    corr_cols = corr.max().sort_values(ascending=False)
    return corr_cols[corr_cols > 0.8]

display(reduced_train_data.shape)

corr_results = run_corr_analysis()
corr_result_index = 0
for corr_result in corr_results.index:
    # display(corr_result)
    
    if (corr_result_index % 2 == 1):
        # display(corr_result)
        reduced_train_data.drop(columns=[corr_result], inplace=True)
        reduced_test_data.drop(columns=[corr_result], inplace=True)
    
    corr_result_index = corr_result_index + 1

display(reduced_train_data.shape)

(1028, 42)

(1028, 34)

### Target correlation

- Drop lowly target-correlated features

In [14]:
def run_target_corr_analysis():
    features = reduced_train_data.drop(columns=['Id'])

    corr = features.corr().abs()
    target_corr = corr['SalePrice'].sort_values(ascending=True)
    return target_corr[target_corr < 0.2]

lowly_correlated_feats = run_target_corr_analysis()
display(reduced_train_data.shape)
display(lowly_correlated_feats)

reduced_train_data.drop(columns=lowly_correlated_feats.index, inplace=True)
reduced_test_data.drop(columns=lowly_correlated_feats.index, inplace=True)

display(run_target_corr_analysis())
display(reduced_train_data.shape)

(1028, 34)

PoolArea             0.001629
MiscVal              0.008387
MSSubClass_20        0.011899
BsmtFinSF2           0.021231
3SsnPorch            0.043836
ScreenPorch          0.073049
LowQualFinSF         0.075244
LotDepth             0.085522
SFMeanPrice          0.099546
HouseStyle_1Story    0.104140
AreaMeanPrice        0.121453
BsmtUnfSF            0.176470
EnclosedPorch        0.199878
Name: SalePrice, dtype: float64

Series([], Name: SalePrice, dtype: float64)

(1028, 21)

### Feature importance

- Drop lowly important features

In [15]:
from sklearn.ensemble import RandomForestRegressor

rfr = RandomForestRegressor(n_jobs=-1, random_state=42, n_estimators=50)

feature_selected_train_data = reduced_train_data.copy()
feature_selected_test_data = reduced_test_data.copy()

def run_importance_analysis():
    features = feature_selected_train_data.drop(columns=['Id', 'SalePrice'])
    
    rfr_importance_analyser = rfr.fit(features, feature_selected_train_data['SalePrice'])
    return features.columns[rfr_importance_analyser.feature_importances_ < 0.02]

low_importance_feats = run_importance_analysis()
display(feature_selected_train_data.shape)
display(low_importance_feats)

feature_selected_train_data.drop(columns=low_importance_feats, inplace=True)
feature_selected_test_data.drop(columns=low_importance_feats, inplace=True)

display(run_importance_analysis())
display(feature_selected_train_data.shape)

(1028, 21)

Index([u'BedroomAbvGr', u'Exterior1st_VinylSd', u'Foundation_PConc',
       u'GarageTypeMeanPrice', u'HouseStyle_2Story', u'LotArea',
       u'LotFrontage', u'MasVnrArea', u'MasVnrType_BrkFace', u'OpenPorchSF',
       u'TotalBsmtSF', u'WoodDeckSF', u'PorchSF', u'NetBuildingMeanPrice'],
      dtype='object')

Index([], dtype='object')

(1028, 7)

In [16]:
feature_selected_train_data.describe()

Unnamed: 0,BsmtFinSF1,GarageArea,Id,SalePrice,FloorSF,HouseSF,WeightedNeighborhoodMeanPrice
count,1028.0,1028.0,1028.0,1028.0,1028.0,1028.0,1028.0
mean,414.585603,451.845331,732.459144,168071.782101,1400.713035,2157.009241,175341.038294
std,393.970246,191.697934,421.466399,56130.459219,406.976359,537.71624,48316.145644
min,0.0,0.0,1.0,35311.0,334.0,334.0,98777.17551
25%,0.0,312.0,367.75,129500.0,1085.0,1750.3125,137051.1
50%,384.0,461.5,739.5,158000.0,1377.0,2124.375,187022.39
75%,690.5,564.0,1099.25,197600.0,1668.0,2548.125,199130.505882
max,1619.0,1053.0,1458.0,370878.0,2730.0,3856.5,337059.710526


## Clustering

In [17]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

clusters_range = range(2, 11)
scores = []

for n_clusters in clusters_range:
    kmeans = KMeans(n_clusters=n_clusters, n_jobs=-1, random_state=42)
    result = kmeans.fit_predict(feature_selected_train_data.drop(columns=['Id', 'SalePrice']))
    
    curr_score = silhouette_score(feature_selected_train_data, result)
    scores.append(curr_score)
    # display(curr_score)
    
cluster_scores_df = pd.DataFrame({ 'scores': scores, 'n_clusters': clusters_range })
best_score = cluster_scores_df['scores'].max()
n_clusters = cluster_scores_df[cluster_scores_df['scores'] == best_score]['n_clusters'].values[0] + 1
n_clusters

4

In [18]:
from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=n_clusters, n_jobs=-1, random_state=42)
result = kmeans.fit_predict(feature_selected_train_data.drop(columns=['Id', 'SalePrice']))
test_result = kmeans.predict(feature_selected_test_data.drop(columns=['Id']))

# enable clustering
feature_selected_train_data['Cluster'] = result
feature_selected_test_data['Cluster'] = test_result

# disable clustering
# feature_selected_train_data['Cluster'] = 0
# feature_selected_test_data['Cluster'] = 0
# n_clusters = 1

feature_selected_train_data['Cluster'].describe()

count    1028.000000
mean        1.650778
std         1.072644
min         0.000000
25%         1.000000
50%         1.000000
75%         3.000000
max         3.000000
Name: Cluster, dtype: float64

## Regression Model Selection

In [19]:
from sklearn.metrics import mean_squared_log_error, make_scorer
from sklearn.model_selection import KFold, GridSearchCV

def kfold(model, name, cluster, train_X, train_y):
    kf = KFold(n_splits=2, random_state=42, shuffle=False)
    
    train_scores = []
    test_scores = []

    for kf_chunks in kf.split(train_X):
        train_index = kf_chunks[0]
        test_index = kf_chunks[1]

        X_train = train_X.iloc[train_index]
        X_test = train_X.iloc[test_index]
        y_train = train_y.iloc[train_index]
        y_test = train_y.iloc[test_index]

        model_predictor = model.fit(X_train, y_train)
        y_train_predicted = model_predictor.predict(X_train)
        y_test_predicted = model_predictor.predict(X_test)

        y_train_predicted[y_train_predicted < 0] = 0
        y_test_predicted[y_test_predicted < 0] = 0

        train_score = np.sqrt(mean_squared_log_error(y_train, y_train_predicted))
        test_score = np.sqrt(mean_squared_log_error(y_test, y_test_predicted))
        
        train_scores.append(train_score)
        test_scores.append(test_score)

        print('[{}][{}] Current fold train performance'.format(name, cluster), train_score)
        print('[{}][{}] Current fold test performance'.format(name, cluster), test_score)
        
    return train_scores, test_scores

def custom_scorer(y, y_pred):
    y_pred[y_pred < 0] = 0
    return np.sqrt(mean_squared_log_error(y, y_pred))

gs_scorer = make_scorer(custom_scorer, greater_is_better=False)

def tune_model(Model, name, default_params, cv_params):    
    best_params = []
    merged_params_arr = []
    model_train_scores = []
    model_test_scores = []
    mean_train_scores = []
    mean_test_scores = []

    for cluster in range(n_clusters):
        current_cluster_filter = feature_selected_train_data['Cluster'] == cluster

        gs_train_X = feature_selected_train_data[current_cluster_filter].drop(columns=['Id', 'SalePrice', 'Cluster'])
        gs_train_y = feature_selected_train_data[current_cluster_filter]['SalePrice']

        gs = GridSearchCV(Model(**default_params), cv_params, n_jobs=-1, cv=2, scoring=gs_scorer)

        gs_predictor = gs.fit(gs_train_X, gs_train_y)

        best_params.append(gs_predictor.best_params_)
        
        merged_params = dict()
        merged_params.update(default_params)
        merged_params.update(gs_predictor.best_params_)
        merged_params_arr.append(merged_params)
        
        train_scores, test_scores = kfold(Model(**merged_params), name, cluster, gs_train_X, gs_train_y)
        
        model_train_scores.append(train_scores)
        model_test_scores.append(test_scores)
        
        mean_train_scores.append(np.mean(train_scores))
        mean_test_scores.append(np.mean(test_scores))

    model_arr = np.full((1, n_clusters), name, object)[0]
    return pd.DataFrame({
        'cluster': range(n_clusters),
        'params_cv': best_params,
        'params_full': merged_params_arr,
        'model': model_arr,
        'train_scores': model_train_scores,
        'test_scores': model_test_scores,
        'train_score': mean_train_scores,
        'test_score': mean_test_scores,
    })


### Linear Regression

In [20]:
from sklearn.linear_model import LinearRegression
   
lr_df = tune_model(LinearRegression, 'Linear Regression', {
    'copy_X': True,
    'n_jobs': -1
}, {
    'fit_intercept': [True, False],
    'normalize': [True, False]
})
lr_df

('[Linear Regression][0] Current fold train performance', 0.09556374019547943)
('[Linear Regression][0] Current fold test performance', 0.11396288433248082)
('[Linear Regression][0] Current fold train performance', 0.10137996301819449)
('[Linear Regression][0] Current fold test performance', 0.1166600046774641)
('[Linear Regression][1] Current fold train performance', 0.16035829238122212)
('[Linear Regression][1] Current fold test performance', 0.17304662451182376)
('[Linear Regression][1] Current fold train performance', 0.1700258131669013)
('[Linear Regression][1] Current fold test performance', 0.16792269150596995)
('[Linear Regression][2] Current fold train performance', 0.08944219471678137)
('[Linear Regression][2] Current fold test performance', 0.08041574531884074)
('[Linear Regression][2] Current fold train performance', 0.06852229869342451)
('[Linear Regression][2] Current fold test performance', 0.09995572103130652)
('[Linear Regression][3] Current fold train performance', 0.



Unnamed: 0,cluster,model,params_cv,params_full,test_score,test_scores,train_score,train_scores
0,0,Linear Regression,"{u'normalize': True, u'fit_intercept': True}","{u'copy_X': True, u'normalize': True, u'n_jobs...",0.115311,"[0.11396288433248082, 0.1166600046774641]",0.098472,"[0.09556374019547943, 0.10137996301819449]"
1,1,Linear Regression,"{u'normalize': False, u'fit_intercept': True}","{u'copy_X': True, u'normalize': False, u'n_job...",0.170485,"[0.17304662451182376, 0.16792269150596995]",0.165192,"[0.16035829238122212, 0.1700258131669013]"
2,2,Linear Regression,"{u'normalize': True, u'fit_intercept': False}","{u'copy_X': True, u'normalize': True, u'n_jobs...",0.090186,"[0.08041574531884074, 0.09995572103130652]",0.078982,"[0.08944219471678137, 0.06852229869342451]"
3,3,Linear Regression,"{u'normalize': True, u'fit_intercept': True}","{u'copy_X': True, u'normalize': True, u'n_jobs...",0.114517,"[0.12167767920497417, 0.1073554660425421]",0.111678,"[0.10453007556450254, 0.11882507693263102]"


### Random Forest

In [21]:
from sklearn.ensemble import RandomForestRegressor
   
rfr_df = tune_model(RandomForestRegressor, 'Random Forest', {
    'random_state': 42,
    'n_jobs': -1
}, {
    'n_estimators': [10, 50, 100],
    'max_depth': [2, 5, 10],
    'min_samples_split': [2, 3],
    'min_samples_leaf': [2, 3]
})
rfr_df

('[Random Forest][0] Current fold train performance', 0.05423050917771191)
('[Random Forest][0] Current fold test performance', 0.12134133631557506)
('[Random Forest][0] Current fold train performance', 0.056790264431870255)
('[Random Forest][0] Current fold test performance', 0.11104391322683084)
('[Random Forest][1] Current fold train performance', 0.10500949340909765)
('[Random Forest][1] Current fold test performance', 0.1850527305660619)
('[Random Forest][1] Current fold train performance', 0.11872511894294294)
('[Random Forest][1] Current fold test performance', 0.18276575824184463)




('[Random Forest][2] Current fold train performance', 0.06764295586994745)
('[Random Forest][2] Current fold test performance', 0.091228477923369)
('[Random Forest][2] Current fold train performance', 0.04079609718670491)
('[Random Forest][2] Current fold test performance', 0.1180800795743933)
('[Random Forest][3] Current fold train performance', 0.058183991981758096)
('[Random Forest][3] Current fold test performance', 0.12796121305923425)
('[Random Forest][3] Current fold train performance', 0.07214194990886598)
('[Random Forest][3] Current fold test performance', 0.11385999338639587)


Unnamed: 0,cluster,model,params_cv,params_full,test_score,test_scores,train_score,train_scores
0,0,Random Forest,"{u'min_samples_split': 2, u'n_estimators': 100...","{u'n_estimators': 100, u'random_state': 42, u'...",0.116193,"[0.12134133631557506, 0.11104391322683084]",0.05551,"[0.05423050917771191, 0.056790264431870255]"
1,1,Random Forest,"{u'min_samples_split': 2, u'n_estimators': 50,...","{u'n_estimators': 50, u'random_state': 42, u'n...",0.183909,"[0.1850527305660619, 0.18276575824184463]",0.111867,"[0.10500949340909765, 0.11872511894294294]"
2,2,Random Forest,"{u'min_samples_split': 2, u'n_estimators': 50,...","{u'n_estimators': 50, u'random_state': 42, u'n...",0.104654,"[0.091228477923369, 0.1180800795743933]",0.05422,"[0.06764295586994745, 0.04079609718670491]"
3,3,Random Forest,"{u'min_samples_split': 3, u'n_estimators': 10,...","{u'n_estimators': 10, u'random_state': 42, u'n...",0.120911,"[0.12796121305923425, 0.11385999338639587]",0.065163,"[0.058183991981758096, 0.07214194990886598]"


### Gradient Boosting Regressor

In [22]:
from sklearn.ensemble import GradientBoostingRegressor
   
gbr_df = tune_model(GradientBoostingRegressor, 'GradientBoosting', {
    'random_state': 42,
    'presort': True,

    'learning_rate': 0.1,
    'tol': 0.01
}, {
    'loss': ['ls', 'lad'],
    # 'learning_rate': [0.2, 0.1, 0.01],
    'n_estimators': [25, 150, 350],
    'min_samples_split': [2, 4, 6],
    'min_samples_leaf': [1, 2],
    # 'tol': [0.05, 0.01, 0.005]
})
gbr_df

('[GradientBoosting][0] Current fold train performance', 0.038684252323911934)
('[GradientBoosting][0] Current fold test performance', 0.12050727242039133)
('[GradientBoosting][0] Current fold train performance', 0.03323489564138827)
('[GradientBoosting][0] Current fold test performance', 0.10629909458749727)
('[GradientBoosting][1] Current fold train performance', 0.11872530083964154)
('[GradientBoosting][1] Current fold test performance', 0.18337425362947474)
('[GradientBoosting][1] Current fold train performance', 0.1391459299433792)
('[GradientBoosting][1] Current fold test performance', 0.1802657932067858)




('[GradientBoosting][2] Current fold train performance', 0.027136819944411642)
('[GradientBoosting][2] Current fold test performance', 0.08434596755589555)
('[GradientBoosting][2] Current fold train performance', 0.020541113403315523)
('[GradientBoosting][2] Current fold test performance', 0.12333248356823068)
('[GradientBoosting][3] Current fold train performance', 0.06570794573518424)
('[GradientBoosting][3] Current fold test performance', 0.12320001860054487)
('[GradientBoosting][3] Current fold train performance', 0.07133286517660112)
('[GradientBoosting][3] Current fold test performance', 0.10029275315230621)


Unnamed: 0,cluster,model,params_cv,params_full,test_score,test_scores,train_score,train_scores
0,0,GradientBoosting,"{u'min_samples_split': 6, u'loss': u'lad', u'n...","{u'presort': True, u'loss': u'lad', u'learning...",0.113403,"[0.12050727242039133, 0.10629909458749727]",0.03596,"[0.038684252323911934, 0.03323489564138827]"
1,1,GradientBoosting,"{u'min_samples_split': 6, u'loss': u'lad', u'n...","{u'presort': True, u'loss': u'lad', u'learning...",0.18182,"[0.18337425362947474, 0.1802657932067858]",0.128936,"[0.11872530083964154, 0.1391459299433792]"
2,2,GradientBoosting,"{u'min_samples_split': 2, u'loss': u'ls', u'n_...","{u'presort': True, u'loss': u'ls', u'learning_...",0.103839,"[0.08434596755589555, 0.12333248356823068]",0.023839,"[0.027136819944411642, 0.020541113403315523]"
3,3,GradientBoosting,"{u'min_samples_split': 4, u'loss': u'lad', u'n...","{u'presort': True, u'loss': u'lad', u'learning...",0.111746,"[0.12320001860054487, 0.10029275315230621]",0.06852,"[0.06570794573518424, 0.07133286517660112]"


### Lasso

In [23]:
from sklearn.linear_model import Lasso
   
lasso_df = tune_model(Lasso, 'Lasso', {
    'random_state': 42,
    'precompute': True,
    'copy_X': True,
    
    'max_iter': 100
}, {
    'alpha': [1.0, 0.6, 0.25],
    'fit_intercept': [False, True],
    'normalize': [False, True],
    # 'max_iter': [100, 1000],
    'tol': [0.01, 0.001],
    'positive': [False, True],
    'selection': ['cyclic', 'random'],
})
lasso_df



('[Lasso][0] Current fold train performance', 0.09556137642807741)
('[Lasso][0] Current fold test performance', 0.11148983935661085)
('[Lasso][0] Current fold train performance', 0.11241387385333805)
('[Lasso][0] Current fold test performance', 0.10266228845865691)
('[Lasso][1] Current fold train performance', 0.1601632535247619)
('[Lasso][1] Current fold test performance', 0.17199320476033103)
('[Lasso][1] Current fold train performance', 0.1698559087590616)
('[Lasso][1] Current fold test performance', 0.1678543839350031)




('[Lasso][2] Current fold train performance', 0.09172339862372594)
('[Lasso][2] Current fold test performance', 0.07764180835426715)
('[Lasso][2] Current fold train performance', 0.06821864302646692)
('[Lasso][2] Current fold test performance', 0.09951530396739086)
('[Lasso][3] Current fold train performance', 0.10447306641789053)
('[Lasso][3] Current fold test performance', 0.12151966028255737)
('[Lasso][3] Current fold train performance', 0.11879956520649028)
('[Lasso][3] Current fold test performance', 0.1074525379109892)


Unnamed: 0,cluster,model,params_cv,params_full,test_score,test_scores,train_score,train_scores
0,0,Lasso,"{u'normalize': False, u'selection': u'random',...","{u'normalize': False, u'selection': u'random',...",0.107076,"[0.11148983935661085, 0.10266228845865691]",0.103988,"[0.09556137642807741, 0.11241387385333805]"
1,1,Lasso,"{u'normalize': True, u'selection': u'random', ...","{u'normalize': True, u'selection': u'random', ...",0.169924,"[0.17199320476033103, 0.1678543839350031]",0.16501,"[0.1601632535247619, 0.1698559087590616]"
2,2,Lasso,"{u'normalize': False, u'selection': u'random',...","{u'normalize': False, u'selection': u'random',...",0.088579,"[0.07764180835426715, 0.09951530396739086]",0.079971,"[0.09172339862372594, 0.06821864302646692]"
3,3,Lasso,"{u'normalize': True, u'selection': u'cyclic', ...","{u'normalize': True, u'selection': u'cyclic', ...",0.114486,"[0.12151966028255737, 0.1074525379109892]",0.111636,"[0.10447306641789053, 0.11879956520649028]"


### Ridge

In [24]:
from sklearn.linear_model import Ridge
   
ridge_df = tune_model(Ridge, 'Ridge', {
    'random_state': 42,
    'copy_X': True,
    
    'fit_intercept': True,
    'normalize': False
}, {
    'alpha': [50.0, 25.0, 2.5, 0.25],
    # 'fit_intercept': [False, True],
    # 'normalize': [False, True],
    'max_iter': [100, 500, 1000],
    'tol': [0.001, 0.00075, 0.0005],
    'solver': ['lsqr', 'sag'],
})
ridge_df



('[Ridge][0] Current fold train performance', 0.09630955732153397)
('[Ridge][0] Current fold test performance', 0.1104665083351422)
('[Ridge][0] Current fold train performance', 0.10391447624522389)
('[Ridge][0] Current fold test performance', 0.11734788011196891)
('[Ridge][1] Current fold train performance', 0.1603635703073899)
('[Ridge][1] Current fold test performance', 0.17235219702059465)
('[Ridge][1] Current fold train performance', 0.16983443225404707)
('[Ridge][1] Current fold test performance', 0.1678731708876939)




('[Ridge][2] Current fold train performance', 0.09692990835839285)
('[Ridge][2] Current fold test performance', 0.06285097961704977)
('[Ridge][2] Current fold train performance', 0.053965353317970664)
('[Ridge][2] Current fold test performance', 0.1076549845396439)
('[Ridge][3] Current fold train performance', 0.1045300455321846)
('[Ridge][3] Current fold test performance', 0.12167775471265246)
('[Ridge][3] Current fold train performance', 0.11882507651753638)
('[Ridge][3] Current fold test performance', 0.10735546472048592)


Unnamed: 0,cluster,model,params_cv,params_full,test_score,test_scores,train_score,train_scores
0,0,Ridge,"{u'alpha': 50.0, u'max_iter': 100, u'tol': 0.0...","{u'normalize': False, u'fit_intercept': True, ...",0.113907,"[0.1104665083351422, 0.11734788011196891]",0.100112,"[0.09630955732153397, 0.10391447624522389]"
1,1,Ridge,"{u'alpha': 50.0, u'max_iter': 100, u'tol': 0.0...","{u'normalize': False, u'fit_intercept': True, ...",0.170113,"[0.17235219702059465, 0.1678731708876939]",0.165099,"[0.1603635703073899, 0.16983443225404707]"
2,2,Ridge,"{u'alpha': 0.25, u'max_iter': 500, u'tol': 0.0...","{u'normalize': False, u'fit_intercept': True, ...",0.085253,"[0.06285097961704977, 0.1076549845396439]",0.075448,"[0.09692990835839285, 0.053965353317970664]"
3,3,Ridge,"{u'alpha': 0.25, u'max_iter': 100, u'tol': 0.0...","{u'normalize': False, u'fit_intercept': True, ...",0.114517,"[0.12167775471265246, 0.10735546472048592]",0.111678,"[0.1045300455321846, 0.11882507651753638]"


### Multi-layer Perceptron Regressor

In [25]:
from sklearn.neural_network import MLPRegressor

mlpr_df = tune_model(MLPRegressor, 'MLPR', {
    'random_state': 42,
    
    'activation': 'relu',
    'learning_rate': 'constant',
    'solver': 'lbfgs',
    'alpha': 0.00003
}, {
    # 'activation': ['tanh', 'relu'],
    # 'solver': ['lbfgs', 'adam'],
    # 'alpha': [0.001, 0.00005, 0.00003, 0.00001],
    # 'learning_rate': ['constant', 'invscaling', 'adaptive'],
})
mlpr_df

('[MLPR][0] Current fold train performance', 0.09498387091628098)
('[MLPR][0] Current fold test performance', 0.10266168429104927)
('[MLPR][0] Current fold train performance', 0.10350941334225243)
('[MLPR][0] Current fold test performance', 0.11420186184440038)
('[MLPR][1] Current fold train performance', 0.16461084379293592)
('[MLPR][1] Current fold test performance', 0.17761898773086848)
('[MLPR][1] Current fold train performance', 0.1790572180142363)
('[MLPR][1] Current fold test performance', 0.17802757106687028)




('[MLPR][2] Current fold train performance', 0.11075541992725423)
('[MLPR][2] Current fold test performance', 0.07586225311841063)
('[MLPR][2] Current fold train performance', 0.06580241081135016)
('[MLPR][2] Current fold test performance', 0.09684747974305956)
('[MLPR][3] Current fold train performance', 0.1094026867514226)
('[MLPR][3] Current fold test performance', 0.13106589489580275)
('[MLPR][3] Current fold train performance', 0.12695835808272493)
('[MLPR][3] Current fold test performance', 0.11268651166232609)


Unnamed: 0,cluster,model,params_cv,params_full,test_score,test_scores,train_score,train_scores
0,0,MLPR,{},"{u'alpha': 3e-05, u'activation': u'relu', u'ra...",0.108432,"[0.10266168429104927, 0.11420186184440038]",0.099247,"[0.09498387091628098, 0.10350941334225243]"
1,1,MLPR,{},"{u'alpha': 3e-05, u'activation': u'relu', u'ra...",0.177823,"[0.17761898773086848, 0.17802757106687028]",0.171834,"[0.16461084379293592, 0.1790572180142363]"
2,2,MLPR,{},"{u'alpha': 3e-05, u'activation': u'relu', u'ra...",0.086355,"[0.07586225311841063, 0.09684747974305956]",0.088279,"[0.11075541992725423, 0.06580241081135016]"
3,3,MLPR,{},"{u'alpha': 3e-05, u'activation': u'relu', u'ra...",0.121876,"[0.13106589489580275, 0.11268651166232609]",0.118181,"[0.1094026867514226, 0.12695835808272493]"


In [26]:
all_df = lr_df.copy()
# all_df = all_df.append(rfr_df.copy())
# all_df = all_df.append(gbr_df.copy()) # overfit
all_df = all_df.append(lasso_df.copy())
all_df = all_df.append(ridge_df.copy())
all_df = all_df.append(mlpr_df.copy())

selected_models = pd.DataFrame({ 'cluster': [], 'model': [], 'params_full': [], 'test_score': [] })

for cluster in range(n_clusters):
    cluster_best_models = all_df[all_df['cluster'] == cluster].sort_values(by='test_score', ascending=True)
    cluster_best_model = cluster_best_models.head(1)[['cluster', 'model', 'params_full', 'test_score']]
    selected_models = selected_models.append(cluster_best_model)
    display(cluster_best_model)

selected_models['cluster'] = selected_models['cluster'].astype(int)
selected_models

Unnamed: 0,cluster,model,params_full,test_score
0,0,Lasso,"{u'normalize': False, u'selection': u'random',...",0.107076


Unnamed: 0,cluster,model,params_full,test_score
1,1,Lasso,"{u'normalize': True, u'selection': u'random', ...",0.169924


Unnamed: 0,cluster,model,params_full,test_score
2,2,Ridge,"{u'normalize': False, u'fit_intercept': True, ...",0.085253


Unnamed: 0,cluster,model,params_full,test_score
3,3,Lasso,"{u'normalize': True, u'selection': u'cyclic', ...",0.114486


Unnamed: 0,cluster,model,params_full,test_score
0,0,Lasso,"{u'normalize': False, u'selection': u'random',...",0.107076
1,1,Lasso,"{u'normalize': True, u'selection': u'random', ...",0.169924
2,2,Ridge,"{u'normalize': False, u'fit_intercept': True, ...",0.085253
3,3,Lasso,"{u'normalize': True, u'selection': u'cyclic', ...",0.114486


## Results for submission

In [27]:
def build_model(model, params):
    if (model == 'Linear Regression'):
        return LinearRegression(**params)
    if (model == 'Lasso'):
        return Lasso(**params)
    if (model == 'Ridge'):
        return Ridge(**params)    


In [28]:
from sklearn.ensemble import AdaBoostRegressor
   
results = []
ids = []

for cluster in range(n_clusters):
    current_cluster_filter = feature_selected_train_data['Cluster'] == cluster
    
    train_X = feature_selected_train_data[current_cluster_filter].drop(columns=['Id', 'SalePrice', 'Cluster'])
    train_y = feature_selected_train_data[current_cluster_filter]['SalePrice']
    
    current_cluster_filter = feature_selected_test_data['Cluster'] == cluster
    predicting_X = feature_selected_test_data[current_cluster_filter].drop(columns=['Id', 'Cluster'])
    predicting_ids = feature_selected_test_data[current_cluster_filter]['Id']
    
    selected_model_df = selected_models[selected_models['cluster'] == cluster]
    model = build_model(selected_model_df['model'].values[0], selected_model_df['params_full'].values[0])
    
    model = model.fit(train_X, train_y)
    prediction = model.predict(predicting_X)
    
    results = np.concatenate((results, prediction))
    ids = np.concatenate((ids, predicting_ids))

kaggle_sb_df = pd.DataFrame({ 'SalePrice': results, 'Id': ids })

kaggle_sb_df['Id'] = kaggle_sb_df['Id'].astype(int)
kaggle_sb_df = kaggle_sb_df.sort_values(by='Id')

kaggle_sb_df.to_csv('data/kaggle_submission_42.csv', index=False)