In [1]:
# library imports
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from scipy.stats import norm, skew
import tqdm

%matplotlib inline

## Import Data

In [2]:
# set the path of the raw data
raw_data_path = os.path.join(os.path.pardir, 'data', 'raw')
train_file_path = os.path.join(raw_data_path, 'train_house.csv')
test_file_path = os.path.join(raw_data_path, 'test_house.csv')

In [3]:
# read the data with all default parameters
train_df = pd.read_csv(train_file_path, index_col='Id')
test_df = pd.read_csv(test_file_path, index_col='Id')

## Organizing data

In [4]:
# get columns with nan values from training data
# numerical columns and categorical columns in the training data
col_with_nan = [col for col in train_df.columns if train_df[col].isnull().any()]
#num_col = [col for col in train_df.columns if train_df[col].dtypes in ['float64', 'int64']]
#cat_col = list(set(train_df.columns) - set(num_col))

## Basic Structure

In [5]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1460 entries, 1 to 1460
Data columns (total 80 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   MSSubClass     1460 non-null   int64  
 1   MSZoning       1460 non-null   object 
 2   LotFrontage    1201 non-null   float64
 3   LotArea        1460 non-null   int64  
 4   Street         1460 non-null   object 
 5   Alley          91 non-null     object 
 6   LotShape       1460 non-null   object 
 7   LandContour    1460 non-null   object 
 8   Utilities      1460 non-null   object 
 9   LotConfig      1460 non-null   object 
 10  LandSlope      1460 non-null   object 
 11  Neighborhood   1460 non-null   object 
 12  Condition1     1460 non-null   object 
 13  Condition2     1460 non-null   object 
 14  BldgType       1460 non-null   object 
 15  HouseStyle     1460 non-null   object 
 16  OverallQual    1460 non-null   int64  
 17  OverallCond    1460 non-null   int64  
 18  YearBuil

In [6]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1459 entries, 1461 to 2919
Data columns (total 79 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   MSSubClass     1459 non-null   int64  
 1   MSZoning       1455 non-null   object 
 2   LotFrontage    1232 non-null   float64
 3   LotArea        1459 non-null   int64  
 4   Street         1459 non-null   object 
 5   Alley          107 non-null    object 
 6   LotShape       1459 non-null   object 
 7   LandContour    1459 non-null   object 
 8   Utilities      1457 non-null   object 
 9   LotConfig      1459 non-null   object 
 10  LandSlope      1459 non-null   object 
 11  Neighborhood   1459 non-null   object 
 12  Condition1     1459 non-null   object 
 13  Condition2     1459 non-null   object 
 14  BldgType       1459 non-null   object 
 15  HouseStyle     1459 non-null   object 
 16  OverallQual    1459 non-null   int64  
 17  OverallCond    1459 non-null   int64  
 18  YearB

## Check for duplicates

In [7]:
# Try to find duplicated data
duplicatedID = train_df.duplicated()
train_df[duplicatedID] #no duplicated found

Unnamed: 0_level_0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1


## Study the data graphically

In [8]:
def hist_log_hist (df, col, show=True, file_path=''):
    
    '''
        Returns the histogram and log1p of the values histogram of the column col
        with their probability plots.
        
        Args:
            df: Dataframe with features. Shape(m, n)
            col: feature to be plotted
            show: display graphics if True

    '''

    f, axes = plt.subplots(2, 2, figsize=(12, 10));

    # original histogram
    sns.distplot(df[df[col].notnull()][col], fit=norm, ax=axes[0, 0]);
    stats.probplot(df[df[col].notnull()][col], plot=axes[1, 0]);

    # log-histogram (deals with zero and NaN values)
    sns.distplot(np.log1p(df[df[col].notnull()][col]), fit=norm, ax=axes[0, 1]);
    stats.probplot(np.log1p(df[df[col].notnull()][col]), plot=axes[1, 1]);
    
    plt.suptitle(col, fontsize=18)
    plt.tight_layout(rect=[0, 0, 1, 0.95]);
    
    if show:
        plt.show();
    elif file_path:
        plt.savefig(file_path, dpi=300)

In [None]:
# distributions of numerical features
# saving histograms for all numerical columns to see skewness of data

for col in tqdm.tqdm(num_col):
    
    path = os.path.join(os.path.pardir, 'data', 'figures', col)
    path = '.'.join([path, 'png'])
    hist_log_hist(train_df, col, show=False, file_path=path)
    plt.cla()
    plt.clf()
    plt.close()

In [None]:
# very skewed columns without 'SalePrice'
# skew_col = ['1stFlrSF', 'GrLivArea', 'LotArea']

In [None]:
# check scatter plot with the target "SalePrice"
for col in tqdm.tqdm(num_col):
    
    path = os.path.join(os.path.pardir, 'data', 'figures', col)
    path = '.'.join([path, 'png'])
    plt.scatter(train_df[col], train_df['SalePrice'])
    plt.savefig(path, dpi=300)
    plt.cla()
    plt.clf()
    plt.close()

Removing outiliers shown in the scatter plot. Points that did not follow the patterns in the scatter plot were dropped

In [9]:
# outliers from scatter plot
# point > 4000 in 1stFlrSF
# point > 5000 in BsmtFinSF1
# point >~ 4500 in GrLivArea
# point > 100000 in LotArea
# point > 300 in LotFrontage
# drop outliers from above

outliers_removed = train_df.copy()
outliers_removed = outliers_removed.drop(outliers_removed[outliers_removed['1stFlrSF'] >= 4000].index)
outliers_removed = outliers_removed.drop(outliers_removed[outliers_removed.BsmtFinSF1 >= 5000].index)
outliers_removed = outliers_removed.drop(outliers_removed[outliers_removed.GrLivArea > 4500].index)
outliers_removed = outliers_removed.drop(outliers_removed[outliers_removed.LotArea > 100000].index)
outliers_removed = outliers_removed.drop(outliers_removed[outliers_removed.LotFrontage > 300].index)

Check how each categorical feature influence the target sale price for the houses

In [None]:
# check importance of categorical features
for col in tqdm.tqdm(cat_col):
    
    path = os.path.join(os.path.pardir, 'data', 'figures', col)
    path = '.'.join([path, 'png'])
    
    # order bars by its mean
    order = outliers_removed.groupby([col])['SalePrice'].aggregate(
        np.mean).reset_index().sort_values('SalePrice')
    
    plt.figure(figsize=(12, 7))
    
    sns.barplot(x=col, y='SalePrice', data=outliers_removed, order=order[col]);
    
    plt.tight_layout()
    plt.savefig(path, dpi=300)
    plt.cla()
    plt.clf()
    plt.close()

Keeping track of the freatures that showed some ranking in the categories

In [None]:
# # columns with clear rankings
# rank_col = ['Alley', 'BsmtCond', 'BsmtExposure', 'BsmtQual',
#             'CentralAir', 'Electrical', 'ExterCond', 'Condition2', 
#             'Exterior1st', 'Exterior2nd', 'ExterQual', 'Fence', 
#             'FireplaceQU', 'Foundation', 'Functional', 'GarageFinish', 
#             'GarageQual', 'GarageType', 'Heating', 'HeatingQC', 'HouseStyle',
#             'KitchenQual', 'LandContour', 'MasVnrType', 'MSZoning', 'Neighborhood', 
#             'PavedDrive', 'PoolQC', 'RoofMatl', 'RoofStyle', 'SaleType', 'Street',
#             'Utilities']
# # maybe target encoder columns (not so clear ranking) or one hot encoder
# te_col = ['BldgType', 'BsmtFinType1', 'BsmtFinType2', 'Condition1', 'GarageCond', 
#           'LandSlope', 'LotConfig', 'LotShape', 'MiscFeature', 'SaleCondition']

Checking numerical features that are actually categorical

In [10]:
num_cat_col = ['MSSubClass', 'OverallQual', 'OverallCond']

In [None]:
for col in tqdm.tqdm(num_cat_col):
    
    path = os.path.join(os.path.pardir, 'data', 'figures', col)
    path = '.'.join([path, 'png'])
    
    # order bars by its mean
    order = outliers_removed.groupby([col])['SalePrice'].aggregate(
        np.mean).reset_index().sort_values('SalePrice')
    
    plt.figure(figsize=(12, 7))
    
    sns.barplot(x=col, y='SalePrice', data=outliers_removed, order=order[col]);
    
    plt.tight_layout()
    plt.savefig(path, dpi=300)
    plt.cla()
    plt.clf()
    plt.close()

From the figures we can see that MSSubClass is not actually in order, so we will pass it to categorical feature.

In [11]:
# MSSubClass
train_df['MSSubClass'] = train_df['MSSubClass'].astype('object')
test_df['MSSubClass'] = test_df['MSSubClass'].astype('object')

# Year and Month sold are transformed into categorical features.
train_df['YrSold'] = train_df['YrSold'].astype('object')
test_df['YrSold'] = test_df['YrSold'].astype('object')
train_df['MoSold'] = train_df['MoSold'].astype('object')
test_df['MoSold'] = test_df['MoSold'].astype('object')

## Missing values

In [12]:
train_df = outliers_removed.copy()
train_df[col_with_nan].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1453 entries, 1 to 1460
Data columns (total 19 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   LotFrontage   1197 non-null   float64
 1   Alley         91 non-null     object 
 2   MasVnrType    1445 non-null   object 
 3   MasVnrArea    1445 non-null   float64
 4   BsmtQual      1416 non-null   object 
 5   BsmtCond      1416 non-null   object 
 6   BsmtExposure  1415 non-null   object 
 7   BsmtFinType1  1416 non-null   object 
 8   BsmtFinType2  1415 non-null   object 
 9   Electrical    1452 non-null   object 
 10  FireplaceQu   763 non-null    object 
 11  GarageType    1372 non-null   object 
 12  GarageYrBlt   1372 non-null   float64
 13  GarageFinish  1372 non-null   object 
 14  GarageQual    1372 non-null   object 
 15  GarageCond    1372 non-null   object 
 16  PoolQC        6 non-null      object 
 17  Fence         281 non-null    object 
 18  MiscFeature   52 non-null   

In [13]:
# missing values dataframe to avoid errors in code
missing = train_df.copy()

It is probable that houses have similar LotFrontage in the same Neighborhood

In [14]:
# fill missing values of lotfrontage with values of the median in the neighborhood
missing['LotFrontage'] = missing.groupby(['Neighborhood'])['LotFrontage'].transform(
    lambda x: x.fillna(x.median()))

All these features are probably not in the houses with missing values, so we will impute 'NA'.

In [15]:
# filling NaN values with 'NA'
fill_NA_col = ['BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1','BsmtFinType2',
              'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'Alley',
              'FireplaceQu', 'PoolQC', 'Fence', 'MiscFeature']

for col in fill_NA_col:
    missing[col].fillna('NA', inplace=True)

For MasVnrArea, Type and Electrical we will impute the most frequent values because there are few missing values. For the the GarageYrBlt we will impute zero, because there is no garage in the house.

In [16]:
# fill with the most frequent value
mode_MasVnr = missing[['MasVnrArea','MasVnrType']].mode().iloc[0,:]
missing.MasVnrArea.fillna(mode_MasVnr[0], inplace=True)
missing.MasVnrType.fillna(mode_MasVnr[1], inplace=True)

# fill the one mv with the most frequent value in the column
mode_Electrical = missing.Electrical.mode()[0]
missing.Electrical.fillna(mode_Electrical, inplace=True)

# filling with year with zero because the garage was not built
missing['GarageYrBlt'].fillna(0, inplace=True)

In [17]:
train_df = missing.copy()

## Missing Values for test data

In [18]:
# get cols with NaN values
test_col_with_nan = [col for col in test_df.columns if test_df[col].isnull().any()]

In [19]:
# protecting original data
test_missing = test_df.copy()

In [20]:
test_missing[test_col_with_nan].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1459 entries, 1461 to 2919
Data columns (total 33 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   MSZoning      1455 non-null   object 
 1   LotFrontage   1232 non-null   float64
 2   Alley         107 non-null    object 
 3   Utilities     1457 non-null   object 
 4   Exterior1st   1458 non-null   object 
 5   Exterior2nd   1458 non-null   object 
 6   MasVnrType    1443 non-null   object 
 7   MasVnrArea    1444 non-null   float64
 8   BsmtQual      1415 non-null   object 
 9   BsmtCond      1414 non-null   object 
 10  BsmtExposure  1415 non-null   object 
 11  BsmtFinType1  1417 non-null   object 
 12  BsmtFinSF1    1458 non-null   float64
 13  BsmtFinType2  1417 non-null   object 
 14  BsmtFinSF2    1458 non-null   float64
 15  BsmtUnfSF     1458 non-null   float64
 16  TotalBsmtSF   1458 non-null   float64
 17  BsmtFullBath  1457 non-null   float64
 18  BsmtHalfBath  1457 non-nu

Get columns with categorical features and numerical ones. We will use it to fill missing values in the test data separately.

In [21]:
# getting numerical and categorical cols in cols with nan
cat_col_with_nan = [col for col in test_col_with_nan if test_missing[col].dtypes == 'object']
num_col_with_nan = list(set(test_col_with_nan) - set(cat_col_with_nan))

In [22]:
# filling LotFrontage missing values in test data with training median values
for neighborhood in list(train_df.groupby(['Neighborhood'])['LotFrontage'].median().index):
    
    median = train_df[train_df['Neighborhood'] == neighborhood].median()['LotFrontage']
    
    test_missing.loc[test_missing['Neighborhood'] == neighborhood, 'LotFrontage'] = \
        test_missing.loc[test_missing['Neighborhood'] == neighborhood, 'LotFrontage'].fillna(median)

In [23]:
# replacing missing value in the test data with the most frequent data in train data
# for categorical features, and with the median for numerical features
for col in cat_col_with_nan:
    test_missing[col].fillna(train_df[col].mode()[0], inplace=True)

for col in num_col_with_nan:
    test_missing[col].fillna(train_df[col].median(), inplace=True)

In [24]:
# separating 'object', 'int64' and 'float64' columns
cat_col = [col for col in train_df.columns if train_df[col].dtypes == 'object']
float_col = [col for col in train_df.columns if train_df[col].dtypes == 'float64']
int_col = [col for col in train_df.columns if train_df[col].dtypes == 'int64']

In [25]:
# fixing dtypes of columns in test data
for col in test_missing.columns:
    
    if col in cat_col:
        test_missing = test_missing.astype({col: 'object'})
    elif col in int_col:
        test_missing = test_missing.astype({col: 'int64'})
    elif col in float_col:
        test_missing = test_missing.astype({col: 'float64'})

In [26]:
test_df = test_missing.copy()

## Preparing data

Concatenate train and test to manipulate features

In [27]:
y = train_df['SalePrice']
train_df = train_df.drop('SalePrice', axis=1)
all_data = pd.concat([train_df, test_df])

There is only one sample in the training set that has Utilities as 'NoSeWa'. So it will not use this feature.

In [28]:
all_data.groupby('Utilities')['Utilities'].count()

Utilities
AllPub    2911
NoSeWa       1
Name: Utilities, dtype: int64

In [29]:
all_data = all_data.drop('Utilities', axis=1)

Adding some features that may improve the models predictions

In [30]:
all_data['TotalSF'] = all_data['TotalBsmtSF'] + all_data['1stFlrSF'] + all_data['2ndFlrSF']
all_data['TotalPorchArea'] = (all_data['EnclosedPorch'] + all_data['OpenPorchSF'] + 
                              all_data['3SsnPorch'] + all_data['ScreenPorch'])
all_data['HasPool'] = all_data['PoolArea'].apply(lambda x: 1 if x > 0 else 0)
all_data['Has2ndFloor'] = all_data['2ndFlrSF'].apply(lambda x: 1 if x > 0 else 0)
all_data['HasGarage'] = all_data['GarageArea'].apply(lambda x: 1 if x > 0 else 0)
all_data['HasBsmt'] = all_data['TotalBsmtSF'].apply(lambda x: 1 if x > 0 else 0)

In [31]:
all_data['YearBuilt'] = all_data['YearBuilt'].astype('object')
all_data['YearRemodAdd'] = all_data['YearRemodAdd'].astype('object')
all_data['GarageYrBlt'] = all_data['GarageYrBlt'].astype('object')

In [32]:
# Label Encoding ranked features
NA_Ex_dict = {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1, 'NA': 0}
label_encoding = {'Alley': {'NA': 0, 'Grvl': 1, 'Pave': 2}, 
                 'ExterQual': NA_Ex_dict,
                 'ExterCond': NA_Ex_dict,
                 'BsmtQual': NA_Ex_dict,
                 'BsmtCond': NA_Ex_dict,
                 'HeatingQC': NA_Ex_dict,
                 'KitchenQual': NA_Ex_dict,
                 'FireplaceQu': NA_Ex_dict,
                 'GarageQual': NA_Ex_dict,
                 'GarageCond': NA_Ex_dict,
                 'PoolQC': NA_Ex_dict,
                 'BsmtExposure':{'No': 0, 'Gd': 3, 'Mn': 1, 'Av': 2, 'NA': 0},
                 'CentralAir': {'N': 0, 'Y': 1},
                 'GarageFinish': {'RFn': 2, 'Unf': 1, 'Fin': 3, 'NA': 0},
                 'PavedDrive': {'Y': 2, 'N': 0, 'P': 1},
                 'Fence': {'NA': 0, 'MnPrv': 3, 'GdWo': 2, 'GdPrv': 4, 'MnWw': 1},
                 'Electrical': {'SBrkr': 5, 'FuseF': 3, 'FuseA': 4, 'FuseP': 2, 'Mix': 1}}

all_data.replace(label_encoding, inplace=True)

In [33]:
# get remaining object cols
object_col = [col for col in all_data.columns if all_data[col].dtypes == 'object']
num_col = [col for col in all_data.columns if all_data[col].dtypes != 'object']

In [34]:
skewed_feats = all_data[num_col].apply(lambda x: skew(x)).sort_values(ascending=False)

skewed_feats

MiscVal           21.944005
PoolQC            19.531987
PoolArea          17.673354
HasPool           15.481305
LowQualFinSF      12.073977
3SsnPorch         11.362112
KitchenAbvGr       4.296287
BsmtFinSF2         4.158442
Alley              4.132591
EnclosedPorch      3.998474
ScreenPorch        3.941117
BsmtHalfBath       3.940486
LotArea            3.600580
MasVnrArea         2.627278
OpenPorchSF        2.528695
TotalPorchArea     2.242861
WoodDeckSF         1.837911
Fence              1.750986
MSSubClass         1.373055
ExterCond          1.312957
1stFlrSF           1.263878
BsmtExposure       1.247050
GrLivArea          1.073596
TotalSF            1.017432
BsmtFinSF1         0.984885
BsmtUnfSF          0.918903
2ndFlrSF           0.860378
ExterQual          0.782285
TotRmsAbvGrd       0.751316
Fireplaces         0.727210
HalfBath           0.693943
TotalBsmtSF        0.678083
LotFrontage        0.627478
BsmtFullBath       0.616889
OverallCond        0.570607
KitchenQual        0

Deal with skewness in the data, using BoxCox1p transformation.

In [35]:
from scipy.stats import skew
from scipy.special import boxcox1p
from scipy.stats import boxcox_normmax

def fixing_skewness(df):
    """
    This function takes in a dataframe and return fixed skewed dataframe
    """
    
    ## Getting all the data that are not of "object" type. 
    numeric_feats = df.dtypes[df.dtypes != "object"].index

    # Check the skew of all numerical features
    skewed_feats = df[numeric_feats].apply(lambda x: skew(x)).sort_values(ascending=False)
    high_skew = skewed_feats[abs(skewed_feats) > 0.5]
    skewed_features = high_skew.index

    for feat in skewed_features:
        df[feat] = boxcox1p(df[feat], boxcox_normmax(df[feat] + 1))

fixing_skewness(all_data)



In [36]:
# Taking log of y because it makes the training faster and better
y = np.log1p(y)

In [37]:
train_df = all_data[:len(train_df)]
test_df = all_data[len(train_df):]

X = train_df.copy()
X_test = test_df.copy()

In [38]:
# Target Encoding the remaining object columns
import category_encoders as ce

target_encoder = ce.TargetEncoder(cols = object_col)

target_encoder.fit(X[object_col], y)
X[object_col] = target_encoder.transform(X[object_col], y)
X_test[object_col] = target_encoder.transform(X_test[object_col])

In [40]:
# Standardization
from sklearn.preprocessing import RobustScaler

robust_scaler = RobustScaler()
X = robust_scaler.fit_transform(X)
X_test = robust_scaler.transform(X_test)

In [41]:
X = pd.DataFrame(X)
X_test = pd.DataFrame(X_test)

## Modeling of the data

We will try to make a prediction model using regression models such as: Rigde, Lasso, ElasticNet, SVR and RandomForest. Let's analyse which one suits better for this problem.

### Importing modules of the models

In [62]:
from sklearn.linear_model import Ridge, ElasticNet, Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error

### Reading test data real solution

In [50]:
y_test = pd.read_csv('submission.csv')
y_test.drop(['Id'], axis=1, inplace=True)
y_test['SalePrice'] = np.log1p(y_test['SalePrice'])
y_test.head()

Unnamed: 0,SalePrice
0,11.561725
1,12.055256
2,12.154258
3,12.183321
4,12.162648


In [70]:
# function to fit and do hyperparameter tunning
def hyperparameter_tuning(X, y, X_test, y_test, parameters, model, cv=5, scoring='neg_root_mean_squared_error'):
    clf = GridSearchCV(model, parameters, cv=cv, scoring=scoring, n_jobs=-1)
    clf.fit(X, y)
    test_score = np.sqrt(mean_squared_error(y_test, clf.predict(X_test)))
    print('Best parameters:')
    print(clf.best_params_)
    print('Train score: %.4f' % (-clf.best_score_))
    print('Test score: %.4f' % test_score)
    return clf

### Ridge

In [71]:
param_ridge = {'alpha' : [0.001, 0.01, 0.1, 1.0, 10.0]}
ridge_model = Ridge(normalize=True)
clf_ridge = hyperparameter_tuning(X, y, X_test, y_test, param_ridge, ridge_model)

Best parameters:
{'alpha': 0.1}
Train score: 0.1131
Test score: 0.1302


### ElasticNet Model

In [73]:
param_en = {'alpha' : [0.00001, 0.0001, 0.001],
            'l1_ratio' : [0.1, 0.3, 0.5, 0.7, 0.9]}
elasticnet_model = ElasticNet(normalize=True)
clf_en = hyperparameter_tuning(X, y, X_test, y_test, param_en, elasticnet_model)

Best parameters:
{'alpha': 0.0001, 'l1_ratio': 0.7}
Train score: 0.1114
Test score: 0.1289


### Lasso Model

In [74]:
param_lasso = {'alpha' : [0.00001, 0.0001, 0.001]}
lasso_model = Lasso(normalize=True)
clf_lasso = hyperparameter_tuning(X, y, X_test, y_test, param_lasso, lasso_model)

Best parameters:
{'alpha': 0.0001}
Train score: 0.1117
Test score: 0.1287


### SVR

In [83]:
param_svr = {'kernel' : ['rbf'],
             'C' : [1e4, 1e5, 1e6],
             'epsilon' : [1e-6, 1e-5, 1e-4]
            }
svr_model = SVR()
clf_svr = hyperparameter_tuning(X, y, X_test, y_test, param_svr, svr_model)

Best parameters:
{'C': 100000.0, 'epsilon': 1e-05, 'kernel': 'rbf'}
Train score: 0.1183
Test score: 0.1387


### Random Forest

In [94]:
param_rf = {'n_estimators' : [500, 600, 700]}
rf_model = RandomForestRegressor()
clf_rf = hyperparameter_tuning(X, y, X_test, y_test, param_rf, rf_model)

Best parameters:
{'n_estimators': 600}
Train score: 0.1332
Test score: 0.1482


From all models from above, after parameter tunning, we see that they all have similar good results and they are not overfitting. The best ones were Lasso and ElaticNet regression (ElasticNet is a combination of Lasso and Ridge, so this was expected), they had very little difference from each other. We can also try to merge the models to make one ensemble model. We could even try some neural network to try to beat the highest score.

Just do not forget that we need to retransform the predictions because they are in np.log1p().

### Ensemble Model

Let's try to find a combination of the model above in which we can beat the best test score from before: 0.1287. As ElasticNet is a combination of Lasso and Ridge regression we will only use the ElasticNet model.

In [116]:
#y_ridge = clf_ridge.predict(X_test)
#y_lasso = clf_lasso.predict(X_test)
y_en = clf_en.predict(X_test)
y_svr = clf_svr.predict(X_test)
y_rf = clf_rf.predict(X_test)

lowest_score = 10000
best_params = {'c1': 0, 'c2': 0, 'c3': 1}

for i in tqdm.tqdm(range(100000)):
    c1, c2, c3 = np.random.rand(3)                                      #generate three number between [0, 1)
    total_c = c1 + c2 + c3                                              #getting the total value for normalizing
    c1 /= total_c                                                       #Normalizing
    c2 /= total_c
    c3 /= total_c
    
    y_ensemble = c1 * y_en + c2 * y_svr + c3 * y_rf
    actual_score = np.sqrt(mean_squared_error(y_test, y_ensemble))
    
    if actual_score < lowest_score:
        lowest_score = actual_score
        best_params['c1'] = c1
        best_params['c2'] = c2
        best_params['c3'] = c3

100%|██████████| 100000/100000 [04:50<00:00, 343.89it/s]


In [117]:
print('c1: %.4f' % best_params['c1'])
print('c2: %.4f' % best_params['c2'])
print('c3: %.4f' % best_params['c3'])
print('Lowest_score: %.4f' % lowest_score)

c1: 0.8827
c2: 0.0003
c3: 0.1170
Lowest_score: 0.1286


We see that we got an improvement of 0.0001, so it is not worth it. We can stick with the ElasticNet.