# Library

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("whitegrid")

from sklearn.preprocessing import OrdinalEncoder
from category_encoders import MEstimateEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_validate, GridSearchCV

from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.linear_model import Lasso, Ridge
from sklearn.linear_model import ElasticNet, BayesianRidge
from sklearn.svm import SVR
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor

## Import Data

In [None]:
train_df = pd.read_csv("../input/house-prices-advanced-regression-techniques/train.csv", index_col='Id')
test_df = pd.read_csv("../input/house-prices-advanced-regression-techniques/test.csv", index_col='Id')
submisson = pd.read_csv("../input/house-prices-advanced-regression-techniques/sample_submission.csv")

# Data Preprocessing

## Missing Values

### Check Missing Values

In [None]:
colnul_train = train_df.isnull().sum()[train_df.isnull().sum() > 0].sort_values(ascending=False) / 1460
colnul_test = test_df.isnull().sum()[test_df.isnull().sum() > 0].sort_values(ascending=False) / 1459

fig, axes = plt.subplots(1, 2, figsize=(20,10))
sns.barplot(x=colnul_train, y=colnul_train.index, ax=axes[0])
sns.barplot(x=colnul_test, y=colnul_test.index, ax=axes[1])
axes[0].set_title('Missing values in training data')
axes[1].set_title('Missing values in testing data')
axes[0].set_xlabel('Nan Values')
axes[1].set_xlabel('Nan Values')

### Fill Missing values

Mode :
	
    FireplaceQu
	
     MasVnrType
	
     Electrical
     
     Alley

     Fence


None :
	
    GarageType
	
     GarageFinish
	
     GarageQual
	
     GarageCond

YearBuilt :
	
    GarageYrBlt

NoBsmt :
	
    BsmtExposure
	
     BsmtFinType1
	
     BsmtFinType2
	
     BsmtCond
	
     BsmtQual

0 :

    MasVnrArea

     GarageYrBlt

     GarageCars

     GarageArea

     BsmtFinSF1

     BsmtFinSF2

     BsmtUnfSF

     TotalBsmtSF

     BsmtFullBath

     BsmtHalfBath

In [None]:
#Categorical
for df in [train_df, test_df]:
    for i in ['FireplaceQu', 'MasVnrType', 'Electrical', 'Alley', 'Fence']:
        mode = train_df[i].mode()[0]
        df[i] = df[i].fillna(mode)

    for i in ['GarageType', 'GarageFinish', 'GarageQual', 'GarageCond']:
        df[i] = df[i].fillna('None')

    for i in ['BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'BsmtCond', 'BsmtQual']:
        df[i] = df[i].fillna('NoBsmt')

    df['MiscFeature'] = df['MiscFeature'].fillna('NoMisc')
    df['PoolQC'] = df['PoolQC'].fillna('TA')

    df.drop("Utilities", axis=1, inplace=True)

#Numerical
for df in [train_df, test_df]:
    df.loc[(df['LotFrontage'].isna()) & (df['Street'] == 'Grvl'), 'LotFrontage'] = train_df.groupby('Street')['LotFrontage'].mean()['Grvl']
    df.loc[(df['LotFrontage'].isna()) & (df['Street'] == 'Pave'), 'LotFrontage'] = train_df.groupby('Street')['LotFrontage'].mean()['Pave']

    for i in ['GarageYrBlt', 'MasVnrArea', 'GarageCars', 'GarageArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'BsmtFullBath', 'BsmtHalfBath']:
        df[i] = df[i].fillna(0)

for i in  ["Functional","KitchenQual", "Exterior1st", "Exterior2nd", "MSZoning", "SaleType"] :
    mode = train_df[i].mode()[0]
    test_df[i] = test_df[i].fillna(mode)

In [None]:
print('Missing value in train_df : {}'.format(train_df.isna().sum().sum()))
print('Missing value in test_df : {}'.format(test_df.isna().sum().sum()))

## Encoding Categorical Data

**Categorical Variable** :

     Nominal -> no intrinsic ordering
     Ordinal -> clear ordering

In [None]:
categorical = train_df.loc[:, train_df.dtypes == 'object'].nunique() # get number of unique values
numerical = train_df.loc[:, train_df.dtypes != 'object'].columns

### Get Unique values for each Categorical Variable

In [None]:
cat_var_unique = {i : sorted(train_df[i].unique()) for i in categorical.index}
df_cat_var_unqiue = pd.DataFrame.from_dict(cat_var_unique, orient='index').sort_values([x for x in range(25)])

### Ordinal Encoding

In [None]:
#Group each ordinal variable based on thei unique values.

ord_var1 = ["ExterCond", "HeatingQC"]
ord_var1_cat = ["Po", "Fa", "TA", "Gd", "Ex"]

ord_var2 = ["ExterQual", "KitchenQual"]
ord_var2_cat = ["Fa", "TA", "Gd", "Ex"]

ord_var3 = ["FireplaceQu", "GarageQual", "GarageCond"]
ord_var3_cat = ["None", "Po", "Fa", "TA", "Gd", "Ex"]

ord_var4 = ["BsmtQual"]
ord_var4_cat = ["NoBsmt", "Fa", "TA", "Gd", "Ex"]

ord_var5 = ["BsmtCond"]
ord_var5_cat = ["NoBsmt", "Po", "Fa", "TA", "Gd"]

ord_var6 = ["BsmtExposure"]
ord_var6_cat = ["NoBsmt", "No", "Mn", "Av", "Gd"]

ord_var7 = ["BsmtFinType1", "BsmtFinType2"]
ord_var7_cat = ["NoBsmt", "Unf", "LwQ", "Rec", "BLQ", "ALQ", "GLQ"]

# Put all in one array for easier iteration
ord_var = [ord_var1, ord_var2, ord_var3, ord_var4, ord_var5, ord_var6, ord_var7]
ord_var_cat = [ord_var1_cat, ord_var2_cat, ord_var3_cat, ord_var4_cat, ord_var5_cat, ord_var6_cat, ord_var7_cat]
ord_all = ord_var1 + ord_var2 + ord_var3 + ord_var4 + ord_var5 + ord_var6 + ord_var7 

In [None]:
for i in range(len(ord_var)):
    encoder = OrdinalEncoder(categories=[ord_var_cat[i]])
    for var in ord_var[i]:
        train_df[var] = encoder.fit_transform(train_df[[var]])
        test_df[var] = encoder.fit_transform(test_df[[var]])

### One-Hot Encoding

In [None]:
categorical = categorical.drop(ord_all)
onehot_var = categorical[categorical < 6].index # only variable with a unique values less than 6

train_df = pd.get_dummies(train_df, prefix=onehot_var, columns=onehot_var)
test_df = pd.get_dummies(test_df, prefix=onehot_var, columns=onehot_var)

In [None]:
# Get encoded variables name that do not yet exist in the test_df
add_var = [var for var in train_df.columns if var not in test_df.columns]

for var in add_var:
    if var != 'SalePrice':
        test_df[var] = 0

In [None]:
# Reorder test_df column so it is the same order as the train_df
test_df = test_df[train_df.columns.drop('SalePrice')]

### Target Encoding

The problem with One-Hot Encoding is that the more unique values in a variable, the more new columns will be created. In that case, it can lead to high memory consumption and increase the computational cost. Therefore, we will use target encoding for variables with 6 or more unique values.

Here we eill use M-Estimate Encoder, this is a simplified version of target encoder, which goes under names like m-probability estimate or additive smoothing with known incidence rates. In comparison to target encoder, m-probability estimate has only one tunable parameter (m), while target encoder has two tunable parameters (min_samples_leaf and smoothing).

In [None]:
categorical = categorical.drop(onehot_var)
x_train = train_df.drop('SalePrice', axis=1)
y_train = train_df['SalePrice']

In [None]:
Mest = MEstimateEncoder(cols=train_df[categorical.index.append(pd.Index(['MoSold']))]) ## Add MoSold variable to the encoder
x_train = Mest.fit_transform(x_train, y_train)
test_df = Mest.transform(test_df)
train_df = pd.concat([x_train, y_train], axis=1)

## Correlation

In [None]:
sns.set_theme(rc={'figure.figsize':(24,20)})
sns.heatmap(train_df[numerical].corr(), annot=True, fmt='.2f')

In [None]:
train = train_df[numerical].corr()[['SalePrice']]
plt.figure(figsize=(20, 15))
sns.barplot(x='SalePrice', y=train.index, data=train)
plt.axvline(x=0.5, color='r')

## Creating New Variables

Now we will create some new variables based on variables that have a high correlation in the data above to avoid collinearity. We will also create new features that might be useful.

In [None]:
for df in [train_df, test_df]:
    df['GarAreaPerCar'] = (df['GarageArea'] / df['GarageCars']).fillna(0)
    df['GrLivAreaPerRoom'] = df['GrLivArea'] / df['TotRmsAbvGrd']
    df["TotalHouseSF"] = df["TotalBsmtSF"] + df["1stFlrSF"] + df["2ndFlrSF"]
    df["TotalFullBath"] = df["FullBath"] + df["BsmtFullBath"]
    df["TotalHalfBath"] = df["HalfBath"] + df["BsmtHalfBath"]
    df["InitHouseAge"] = df["YrSold"] - df["YearBuilt"]
    df["RemodHouseAge"] = df["InitHouseAge"] - (df["YrSold"] - df["YearRemodAdd"])
    df["IsRemod"] = (df["YearRemodAdd"] - df["YearBuilt"]).apply(lambda x: 1 if x > 0 else 0)
    df["GarageAge"] = (df["YrSold"] - df["GarageYrBlt"]).apply(lambda x: 0 if x > 2000 else x)
    df["IsGarage"] = df["GarageYrBlt"].apply(lambda x: 1 if x > 0 else 0)
    df['TotalPorchSF'] = df['OpenPorchSF'] + df['EnclosedPorch'] + df['3SsnPorch'] + df['ScreenPorch']
    df["AvgQualCond"] = (df["OverallQual"] + df["OverallCond"]) / 2

## Deleting Variables

In [None]:
for df in [train_df, test_df]:
    df = df.drop(["GarageArea", "GarageCars", "GrLivArea", 
            "TotRmsAbvGrd", "TotalBsmtSF", "1stFlrSF", 
            "2ndFlrSF", "FullBath", "BsmtFullBath", "HalfBath", 
            "BsmtHalfBath", "YrSold", "YearBuilt", "YearRemodAdd",
            "GarageYrBlt", "OpenPorchSF", "EnclosedPorch", "3SsnPorch",
            "ScreenPorch", "OverallQual", "OverallCond"], 
            axis=1)

# Modeling

## Splitting Data

In [None]:
X_train = train_df.drop(['SalePrice'], axis=1)
y_train = train_df['SalePrice']
X_test = test_df

## Scaling Data

In [None]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
y_train_log = np.log10(y_train)

## Selecting Best Model

In [None]:
model = {
    'XGB' : XGBRegressor(),
    'LGBM' : LGBMRegressor(),
    'Lasso' : Lasso(),
    'Ridge' : Ridge(),
    'Elastic Net' : ElasticNet(),
    'Bayesian Ridge' : BayesianRidge(),
    'SVR' : SVR(),
    'Gradient Boosting' : GradientBoostingRegressor()
}

result = pd.DataFrame(columns=['Model', 'Avg_RMSE'])

for name, mod in model.items():
    model = mod
    crossvad = cross_validate(model, X_train_scaled, y_train_log, cv=10, scoring=(['neg_root_mean_squared_error']))
    result = result.append({'Model' : name, 'Avg_RMSE' : np.abs(crossvad['test_neg_root_mean_squared_error']).mean()}, ignore_index=True)

result = result.sort_values('Avg_RMSE', ascending=True)
result.reset_index(drop=True)

We will choose three best model from cross validation test above, which is Gradient Boosting, LGBM, and XGB.

## Hyperparameter Tuning

### Gradient Boosting

In [None]:
gb = GradientBoostingRegressor(random_state=0)
params = {
    'loss' : ('squared_error', 'absolute_error'),
    'learning_rate' : (1.0, 0.1, 0.01),
    'n_estimators' : (50, 100, 200)
}

mod1 = GridSearchCV(gb, params, cv=10)
mod1.fit(X_train_scaled, y_train_log)
print('Best hyperparameter : ', mod1.best_params_)

In [None]:
y_pred = mod1.predict(X_train_scaled)
print(f'Train RMSE : {mean_squared_error(y_train_log, y_pred, squared=False)}')

### LGBM

In [None]:
lgbm = LGBMRegressor(random_state=0)
params = {
    'num_leaves' : (11, 31, 51),
    'learning_rate' : (0.5, 0.1, 0.05),
    'n_estimators' : (50, 100, 200)
}

mod2 = GridSearchCV(lgbm, params, cv=10)
mod2.fit(X_train_scaled, y_train_log)
print('Best hyperparameter : ', mod2.best_params_)

In [None]:
y_pred = mod2.predict(X_train_scaled)
print(f'Train RMSE : {mean_squared_error(y_train_log, y_pred, squared=False)}')

### XGB

In [None]:
xgb = XGBRegressor(random_state=0)
params = {
    'max_depth' : (3, 6, 9),
    'learning_rate' : (0.3, 0.1, 0.05),
    'n_estimators' : (50, 100, 200)
}

mod3 = GridSearchCV(xgb, params, cv=10)
mod3.fit(X_train_scaled, y_train_log)
print('Best hyperparameter : ', mod3.best_params_)

In [None]:
y_pred = mod3.predict(X_train_scaled)
print(f'Train RMSE : {mean_squared_error(y_train_log, y_pred, squared=False)}')

### Stacking 3 Models

In [None]:
def mod_predict(x):
        return (3 * mod1.predict(x) + 5 * mod2.predict(x) + 2 * mod3.predict(x)) / 10

y_pred_stack = mod_predict(X_train_scaled)
print(f'Train RMSE with stacking : {mean_squared_error(y_train_log, y_pred_stack, squared=False)}')

# Submission

In [None]:
y_pred = mod_predict(X_test_scaled)
y_pred_inv = 10 ** y_pred

In [None]:
submisson['SalePrice'] = y_pred_inv
submisson.to_csv('submission.csv', index=False)