### 1. Import Library

In [23]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt 

from sklearn import preprocessing
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error


from mpl_toolkits.mplot3d import Axes3D

import math
import shap
import os

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

### 2. Load Dataset

This dataset has been cleaned, with missing values imputed and categorical features being stripped and lowered

In [24]:
df_model = pd.read_csv('df_model')

### 3. Data Pre-processing

From the previous model we will use just the top features displayed in feature importance.

These include: 
* OverallQual
* GrLivArea
* TotalBsmtSF
* BsmtFinSF1
* YearBuilt
* YearRemodAdd
* LotArea
* OverallCond
* GarageCars
* BsmtQual_ex
* 1stFirSF
* Fireplaces
* GarageArea
* GarageFinish_unf
* 2ndFirSF
* KitchenQual_gd
* KitchenQual_ex
* BsmtQual_gd
* OpenPorchSF
* CentralAir_n

Create a subset of the dataframe that has these columns within it

In [25]:
df_model_selected = df_model [['OverallQual' , 'GrLivArea', 'TotalBsmtSF', 
                               'BsmtFinSF1', 'YearBuilt', 'YearRemodAdd', 'LotArea',
                               'OverallCond', 'GarageCars', 'BsmtQual_ex', '1stFlrSF',
                               'Fireplaces', 'GarageArea', 'GarageFinish_unf', 
                               '2ndFlrSF', 'KitchenQual_gd', 'KitchenQual_ex',
                               'BsmtQual_gd', 'OpenPorchSF', 'CentralAir_n', 'SalePrice']]

In [26]:
df_model_selected.head()

Unnamed: 0,OverallQual,GrLivArea,TotalBsmtSF,BsmtFinSF1,YearBuilt,YearRemodAdd,LotArea,OverallCond,GarageCars,BsmtQual_ex,1stFlrSF,Fireplaces,GarageArea,GarageFinish_unf,2ndFlrSF,KitchenQual_gd,KitchenQual_ex,BsmtQual_gd,OpenPorchSF,CentralAir_n,SalePrice
0,7,1710,856,706,2003,2003,8450,5,2,0,856,0,548,0,854,1,0,1,61,0,208500
1,6,1262,1262,978,1976,1976,9600,8,2,0,1262,1,460,0,0,0,0,1,0,0,181500
2,7,1786,920,486,2001,2002,11250,5,2,0,920,1,608,0,866,1,0,1,42,0,223500
3,7,1717,756,216,1915,1970,9550,5,3,0,961,1,642,1,756,1,0,0,35,0,140000
4,8,2198,1145,655,2000,2000,14260,5,3,0,1145,1,836,0,1053,1,0,1,84,0,250000


In [27]:
feature_names = df_model[['OverallQual' , 'GrLivArea', 'TotalBsmtSF', 
                               'BsmtFinSF1', 'YearBuilt', 'YearRemodAdd', 'LotArea',
                               'OverallCond', 'GarageCars', 'BsmtQual_ex', '1stFlrSF',
                               'Fireplaces', 'GarageArea', 'GarageFinish_unf', 
                               '2ndFlrSF', 'KitchenQual_gd', 'KitchenQual_ex',
                               'BsmtQual_gd', 'OpenPorchSF', 'CentralAir_n',
                          'SalePrice']]

In [28]:
feature_names.info

<bound method DataFrame.info of       OverallQual  GrLivArea  TotalBsmtSF  BsmtFinSF1  YearBuilt  \
0               7       1710          856         706       2003   
1               6       1262         1262         978       1976   
2               7       1786          920         486       2001   
3               7       1717          756         216       1915   
4               8       2198         1145         655       2000   
5               5       1362          796         732       1993   
6               8       1694         1686        1369       2004   
7               7       2090         1107         859       1973   
8               7       1774          952           0       1931   
9               5       1077          991         851       1939   
10              5       1040         1040         906       1965   
11              9       2324         1175         998       2005   
12              5        912          912         737       1962   
13              

### 4. Train Test Split

In [29]:
X = df_model[['OverallQual' , 'GrLivArea', 'TotalBsmtSF', 
                               'BsmtFinSF1', 'YearBuilt', 'YearRemodAdd', 'LotArea',
                               'OverallCond', 'GarageCars', 'BsmtQual_ex', '1stFlrSF',
                               'Fireplaces', 'GarageArea', 'GarageFinish_unf', 
                               '2ndFlrSF', 'KitchenQual_gd', 'KitchenQual_ex',
                               'BsmtQual_gd', 'OpenPorchSF', 'CentralAir_n']]
y = df_model[['SalePrice']]

In [30]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,random_state=42)

### 5. Model Preparation

This involves:

1. Normalisation using MinMax Scaler
2. One-hot encoding of categorical variable (Not required here as all values are numeric)

In [31]:
# MinMaxScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [32]:
X_train_scaled_df = pd.DataFrame(X_train_scaled, columns = X_train.columns)
X_test_scaled_df = pd.DataFrame(X_test_scaled, columns = X_test.columns)
X_train_scaled_df.head()

Unnamed: 0,OverallQual,GrLivArea,TotalBsmtSF,BsmtFinSF1,YearBuilt,YearRemodAdd,LotArea,OverallCond,GarageCars,BsmtQual_ex,1stFlrSF,Fireplaces,GarageArea,GarageFinish_unf,2ndFlrSF,KitchenQual_gd,KitchenQual_ex,BsmtQual_gd,OpenPorchSF,CentralAir_n
0,-0.820445,-0.407093,0.572612,1.037269,-0.455469,-1.346063,-0.212896,0.372217,-1.056544,-0.300948,0.374235,-0.958592,-0.863837,-0.9337,-0.801923,-0.820581,-0.265664,-0.854617,-0.714352,-0.276582
1,-0.088934,0.08317,-0.596547,-0.971996,0.718609,0.439214,-0.265245,1.268609,0.295092,-0.300948,-0.958202,0.59215,-0.456264,-0.9337,0.955088,-0.820581,-0.265664,1.170115,-0.138015,-0.276582
2,-0.820445,-1.39525,-0.603357,-0.971996,-1.988293,-1.683818,-0.177841,1.268609,-2.408179,-0.300948,-0.965964,-0.958592,-2.257169,1.071008,-0.801923,-0.820581,-0.265664,-0.854617,-0.714352,-0.276582
3,-0.820445,0.458975,-0.750921,0.267995,-1.107734,-1.683818,-0.324474,1.268609,-1.056544,-0.300948,-0.487321,2.142892,-1.119755,1.071008,0.989227,1.218649,-0.265664,1.170115,-0.714352,-0.276582
4,-0.820445,0.312087,-0.081209,-0.49692,-1.531707,-1.683818,-0.529035,0.372217,-1.056544,-0.300948,-0.370895,0.59215,-0.797488,1.071008,0.711564,1.218649,-0.265664,-0.854617,-0.714352,-0.276582


In [33]:
X_test_scaled_df.head()

Unnamed: 0,OverallQual,GrLivArea,TotalBsmtSF,BsmtFinSF1,YearBuilt,YearRemodAdd,LotArea,OverallCond,GarageCars,BsmtQual_ex,1stFlrSF,Fireplaces,GarageArea,GarageFinish_unf,2ndFlrSF,KitchenQual_gd,KitchenQual_ex,BsmtQual_gd,OpenPorchSF,CentralAir_n
0,-0.088934,-0.876372,-0.006292,0.472844,-0.259789,0.87347,-0.211594,2.165,-1.056544,-0.300948,-0.26223,-0.958592,-1.006014,-0.9337,-0.801923,-0.820581,-0.265664,-0.854617,-0.714352,-0.276582
1,1.374088,2.088099,0.910874,1.276986,0.751222,0.487465,0.145643,-0.524174,0.295092,3.322836,0.855465,2.142892,1.117159,-0.9337,1.75166,1.218649,-0.265664,-0.854617,-0.253282,-0.276582
2,-0.820445,-0.952678,-0.122072,-0.971996,-1.433867,-1.683818,-0.160826,0.372217,0.295092,-0.300948,-0.36572,0.59215,-0.551048,1.071008,-0.801923,-0.820581,-0.265664,-0.854617,-0.714352,-0.276582
3,-0.088934,0.260581,-0.131153,-0.102477,-0.781602,-1.683818,-0.529035,1.268609,0.295092,-0.300948,-0.427814,2.142892,-0.266695,1.071008,0.700185,-0.820581,-0.265664,-0.854617,-0.36855,-0.276582
4,2.105599,0.176644,1.267297,1.255193,1.175195,1.114724,0.205338,-0.524174,1.646727,3.322836,1.165936,0.59215,2.065003,-0.9337,-0.801923,-0.820581,3.764151,-0.854617,-0.714352,-0.276582


### 6. Model Build

1. XGBoostRegressor
2. RandomForestRegressor

#### XGBoostRegressor

In [34]:
my_model = XGBRegressor(n_estimators=1000, learning_rate=0.05)
my_model.fit(X_train_scaled_df, y_train, eval_set=[(X_test_scaled_df, y_test)], verbose=False)
predictions = my_model.predict(X_test_scaled_df, iteration_range=(0, 5))

In [35]:
print("Mean Absolute Error : " + str(mean_absolute_error(predictions, y_test)))

Mean Absolute Error : 139043.25897969285


In [36]:
print("Mean Squared Error : " + str(mean_squared_error(y_test, predictions, squared=False)))

Mean Squared Error : 157331.07238460737


In [37]:
MSE = mean_squared_error(y_test, predictions, squared=False)

In [38]:
rsme = math.sqrt(MSE)  
print("Root Mean Square Error:")  
print(rsme)  

Root Mean Square Error:
396.6498107709209


#### RandomForestRegressor

In [39]:
randforest  = RandomForestRegressor(random_state = 1)
randforest.fit(X_train_scaled_df, y_train)
pred =randforest.predict(X_test_scaled_df)

A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().


In [40]:
print("Mean Absolute Error : " + str(mean_absolute_error(pred, y_test)))

Mean Absolute Error : 17650.444457762555


In [41]:
print("Mean Squared Error : " + str(mean_squared_error(y_test, pred, squared=False)))

Mean Squared Error : 28496.65097315002


In [42]:
MSE = mean_squared_error(y_test, pred, squared=False)

In [43]:
rsme = math.sqrt(MSE)  
print("Root Mean Square Error:")  
print(rsme)  

Root Mean Square Error:
168.80951090844977
