### 1. Import Library

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt 

from sklearn import preprocessing
from sklearn import metrics
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error


from mpl_toolkits.mplot3d import Axes3D

import math
import shap
import os

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

### 2. Load Dataset

This dataset has been cleaned, with missing values imputed and categorical features being stripped and lowered

In [2]:
df_model = pd.read_csv('df_model')

### 3. Data Pre-processing

From the previous model we will use just the top features displayed in feature importance.

These include: 
* OverallQual
* GrLivArea
* TotalBsmtSF
* BsmtFinSF1
* YearBuilt
* YearRemodAdd
* LotArea
* OverallCond
* GarageCars
* BsmtQual_ex
* 1stFirSF
* Fireplaces
* GarageArea
* GarageFinish_unf
* 2ndFirSF
* KitchenQual_gd
* KitchenQual_ex
* BsmtQual_gd
* OpenPorchSF
* CentralAir_n

Create a subset of the dataframe that has these columns within it

In [3]:
df_model_selected = df_model [['OverallQual' , 'GrLivArea', 'TotalBsmtSF', 
                               'BsmtFinSF1', 'YearBuilt', 'YearRemodAdd', 'LotArea',
                               'OverallCond', 'GarageCars', 'BsmtQual_ex', '1stFlrSF',
                               'Fireplaces', 'GarageArea', 'GarageFinish_unf', 
                               '2ndFlrSF', 'KitchenQual_gd', 'KitchenQual_ex',
                               'BsmtQual_gd', 'OpenPorchSF', 'CentralAir_n',
                              'SalePrice']]

In [4]:
df_model_selected.head()

Unnamed: 0,OverallQual,GrLivArea,TotalBsmtSF,BsmtFinSF1,YearBuilt,YearRemodAdd,LotArea,OverallCond,GarageCars,BsmtQual_ex,1stFlrSF,Fireplaces,GarageArea,GarageFinish_unf,2ndFlrSF,KitchenQual_gd,KitchenQual_ex,BsmtQual_gd,OpenPorchSF,CentralAir_n,SalePrice
0,7,1710,856,706,2003,2003,8450,5,2,0,856,0,548,0,854,1,0,1,61,0,208500
1,6,1262,1262,978,1976,1976,9600,8,2,0,1262,1,460,0,0,0,0,1,0,0,181500
2,7,1786,920,486,2001,2002,11250,5,2,0,920,1,608,0,866,1,0,1,42,0,223500
3,7,1717,756,216,1915,1970,9550,5,3,0,961,1,642,1,756,1,0,0,35,0,140000
4,8,2198,1145,655,2000,2000,14260,5,3,0,1145,1,836,0,1053,1,0,1,84,0,250000


### 4. Model Preparation

This involves:

- one hot encoding of categorical feature

This was not done in this case as all the features are numeric. 

In the next notebook we will apply normalisation and scaling

### 5. Train Test Split

In [5]:
feature_names = ['OverallQual' , 'GrLivArea', 'TotalBsmtSF', 
                 'BsmtFinSF1', 'YearBuilt', 'YearRemodAdd', 'LotArea',
                 'OverallCond', 'GarageCars', 'BsmtQual_ex', '1stFlrSF',
                 'Fireplaces', 'GarageArea', 'GarageFinish_unf', 
                 '2ndFlrSF', 'KitchenQual_gd', 'KitchenQual_ex',
                 'BsmtQual_gd', 'OpenPorchSF', 'CentralAir_n']

In [6]:
X = df_model[feature_names]
y = df_model[['SalePrice']]

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,random_state=42)

### 6. Model Build

1. XGBoostRegressor
2. RandomForestRegressor

##### XGBoostRegressor

In [8]:
my_model = XGBRegressor(n_estimators=1000, learning_rate=0.05)
my_model.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=False)
predictions = my_model.predict(X_test, iteration_range=(0, 5))

In [9]:
print("Mean Absolute Error : " + str(mean_absolute_error(predictions, y_test)))

Mean Absolute Error : 139043.25897969285


In [10]:
print("Mean Squared Error : " + str(mean_squared_error(y_test, predictions, squared=False)))

Mean Squared Error : 157331.07238460737


In [11]:
MSE = mean_squared_error(y_test, predictions, squared=False)

In [12]:
rsme = math.sqrt(MSE)  
print("Root Mean Square Error:")  
print(rsme)  

Root Mean Square Error:
396.6498107709209


##### RandomForestRegressor

In [13]:
randforest  = RandomForestRegressor(random_state = 1)
randforest.fit(X_train, y_train)
pred =randforest.predict(X_test)

A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().


In [14]:
print("Mean Absolute Error : " + str(mean_absolute_error(pred, y_test)))

Mean Absolute Error : 17644.504731735156


In [15]:
print("Mean Squared Error : " + str(mean_squared_error(y_test, pred, squared=False)))

Mean Squared Error : 28484.815140125094


In [16]:
MSE = mean_squared_error(y_test, pred, squared=False)

In [17]:
rsme = math.sqrt(MSE)  
print("Root Mean Square Error:")  
print(rsme)  

Root Mean Square Error:
168.77445049569883
