# XGBoost Regressor

## Import libraries

In [4]:
import numpy as np
import pandas as pd

## Import the cleaned dataset

In [5]:
df = pd.read_csv('clean_data.csv')

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 287196 entries, 0 to 287195
Data columns (total 7 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   town                 287196 non-null  object 
 1   flat_type            287196 non-null  object 
 2   floor_area_sqm       287196 non-null  float64
 3   resale_price         287196 non-null  int64  
 4   year_of_sale         287196 non-null  int64  
 5   storey               287196 non-null  object 
 6   year_lease_commence  287196 non-null  int64  
dtypes: float64(1), int64(3), object(3)
memory usage: 15.3+ MB


## Training

In [6]:
from sklearn.model_selection import train_test_split

In [7]:
target = df['resale_price'].copy(deep= True)
features = df.drop('resale_price', axis= 1)

X_train, X_test, y_train, y_test = train_test_split(features, target, test_size= 0.3, random_state= 42)

X_train

Unnamed: 0,town,flat_type,floor_area_sqm,year_of_sale,storey,year_lease_commence
79762,TAMPINES,3 ROOM,60.0,1994,LOW,1985
59669,KALLANG/WHAMPOA,5 ROOM,114.0,1994,MID,1974
75897,PASIR RIS,5 ROOM,122.0,1994,MID,1989
216035,HOUGANG,3 ROOM,73.0,1998,MID,1978
249067,PASIR RIS,4 ROOM,106.0,1999,LOW,1995
...,...,...,...,...,...,...
119879,WOODLANDS,4 ROOM,83.0,1996,LOW,1990
259178,HOUGANG,3 ROOM,67.0,1999,MID,1983
131932,BEDOK,3 ROOM,83.0,1996,LOW,1980
146867,BEDOK,4 ROOM,92.0,1997,LOW,1978


In [8]:
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size= 0.3, random_state= 8)

X_train

Unnamed: 0,town,flat_type,floor_area_sqm,year_of_sale,storey,year_lease_commence
217170,PASIR RIS,4 ROOM,108.0,1998,LOW,1993
99114,TAMPINES,3 ROOM,74.0,1995,MID,1986
230916,CHOA CHU KANG,4 ROOM,104.0,1999,MID,1995
116779,TAMPINES,4 ROOM,93.0,1996,MID,1985
44766,GEYLANG,3 ROOM,62.0,1993,MID,1969
...,...,...,...,...,...,...
58535,BISHAN,5 ROOM,121.0,1994,HIGH,1987
12588,ANG MO KIO,3 ROOM,67.0,1991,LOW,1979
110081,JURONG EAST,4 ROOM,83.0,1995,MID,1986
188228,YISHUN,4 ROOM,92.0,1998,LOW,1984


## Data Preprocessing

In [9]:
from pickle import load

In [10]:
town_looe = load(open('town_looe.pkl', 'rb'))

df_enc_town = town_looe.transform(X_train['town'])
df_enc_town.rename(columns= {'town': 'town_enc'}, inplace= True)

X_train = pd.concat([X_train, df_enc_town], axis= 1)

X_train

Unnamed: 0,town,flat_type,floor_area_sqm,year_of_sale,storey,year_lease_commence,town_enc
217170,PASIR RIS,4 ROOM,108.0,1998,LOW,1993,375197.713106
99114,TAMPINES,3 ROOM,74.0,1995,MID,1986,263294.883347
230916,CHOA CHU KANG,4 ROOM,104.0,1999,MID,1995,302768.354018
116779,TAMPINES,4 ROOM,93.0,1996,MID,1985,263294.883347
44766,GEYLANG,3 ROOM,62.0,1993,MID,1969,184591.699708
...,...,...,...,...,...,...,...
58535,BISHAN,5 ROOM,121.0,1994,HIGH,1987,342266.840904
12588,ANG MO KIO,3 ROOM,67.0,1991,LOW,1979,163762.230305
110081,JURONG EAST,4 ROOM,83.0,1995,MID,1986,199857.145714
188228,YISHUN,4 ROOM,92.0,1998,LOW,1984,215471.592173


In [11]:
flat_type_ohe = load(open('flat_type_ohe.pkl', 'rb'))

idx = X_train.index
df_flat_type_ohe = pd.DataFrame(flat_type_ohe.transform(X_train['flat_type'].values.reshape(-1, 1)).toarray())
df_flat_type_ohe.columns = flat_type_ohe.get_feature_names_out(['flat_type'])
df_flat_type_ohe.index = idx
X_train = pd.concat([X_train, df_flat_type_ohe], axis= 1)

X_train

Unnamed: 0,town,flat_type,floor_area_sqm,year_of_sale,storey,year_lease_commence,town_enc,flat_type_1 ROOM,flat_type_2 ROOM,flat_type_3 ROOM,flat_type_4 ROOM,flat_type_5 ROOM,flat_type_EXECUTIVE,flat_type_MULTI GENERATION
217170,PASIR RIS,4 ROOM,108.0,1998,LOW,1993,375197.713106,0.0,0.0,0.0,1.0,0.0,0.0,0.0
99114,TAMPINES,3 ROOM,74.0,1995,MID,1986,263294.883347,0.0,0.0,1.0,0.0,0.0,0.0,0.0
230916,CHOA CHU KANG,4 ROOM,104.0,1999,MID,1995,302768.354018,0.0,0.0,0.0,1.0,0.0,0.0,0.0
116779,TAMPINES,4 ROOM,93.0,1996,MID,1985,263294.883347,0.0,0.0,0.0,1.0,0.0,0.0,0.0
44766,GEYLANG,3 ROOM,62.0,1993,MID,1969,184591.699708,0.0,0.0,1.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
58535,BISHAN,5 ROOM,121.0,1994,HIGH,1987,342266.840904,0.0,0.0,0.0,0.0,1.0,0.0,0.0
12588,ANG MO KIO,3 ROOM,67.0,1991,LOW,1979,163762.230305,0.0,0.0,1.0,0.0,0.0,0.0,0.0
110081,JURONG EAST,4 ROOM,83.0,1995,MID,1986,199857.145714,0.0,0.0,0.0,1.0,0.0,0.0,0.0
188228,YISHUN,4 ROOM,92.0,1998,LOW,1984,215471.592173,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [12]:
storey_ohe = load(open('storey_ohe.pkl', 'rb'))

idx = X_train.index
df_storey_ohe = pd.DataFrame(storey_ohe.transform(X_train['storey'].values.reshape(-1, 1)).toarray())
df_storey_ohe.columns = storey_ohe.get_feature_names_out(['storey'])
df_storey_ohe.index = idx
X_train = pd.concat([X_train, df_storey_ohe], axis= 1)

X_train

Unnamed: 0,town,flat_type,floor_area_sqm,year_of_sale,storey,year_lease_commence,town_enc,flat_type_1 ROOM,flat_type_2 ROOM,flat_type_3 ROOM,flat_type_4 ROOM,flat_type_5 ROOM,flat_type_EXECUTIVE,flat_type_MULTI GENERATION,storey_HIGH,storey_LOW,storey_MID,storey_VERY HIGH
217170,PASIR RIS,4 ROOM,108.0,1998,LOW,1993,375197.713106,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
99114,TAMPINES,3 ROOM,74.0,1995,MID,1986,263294.883347,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
230916,CHOA CHU KANG,4 ROOM,104.0,1999,MID,1995,302768.354018,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
116779,TAMPINES,4 ROOM,93.0,1996,MID,1985,263294.883347,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
44766,GEYLANG,3 ROOM,62.0,1993,MID,1969,184591.699708,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
58535,BISHAN,5 ROOM,121.0,1994,HIGH,1987,342266.840904,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
12588,ANG MO KIO,3 ROOM,67.0,1991,LOW,1979,163762.230305,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
110081,JURONG EAST,4 ROOM,83.0,1995,MID,1986,199857.145714,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
188228,YISHUN,4 ROOM,92.0,1998,LOW,1984,215471.592173,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [13]:
X_train.drop(columns= ['town', 'flat_type', 'storey'], inplace= True)

X_train

Unnamed: 0,floor_area_sqm,year_of_sale,year_lease_commence,town_enc,flat_type_1 ROOM,flat_type_2 ROOM,flat_type_3 ROOM,flat_type_4 ROOM,flat_type_5 ROOM,flat_type_EXECUTIVE,flat_type_MULTI GENERATION,storey_HIGH,storey_LOW,storey_MID,storey_VERY HIGH
217170,108.0,1998,1993,375197.713106,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
99114,74.0,1995,1986,263294.883347,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
230916,104.0,1999,1995,302768.354018,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
116779,93.0,1996,1985,263294.883347,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
44766,62.0,1993,1969,184591.699708,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
58535,121.0,1994,1987,342266.840904,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
12588,67.0,1991,1979,163762.230305,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
110081,83.0,1995,1986,199857.145714,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
188228,92.0,1998,1984,215471.592173,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


## Repeat for validation and test set

In [14]:
df_enc_town_val = town_looe.transform(X_val['town'])
df_enc_town_val.rename(columns= {'town': 'town_enc'}, inplace= True)

X_val = pd.concat([X_val, df_enc_town_val], axis= 1)

X_val

Unnamed: 0,town,flat_type,floor_area_sqm,year_of_sale,storey,year_lease_commence,town_enc
96975,TOA PAYOH,3 ROOM,66.0,1995,MID,1973,183031.462035
165113,TAMPINES,4 ROOM,98.0,1997,LOW,1985,263294.883347
154261,BEDOK,4 ROOM,92.0,1997,MID,1978,195799.628024
29725,ANG MO KIO,3 ROOM,74.0,1992,LOW,1979,163762.230305
5762,CLEMENTI,EXECUTIVE,147.0,1990,MID,1985,189347.588929
...,...,...,...,...,...,...,...
38065,KALLANG/WHAMPOA,5 ROOM,122.0,1992,LOW,1987,190484.991541
44050,TAMPINES,3 ROOM,73.0,1993,MID,1985,263294.883347
23026,ANG MO KIO,3 ROOM,68.0,1991,MID,1980,163762.230305
40780,TAMPINES,5 ROOM,133.0,1993,LOW,1984,263294.883347


In [15]:
idx = X_val.index
df_flat_type_ohe_val = pd.DataFrame(flat_type_ohe.transform(X_val['flat_type'].values.reshape(-1, 1)).toarray())
df_flat_type_ohe_val.columns = flat_type_ohe.get_feature_names_out(['flat_type'])
df_flat_type_ohe_val.index = idx
X_val = pd.concat([X_val, df_flat_type_ohe_val], axis= 1)

X_val

Unnamed: 0,town,flat_type,floor_area_sqm,year_of_sale,storey,year_lease_commence,town_enc,flat_type_1 ROOM,flat_type_2 ROOM,flat_type_3 ROOM,flat_type_4 ROOM,flat_type_5 ROOM,flat_type_EXECUTIVE,flat_type_MULTI GENERATION
96975,TOA PAYOH,3 ROOM,66.0,1995,MID,1973,183031.462035,0.0,0.0,1.0,0.0,0.0,0.0,0.0
165113,TAMPINES,4 ROOM,98.0,1997,LOW,1985,263294.883347,0.0,0.0,0.0,1.0,0.0,0.0,0.0
154261,BEDOK,4 ROOM,92.0,1997,MID,1978,195799.628024,0.0,0.0,0.0,1.0,0.0,0.0,0.0
29725,ANG MO KIO,3 ROOM,74.0,1992,LOW,1979,163762.230305,0.0,0.0,1.0,0.0,0.0,0.0,0.0
5762,CLEMENTI,EXECUTIVE,147.0,1990,MID,1985,189347.588929,0.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38065,KALLANG/WHAMPOA,5 ROOM,122.0,1992,LOW,1987,190484.991541,0.0,0.0,0.0,0.0,1.0,0.0,0.0
44050,TAMPINES,3 ROOM,73.0,1993,MID,1985,263294.883347,0.0,0.0,1.0,0.0,0.0,0.0,0.0
23026,ANG MO KIO,3 ROOM,68.0,1991,MID,1980,163762.230305,0.0,0.0,1.0,0.0,0.0,0.0,0.0
40780,TAMPINES,5 ROOM,133.0,1993,LOW,1984,263294.883347,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [16]:
idx = X_val.index
df_storey_ohe_val = pd.DataFrame(storey_ohe.transform(X_val['storey'].values.reshape(-1, 1)).toarray())
df_storey_ohe_val.columns = storey_ohe.get_feature_names_out(['storey'])
df_storey_ohe_val.index = idx
X_val = pd.concat([X_val, df_storey_ohe_val], axis= 1)

X_val

Unnamed: 0,town,flat_type,floor_area_sqm,year_of_sale,storey,year_lease_commence,town_enc,flat_type_1 ROOM,flat_type_2 ROOM,flat_type_3 ROOM,flat_type_4 ROOM,flat_type_5 ROOM,flat_type_EXECUTIVE,flat_type_MULTI GENERATION,storey_HIGH,storey_LOW,storey_MID,storey_VERY HIGH
96975,TOA PAYOH,3 ROOM,66.0,1995,MID,1973,183031.462035,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
165113,TAMPINES,4 ROOM,98.0,1997,LOW,1985,263294.883347,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
154261,BEDOK,4 ROOM,92.0,1997,MID,1978,195799.628024,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
29725,ANG MO KIO,3 ROOM,74.0,1992,LOW,1979,163762.230305,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
5762,CLEMENTI,EXECUTIVE,147.0,1990,MID,1985,189347.588929,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38065,KALLANG/WHAMPOA,5 ROOM,122.0,1992,LOW,1987,190484.991541,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
44050,TAMPINES,3 ROOM,73.0,1993,MID,1985,263294.883347,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
23026,ANG MO KIO,3 ROOM,68.0,1991,MID,1980,163762.230305,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
40780,TAMPINES,5 ROOM,133.0,1993,LOW,1984,263294.883347,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0


In [17]:
X_val.drop(columns= ['town', 'flat_type', 'storey'], inplace= True)

X_val

Unnamed: 0,floor_area_sqm,year_of_sale,year_lease_commence,town_enc,flat_type_1 ROOM,flat_type_2 ROOM,flat_type_3 ROOM,flat_type_4 ROOM,flat_type_5 ROOM,flat_type_EXECUTIVE,flat_type_MULTI GENERATION,storey_HIGH,storey_LOW,storey_MID,storey_VERY HIGH
96975,66.0,1995,1973,183031.462035,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
165113,98.0,1997,1985,263294.883347,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
154261,92.0,1997,1978,195799.628024,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
29725,74.0,1992,1979,163762.230305,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
5762,147.0,1990,1985,189347.588929,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38065,122.0,1992,1987,190484.991541,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
44050,73.0,1993,1985,263294.883347,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
23026,68.0,1991,1980,163762.230305,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
40780,133.0,1993,1984,263294.883347,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0


In [18]:
df_enc_town_test = town_looe.transform(X_test['town'])
df_enc_town_test.rename(columns= {'town': 'town_enc'}, inplace= True)

X_test = pd.concat([X_test, df_enc_town_test], axis= 1)

X_test

Unnamed: 0,town,flat_type,floor_area_sqm,year_of_sale,storey,year_lease_commence,town_enc
157428,BUKIT BATOK,3 ROOM,73.0,1997,LOW,1987,195904.688669
229280,YISHUN,4 ROOM,84.0,1998,LOW,1987,215471.592173
53158,YISHUN,EXECUTIVE,146.0,1993,MID,1988,215471.592173
71598,JURONG WEST,EXECUTIVE,146.0,1994,LOW,1988,198286.127110
226187,CLEMENTI,3 ROOM,67.0,1998,LOW,1980,189347.588929
...,...,...,...,...,...,...,...
73507,JURONG EAST,4 ROOM,103.0,1994,MID,1984,199857.145714
184971,BEDOK,3 ROOM,73.0,1998,MID,1976,195799.628024
117709,BEDOK,4 ROOM,84.0,1996,MID,1986,195799.628024
53645,BISHAN,4 ROOM,84.0,1993,LOW,1986,342266.840904


In [19]:
idx = X_test.index
df_flat_type_ohe_test = pd.DataFrame(flat_type_ohe.transform(X_test['flat_type'].values.reshape(-1, 1)).toarray())
df_flat_type_ohe_test.columns = flat_type_ohe.get_feature_names_out(['flat_type'])
df_flat_type_ohe_test.index = idx
X_test = pd.concat([X_test, df_flat_type_ohe_test], axis= 1)

X_test

Unnamed: 0,town,flat_type,floor_area_sqm,year_of_sale,storey,year_lease_commence,town_enc,flat_type_1 ROOM,flat_type_2 ROOM,flat_type_3 ROOM,flat_type_4 ROOM,flat_type_5 ROOM,flat_type_EXECUTIVE,flat_type_MULTI GENERATION
157428,BUKIT BATOK,3 ROOM,73.0,1997,LOW,1987,195904.688669,0.0,0.0,1.0,0.0,0.0,0.0,0.0
229280,YISHUN,4 ROOM,84.0,1998,LOW,1987,215471.592173,0.0,0.0,0.0,1.0,0.0,0.0,0.0
53158,YISHUN,EXECUTIVE,146.0,1993,MID,1988,215471.592173,0.0,0.0,0.0,0.0,0.0,1.0,0.0
71598,JURONG WEST,EXECUTIVE,146.0,1994,LOW,1988,198286.127110,0.0,0.0,0.0,0.0,0.0,1.0,0.0
226187,CLEMENTI,3 ROOM,67.0,1998,LOW,1980,189347.588929,0.0,0.0,1.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
73507,JURONG EAST,4 ROOM,103.0,1994,MID,1984,199857.145714,0.0,0.0,0.0,1.0,0.0,0.0,0.0
184971,BEDOK,3 ROOM,73.0,1998,MID,1976,195799.628024,0.0,0.0,1.0,0.0,0.0,0.0,0.0
117709,BEDOK,4 ROOM,84.0,1996,MID,1986,195799.628024,0.0,0.0,0.0,1.0,0.0,0.0,0.0
53645,BISHAN,4 ROOM,84.0,1993,LOW,1986,342266.840904,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [20]:
idx = X_test.index
df_storey_ohe_test = pd.DataFrame(storey_ohe.transform(X_test['storey'].values.reshape(-1, 1)).toarray())
df_storey_ohe_test.columns = storey_ohe.get_feature_names_out(['storey'])
df_storey_ohe_test.index = idx
X_test = pd.concat([X_test, df_storey_ohe_test], axis= 1)

X_test

Unnamed: 0,town,flat_type,floor_area_sqm,year_of_sale,storey,year_lease_commence,town_enc,flat_type_1 ROOM,flat_type_2 ROOM,flat_type_3 ROOM,flat_type_4 ROOM,flat_type_5 ROOM,flat_type_EXECUTIVE,flat_type_MULTI GENERATION,storey_HIGH,storey_LOW,storey_MID,storey_VERY HIGH
157428,BUKIT BATOK,3 ROOM,73.0,1997,LOW,1987,195904.688669,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
229280,YISHUN,4 ROOM,84.0,1998,LOW,1987,215471.592173,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
53158,YISHUN,EXECUTIVE,146.0,1993,MID,1988,215471.592173,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
71598,JURONG WEST,EXECUTIVE,146.0,1994,LOW,1988,198286.127110,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
226187,CLEMENTI,3 ROOM,67.0,1998,LOW,1980,189347.588929,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
73507,JURONG EAST,4 ROOM,103.0,1994,MID,1984,199857.145714,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
184971,BEDOK,3 ROOM,73.0,1998,MID,1976,195799.628024,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
117709,BEDOK,4 ROOM,84.0,1996,MID,1986,195799.628024,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
53645,BISHAN,4 ROOM,84.0,1993,LOW,1986,342266.840904,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [21]:
X_test.drop(columns= ['town', 'flat_type', 'storey'], inplace= True)

X_test

Unnamed: 0,floor_area_sqm,year_of_sale,year_lease_commence,town_enc,flat_type_1 ROOM,flat_type_2 ROOM,flat_type_3 ROOM,flat_type_4 ROOM,flat_type_5 ROOM,flat_type_EXECUTIVE,flat_type_MULTI GENERATION,storey_HIGH,storey_LOW,storey_MID,storey_VERY HIGH
157428,73.0,1997,1987,195904.688669,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
229280,84.0,1998,1987,215471.592173,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
53158,146.0,1993,1988,215471.592173,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
71598,146.0,1994,1988,198286.127110,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
226187,67.0,1998,1980,189347.588929,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
73507,103.0,1994,1984,199857.145714,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
184971,73.0,1998,1976,195799.628024,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
117709,84.0,1996,1986,195799.628024,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
53645,84.0,1993,1986,342266.840904,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


## Dummy Regressor as baseline

In [22]:
from sklearn.dummy import DummyRegressor

dummy_reg = DummyRegressor(strategy= 'mean')
dummy_reg.fit(X_train, y_train)

In [23]:
from sklearn.metrics import mean_squared_error

y_pred_dummy = dummy_reg.predict(X_train)
RMSE_dummy = mean_squared_error(y_train, y_pred_dummy, squared= False)

print('The training root mean squared error (RMSE) of the dummy regressor is:', RMSE_dummy)

The training root mean squared error (RMSE) of the dummy regressor is: 128039.24623415218


## Train the XGBoost model


In [26]:
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error

In [25]:
XGBoost_reg = XGBRegressor()

XGBoost_reg.fit(X_train, y_train)

In [28]:
y_pred_XGB = XGBoost_reg.predict(X_train)

RMSE_XGB = mean_squared_error(y_train, y_pred_XGB, squared= False)

print('The training root mean squared error (RMSE) of the XGBoost Regressor is:', RMSE_XGB)

The training root mean squared error (RMSE) of the XGBoost Regressor is: 22151.679812298804


In [29]:
y_pred_val_XGB = XGBoost_reg.predict(X_val)

RMSE_val_XGB = mean_squared_error(y_val, y_pred_val_XGB, squared= False)

print('The validation root mean squared error (RMSE) of the XGBoost Regressor is:', RMSE_val_XGB)

The validation root mean squared error (RMSE) of the XGBoost Regressor is: 23052.55184262129


## Hyperparameter tuning

Let's see if we can further improve the performance of the model by tuning some hyperparameters.

The hyperparameters that will be tweaked are:
* n_estimators: number of trees in ensemble = number of rounds of boosting
* max_depth: max_depth of each weak estimator
* learning_rate: step size at each iteration to move towards objective function
* colsample_bytree: Fraction of columns to be sampled for each tree

Some recommended values for each hyperparameters can be found [here](https://towardsdatascience.com/xgboost-fine-tune-and-optimize-your-model-23d996fab663)

In [32]:
from sklearn.model_selection import GridSearchCV

params_dict = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 6, 10],
    'learning_rate': [0.01, 0.05, 0.1],
    'colsample_bytree': [0.3, 0.7]
}

In [33]:
XGBoost_reg_tuned_grid = GridSearchCV(XGBRegressor(), param_grid= params_dict, scoring= 'neg_mean_squared_error').fit(X_train, y_train)

In [34]:
XGBoost_reg_tuned = XGBoost_reg_tuned_grid.best_estimator_

In [35]:
from pickle import dump

dump(XGBoost_reg_tuned, open('XGBoost_model.pkl', 'wb'))

In [36]:
# Performance on training data

y_pred_tuned = XGBoost_reg_tuned.predict(X_train)
RMSE_tuned = mean_squared_error(y_train, y_pred_tuned, squared= False)

print('The training root mean squared error (RMSE) for the tuned XGBoost model is:', RMSE_tuned)

The training root mean squared error (RMSE) for the tuned XGBoost model is: 20711.22092328809


In [37]:
# Performance on validation data

y_pred_tuned_val = XGBoost_reg_tuned.predict(X_val)
RMSE_tuned_val = mean_squared_error(y_val, y_pred_tuned_val, squared= False)

print('The validation root mean squared error (RMSE) for the tuned XGBoost model is:', RMSE_tuned_val)

The validation root mean squared error (RMSE) for the tuned XGBoost model is: 22777.053081767073


In [38]:
# Performance on test data

y_pred_tuned_test = XGBoost_reg_tuned.predict(X_test)
RMSE_tuned_test = mean_squared_error(y_test, y_pred_tuned_test, squared= False)

print('The test root mean squared error (RMSE) for the tuned XGBoost model is:', RMSE_tuned_test)

The test root mean squared error (RMSE) for the tuned XGBoost model is: 22686.363043912806


In [39]:
XGBoost_reg_tuned_grid.best_params_

{'colsample_bytree': 0.7,
 'learning_rate': 0.05,
 'max_depth': 10,
 'n_estimators': 300}

In [40]:
importances = XGBoost_reg_tuned.feature_importances_

top5_idx = np.argsort(importances)[::-1][:5]

print('The top 5 most important features in determining resale prices are:')
for i in top5_idx:
    print(X_train.columns[i])

The top 5 most important features in determining resale prices are:
flat_type_3 ROOM
flat_type_4 ROOM
year_of_sale
floor_area_sqm
flat_type_EXECUTIVE


## Conclusion

The XGBoost model after hyperparameter tuning seems to have similar performance to the default values
(Perhaps a wider range of the parameters or different parameters should be considered in order to further finetune the model)

The best performing hyperparameters are:
* n_estimators = 300
* max_depth = 10
* learning_rate = 0.05
* colsample_bytree = 0.7

Some of the more important features that determine the resale price includes:
* floor area 
* year of sale
* flat type