# Training a Random Forest Regressor 

## Import libraries


In [21]:
import numpy as np
import pandas as pd

## Import the dataset

In [22]:
raw_data = pd.read_csv('raw_flat_prices_data.csv', parse_dates= ['month', 'lease_commence_date'])

raw_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 287196 entries, 0 to 287195
Data columns (total 10 columns):
 #   Column               Non-Null Count   Dtype         
---  ------               --------------   -----         
 0   month                287196 non-null  datetime64[ns]
 1   town                 287196 non-null  object        
 2   flat_type            287196 non-null  object        
 3   block                287196 non-null  object        
 4   street_name          287196 non-null  object        
 5   storey_range         287196 non-null  object        
 6   floor_area_sqm       287196 non-null  float64       
 7   flat_model           287196 non-null  object        
 8   lease_commence_date  287196 non-null  datetime64[ns]
 9   resale_price         287196 non-null  int64         
dtypes: datetime64[ns](2), float64(1), int64(1), object(6)
memory usage: 21.9+ MB


## Feature engineering and data preprocessing plan

* month - only use the year of transaction
* town - utilize Leave One Out Encoding due to large number of categories (26) and want to preserve information of price in relation to town
* flat_type - utilize one hot encoding (7 categories)
* block and street_name - will not be used in model for simplicity of the model
* storey_range - grouped into 4 groups (low, mid, high, very high) and then one hot encoded
* floor_area_sqm - used as a numeric value
* flat_model - will not be used in model for simplicity of model (Many of the models are under-represented)
* lease_commence_date - used as a numeric value
* resale price - used as a numeric value (target)


In [23]:
df_fe = raw_data[['month', 'town', 'flat_type', 'storey_range', 'floor_area_sqm', 'lease_commence_date', 'resale_price']].copy(deep= True)

df_fe

Unnamed: 0,month,town,flat_type,storey_range,floor_area_sqm,lease_commence_date,resale_price
0,1990-01-01,ANG MO KIO,1 ROOM,10 TO 12,31.0,1977-01-01,9000
1,1990-01-01,ANG MO KIO,1 ROOM,04 TO 06,31.0,1977-01-01,6000
2,1990-01-01,ANG MO KIO,1 ROOM,10 TO 12,31.0,1977-01-01,8000
3,1990-01-01,ANG MO KIO,1 ROOM,07 TO 09,31.0,1977-01-01,6000
4,1990-01-01,ANG MO KIO,3 ROOM,04 TO 06,73.0,1976-01-01,47200
...,...,...,...,...,...,...,...
287191,1999-12-01,YISHUN,EXECUTIVE,10 TO 12,142.0,1987-01-01,456000
287192,1999-12-01,YISHUN,EXECUTIVE,01 TO 03,142.0,1988-01-01,408000
287193,1999-12-01,YISHUN,EXECUTIVE,07 TO 09,146.0,1988-01-01,469000
287194,1999-12-01,YISHUN,EXECUTIVE,04 TO 06,146.0,1988-01-01,440000


In [24]:
df_fe['year_of_sale'] = df_fe['month'].dt.year

df_fe

Unnamed: 0,month,town,flat_type,storey_range,floor_area_sqm,lease_commence_date,resale_price,year_of_sale
0,1990-01-01,ANG MO KIO,1 ROOM,10 TO 12,31.0,1977-01-01,9000,1990
1,1990-01-01,ANG MO KIO,1 ROOM,04 TO 06,31.0,1977-01-01,6000,1990
2,1990-01-01,ANG MO KIO,1 ROOM,10 TO 12,31.0,1977-01-01,8000,1990
3,1990-01-01,ANG MO KIO,1 ROOM,07 TO 09,31.0,1977-01-01,6000,1990
4,1990-01-01,ANG MO KIO,3 ROOM,04 TO 06,73.0,1976-01-01,47200,1990
...,...,...,...,...,...,...,...,...
287191,1999-12-01,YISHUN,EXECUTIVE,10 TO 12,142.0,1987-01-01,456000,1999
287192,1999-12-01,YISHUN,EXECUTIVE,01 TO 03,142.0,1988-01-01,408000,1999
287193,1999-12-01,YISHUN,EXECUTIVE,07 TO 09,146.0,1988-01-01,469000,1999
287194,1999-12-01,YISHUN,EXECUTIVE,04 TO 06,146.0,1988-01-01,440000,1999


In [25]:
storey_class = {
    '01 TO 03': 'LOW',
    '04 TO 06': 'LOW',
    '07 TO 09': 'MID',
    '10 TO 12': 'MID',
    '13 TO 15': 'HIGH',
    '16 TO 18': 'HIGH',
    '19 TO 21': 'VERY HIGH',
    '22 TO 24': 'VERY HIGH',
    '25 TO 27': 'VERY HIGH'
}

In [26]:
storey_cat = [] 

for s in df_fe['storey_range']:
    storey_cat.append(storey_class[s])

df_fe['storey'] = storey_cat

df_fe

Unnamed: 0,month,town,flat_type,storey_range,floor_area_sqm,lease_commence_date,resale_price,year_of_sale,storey
0,1990-01-01,ANG MO KIO,1 ROOM,10 TO 12,31.0,1977-01-01,9000,1990,MID
1,1990-01-01,ANG MO KIO,1 ROOM,04 TO 06,31.0,1977-01-01,6000,1990,LOW
2,1990-01-01,ANG MO KIO,1 ROOM,10 TO 12,31.0,1977-01-01,8000,1990,MID
3,1990-01-01,ANG MO KIO,1 ROOM,07 TO 09,31.0,1977-01-01,6000,1990,MID
4,1990-01-01,ANG MO KIO,3 ROOM,04 TO 06,73.0,1976-01-01,47200,1990,LOW
...,...,...,...,...,...,...,...,...,...
287191,1999-12-01,YISHUN,EXECUTIVE,10 TO 12,142.0,1987-01-01,456000,1999,MID
287192,1999-12-01,YISHUN,EXECUTIVE,01 TO 03,142.0,1988-01-01,408000,1999,LOW
287193,1999-12-01,YISHUN,EXECUTIVE,07 TO 09,146.0,1988-01-01,469000,1999,MID
287194,1999-12-01,YISHUN,EXECUTIVE,04 TO 06,146.0,1988-01-01,440000,1999,LOW


In [27]:
df_fe['year_lease_commence'] = df_fe['lease_commence_date'].dt.year

df_fe

Unnamed: 0,month,town,flat_type,storey_range,floor_area_sqm,lease_commence_date,resale_price,year_of_sale,storey,year_lease_commence
0,1990-01-01,ANG MO KIO,1 ROOM,10 TO 12,31.0,1977-01-01,9000,1990,MID,1977
1,1990-01-01,ANG MO KIO,1 ROOM,04 TO 06,31.0,1977-01-01,6000,1990,LOW,1977
2,1990-01-01,ANG MO KIO,1 ROOM,10 TO 12,31.0,1977-01-01,8000,1990,MID,1977
3,1990-01-01,ANG MO KIO,1 ROOM,07 TO 09,31.0,1977-01-01,6000,1990,MID,1977
4,1990-01-01,ANG MO KIO,3 ROOM,04 TO 06,73.0,1976-01-01,47200,1990,LOW,1976
...,...,...,...,...,...,...,...,...,...,...
287191,1999-12-01,YISHUN,EXECUTIVE,10 TO 12,142.0,1987-01-01,456000,1999,MID,1987
287192,1999-12-01,YISHUN,EXECUTIVE,01 TO 03,142.0,1988-01-01,408000,1999,LOW,1988
287193,1999-12-01,YISHUN,EXECUTIVE,07 TO 09,146.0,1988-01-01,469000,1999,MID,1988
287194,1999-12-01,YISHUN,EXECUTIVE,04 TO 06,146.0,1988-01-01,440000,1999,LOW,1988


In [28]:
df_fe.drop(labels= ['month', 'storey_range', 'lease_commence_date'], axis= 1, inplace= True)

df_fe

Unnamed: 0,town,flat_type,floor_area_sqm,resale_price,year_of_sale,storey,year_lease_commence
0,ANG MO KIO,1 ROOM,31.0,9000,1990,MID,1977
1,ANG MO KIO,1 ROOM,31.0,6000,1990,LOW,1977
2,ANG MO KIO,1 ROOM,31.0,8000,1990,MID,1977
3,ANG MO KIO,1 ROOM,31.0,6000,1990,MID,1977
4,ANG MO KIO,3 ROOM,73.0,47200,1990,LOW,1976
...,...,...,...,...,...,...,...
287191,YISHUN,EXECUTIVE,142.0,456000,1999,MID,1987
287192,YISHUN,EXECUTIVE,142.0,408000,1999,LOW,1988
287193,YISHUN,EXECUTIVE,146.0,469000,1999,MID,1988
287194,YISHUN,EXECUTIVE,146.0,440000,1999,LOW,1988
