## **Data Preprocessing**
### This notebook implements steps to preprocess the data. The objective is to transform the dataset into a format where downstream Exploratory Data Analysis (EDA) and model training can be performed. The table below gives a summary and explanation of the preprocessing steps deployed.

In [1]:
import pandas as pd
from src.preprocessing import *

%load_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
CSV_FILE = "data/train_cleaned.csv"
NEW_CSV_FILE = "data/train_preprocessed.csv"

In [3]:
df = pd.read_csv(CSV_FILE)

#### Step 1: Convert `flat_type` data into numerical labels that reflect the number of bedrooms

In [4]:
df['std_flat_type'] = df.apply(lambda x: std_flat_type(x['flat_type'], x['lease_commence_date']), axis=1)
df.head()

Unnamed: 0.1,Unnamed: 0,rent_approval_date,flat_type,flat_model,floor_area_sqm,lease_commence_date,latitude,longitude,planning_area,region,monthly_rent,std_flat_type
0,0,2021-09,3-room,new generation,67.0,1983,1.344518,103.73863,jurong east,west region,1600,3.0
1,1,2022-05,4-room,new generation,92.0,1978,1.330186,103.938717,bedok,east region,2250,4.0
2,2,2022-10,3-room,improved,67.0,1971,1.332242,103.845643,toa payoh,central region,1900,3.0
3,3,2021-08,executive,apartment,149.0,1993,1.370239,103.962894,pasir ris,east region,2850,4.0
4,4,2022-11,3-room,improved,68.0,1972,1.320502,103.863341,kallang,central region,2100,3.0


#### Step 2: Reduce the number of categories of `flat_model`

In [5]:
df['std_flat_model'] = df.apply(lambda x: std_flat_model(x['flat_model']), axis=1)
df.head()

Unnamed: 0.1,Unnamed: 0,rent_approval_date,flat_type,flat_model,floor_area_sqm,lease_commence_date,latitude,longitude,planning_area,region,monthly_rent,std_flat_type,std_flat_model
0,0,2021-09,3-room,new generation,67.0,1983,1.344518,103.73863,jurong east,west region,1600,3.0,NG
1,1,2022-05,4-room,new generation,92.0,1978,1.330186,103.938717,bedok,east region,2250,4.0,NG
2,2,2022-10,3-room,improved,67.0,1971,1.332242,103.845643,toa payoh,central region,1900,3.0,I
3,3,2021-08,executive,apartment,149.0,1993,1.370239,103.962894,pasir ris,east region,2850,4.0,OTH
4,4,2022-11,3-room,improved,68.0,1972,1.320502,103.863341,kallang,central region,2100,3.0,I


#### Step 3: Perform variable scaling to derive remaining lease period based on `lease_commence_date`

In [6]:
df['std_remaining_lease'] = df.apply(lambda x: std_remaining_lease(x['lease_commence_date'], 2023), axis=1)
df.head()

Unnamed: 0.1,Unnamed: 0,rent_approval_date,flat_type,flat_model,floor_area_sqm,lease_commence_date,latitude,longitude,planning_area,region,monthly_rent,std_flat_type,std_flat_model,std_remaining_lease
0,0,2021-09,3-room,new generation,67.0,1983,1.344518,103.73863,jurong east,west region,1600,3.0,NG,59
1,1,2022-05,4-room,new generation,92.0,1978,1.330186,103.938717,bedok,east region,2250,4.0,NG,54
2,2,2022-10,3-room,improved,67.0,1971,1.332242,103.845643,toa payoh,central region,1900,3.0,I,47
3,3,2021-08,executive,apartment,149.0,1993,1.370239,103.962894,pasir ris,east region,2850,4.0,OTH,69
4,4,2022-11,3-room,improved,68.0,1972,1.320502,103.863341,kallang,central region,2100,3.0,I,48


#### Step 4: Perform one-hot-encoding on categorical variables

In [7]:
df = preprocess_region(df)
df.head()

Unnamed: 0.1,Unnamed: 0,rent_approval_date,flat_type,flat_model,floor_area_sqm,lease_commence_date,latitude,longitude,planning_area,monthly_rent,std_flat_type,std_flat_model,std_remaining_lease,central region,east region,north region,north-east region,west region
0,0,2021-09,3-room,new generation,67.0,1983,1.344518,103.73863,jurong east,1600,3.0,NG,59,0,0,0,0,1
1,1,2022-05,4-room,new generation,92.0,1978,1.330186,103.938717,bedok,2250,4.0,NG,54,0,1,0,0,0
2,2,2022-10,3-room,improved,67.0,1971,1.332242,103.845643,toa payoh,1900,3.0,I,47,1,0,0,0,0
3,3,2021-08,executive,apartment,149.0,1993,1.370239,103.962894,pasir ris,2850,4.0,OTH,69,0,1,0,0,0
4,4,2022-11,3-room,improved,68.0,1972,1.320502,103.863341,kallang,2100,3.0,I,48,1,0,0,0,0


In [8]:
df = preprocess_planning_area(df)
df.head()

Unnamed: 0.1,Unnamed: 0,rent_approval_date,flat_type,flat_model,floor_area_sqm,lease_commence_date,latitude,longitude,monthly_rent,std_flat_type,...,planning_area_ punggol,planning_area_ queenstown,planning_area_ rochor,planning_area_ sembawang,planning_area_ sengkang,planning_area_ serangoon,planning_area_ tampines,planning_area_ toa payoh,planning_area_ woodlands,planning_area_ yishun
0,0,2021-09,3-room,new generation,67.0,1983,1.344518,103.73863,1600,3.0,...,0,0,0,0,0,0,0,0,0,0
1,1,2022-05,4-room,new generation,92.0,1978,1.330186,103.938717,2250,4.0,...,0,0,0,0,0,0,0,0,0,0
2,2,2022-10,3-room,improved,67.0,1971,1.332242,103.845643,1900,3.0,...,0,0,0,0,0,0,0,1,0,0
3,3,2021-08,executive,apartment,149.0,1993,1.370239,103.962894,2850,4.0,...,0,0,0,0,0,0,0,0,0,0
4,4,2022-11,3-room,improved,68.0,1972,1.320502,103.863341,2100,3.0,...,0,0,0,0,0,0,0,0,0,0


In [9]:
df = preprocess_flat_model(df)
df.head()

Unnamed: 0.1,Unnamed: 0,rent_approval_date,flat_type,floor_area_sqm,lease_commence_date,latitude,longitude,monthly_rent,std_flat_type,std_remaining_lease,...,planning_area_ toa payoh,planning_area_ woodlands,planning_area_ yishun,A,I,NG,OTH,P,S,STD
0,0,2021-09,3-room,67.0,1983,1.344518,103.73863,1600,3.0,59,...,0,0,0,0,0,1,0,0,0,0
1,1,2022-05,4-room,92.0,1978,1.330186,103.938717,2250,4.0,54,...,0,0,0,0,0,1,0,0,0,0
2,2,2022-10,3-room,67.0,1971,1.332242,103.845643,1900,3.0,47,...,1,0,0,0,1,0,0,0,0,0
3,3,2021-08,executive,149.0,1993,1.370239,103.962894,2850,4.0,69,...,0,0,0,0,0,0,1,0,0,0
4,4,2022-11,3-room,68.0,1972,1.320502,103.863341,2100,3.0,48,...,0,0,0,0,1,0,0,0,0,0


#### Step 5: Split the `rent_approval_date` into `rent_approval_month` and `rent_approval_year`

In [10]:
df = preprocess_rent_approval_date(df)
df.head()

Unnamed: 0.1,Unnamed: 0,flat_type,floor_area_sqm,lease_commence_date,latitude,longitude,monthly_rent,std_flat_type,std_remaining_lease,central region,...,planning_area_ yishun,A,I,NG,OTH,P,S,STD,rent_approval_year,rent_approval_month
0,0,3-room,67.0,1983,1.344518,103.73863,1600,3.0,59,0,...,0,0,0,1,0,0,0,0,2021,9
1,1,4-room,92.0,1978,1.330186,103.938717,2250,4.0,54,0,...,0,0,0,1,0,0,0,0,2022,5
2,2,3-room,67.0,1971,1.332242,103.845643,1900,3.0,47,1,...,0,0,1,0,0,0,0,0,2022,10
3,3,executive,149.0,1993,1.370239,103.962894,2850,4.0,69,0,...,0,0,0,0,1,0,0,0,2021,8
4,4,3-room,68.0,1972,1.320502,103.863341,2100,3.0,48,1,...,0,0,1,0,0,0,0,0,2022,11


#### Step 5: Save preprocessed dataset as a new csv file

In [11]:
df.to_csv(NEW_CSV_FILE)