### Data Generation

This notebook generates the training and validation data for training a model for predicting housing rental prices

In [11]:
import pandas as pd
from sklearn.model_selection import train_test_split
from src.cleaning import *
from src.preprocessing import *
from src.feature_eng import *

%load_ext autoreload
%autoreload 2
%matplotlib inline

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [12]:
RAW_DATAFILE = "data/train.csv"

In [13]:
raw_df = pd.read_csv(RAW_DATAFILE)

### Dataset 1: Baseline
This is a baseline dataset with only minimal data cleaning and preprocessing done. It is used to train a baseline model with which we compare all other trained models.

#### Data Cleaning
In this step, we perform:
- standardisation of strings to lowercase
- standardisation of flat type labels
- remove single-valued columns

In [14]:
cleaned_df = convert_strings_to_lowercase(raw_df)
cleaned_df = clean_flat_type_labels(cleaned_df)

columns_to_remove = ['furnished', 'elevation', 'street_name']
cleaned_df = drop_data(cleaned_df, columns_to_remove)

#### Data Preprocessing
In this step, we perform the following preprocessing steps:
- convert `flat_type` to numerical form
- reduce the number of categories of `flat_model`
- perform variable scaling to derive remaining lease period on `lease_commence_date`
- perform one-hot-encoding on categorical variables
- split `rent_approval_date` into month and year columns
- convert `block` to integer form, removing any letters

In [15]:
preprocessed_df = cleaned_df
preprocessed_df['std_flat_type'] = preprocessed_df.apply(lambda x: std_flat_type(x['flat_type'], x['lease_commence_date']), axis=1)
preprocessed_df['std_flat_model'] = preprocessed_df.apply(lambda x: std_flat_model(x['flat_model']), axis=1)
preprocessed_df['std_remaining_lease'] = preprocessed_df.apply(lambda x: std_remaining_lease(x['lease_commence_date'], 2023), axis=1)

preprocessed_df = preprocess_region(preprocessed_df)
preprocessed_df = preprocess_planning_area(preprocessed_df)
preprocessed_df = preprocess_flat_model(preprocessed_df)

preprocessed_df = preprocess_town(preprocessed_df)
preprocessed_df = preprocess_subzone(preprocessed_df)

preprocessed_df = preprocess_rent_approval_date(preprocessed_df)
preprocessed_df['block_num_int'] = preprocessed_df.apply(lambda x: preprocess_block(x['block']), axis=1)
preprocessed_df = preprocessed_df.drop(columns=['block', 'flat_type', 'lease_commence_date'])

In [16]:
baseline_train, baseline_val = train_test_split(preprocessed_df, test_size=0.1, random_state=42)

In [17]:
baseline_train.to_csv("data/train/baseline_train.csv", index=False)
baseline_val.to_csv("data/train/baseline_val.csv", index=False)

### Dataset 2: Dataset with Additional Features
In this dataset, additional features below are added based on our initial data analysis
- Mean monthly COE prices
- Number of amenities near a house

In [18]:
preprocessed_df.head()

Unnamed: 0,floor_area_sqm,latitude,longitude,monthly_rent,std_flat_type,std_remaining_lease,central region,east region,north region,north-east region,west region,planning_area_ ang mo kio,planning_area_ bedok,planning_area_ bishan,planning_area_ bukit batok,planning_area_ bukit merah,planning_area_ bukit panjang,planning_area_ bukit timah,planning_area_ choa chu kang,planning_area_ clementi,planning_area_ downtown core,planning_area_ geylang,planning_area_ hougang,planning_area_ jurong east,planning_area_ jurong west,planning_area_ kallang,planning_area_ marine parade,planning_area_ novena,planning_area_ outram,planning_area_ pasir ris,planning_area_ punggol,planning_area_ queenstown,planning_area_ rochor,planning_area_ sembawang,planning_area_ sengkang,planning_area_ serangoon,planning_area_ tampines,planning_area_ toa payoh,planning_area_ woodlands,planning_area_ yishun,A,I,NG,OTH,P,S,STD,ang mo kio,bedok,bishan,bukit batok,bukit merah,bukit panjang,bukit timah,central,choa chu kang,clementi,geylang,hougang,jurong east,jurong west,kallang/whampoa,marine parade,pasir ris,punggol,queenstown,sembawang,sengkang,serangoon,tampines,toa payoh,woodlands,yishun,subzone admiralty,subzone alexandra hill,subzone aljunied,subzone anak bukit,subzone anchorvale,subzone ang mo kio town centre,subzone balestier,subzone bangkit,subzone bedok north,subzone bedok reservoir,subzone bedok south,subzone bencoolen,subzone bendemeer,subzone bishan east,subzone boon keng,subzone boon lay place,subzone boon teck,subzone braddell,subzone brickworks,subzone bugis,subzone bukit batok central,subzone bukit batok east,subzone bukit batok south,subzone bukit batok west,subzone bukit ho swee,subzone bukit merah,subzone cheng san,subzone china square,subzone chinatown,subzone choa chu kang central,subzone choa chu kang north,subzone chong boon,subzone city hall,subzone clementi central,subzone clementi north,subzone clementi west,subzone clementi woods,subzone commonwealth,subzone compassvale,subzone crawford,subzone depot road,subzone dover,subzone everton park,subzone fajar,subzone farrer park,subzone fernvale,subzone frankel,subzone geylang bahru,subzone geylang east,subzone ghim moh,subzone gombak,subzone guilin,subzone henderson hill,subzone holland drive,subzone hong kah,subzone hong kah north,subzone hougang central,subzone hougang east,subzone hougang west,subzone jelebu,subzone joo seng,subzone jurong west central,subzone kaki bukit,subzone kampong bugis,subzone kampong java,subzone kampong tiong bahru,subzone kampong ubi,subzone kangkar,subzone keat hong,subzone kebun bahru,subzone kembangan,subzone khatib,subzone kim keat,subzone kovan,subzone lavender,subzone little india,subzone lorong 8 toa payoh,subzone lorong ah soo,subzone lower seletar,subzone macpherson,subzone margaret drive,subzone marine parade,subzone marymount,subzone matilda,subzone mei chin,subzone midview,subzone moulmein,subzone north coast,subzone northland,subzone pasir panjang 2,subzone pasir ris central,subzone pasir ris drive,subzone pasir ris west,subzone pearl's hill,subzone pei chun,subzone peng siang,subzone potong pasir,subzone punggol field,subzone punggol town centre,subzone redhill,subzone rivervale,subzone saujana,subzone sembawang central,subzone sembawang east,subzone sembawang north,subzone sengkang town centre,subzone senja,subzone serangoon central,subzone serangoon garden,subzone serangoon north,subzone shangri-la,subzone simei,subzone sungei road,subzone sunset way,subzone swiss club,subzone tai seng,subzone taman jurong,subzone tampines east,subzone tampines west,subzone tanglin halt,subzone tanjong rhu,subzone teban gardens,subzone teck whye,subzone telok blangah drive,subzone telok blangah rise,subzone telok blangah way,subzone tiong bahru,subzone tiong bahru station,subzone toa payoh central,subzone toa payoh west,subzone toh guan,subzone townsville,subzone trafalgar,subzone ulu pandan,subzone upper paya lebar,subzone upper thomson,subzone victoria,subzone waterway east,subzone wenya,subzone woodgrove,subzone woodlands east,subzone woodlands south,subzone woodlands west,subzone yew tee,subzone yio chu kang west,subzone yishun central,subzone yishun east,subzone yishun south,subzone yishun west,subzone yuhua east,subzone yuhua west,subzone yunnan,rent_approval_year,rent_approval_month,block_num_int
0,67.0,1.344518,103.73863,1600,3.0,59,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,2021,9,257
1,92.0,1.330186,103.938717,2250,4.0,54,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2022,5,119
2,67.0,1.332242,103.845643,1900,3.0,47,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2022,10,157
3,149.0,1.370239,103.962894,2850,4.0,69,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2021,8,250
4,68.0,1.320502,103.863341,2100,3.0,48,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2022,11,34


In [19]:
coe_prices = pd.read_csv("data/auxiliary-data/auxiliary-data/sg-coe-prices.csv")
mean_coe_prices, feature_eng_df = compute_mean_coe_prices(coe_prices, preprocessed_df)

In [20]:
amenities_df = pd.read_csv("data/auxiliary-data/auxiliary-data/amenity_data_allmrt.csv")
feature_eng_df = count_amenity(feature_eng_df, amenities_df, 0.8)

In [21]:
baseline_w_feature_eng_train, baseline_w_feature_eng_val = train_test_split(feature_eng_df, test_size=0.1, random_state=42)
baseline_w_feature_eng_train = baseline_w_feature_eng_train.dropna()
baseline_w_feature_eng_val = baseline_w_feature_eng_val.dropna()
baseline_w_feature_eng_train.to_csv("data/train/baseline_w_feature_eng_train.csv", index=False)
baseline_w_feature_eng_val.to_csv("data/train/baseline_w_feature_eng_val.csv", index=False)

### Dataset 3: Baseline with Extra Columns Removed
This is a truncated version of the baseline dataset with the following columns removed to check if their inclusion/exclusion has any impact on predicted rental prices.
- block
- town
- subzone
- street_name


In [22]:
extra_preprocessed_df = cleaned_df
extra_preprocessed_df['std_flat_type'] = extra_preprocessed_df.apply(lambda x: std_flat_type(x['flat_type'], x['lease_commence_date']), axis=1)
extra_preprocessed_df['std_flat_model'] = extra_preprocessed_df.apply(lambda x: std_flat_model(x['flat_model']), axis=1)
extra_preprocessed_df['std_remaining_lease'] = extra_preprocessed_df.apply(lambda x: std_remaining_lease(x['lease_commence_date'], 2023), axis=1)

extra_preprocessed_df = preprocess_region(extra_preprocessed_df)
extra_preprocessed_df = preprocess_planning_area(extra_preprocessed_df)
extra_preprocessed_df = preprocess_flat_model(extra_preprocessed_df)

extra_preprocessed_df = preprocess_rent_approval_date(extra_preprocessed_df)
extra_preprocessed_df = extra_preprocessed_df.drop(columns=['block', 'flat_type', 'town', 'subzone', 'lease_commence_date'])

In [23]:
baseline_truncated_train, baseline_truncated_val = train_test_split(extra_preprocessed_df, test_size=0.1, random_state=42)
baseline_truncated_train.to_csv("data/train/baseline_truncated_train.csv", index=False)
baseline_truncated_val.to_csv("data/train/baseline_truncated_val.csv", index=False)

### Dataset 4: Baseline Using Price Per Sq Metre
This is a dataset using the price per square metre as a metric for prediction

In [24]:
ppsm_baseline = preprocessed_df
ppsm_baseline['rent_per_sqm'] = ppsm_baseline['monthly_rent'] / ppsm_baseline['floor_area_sqm']
# ppsm_baseline.head()

In [25]:
ppsm_baseline = ppsm_baseline.drop(columns=['monthly_rent', 'floor_area_sqm'])

In [26]:
ppsm_baseline_train, ppsm_baseline_val = train_test_split(ppsm_baseline, test_size=0.1, random_state=42)
ppsm_baseline_train.to_csv("data/train/ppsm_baseline_train.csv", index=False)
ppsm_baseline_val.to_csv("data/train/ppsm_baseline_val.csv", index=False)

### Dataset 5: Feature Engineering Set with Using Price Per Sq Metre
This is a dataset using the price per square metre as a metric for prediction and including additional features as per Dataset 2.

In [27]:
feature_eng_df.head()

Unnamed: 0,floor_area_sqm,latitude,longitude,monthly_rent,std_flat_type,std_remaining_lease,central region,east region,north region,north-east region,west region,planning_area_ ang mo kio,planning_area_ bedok,planning_area_ bishan,planning_area_ bukit batok,planning_area_ bukit merah,planning_area_ bukit panjang,planning_area_ bukit timah,planning_area_ choa chu kang,planning_area_ clementi,planning_area_ downtown core,planning_area_ geylang,planning_area_ hougang,planning_area_ jurong east,planning_area_ jurong west,planning_area_ kallang,planning_area_ marine parade,planning_area_ novena,planning_area_ outram,planning_area_ pasir ris,planning_area_ punggol,planning_area_ queenstown,planning_area_ rochor,planning_area_ sembawang,planning_area_ sengkang,planning_area_ serangoon,planning_area_ tampines,planning_area_ toa payoh,planning_area_ woodlands,planning_area_ yishun,A,I,NG,OTH,P,S,STD,ang mo kio,bedok,bishan,bukit batok,bukit merah,bukit panjang,bukit timah,central,choa chu kang,clementi,geylang,hougang,jurong east,jurong west,kallang/whampoa,marine parade,pasir ris,punggol,queenstown,sembawang,sengkang,serangoon,tampines,toa payoh,woodlands,yishun,subzone admiralty,subzone alexandra hill,subzone aljunied,subzone anak bukit,subzone anchorvale,subzone ang mo kio town centre,subzone balestier,subzone bangkit,subzone bedok north,subzone bedok reservoir,subzone bedok south,subzone bencoolen,subzone bendemeer,subzone bishan east,subzone boon keng,subzone boon lay place,subzone boon teck,subzone braddell,subzone brickworks,subzone bugis,subzone bukit batok central,subzone bukit batok east,subzone bukit batok south,subzone bukit batok west,subzone bukit ho swee,subzone bukit merah,subzone cheng san,subzone china square,subzone chinatown,subzone choa chu kang central,subzone choa chu kang north,subzone chong boon,subzone city hall,subzone clementi central,subzone clementi north,subzone clementi west,subzone clementi woods,subzone commonwealth,subzone compassvale,subzone crawford,subzone depot road,subzone dover,subzone everton park,subzone fajar,subzone farrer park,subzone fernvale,subzone frankel,subzone geylang bahru,subzone geylang east,subzone ghim moh,subzone gombak,subzone guilin,subzone henderson hill,subzone holland drive,subzone hong kah,subzone hong kah north,subzone hougang central,subzone hougang east,subzone hougang west,subzone jelebu,subzone joo seng,subzone jurong west central,subzone kaki bukit,subzone kampong bugis,subzone kampong java,subzone kampong tiong bahru,subzone kampong ubi,subzone kangkar,subzone keat hong,subzone kebun bahru,subzone kembangan,subzone khatib,subzone kim keat,subzone kovan,subzone lavender,subzone little india,subzone lorong 8 toa payoh,subzone lorong ah soo,subzone lower seletar,subzone macpherson,subzone margaret drive,subzone marine parade,subzone marymount,subzone matilda,subzone mei chin,subzone midview,subzone moulmein,subzone north coast,subzone northland,subzone pasir panjang 2,subzone pasir ris central,subzone pasir ris drive,subzone pasir ris west,subzone pearl's hill,subzone pei chun,subzone peng siang,subzone potong pasir,subzone punggol field,subzone punggol town centre,subzone redhill,subzone rivervale,subzone saujana,subzone sembawang central,subzone sembawang east,subzone sembawang north,subzone sengkang town centre,subzone senja,subzone serangoon central,subzone serangoon garden,subzone serangoon north,subzone shangri-la,subzone simei,subzone sungei road,subzone sunset way,subzone swiss club,subzone tai seng,subzone taman jurong,subzone tampines east,subzone tampines west,subzone tanglin halt,subzone tanjong rhu,subzone teban gardens,subzone teck whye,subzone telok blangah drive,subzone telok blangah rise,subzone telok blangah way,subzone tiong bahru,subzone tiong bahru station,subzone toa payoh central,subzone toa payoh west,subzone toh guan,subzone townsville,subzone trafalgar,subzone ulu pandan,subzone upper paya lebar,subzone upper thomson,subzone victoria,subzone waterway east,subzone wenya,subzone woodgrove,subzone woodlands east,subzone woodlands south,subzone woodlands west,subzone yew tee,subzone yio chu kang west,subzone yishun central,subzone yishun east,subzone yishun south,subzone yishun west,subzone yuhua east,subzone yuhua west,subzone yunnan,rent_approval_year,rent_approval_month,block_num_int,mean_coe_price,amenity_count
0,67.0,1.344518,103.73863,1600,3.0,59,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,2021,9,257,54951.625,3.0
1,92.0,1.330186,103.938717,2250,4.0,54,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2022,5,119,77035.625,4.0
2,67.0,1.332242,103.845643,1900,3.0,47,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2022,10,157,89580.25,5.0
3,149.0,1.370239,103.962894,2850,4.0,69,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2021,8,250,51997.375,4.0
4,68.0,1.320502,103.863341,2100,3.0,48,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2022,11,34,99472.875,4.0


In [28]:
feature_eng_ppsm = feature_eng_df
feature_eng_ppsm['rent_per_sqm'] = feature_eng_ppsm['monthly_rent'] / feature_eng_ppsm['floor_area_sqm']
feature_eng_ppsm = feature_eng_ppsm.drop(columns=['monthly_rent', 'floor_area_sqm'])

In [29]:
feature_eng_ppsm.head()

Unnamed: 0,latitude,longitude,std_flat_type,std_remaining_lease,central region,east region,north region,north-east region,west region,planning_area_ ang mo kio,planning_area_ bedok,planning_area_ bishan,planning_area_ bukit batok,planning_area_ bukit merah,planning_area_ bukit panjang,planning_area_ bukit timah,planning_area_ choa chu kang,planning_area_ clementi,planning_area_ downtown core,planning_area_ geylang,planning_area_ hougang,planning_area_ jurong east,planning_area_ jurong west,planning_area_ kallang,planning_area_ marine parade,planning_area_ novena,planning_area_ outram,planning_area_ pasir ris,planning_area_ punggol,planning_area_ queenstown,planning_area_ rochor,planning_area_ sembawang,planning_area_ sengkang,planning_area_ serangoon,planning_area_ tampines,planning_area_ toa payoh,planning_area_ woodlands,planning_area_ yishun,A,I,NG,OTH,P,S,STD,ang mo kio,bedok,bishan,bukit batok,bukit merah,bukit panjang,bukit timah,central,choa chu kang,clementi,geylang,hougang,jurong east,jurong west,kallang/whampoa,marine parade,pasir ris,punggol,queenstown,sembawang,sengkang,serangoon,tampines,toa payoh,woodlands,yishun,subzone admiralty,subzone alexandra hill,subzone aljunied,subzone anak bukit,subzone anchorvale,subzone ang mo kio town centre,subzone balestier,subzone bangkit,subzone bedok north,subzone bedok reservoir,subzone bedok south,subzone bencoolen,subzone bendemeer,subzone bishan east,subzone boon keng,subzone boon lay place,subzone boon teck,subzone braddell,subzone brickworks,subzone bugis,subzone bukit batok central,subzone bukit batok east,subzone bukit batok south,subzone bukit batok west,subzone bukit ho swee,subzone bukit merah,subzone cheng san,subzone china square,subzone chinatown,subzone choa chu kang central,subzone choa chu kang north,subzone chong boon,subzone city hall,subzone clementi central,subzone clementi north,subzone clementi west,subzone clementi woods,subzone commonwealth,subzone compassvale,subzone crawford,subzone depot road,subzone dover,subzone everton park,subzone fajar,subzone farrer park,subzone fernvale,subzone frankel,subzone geylang bahru,subzone geylang east,subzone ghim moh,subzone gombak,subzone guilin,subzone henderson hill,subzone holland drive,subzone hong kah,subzone hong kah north,subzone hougang central,subzone hougang east,subzone hougang west,subzone jelebu,subzone joo seng,subzone jurong west central,subzone kaki bukit,subzone kampong bugis,subzone kampong java,subzone kampong tiong bahru,subzone kampong ubi,subzone kangkar,subzone keat hong,subzone kebun bahru,subzone kembangan,subzone khatib,subzone kim keat,subzone kovan,subzone lavender,subzone little india,subzone lorong 8 toa payoh,subzone lorong ah soo,subzone lower seletar,subzone macpherson,subzone margaret drive,subzone marine parade,subzone marymount,subzone matilda,subzone mei chin,subzone midview,subzone moulmein,subzone north coast,subzone northland,subzone pasir panjang 2,subzone pasir ris central,subzone pasir ris drive,subzone pasir ris west,subzone pearl's hill,subzone pei chun,subzone peng siang,subzone potong pasir,subzone punggol field,subzone punggol town centre,subzone redhill,subzone rivervale,subzone saujana,subzone sembawang central,subzone sembawang east,subzone sembawang north,subzone sengkang town centre,subzone senja,subzone serangoon central,subzone serangoon garden,subzone serangoon north,subzone shangri-la,subzone simei,subzone sungei road,subzone sunset way,subzone swiss club,subzone tai seng,subzone taman jurong,subzone tampines east,subzone tampines west,subzone tanglin halt,subzone tanjong rhu,subzone teban gardens,subzone teck whye,subzone telok blangah drive,subzone telok blangah rise,subzone telok blangah way,subzone tiong bahru,subzone tiong bahru station,subzone toa payoh central,subzone toa payoh west,subzone toh guan,subzone townsville,subzone trafalgar,subzone ulu pandan,subzone upper paya lebar,subzone upper thomson,subzone victoria,subzone waterway east,subzone wenya,subzone woodgrove,subzone woodlands east,subzone woodlands south,subzone woodlands west,subzone yew tee,subzone yio chu kang west,subzone yishun central,subzone yishun east,subzone yishun south,subzone yishun west,subzone yuhua east,subzone yuhua west,subzone yunnan,rent_approval_year,rent_approval_month,block_num_int,mean_coe_price,amenity_count,rent_per_sqm
0,1.344518,103.73863,3.0,59,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,2021,9,257,54951.625,3.0,23.880597
1,1.330186,103.938717,4.0,54,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2022,5,119,77035.625,4.0,24.456522
2,1.332242,103.845643,3.0,47,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2022,10,157,89580.25,5.0,28.358209
3,1.370239,103.962894,4.0,69,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2021,8,250,51997.375,4.0,19.127517
4,1.320502,103.863341,3.0,48,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2022,11,34,99472.875,4.0,30.882353


In [30]:
feature_eng_ppsm_train, feature_eng_ppsm_val = train_test_split(feature_eng_ppsm, test_size=0.1, random_state=42)
feature_eng_ppsm_train = feature_eng_ppsm_train.dropna()
feature_eng_ppsm_val = feature_eng_ppsm_val.dropna()
feature_eng_ppsm_train.to_csv("data/train/feature_eng_ppsm_train.csv", index=False)
feature_eng_ppsm_val.to_csv("data/train/feature_eng_ppsm_val.csv", index=False)