# Feature Engineering

In [1]:
# Imports

# Standard Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import math
import requests
import random
from urllib3.util.retry import Retry
from requests.adapters import HTTPAdapter
import time
import scipy.stats as stats
from pprint import pprint
import re

from sklearn.model_selection import train_test_split
import gc

In [2]:
# Read data file
df = pd.read_csv('gathered_data.csv')

In [3]:
# Define categorical variable
cat_vars = [var for var in df.columns if df[var].dtype == 'O']
print (cat_vars)

# Define numerical variable
num_vars = [var for var in df.columns if var not in cat_vars and var != "resale_price"]
print (num_vars)

['flat_type', 'storey_range', 'flat_model', 'district_code']
['floor_area_sqm', 'lease_commence_yr', 'remaining_lease', 'lat', 'lon', 'dist_mrt', 'dist_mall', 'sale_yr']


In [4]:
# Print unique values of each cat_vars
for var in cat_vars:
    print(f"Unique values for {var}: {df[var].unique()}")

Unique values for flat_type: ['2 ROOM' '3 ROOM' '4 ROOM' '5 ROOM' 'EXECUTIVE' '1 ROOM'
 'MULTI-GENERATION']
Unique values for storey_range: ['MidToHigh' 'Low' 'LowToMid' 'Middle' 'High' 'UltraHigh']
Unique values for flat_model: ['Improved' 'New Generation' 'DBSS' 'Standard' 'Apartment' 'Simplified'
 'Model A' 'Premium Apartment' 'Adjoined flat' 'Model A-Maisonette'
 'Maisonette' 'Type S1' 'Type S2' 'Model A2' 'Terrace'
 'Improved-Maisonette' 'Premium Maisonette' 'Multi Generation'
 'Premium Apartment Loft' '2-room']
Unique values for district_code: ['56' '46' '47' '41' '57' '65' '16' '10' '09' '15' '67' '59' '26' '20'
 '18' '05' '08' '68' '12' '37' '36' '40' '43' '39' '38' '53' '60' '64'
 '61' '32' '33' '21' '58' '19' '44' '50' '51' '82' '14' '27' '13' '75'
 '79' '54' '55' '52' '31' '35' '73' '76' 'NI' '42' '30' '11']


In [5]:
# Encode flat type and storey range
# flatype
map_flat_type = { '2 ROOM': 1, '3 ROOM': 2, '4 ROOM': 3, '5 ROOM': 4, 'MULTI-GENERATION': 5, 'EXECUTIVE': 6, 'Missing': 0, 'NaN': 0 }
df["flat_type_enc"] = df["flat_type"].map(map_flat_type)

# storey_rangem
map_storey_range = { 'Low': 1, 'LowToMid': 2, 'Mid': 3, 'Middle': 4, 'MidToHigh': 5, 'High': 6, 'UltraHigh': 7, 'Missing': 0, 'NA': 0}
df["storey_range_enc"] = df["storey_range"].map(map_storey_range)

print(df.head())

  flat_type storey_range  floor_area_sqm      flat_model  lease_commence_yr  \
0    2 ROOM    MidToHigh            44.0        Improved               1979   
1    3 ROOM          Low            67.0  New Generation               1978   
2    3 ROOM          Low            67.0  New Generation               1980   
3    3 ROOM     LowToMid            68.0  New Generation               1980   
4    3 ROOM          Low            67.0  New Generation               1980   

   remaining_lease  resale_price       lat         lon     dist_mrt  \
0        61.333333      232000.0  1.362005  103.853880  1003.996014   
1        60.583333      250000.0  1.370966  103.838202   189.874559   
2        62.416667      262000.0  1.380709  103.835368   535.116600   
3        62.083333      265000.0  1.366201  103.857201   945.527315   
4        62.416667      265000.0  1.381041  103.835132   501.151239   

     dist_mall district_code  sale_yr  flat_type_enc  storey_range_enc  
0  1101.335572           

In [6]:
# Mean encode for flat_model and district_code based on mean of target

# Separate into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df, df['resale_price'], test_size=0.5, random_state=0)

# Function to calculate mean target value for each category and create mapping
def find_category_mappings(df, variable, target):
    return df.groupby([variable])[target].mean().to_dict()

# Function to apply the mapping
def mean_encode(train, test, variable, ordinal_mapping):
    train[variable + '_mean_enc'] = train[variable].map(ordinal_mapping)
    test[variable + '_mean_enc'] = test[variable].map(ordinal_mapping)
    return train, test

# Apply mean encoding
mappings = find_category_mappings(X_train, 'flat_model', 'resale_price')
X_train, X_test = mean_encode(X_train, X_test, 'flat_model', mappings)

mappings = find_category_mappings(X_train, 'district_code', 'resale_price')
X_train, X_test = mean_encode(X_train, X_test, 'district_code', mappings)

In [7]:
print("Mean Encoding with Pandas:\n", X_train.head())

Mean Encoding with Pandas:
        flat_type storey_range  floor_area_sqm  flat_model  lease_commence_yr  \
59513     4 ROOM       Middle            93.0     Model A               2015   
43132     4 ROOM          Low            93.0     Model A               2015   
16632     2 ROOM    MidToHigh            43.0    Standard               1967   
61020     5 ROOM       Middle           121.0    Improved               1987   
1635   EXECUTIVE    MidToHigh           146.0  Maisonette               1986   

       remaining_lease  resale_price       lat         lon     dist_mrt  \
59513        94.583333      395000.0  1.400309  103.898075   564.346922   
43132        95.250000      350000.0  1.389041  103.910719   579.566170   
16632        48.750000      225000.0  1.337357  103.850391   506.629900   
61020        66.333333      638000.0  1.319789  103.767788   594.339306   
1635         68.416667      620000.0  1.320604  103.737792  1482.151624   

         dist_mall district_code  sale_y

In [8]:
print(X_train.info())

<class 'pandas.core.frame.DataFrame'>
Index: 38947 entries, 59513 to 68268
Data columns (total 17 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   flat_type               38947 non-null  object 
 1   storey_range            38947 non-null  object 
 2   floor_area_sqm          38947 non-null  float64
 3   flat_model              38947 non-null  object 
 4   lease_commence_yr       38947 non-null  int64  
 5   remaining_lease         38947 non-null  float64
 6   resale_price            38947 non-null  float64
 7   lat                     38947 non-null  float64
 8   lon                     38947 non-null  float64
 9   dist_mrt                38947 non-null  float64
 10  dist_mall               38947 non-null  float64
 11  district_code           38947 non-null  object 
 12  sale_yr                 38947 non-null  int64  
 13  flat_type_enc           38922 non-null  float64
 14  storey_range_enc        38947 non-null 

In [9]:
# Drop the object variables
X_train = X_train.drop(columns=cat_vars)
X_test = X_test.drop(columns=cat_vars)

# Drop the target variable
X_train = X_train.drop(columns="resale_price")
X_test = X_test.drop(columns="resale_price")

In [10]:
print(X_train.head())

       floor_area_sqm  lease_commence_yr  remaining_lease       lat  \
59513            93.0               2015        94.583333  1.400309   
43132            93.0               2015        95.250000  1.389041   
16632            43.0               1967        48.750000  1.337357   
61020           121.0               1987        66.333333  1.319789   
1635            146.0               1986        68.416667  1.320604   

              lon     dist_mrt    dist_mall  sale_yr  flat_type_enc  \
59513  103.898075   564.346922  1148.501196     2019            3.0   
43132  103.910719   579.566170   778.139275     2019            3.0   
16632  103.850391   506.629900  1233.770732     2017            1.0   
61020  103.767788   594.339306   660.200359     2019            4.0   
1635   103.737792  1482.151624  1745.766412     2017            6.0   

       storey_range_enc  flat_model_mean_enc  district_code_mean_enc  
59513                 4        422435.228412           452594.588939  
4313

In [11]:
print(y_train.head())

59513    395000.0
43132    350000.0
16632    225000.0
61020    638000.0
1635     620000.0
Name: resale_price, dtype: float64


In [12]:
# Export x_train, x_test, y_train and y_test to ./data-prepared folder, named the csv files as x_train.csv etc
# Export X_train, X_test, y_train and y_test to ./data-prepared folder
X_train.to_csv('./data-prepared/X_train.csv', index=False)
X_test.to_csv('./data-prepared/X_test.csv', index=False)
y_train.to_csv('./data-prepared/y_train.csv', index=False)
y_test.to_csv('./data-prepared/y_test.csv', index=False)