In [None]:
import numpy as np
import pandas as pd
import pickle

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

## 8. Data Preprocessing <a name='data-preprocess'></a>
[Back to top](#Contents)<br>

In [116]:
data = pd.read_csv('../data/data_eng.csv')

First, we one-hot encode the categorical variables.

In [117]:
data = pd.get_dummies(data, prefix='type', columns=['Type'])
data = data[[c for c in data if c not in ['price_sqft', 'type_Condo', 'type_HDB']] + ['type_HDB', 'type_Condo','price_sqft']]

Next, we stratified-split the dataset into training and test sets.

In [118]:
hdb = data[data['type_HDB'] == 1]
condo = data[data['type_Condo'] == 1]

train_hdb, test_hdb = train_test_split(hdb, test_size=0.1)
train_condo, test_condo = train_test_split(condo, test_size=0.1)

train = pd.concat([train_hdb, train_condo], axis=0).reset_index(drop=True)
test = pd.concat([test_hdb, test_condo], axis=0).reset_index(drop=True)

After that, we scaled the features of the training data by z-scoring. We are careful not to introduce any data leakage by using the same scaler to scale the test data.

In [119]:
scaler = StandardScaler()
train.iloc[:,:-3] = scaler.fit_transform(train.iloc[:,:-3]) #Excludes type_hdb, type_condo, price_sqft
test.iloc[:,:-3] = scaler.transform(test.iloc[:,:-3])

Finally, we store the training and test data into their respective files, and also the scaler used to scale the training data. This scaler can be used to scale any future test data we used.

In [120]:
train.to_csv('../data/train.csv', index=False)
test.to_csv('../data/test.csv', index=False)
pickle.dump(scaler, open('../model/object/scaler.pickle', 'wb'))