# Singapore Resale Price Prediction


In [80]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from math import sqrt


## Read the following datasets from Kaggle 

In [73]:
data = pd.read_csv('flat-prices.csv')
data.head()

Unnamed: 0,month,town,flat_type,block,street_name,storey_range,floor_area_sqm,flat_model,lease_commence_date,resale_price
0,1990-01,ANG MO KIO,1 ROOM,309,ANG MO KIO AVE 1,10 TO 12,31.0,IMPROVED,1977,9000
1,1990-01,ANG MO KIO,1 ROOM,309,ANG MO KIO AVE 1,04 TO 06,31.0,IMPROVED,1977,6000
2,1990-01,ANG MO KIO,1 ROOM,309,ANG MO KIO AVE 1,10 TO 12,31.0,IMPROVED,1977,8000
3,1990-01,ANG MO KIO,1 ROOM,309,ANG MO KIO AVE 1,07 TO 09,31.0,IMPROVED,1977,6000
4,1990-01,ANG MO KIO,3 ROOM,216,ANG MO KIO AVE 1,04 TO 06,73.0,NEW GENERATION,1976,47200


## Encode categorical data and select columns to form the feature matrix

Choose:

1. town
2. flat_type
3. storey_range
4. floor_area_sqm
5. flat_model
6. lease_commmence_date

In [74]:
label = LabelEncoder()

data['town'] = label.fit_transform(data['town'])
data['flat_type'] = label.fit_transform(data['flat_type'])
data['storey_range'] = label.fit_transform(data['storey_range'])
data['flat_model'] = label.fit_transform(data['flat_model'])

selected_features = data[['town', 'flat_type', 'storey_range', 'floor_area_sqm', 'flat_model', 'lease_commence_date']]

X_nobias = selected_features.values 
x1 = np.ones((len(X_nobias), 1))
X = np.hstack((x1, X_nobias))

y = data['resale_price']


## Summary of Encoding

Town:
0 - ANG MO KIO
1 - BEDOK
2 - BISHAN
3 - BUKIT BATOK
4 - BUKIT MERAH
5 - BUKIT PANJANG
6 - BUKIT TIMAH
7 - CENTRAL AREA
8 - CHOA CHU KANG
9 - CLEMENTI
10 - GEYLANG
11 - HOUGANG
12 - JURONG EAST
13 - JURONG WEST
14 - KALLANG/WHAMPOA
15 - LIM CHU KANG
16 - MARINE PARADE
17 - PASIR RIS
18 - QUEENSTOWN
19 - SEMBAWANG
20 - SENGKANG
21 - SERANGOON
22 - TAMPINES
23 - TOA PAYOH
24 - WOODLANDS
25 - YISHUN

Flat Type:
0 - 1 ROOM
1 - 2 ROOM
2 - 3 ROOM
3 - 4 ROOM
4 - 5 ROOM
5 - EXECUTIVE 
6 - MULTI GENERATION

Storey Range:
0 - 01 TO 03 
1 - 04 TO 06 
2 - 07 TO 09
3 - 10 TO 12
4 - 13 TO 15
5 - 16 TO 18
6 - 19 TO 21
7 - 22 TO 24
8 - 25 TO 27

Flat Model:
0 - 2-ROOM
1 - APARTMENT
2 - IMPROVED
3 - IMPROVED-MAISONETTE
4 - MAISONETTE
5 - MODEL A
6 - MODEL A-MAISONETTE
7 - MULTI GENERATION
8 - NEW GENERATION
9 - PREMIUM APARTMENT
10 - SIMPLIFIED
11 - STANDARD
12 - TERRACE

## Training the model

Model:

X_train is the feature matrix and it is a tall matrix (more rows than columns), therefore the weight vector, w in Xw = y is given by:

w = inv(X_TX)*X_Ty, where X_T is the transpose of X

In [75]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.25, random_state= 42)

X_train_transpose = X_train.T
XTX = X_train_transpose.dot(X_train)
invXTX = np.linalg.inv(XTX)
w = (invXTX.dot(X_train_transpose)).dot(y_train)

## Evaluate the train model 

Evaluate the model using the metric: Root Mean Squared Error (RMSE)

$$ RMSE = \sqrt{\sum \limits _{i = 0}^{n} \frac {(\hat{y_i} - y_i)^2}{n}} $$

In [76]:
y_pred_train = X_train.dot(w)
difference = np.subtract(y_pred_train, y_train)
square = np.square(difference)
RMSE_train = np.sqrt(np.mean(square))

RMSE_train


75762.36670795342

## Evaluate the model using test set

In [77]:
y_pred_test = X_test.dot(w)
difference = np.subtract(y_pred_test, y_test)
square = np.square(difference)
RMSE_test = np.sqrt(np.mean(square))

RMSE_test

75962.30851042933

## Predictions Example 

Let us see how our model predicts resale prices.
Take an example: A 5-ROOM flat in BEDOK, FLOOR 04 TO 06, IMPROVED, FLOOR AREA = 121, LEASE COMMENCED in 1980


The feature matrix parameters are:
1 - Bias
1 - BEDOK
4 - 5-ROOM
1 - 04 TO 06
121 - FLOOR AREA
2 - IMPROVED
1980 - LEASE COMMENCEMENT DATE


Actual Resale Price: 145,000

In [78]:
X_example = np.array([1, 1, 4, 1, 121, 2, 1980])
y_example = X_example.dot(w)

y_example

322035.08284687204

### In the next section the process is repeated using the Linear Regression package from sklearn to compare the results

In [88]:
reg = LinearRegression().fit(X_train, y_train)
y_pred_trainsk = reg.predict(X_train)
RMSE_trainsk = sqrt(mean_squared_error(y_pred_trainsk, y_train))

RMSE_trainsk

75762.36670795392

In [84]:
y_pred_testsk = reg.predict(X_test)
RMSE_testsk = sqrt(mean_squared_error(y_pred_testsk, y_test))

RMSE_testsk

75962.30851042214

In [94]:
X_examplesk = np.array([1, 1, 4, 1, 121, 2, 1980])
y_examplesk = reg.predict(X_examplesk.reshape(1, -1))

y_examplesk[0]

322035.08283840865