# Singapore Resale Price Prediction


In [98]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split 
from sklearn.metrics import mean_squared_error


## Read the following datasets from Kaggle 

In [109]:
data = pd.read_csv('flat-prices.csv')
data.head()

NEW GENERATION         78898
IMPROVED               73589
MODEL A                70381
SIMPLIFIED             23258
STANDARD               17375
MAISONETTE             12215
APARTMENT               9901
MODEL A-MAISONETTE       982
MULTI GENERATION         279
TERRACE                  247
IMPROVED-MAISONETTE       44
2-ROOM                    21
PREMIUM APARTMENT          6
Name: flat_model, dtype: int64

## Encode categorical data and select columns to form the feature matrix

Choose:

1. town
2. flat_type
3. storey_range
4. floor_area_sqm
5. flat_model
6. lease_commmence_date

In [110]:
label = LabelEncoder()

data['town'] = label.fit_transform(data['town'])
data['flat_type'] = label.fit_transform(data['flat_type'])
data['storey_range'] = label.fit_transform(data['storey_range'])
data['flat_model'] = label.fit_transform(data['flat_model'])

selected_features = data[['town', 'flat_type', 'storey_range', 'floor_area_sqm', 'flat_model', 'lease_commence_date']]

X_nobias = selected_features.values 
x1 = np.ones((len(X_nobias), 1))
X = np.hstack((x1, X_nobias))

y = data['resale_price']


8     78898
2     73589
5     70381
10    23258
11    17375
4     12215
1      9901
6       982
7       279
12      247
3        44
0        21
9         6
Name: flat_model, dtype: int64

## Summary of Encoding

Town:
0 - ANG MO KIO
1 - BEDOK
2 - BISHAN
3 - BUKIT BATOK
4 - BUKIT MERAH
5 - BUKIT PANJANG
6 - BUKIT TIMAH
7 - CENTRAL AREA
8 - CHOA CHU KANG
9 - CLEMENTI
10 - GEYLANG
11 - HOUGANG
12 - JURONG EAST
13 - JURONG WEST
14 - KALLANG/WHAMPOA
15 - LIM CHU KANG
16 - MARINE PARADE
17 - PASIR RIS
18 - QUEENSTOWN
19 - SEMBAWANG
20 - SENGKANG
21 - SERANGOON
22 - TAMPINES
23 - TOA PAYOH
24 - WOODLANDS
25 - YISHUN

Flat Type:
0 - 1 ROOM
1 - 2 ROOM
2 - 3 ROOM
3 - 4 ROOM
4 - 5 ROOM
5 - EXECUTIVE 
6 - MULTI GENERATION

Storey Range:
0 - 01 TO 03 
1 - 04 TO 06 
2 - 07 TO 09
3 - 10 TO 12
4 - 13 TO 15
5 - 16 TO 18
6 - 19 TO 21
7 - 22 TO 24
8 - 25 TO 27

Flat Model:
0 - 2-ROOM
1 - APARTMENT
2 - IMPROVED
3 - IMPROVED-MAISONETTE
4 - MAISONETTE
5 - MODEL A
6 - MODEL A-MAISONETTE
7 - MULTI GENERATION
8 - NEW GENERATION
9 - PREMIUM APARTMENT
10 - SIMPLIFIED
11 - STANDARD
12 - TERRACE

## Training the Model

Model:

X_train is the feature matrix which is likely to be a tall matrix (more rows than columns) 
As such to obtain the weight matrix of:
Xw = y 

w = inv(X_TX)*X_Ty, where X_T is the transpose of X

In [93]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.25, random_state= 42)

X_transpose = X_train.T
XTX = X_transpose.dot(X_train)
invXTX = np.linalg.inv(XTX)

w = (invXTX.dot(X_transpose)).dot(y_train)