# Linear Regression

In [1]:
%matplotlib notebook

In [11]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.model_selection import train_test_split

In [3]:
df = pd.read_csv('processed_train2.csv')
df.head()

Unnamed: 0,flat_type,storey_range,year,remaining_lease,price_per_sqm,nearest_mrt_distance,exist_primary_school,nearest_shopping_mall_distance,nearest_secondary_school_distance,nearest_market_distance,...,rochor,sembawang,sengkang,serangoon,sungei kadut,tampines,tanglin,toa payoh,woodlands,yishun
0,1.609438,0.0,2001,87,1777.118644,7.037487,0,1033.21754,6.06216,6.852835,...,0,0,0,0,0,0,0,0,0,0
1,1.791759,1.098612,2014,88,3657.272727,6.714843,1,813.177496,5.747428,8.238378,...,0,0,0,0,0,0,0,0,0,0
2,1.791759,0.0,2020,83,3133.928571,7.71052,0,452.557047,6.32683,7.498285,...,0,0,1,0,0,0,0,0,0,0
3,1.386294,0.693147,2000,79,2256.716418,6.05049,1,456.500122,6.429934,6.428578,...,0,0,0,0,0,0,0,0,0,0
4,1.386294,0.693147,2013,71,4364.383562,6.653148,1,764.172989,5.388664,6.637156,...,0,0,0,0,0,0,0,0,0,0


In [4]:
# Convert data to numpy arrays
y_train = df[['price_per_sqm']].to_numpy().squeeze()
X_train = df.drop(columns=['price_per_sqm']).to_numpy()

In [5]:
y_train

array([1777.11864407, 3657.27272727, 3133.92857143, ..., 3814.92537313,
       4134.14634146, 1557.69230769])

In [6]:
X_train

array([[1.60943791e+00, 0.00000000e+00, 2.00100000e+03, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [1.79175947e+00, 1.09861229e+00, 2.01400000e+03, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [1.79175947e+00, 0.00000000e+00, 2.02000000e+03, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       ...,
       [1.38629436e+00, 0.00000000e+00, 2.01100000e+03, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [1.79175947e+00, 1.09861229e+00, 2.01300000e+03, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [1.60943791e+00, 0.00000000e+00, 2.00700000e+03, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00]])

In [7]:
df_test = pd.read_csv('processed_test2.csv')
df_test.head()

Unnamed: 0,flat_type,storey_range,year,remaining_lease,nearest_mrt_distance,exist_primary_school,nearest_shopping_mall_distance,nearest_secondary_school_distance,nearest_market_distance,nearest_commercial_distance,...,rochor,sembawang,sengkang,serangoon,sungei kadut,tampines,tanglin,toa payoh,woodlands,yishun
0,1.609438,0.693147,2004,84,6.505111,1,693.942863,6.05755,6.676841,7.46801,...,0,0,0,0,0,0,0,0,0,0
1,1.791759,0.693147,2001,95,6.64299,1,1933.981858,4.380676,7.636065,7.507018,...,0,0,0,0,0,1,0,0,0,0
2,1.386294,0.0,2002,79,6.2524,1,524.612777,5.996569,6.671233,6.835331,...,0,0,0,0,0,0,0,0,0,0
3,1.386294,0.693147,2015,65,6.306135,1,829.129343,6.349547,6.865476,7.61496,...,0,0,0,0,0,0,0,0,0,0
4,1.791759,0.0,2004,73,6.247808,1,604.665329,6.096252,4.73619,7.511288,...,0,0,0,0,0,0,0,0,0,0


In [8]:
X_test = df_test.to_numpy()

In [9]:
X_test

array([[1.60943791e+00, 6.93147181e-01, 2.00400000e+03, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [1.79175947e+00, 6.93147181e-01, 2.00100000e+03, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [1.38629436e+00, 0.00000000e+00, 2.00200000e+03, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       ...,
       [1.38629436e+00, 6.93147181e-01, 2.00000000e+03, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [1.60943791e+00, 1.09861229e+00, 2.00900000e+03, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [1.60943791e+00, 6.93147181e-01, 2.01000000e+03, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00]])

In [20]:
df_test_raw = pd.read_csv('test.csv')
df_test_raw.head()

Unnamed: 0,month,town,flat_type,block,street_name,storey_range,floor_area_sqm,flat_model,eco_category,lease_commence_date,latitude,longitude,elevation,subzone,planning_area,region
0,2004-01,bukit batok,4 room,186,bukit batok west avenue 6,04 to 06,94.0,new generation,uncategorized,1989,1.346581,103.744085,0.0,bukit batok west,bukit batok,west region
1,2001-11,tampines,5 room,366,tampines street 34,04 to 06,122.0,improved,uncategorized,1997,1.357618,103.961379,0.0,tampines east,tampines,east region
2,2002-07,jurong east,3 room,206,jurong east street 21,01 to 03,67.0,new generation,uncategorized,1982,1.337804,103.741998,0.0,toh guan,jurong east,west region
3,2015-04,ang mo kio,3 room,180,Ang Mo Kio Avenue 5,04 to 06,82.0,new generation,uncategorized,1981,1.380084,103.849574,0.0,yio chu kang east,ang mo kio,north-east region
4,2004-04,clementi,5 room,356,clementi avenue 2,01 to 03,117.0,standard,uncategorized,1978,1.31396,103.769831,0.0,clementi north,clementi,west region


In [10]:
# We fit the scaler based on the training data only
scaler = StandardScaler().fit(X_train)

# Of course, we need to convert both training and test data
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

## Modelling

In [13]:
# Transform data w.r.t to degree of polynomial p
poly = PolynomialFeatures(2)
X_train_poly = poly.fit_transform(X_train)
X_test_poly = poly.fit_transform(X_test)

In [14]:
# Linear Regression
linear = LinearRegression(fit_intercept=False)
linear.fit(X_train_poly, y_train)

LinearRegression(fit_intercept=False)

In [16]:
y_test_pred = linear.predict(X_test_poly)

In [None]:
# # Ridge
# ridge = Ridge(alpha=0.1, fit_intercept=False)
# ridge.fit(X_train_poly, y_train)

In [None]:
# y_test_pred = ridge.predict(X_test_poly)

In [None]:
# # Lasso
# lasso = Lasso(fit_intercept=False)
# lasso.fit(X_train_poly, y_train)

In [None]:
# y_test_pred = lasso.predict(X_test_poly)

In [17]:
# convert back to resale prices
df_kaggle = pd.DataFrame(data=y_test_pred, columns=["price_per_sqm"])

In [21]:
df_kaggle["resale_price"] = df_kaggle["price_per_sqm"] * df_test_raw["floor_area_sqm"]

In [22]:
df_kaggle

Unnamed: 0,price_per_sqm,resale_price
0,2440.861633,229440.993530
1,2307.089989,281464.978622
2,2131.690491,142823.262878
3,4148.542694,340180.500916
4,2939.847412,343962.147217
...,...,...
107929,3270.979187,359807.710571
107930,4135.267090,421797.243164
107931,1523.488892,103597.244629
107932,2615.512573,272013.307617


In [23]:
df_kaggle.reset_index(inplace=True)
df_kaggle

Unnamed: 0,index,price_per_sqm,resale_price
0,0,2440.861633,229440.993530
1,1,2307.089989,281464.978622
2,2,2131.690491,142823.262878
3,3,4148.542694,340180.500916
4,4,2939.847412,343962.147217
...,...,...,...
107929,107929,3270.979187,359807.710571
107930,107930,4135.267090,421797.243164
107931,107931,1523.488892,103597.244629
107932,107932,2615.512573,272013.307617


In [24]:
del df_kaggle['price_per_sqm']

In [25]:
df_kaggle

Unnamed: 0,index,resale_price
0,0,229440.993530
1,1,281464.978622
2,2,142823.262878
3,3,340180.500916
4,4,343962.147217
...,...,...
107929,107929,359807.710571
107930,107930,421797.243164
107931,107931,103597.244629
107932,107932,272013.307617


In [26]:
filename="kaggle_final.csv"
df_kaggle.to_csv(filename, index=False, header=["Id", "Predicted"])

In [27]:
# 48532.37206