# Linear Regression

In [1]:
%matplotlib notebook

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.model_selection import train_test_split

- how to handle categorical variables (nominal e.g. planning_area)
- regression analysis

https://www.analyticsvidhya.com/blog/2015/11/easy-methods-deal-categorical-variables-predictive-modeling/
https://medium.com/analytics-vidhya/implementing-linear-regression-using-sklearn-76264a3c073c
https://www.datacamp.com/community/tutorials/categorical-data

In [3]:
df = pd.read_csv('processed_train.csv')
df.head()

Unnamed: 0,month,flat_type,storey_range,floor_area_sqm,planning_area,remaining_lease,price_per_sqm,nearest_mrt_distance,exist_primary_school
0,2001-08,4,L,118.0,pasir ris,87,1777.118644,1137.523316,False
1,2014-10,5,H,110.0,punggol,88,3657.272727,823.554217,True
2,2020-09,5,L,112.0,sengkang,83,3133.928571,2230.703009,False
3,2000-10,3,M,67.0,clementi,79,2256.716418,423.320893,True
4,2013-01,3,M,73.0,bukit batok,71,4364.383562,774.220785,True


In [4]:
# storey_range
df["storey_range"]=df["storey_range"].mask(df["storey_range"]=="L", 0)
df["storey_range"]=df["storey_range"].mask(df["storey_range"]=="M", 1)
df["storey_range"]=df["storey_range"].mask(df["storey_range"]=="H", 2)

In [5]:
# exist_primary_school
df["exist_primary_school"]=df["exist_primary_school"].mask(df["exist_primary_school"]==False, 0)
df["exist_primary_school"]=df["exist_primary_school"].mask(df["exist_primary_school"]==True, 1)

In [6]:
# flat_type
df["flat_type"]=df["flat_type"].mask(df["flat_type"]=="e", 6)
df["flat_type"]=df["flat_type"].mask(df["flat_type"]=="m", 7)

In [7]:
df.head()

Unnamed: 0,month,flat_type,storey_range,floor_area_sqm,planning_area,remaining_lease,price_per_sqm,nearest_mrt_distance,exist_primary_school
0,2001-08,4,0,118.0,pasir ris,87,1777.118644,1137.523316,0
1,2014-10,5,2,110.0,punggol,88,3657.272727,823.554217,1
2,2020-09,5,0,112.0,sengkang,83,3133.928571,2230.703009,0
3,2000-10,3,1,67.0,clementi,79,2256.716418,423.320893,1
4,2013-01,3,1,73.0,bukit batok,71,4364.383562,774.220785,1


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 431732 entries, 0 to 431731
Data columns (total 9 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   month                 431732 non-null  object 
 1   flat_type             431732 non-null  object 
 2   storey_range          431732 non-null  object 
 3   floor_area_sqm        431732 non-null  float64
 4   planning_area         431732 non-null  object 
 5   remaining_lease       431732 non-null  int64  
 6   price_per_sqm         431732 non-null  float64
 7   nearest_mrt_distance  431732 non-null  float64
 8   exist_primary_school  431732 non-null  object 
dtypes: float64(3), int64(1), object(5)
memory usage: 29.6+ MB


In [9]:
# one hot encoding planning_area
# https://www.analyticsvidhya.com/blog/2015/11/easy-methods-deal-categorical-variables-predictive-modeling/
values=df["planning_area"].unique()
print("planning_area")
print(np.sort(values))

planning_area
['ang mo kio' 'bedok' 'bishan' 'bukit batok' 'bukit merah' 'bukit panjang'
 'bukit timah' 'changi' 'choa chu kang' 'clementi' 'downtown core'
 'geylang' 'hougang' 'jurong east' 'jurong west' 'kallang' 'marine parade'
 'novena' 'outram' 'pasir ris' 'punggol' 'queenstown' 'rochor' 'sembawang'
 'sengkang' 'serangoon' 'sungei kadut' 'tampines' 'tanglin' 'toa payoh'
 'woodlands' 'yishun']


In [10]:
onehot = pd.get_dummies(df['planning_area'])
onehot.head()

Unnamed: 0,ang mo kio,bedok,bishan,bukit batok,bukit merah,bukit panjang,bukit timah,changi,choa chu kang,clementi,...,rochor,sembawang,sengkang,serangoon,sungei kadut,tampines,tanglin,toa payoh,woodlands,yishun
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [11]:
df = pd.concat([df, onehot], axis=1)

In [12]:
df.head()

Unnamed: 0,month,flat_type,storey_range,floor_area_sqm,planning_area,remaining_lease,price_per_sqm,nearest_mrt_distance,exist_primary_school,ang mo kio,...,rochor,sembawang,sengkang,serangoon,sungei kadut,tampines,tanglin,toa payoh,woodlands,yishun
0,2001-08,4,0,118.0,pasir ris,87,1777.118644,1137.523316,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2014-10,5,2,110.0,punggol,88,3657.272727,823.554217,1,0,...,0,0,0,0,0,0,0,0,0,0
2,2020-09,5,0,112.0,sengkang,83,3133.928571,2230.703009,0,0,...,0,0,1,0,0,0,0,0,0,0
3,2000-10,3,1,67.0,clementi,79,2256.716418,423.320893,1,0,...,0,0,0,0,0,0,0,0,0,0
4,2013-01,3,1,73.0,bukit batok,71,4364.383562,774.220785,1,0,...,0,0,0,0,0,0,0,0,0,0


In [13]:
df['month'] = pd.to_datetime(df['month'])
df['month'] = df['month'].apply(lambda x: x.value)

In [14]:
df['flat_type'] = df['flat_type'].astype(str).str.strip().astype(int)
df['storey_range'] = df['storey_range'].astype(str).str.strip().astype(int)
df['exist_primary_school'] = df['exist_primary_school'].astype(str).str.strip().astype(int)

In [15]:
df.head()

Unnamed: 0,month,flat_type,storey_range,floor_area_sqm,planning_area,remaining_lease,price_per_sqm,nearest_mrt_distance,exist_primary_school,ang mo kio,...,rochor,sembawang,sengkang,serangoon,sungei kadut,tampines,tanglin,toa payoh,woodlands,yishun
0,996624000000000000,4,0,118.0,pasir ris,87,1777.118644,1137.523316,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1412121600000000000,5,2,110.0,punggol,88,3657.272727,823.554217,1,0,...,0,0,0,0,0,0,0,0,0,0
2,1598918400000000000,5,0,112.0,sengkang,83,3133.928571,2230.703009,0,0,...,0,0,1,0,0,0,0,0,0,0
3,970358400000000000,3,1,67.0,clementi,79,2256.716418,423.320893,1,0,...,0,0,0,0,0,0,0,0,0,0
4,1356998400000000000,3,1,73.0,bukit batok,71,4364.383562,774.220785,1,0,...,0,0,0,0,0,0,0,0,0,0


In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 431732 entries, 0 to 431731
Data columns (total 41 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   month                 431732 non-null  int64  
 1   flat_type             431732 non-null  int64  
 2   storey_range          431732 non-null  int64  
 3   floor_area_sqm        431732 non-null  float64
 4   planning_area         431732 non-null  object 
 5   remaining_lease       431732 non-null  int64  
 6   price_per_sqm         431732 non-null  float64
 7   nearest_mrt_distance  431732 non-null  float64
 8   exist_primary_school  431732 non-null  int64  
 9   ang mo kio            431732 non-null  uint8  
 10  bedok                 431732 non-null  uint8  
 11  bishan                431732 non-null  uint8  
 12  bukit batok           431732 non-null  uint8  
 13  bukit merah           431732 non-null  uint8  
 14  bukit panjang         431732 non-null  uint8  
 15  

In [17]:
# Convert data to numpy arrays
y = df[['price_per_sqm']].to_numpy().squeeze()
X = df.drop(columns=['price_per_sqm', 'planning_area']).to_numpy()

In [18]:
y

array([1777.11864407, 3657.27272727, 3133.92857143, ..., 3814.92537313,
       4134.14634146, 1557.69230769])

In [19]:
X

array([[9.9662400e+17, 4.0000000e+00, 0.0000000e+00, ..., 0.0000000e+00,
        0.0000000e+00, 0.0000000e+00],
       [1.4121216e+18, 5.0000000e+00, 2.0000000e+00, ..., 0.0000000e+00,
        0.0000000e+00, 0.0000000e+00],
       [1.5989184e+18, 5.0000000e+00, 0.0000000e+00, ..., 0.0000000e+00,
        0.0000000e+00, 0.0000000e+00],
       ...,
       [1.2938400e+18, 3.0000000e+00, 0.0000000e+00, ..., 0.0000000e+00,
        0.0000000e+00, 0.0000000e+00],
       [1.3673664e+18, 5.0000000e+00, 2.0000000e+00, ..., 0.0000000e+00,
        0.0000000e+00, 0.0000000e+00],
       [1.1859264e+18, 4.0000000e+00, 0.0000000e+00, ..., 0.0000000e+00,
        0.0000000e+00, 0.0000000e+00]])

In [20]:
# We fit the scaler based on the training data only
scaler = StandardScaler().fit(X)

In [21]:
X = scaler.transform(X)

In [22]:
import statsmodels.api as sm

In [23]:
X2 = sm.add_constant(X)
est = sm.OLS(y, X2)
est2 = est.fit()
print(est2.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.717
Model:                            OLS   Adj. R-squared:                  0.717
Method:                 Least Squares   F-statistic:                 2.883e+04
Date:                Sun, 04 Apr 2021   Prob (F-statistic):               0.00
Time:                        00:28:20   Log-Likelihood:            -3.3742e+06
No. Observations:              431732   AIC:                         6.748e+06
Df Residuals:                  431693   BIC:                         6.749e+06
Df Model:                          38                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const       3123.8459      0.913   3422.038      0.0