# Linear Regression

In [1]:
%matplotlib notebook

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.model_selection import train_test_split

- how to handle categorical variables (nominal e.g. planning_area)
- regression analysis

https://www.analyticsvidhya.com/blog/2015/11/easy-methods-deal-categorical-variables-predictive-modeling/
https://medium.com/analytics-vidhya/implementing-linear-regression-using-sklearn-76264a3c073c
https://www.datacamp.com/community/tutorials/categorical-data

In [3]:
df = pd.read_csv('processed_train.csv')
df.head()

Unnamed: 0,month,flat_type,storey_range,floor_area_sqm,planning_area,remaining_lease,price_per_sqm,nearest_mrt_distance,exist_primary_school
0,2001-08,4,L,118.0,pasir ris,87,1777.118644,1137.523316,False
1,2014-10,5,H,110.0,punggol,88,3657.272727,823.554217,True
2,2020-09,5,L,112.0,sengkang,83,3133.928571,2230.703009,False
3,2000-10,3,M,67.0,clementi,79,2256.716418,423.320893,True
4,2013-01,3,M,73.0,bukit batok,71,4364.383562,774.220785,True


In [4]:
# storey_range
df["storey_range"]=df["storey_range"].mask(df["storey_range"]=="L", 0)
df["storey_range"]=df["storey_range"].mask(df["storey_range"]=="M", 1)
df["storey_range"]=df["storey_range"].mask(df["storey_range"]=="H", 2)

In [5]:
# exist_primary_school
df["exist_primary_school"]=df["exist_primary_school"].mask(df["exist_primary_school"]==False, 0)
df["exist_primary_school"]=df["exist_primary_school"].mask(df["exist_primary_school"]==True, 1)

In [6]:
# flat_type
df["flat_type"]=df["flat_type"].mask(df["flat_type"]=="e", 6)
df["flat_type"]=df["flat_type"].mask(df["flat_type"]=="m", 7)

In [7]:
df.head()

Unnamed: 0,month,flat_type,storey_range,floor_area_sqm,planning_area,remaining_lease,price_per_sqm,nearest_mrt_distance,exist_primary_school
0,2001-08,4,0,118.0,pasir ris,87,1777.118644,1137.523316,0
1,2014-10,5,2,110.0,punggol,88,3657.272727,823.554217,1
2,2020-09,5,0,112.0,sengkang,83,3133.928571,2230.703009,0
3,2000-10,3,1,67.0,clementi,79,2256.716418,423.320893,1
4,2013-01,3,1,73.0,bukit batok,71,4364.383562,774.220785,1


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 431732 entries, 0 to 431731
Data columns (total 9 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   month                 431732 non-null  object 
 1   flat_type             431732 non-null  object 
 2   storey_range          431732 non-null  object 
 3   floor_area_sqm        431732 non-null  float64
 4   planning_area         431732 non-null  object 
 5   remaining_lease       431732 non-null  int64  
 6   price_per_sqm         431732 non-null  float64
 7   nearest_mrt_distance  431732 non-null  float64
 8   exist_primary_school  431732 non-null  object 
dtypes: float64(3), int64(1), object(5)
memory usage: 29.6+ MB


In [9]:
# planning_area
# one hot encoding

In [10]:
df['month'] = pd.to_datetime(df['month'])
df['month'] = df['month'].apply(lambda x: x.value)

In [11]:
df['flat_type'] = df['flat_type'].astype(str).str.strip().astype(int)
df['storey_range'] = df['storey_range'].astype(str).str.strip().astype(int)
df['exist_primary_school'] = df['exist_primary_school'].astype(str).str.strip().astype(int)

In [12]:
df.head()

Unnamed: 0,month,flat_type,storey_range,floor_area_sqm,planning_area,remaining_lease,price_per_sqm,nearest_mrt_distance,exist_primary_school
0,996624000000000000,4,0,118.0,pasir ris,87,1777.118644,1137.523316,0
1,1412121600000000000,5,2,110.0,punggol,88,3657.272727,823.554217,1
2,1598918400000000000,5,0,112.0,sengkang,83,3133.928571,2230.703009,0
3,970358400000000000,3,1,67.0,clementi,79,2256.716418,423.320893,1
4,1356998400000000000,3,1,73.0,bukit batok,71,4364.383562,774.220785,1


In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 431732 entries, 0 to 431731
Data columns (total 9 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   month                 431732 non-null  int64  
 1   flat_type             431732 non-null  int64  
 2   storey_range          431732 non-null  int64  
 3   floor_area_sqm        431732 non-null  float64
 4   planning_area         431732 non-null  object 
 5   remaining_lease       431732 non-null  int64  
 6   price_per_sqm         431732 non-null  float64
 7   nearest_mrt_distance  431732 non-null  float64
 8   exist_primary_school  431732 non-null  int64  
dtypes: float64(3), int64(5), object(1)
memory usage: 29.6+ MB


In [14]:
# Convert data to numpy arrays
y = df[['price_per_sqm']].to_numpy().squeeze()
X = df.drop(columns=['price_per_sqm', 'planning_area']).to_numpy()

In [15]:
y

array([1777.11864407, 3657.27272727, 3133.92857143, ..., 3814.92537313,
       4134.14634146, 1557.69230769])

In [16]:
X

array([[9.96624000e+17, 4.00000000e+00, 0.00000000e+00, ...,
        8.70000000e+01, 1.13752332e+03, 0.00000000e+00],
       [1.41212160e+18, 5.00000000e+00, 2.00000000e+00, ...,
        8.80000000e+01, 8.23554217e+02, 1.00000000e+00],
       [1.59891840e+18, 5.00000000e+00, 0.00000000e+00, ...,
        8.30000000e+01, 2.23070301e+03, 0.00000000e+00],
       ...,
       [1.29384000e+18, 3.00000000e+00, 0.00000000e+00, ...,
        7.40000000e+01, 5.68873283e+02, 1.00000000e+00],
       [1.36736640e+18, 5.00000000e+00, 2.00000000e+00, ...,
        8.50000000e+01, 6.13929489e+02, 1.00000000e+00],
       [1.18592640e+18, 4.00000000e+00, 0.00000000e+00, ...,
        8.10000000e+01, 9.53334745e+02, 1.00000000e+00]])

In [17]:
# Split dataset in to training and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

print("Size of training set: {}".format(len(X_train)))
print("Size of test: {}".format(len(X_test)))

Size of training set: 345385
Size of test: 86347


In [18]:
# We fit the scaler based on the training data only
scaler = StandardScaler().fit(X_train)

# Of course, we need to convert both training and test data
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [19]:
for p in range(1, 5):
    
    # Transform data w.r.t to degree of polynomial p
    poly = PolynomialFeatures(p)
    X_train_poly = poly.fit_transform(X_train)
    X_test_poly = poly.fit_transform(X_test)
    
    # Train Linear Regressor or transformed data
    # fit_intercept=False since for p=1, transformation adds constant term to data
    poly_reg = LinearRegression(fit_intercept=False).fit(X_train_poly, y_train)

    # Predict values for training and test set
    y_train_pred = poly_reg.predict(X_train_poly)
    y_test_pred = poly_reg.predict(X_test_poly)
    
    # Calculate MSE 
    mse_train = mean_squared_error(y_train, y_train_pred)
    mse_test = mean_squared_error(y_test, y_test_pred)
    
    
    print('Degree of polynomial: {} => MSE (train/test): {:.2f}/{:.2f} (#terms: {})'.format(p, mse_train, mse_test, X_train_poly.shape[1]))

Degree of polynomial: 1 => MSE (train/test): 575561.22/574344.43 (#terms: 8)
Degree of polynomial: 2 => MSE (train/test): 474308.10/472594.86 (#terms: 36)
Degree of polynomial: 3 => MSE (train/test): 324981.39/321927.34 (#terms: 120)
Degree of polynomial: 4 => MSE (train/test): 294357.82/290950.98 (#terms: 330)


## Test

In [21]:
df_test = pd.read_csv('processed_test.csv')
df_test.head()

Unnamed: 0,month,flat_type,storey_range,floor_area_sqm,planning_area,remaining_lease,nearest_mrt_distance,exist_primary_school
0,2004-01,4,M,94.0,bukit batok,84,667.550041,True
1,2001-11,5,M,122.0,tampines,95,766.38576,True
2,2002-07,3,L,67.0,jurong east,79,518.257497,True
3,2015-04,3,M,82.0,ang mo kio,65,546.923279,True
4,2004-04,5,L,117.0,clementi,73,515.878471,True


In [22]:
# storey_range
df_test["storey_range"]=df_test["storey_range"].mask(df_test["storey_range"]=="L", 0)
df_test["storey_range"]=df_test["storey_range"].mask(df_test["storey_range"]=="M", 1)
df_test["storey_range"]=df_test["storey_range"].mask(df_test["storey_range"]=="H", 2)

In [23]:
# exist_primary_school
df_test["exist_primary_school"]=df_test["exist_primary_school"].mask(df_test["exist_primary_school"]==False, 0)
df_test["exist_primary_school"]=df_test["exist_primary_school"].mask(df_test["exist_primary_school"]==True, 1)

In [24]:
# flat_type
df_test["flat_type"]=df_test["flat_type"].mask(df_test["flat_type"]=="e", 6)
df_test["flat_type"]=df_test["flat_type"].mask(df_test["flat_type"]=="m", 7)

In [25]:
df_test['month'] = pd.to_datetime(df_test['month'])
df_test['month'] = df_test['month'].apply(lambda x: x.value)

In [26]:
df_test['flat_type'] = df_test['flat_type'].astype(str).str.strip().astype(int)
df_test['storey_range'] = df_test['storey_range'].astype(str).str.strip().astype(int)
df_test['exist_primary_school'] = df_test['exist_primary_school'].astype(str).str.strip().astype(int)

In [27]:
df_test.head()

Unnamed: 0,month,flat_type,storey_range,floor_area_sqm,planning_area,remaining_lease,nearest_mrt_distance,exist_primary_school
0,1072915200000000000,4,1,94.0,bukit batok,84,667.550041,1
1,1004572800000000000,5,1,122.0,tampines,95,766.38576,1
2,1025481600000000000,3,0,67.0,jurong east,79,518.257497,1
3,1427846400000000000,3,1,82.0,ang mo kio,65,546.923279,1
4,1080777600000000000,5,0,117.0,clementi,73,515.878471,1


In [28]:
X_kaggle_test = df_test.drop(columns=['planning_area']).to_numpy()

In [29]:
X_kaggle_test = scaler.transform(X_kaggle_test)

In [31]:
X_kaggle_test_poly = poly.fit_transform(X_kaggle_test)

In [32]:
y_kaggle_test_pred = poly_reg.predict(X_kaggle_test_poly)

In [33]:
y_kaggle_test_pred

array([1775.68136597, 2282.15625   , 1742.59375   , ..., 2589.40625   ,
       2891.0625    , 3839.2434082 ])

In [36]:
# convert back to resale prices
df_kaggle = pd.DataFrame(data=y_kaggle_test_pred, columns=["price_per_sqm"])

In [38]:
df_kaggle["resale_price"] = df_kaggle["price_per_sqm"] * df_test["floor_area_sqm"]

In [39]:
df_kaggle

Unnamed: 0,price_per_sqm,resale_price
0,1775.681366,166914.048401
1,2282.156250,278423.062500
2,1742.593750,116753.781250
3,4269.265625,350079.781250
4,2160.679688,252799.523438
...,...,...
107929,2994.125000,329353.750000
107930,2671.398682,272482.665527
107931,2589.406250,176079.625000
107932,2891.062500,300670.500000


In [43]:
df_kaggle.reset_index(inplace=True)

In [44]:
df_kaggle

Unnamed: 0,index,price_per_sqm,resale_price
0,0,1775.681366,166914.048401
1,1,2282.156250,278423.062500
2,2,1742.593750,116753.781250
3,3,4269.265625,350079.781250
4,4,2160.679688,252799.523438
...,...,...,...
107929,107929,2994.125000,329353.750000
107930,107930,2671.398682,272482.665527
107931,107931,2589.406250,176079.625000
107932,107932,2891.062500,300670.500000


In [46]:
del df_kaggle['price_per_sqm']

In [47]:
df_kaggle

Unnamed: 0,index,resale_price
0,0,166914.048401
1,1,278423.062500
2,2,116753.781250
3,3,350079.781250
4,4,252799.523438
...,...,...
107929,107929,329353.750000
107930,107930,272482.665527
107931,107931,176079.625000
107932,107932,300670.500000


In [48]:
filename="kaggle3.csv"
df_kaggle.to_csv(filename, index=False, header=["Id", "Predicted"])