**Linear Models: OLS Linear Regression, Ridge Regression, Lasso Regression**

In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression, Ridge, Lasso, RidgeCV, LassoCV
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import KFold
from utilities import cross_val_metrics_calculate

In [2]:
data = pd.read_csv("data/train_data.csv")
X = data.iloc[:, 1:-1] # remove index column (not read by pandas as indices?)
y = data.iloc[:, -1]

In [3]:
feature_names = X.columns
feature_names

Index(['Area (m2)', 'Property Type', 'Bedrooms', 'Bathrooms', 'Address',
       'Law Document', 'Quarter', 'Year', 'Latitude', 'Longitude',
       'Postal Code', 'Importance', 'Place Rank', 'City'],
      dtype='object')

In [4]:
n_folds = 5
kfold = KFold(n_folds)

*OLS Linear Regresison*

In [5]:
lr = LinearRegression()

cv_results = cross_val_metrics_calculate(lr, X, y, kfold.split(X))
print(cv_results)

{'mse': 632.468830132135, 'rmse': 25.017572626383934, 'mae': 8.212152228879576, 'mape': 3.141047653766202}


In [6]:
# Check feature importance through coefficients
lr.fit(X, y)
print("Coefficients of each feature:")
for i in range(lr.n_features_in_):
    print("%s: %.10f" % (lr.feature_names_in_[i],lr.coef_[i]))

Coefficients of each feature:
Area (m2): -0.0000000026
Property Type: 1.8538001384
Bedrooms: 1.1447473330
Bathrooms: 0.6138181405
Address: 0.0000234878
Law Document: -0.6589475639
Quarter: 0.8402210161
Year: 2.2655791710
Latitude: -0.4456004574
Longitude: -0.0596235735
Postal Code: -0.0000352078
Importance: 5.6861188845
Place Rank: 0.5189685878
City: 5.7587266858


- City has high coefficient as it only gets 2 values: 0 and 1 for HN and HCMC, should ignore

In [7]:
lr_with_standardize = make_pipeline(StandardScaler(), lr)

cv_results = cross_val_metrics_calculate(lr_with_standardize, X, y, kfold.split(X))
print(cv_results)

{'mse': 632.468830131915, 'rmse': 25.017572626379895, 'mae': 8.212152228879072, 'mape': 3.1410476537655887}


In [8]:
lr_with_standardize.fit(X, y)
print("Coefficients of each feature:")
for i in range(lr.n_features_in_):
    print("%s: %.10f" % (feature_names[i], lr.coef_[i]))

Coefficients of each feature:
Area (m2): -0.0840317744
Property Type: 2.5879579094
Bedrooms: 3.1753859481
Bathrooms: 1.6273673215
Address: 0.0457313986
Law Document: -0.8560025202
Quarter: 0.8647916908
Year: 3.4013313504
Latitude: -2.1479206553
Longitude: -0.3859431556
Postal Code: -1.0055887278
Importance: 0.2871384447
Place Rank: 1.1502652721
City: 2.4570140106


- Latitude has more importance to price, due to dataset consisting of real estates at HN and HCMC which have noticeably different latitudes
- with standardized data, area has more contribution to price

- No significant difference in metrics whether data is standardized or not
- RMSE and MAE show huge average errors, but MAPE shows only 3.14% loss?
- Time (year and quarter) and bedroom num. have high impact on price

*Ridge Regression*

In [9]:
# Select parameters
ridge_cv = RidgeCV(alphas=[0.1, 0.01, 0.001, 0.005, 0.05, 0.5, 0.0025, 0.025, 0.25, 1, 2.5, 5, 10],
                   scoring=None,
                   cv=5)

ridge_cv.fit(X, y)
ridge_cv.alpha_

  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, 

0.001

In [10]:
ridge_cv_standardize = make_pipeline(StandardScaler(), ridge_cv)

ridge_cv_standardize.fit(X,y)
ridge_cv.alpha_

10.0

In [11]:
ridge = Ridge(alpha=0.001)

cv_results = cross_val_metrics_calculate(ridge, X, y, kfold.split(X))
print(cv_results)

{'mse': 632.4699994325721, 'rmse': 25.0175941176528, 'mae': 8.212152723465088, 'mape': 3.1410478716618657}


  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T


In [12]:
ridge.fit(X, y)
print("Coefficients of each feature:")
for i in range(ridge.n_features_in_):
    print("%s: %.10f" % (feature_names[i], ridge.coef_[i]))

Coefficients of each feature:
Area (m2): -0.0000000026
Property Type: 1.8538002828
Bedrooms: 1.1447473810
Bathrooms: 0.6138181227
Address: 0.0000234878
Law Document: -0.6589476933
Quarter: 0.8402209831
Year: 2.2655791673
Latitude: -0.4456002635
Longitude: -0.0596235547
Postal Code: -0.0000352078
Importance: 5.6860108792
Place Rank: 0.5189672887
City: 5.7587226984


  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T


In [13]:
ridge_s = Ridge(alpha=10)
ridge_with_standardize = make_pipeline(StandardScaler(), ridge_s)

cv_results = cross_val_metrics_calculate(ridge_with_standardize, X, y, kfold.split(X))
print(cv_results)

{'mse': 632.5834290906681, 'rmse': 25.01965254835598, 'mae': 8.211891692091521, 'mape': 3.1412124914245365}


In [14]:
ridge_with_standardize.fit(X, y)
print("Coefficients of each feature:")
for i in range(ridge_s.n_features_in_):
    print("%s: %.10f" % (feature_names[i], ridge_s.coef_[i]))

Coefficients of each feature:
Area (m2): -0.0839903224
Property Type: 2.5871107640
Bedrooms: 3.1737837449
Bathrooms: 1.6288411277
Address: 0.0457728467
Law Document: -0.8564458082
Quarter: 0.8641687263
Year: 3.3997160077
Latitude: -2.1323084033
Longitude: -0.3833866406
Postal Code: -0.9952011969
Importance: 0.2869860827
Place Rank: 1.1480216658
City: 2.4517145005


- Ridge provides almost similar results (coefficients and metrics' results) to OLS linear regression

*Lasso Regression*

In [15]:
# Select parameters
lasso_cv = LassoCV(n_alphas=100,
                   alphas=[0.1, 0.01, 0.001, 0.005, 0.05, 0.5, 0.0025, 0.025, 0.25, 1, 2.5, 5, 10],
                   cv=None)

lasso_cv.fit(X, y)
lasso_cv.alpha_

0.001

In [16]:
lasso_cv_standardize = make_pipeline(StandardScaler(), lasso_cv)

lasso_cv_standardize.fit(X,y)
lasso_cv.alpha_

0.01

In [17]:
lasso = Lasso(alpha=0.001)

cv_results = cross_val_metrics_calculate(lasso, X, y, kfold.split(X))
print(cv_results)

{'mse': 637.1560829995988, 'rmse': 25.103060384250846, 'mae': 8.214043205603366, 'mape': 3.1419656229985815}


MSE is higher than Rigde and OLS

In [18]:
lasso.fit(X, y)
print("Coefficients of each feature:")
for i in range(lasso.n_features_in_):
    print("%s: %.10f" % (feature_names[i], lasso.coef_[i]))

Coefficients of each feature:
Area (m2): -0.0000000026
Property Type: 1.8540976967
Bedrooms: 1.1450006102
Bathrooms: 0.6136878989
Address: 0.0000237333
Law Document: -0.6590586962
Quarter: 0.8391350650
Year: 2.2651175155
Latitude: -0.4440416760
Longitude: -0.0594227729
Postal Code: -0.0000352408
Importance: 5.1327398858
Place Rank: 0.5119131722
City: 5.7379510583


In [19]:
lasso_s = Lasso(alpha=0.01)
lasso_with_standardize = make_pipeline(StandardScaler(), lasso_s)

cv_results = cross_val_metrics_calculate(lasso_with_standardize, X, y, kfold.split(X))
print(cv_results)

{'mse': 592.3173889801952, 'rmse': 24.22096808463137, 'mae': 8.181506729618162, 'mape': 3.133375429947581}


- MSE: Ridge and OLS (both with and without standardization) ~ 632; Lasso without standardization ~ 637; with standardization ~ 626
- MAE and MAPE also slightly better

In [20]:
lasso_with_standardize.fit(X, y)
print("Coefficients of each feature:")
for i in range(lasso_s.n_features_in_):
    print("%s: %.10f" % (feature_names[i], lasso_s.coef_[i]))

Coefficients of each feature:
Area (m2): -0.0741505959
Property Type: 2.5803581823
Bedrooms: 3.1747022864
Bathrooms: 1.6277370889
Address: 0.0356610596
Law Document: -0.8556108244
Quarter: 0.8497483808
Year: 3.3860209299
Latitude: -1.8750650418
Longitude: -0.3339115390
Postal Code: -0.8074715852
Importance: 0.2711666537
Place Rank: 1.0986252068
City: 2.3830120456


**Conclusion**
- Lasso with data standardization performs the best, although not too different from others
- Select different coefficient for regularization term when standardizing data provides better results; Ridge still the same but Lasso noticeably better
- Ridge and Lasso do not really improve performance -> Linear Regression may not be suitable
- Pattern in feature coefficients:
    + property type, bedroom num., post year, and regional features (city and latitude+longitude) contribute the most; especially city and latitude, signaling a noticeable price difference in real estates at HN and HCMC
    + area has small value of coefficient, only noticeable after standardization. This may be due to areas having large values while prices measured in billion VND are smaller

In [21]:
# Save a version of model fitted on training set for use of comparison on testing set
# Use model with standard scaler (better results)

lr_with_standardize.fit(X, y)
ridge_with_standardize.fit(X, y)
lasso_with_standardize.fit(X, y)

import pickle
pickle.dump(lr_with_standardize, open("models/LinearRegression.h5", 'wb'))
pickle.dump(ridge_with_standardize, open("models/Ridge.h5", 'wb'))
pickle.dump(lasso_with_standardize, open("models/Lasso.h5", 'wb'))