# Linear Models: OLS Linear Regression, Ridge Regression, Lasso Regression

# Libraries and functions

In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression, Ridge, Lasso, RidgeCV, LassoCV
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import KFold
# from utilities import cross_val_metrics_calculate

In [2]:
def median_absolute_percentage_error(y_true, y_pred):
  result = abs(y_true - y_pred) / y_true
  return result.median()

In [3]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, root_mean_squared_error, median_absolute_error, mean_absolute_percentage_error, accuracy_score, precision_score, recall_score, f1_score
from pandas import DataFrame, Series
def cross_val_metrics_calculate(model, X:DataFrame, y:Series, splits, metrics=['mse', 'rmse', 'mae', 'mape', 'medae', 'medape']):
    n_folds = 0
    result = {name:0 for name in metrics}
    for train_index, test_index in splits:
        n_folds += 1
        X_train, y_train = X.iloc[train_index], y.iloc[train_index]
        X_test, y_test = X.iloc[test_index], y.iloc[test_index]
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        if 'mse' in metrics:
            result['mse'] += mean_squared_error(y_test, y_pred)
        if 'rmse' in metrics:
            result['rmse'] += root_mean_squared_error(y_test, y_pred)
        if 'mae' in metrics:
            result['mae'] += mean_absolute_error(y_test, y_pred)
        if 'mape' in metrics:
            result['mape'] += mean_absolute_percentage_error(y_test, y_pred)
        if 'accuracy' in metrics:
            result['accuracy'] += accuracy_score(y_test, y_pred)
        if 'precision' in metrics:
            result['precision'] += precision_score(y_test, y_pred, average='macro', zero_division=0)
        if 'recall' in metrics:
            result['recall'] += recall_score(y_test, y_pred, average='macro', zero_division=0)
        if 'f1' in metrics:
            result['f1'] += f1_score(y_test, y_pred, average='macro', zero_division=0)
        if 'medae' in metrics:
            result['medae'] += median_absolute_error(y_test, y_pred)
        if 'medape' in metrics:
            result['medape'] += median_absolute_percentage_error(y_test, y_pred)
    for metric in metrics:
        result[metric] /= n_folds
    return result

In [4]:
data = pd.read_csv("../data/train_data_2nd.csv")
X = data.iloc[:, 1:-1] # remove index column (not read by pandas as indices?)
y = data.iloc[:, -1]

In [5]:
feature_names = X.columns
feature_names

Index(['Area (m2)', 'Property Type', 'Bedrooms', 'Bathrooms', 'Address',
       'Law Document', 'Quarter', 'Year', 'Latitude', 'Longitude',
       'Postal Code', 'Importance', 'Place Rank', 'City'],
      dtype='object')

In [6]:
n_folds = 5
kfold = KFold(n_folds)

# OLS Linear Regresison

In [7]:
lr = LinearRegression()

cv_results = cross_val_metrics_calculate(lr, X, y, kfold.split(X))
print(cv_results)

{'mse': 19016.64659590109, 'rmse': 79.90874796632568, 'mae': 8.972131741802396, 'mape': 3.2385168596430063, 'medae': 4.545275997365727, 'medape': 0.7432247988298927}


In [6]:
# Check feature importance through coefficients
lr.fit(X, y)
print("Coefficients of each feature:")
for i in range(lr.n_features_in_):
    print("%s: %.10f" % (lr.feature_names_in_[i],lr.coef_[i]))

Coefficients of each feature:
Area (m2): 0.0000003972
Property Type: 1.8701870470
Bedrooms: 1.2339251888
Bathrooms: 0.5317770477
Address: -0.0000345431
Law Document: -0.7078823375
Quarter: 0.8369317418
Year: 2.2741343499
Latitude: -0.6181603787
Longitude: -0.0783127077
Postal Code: -0.0000539938
Importance: 5.2315554653
Place Rank: 0.6088397524
City: 6.0078190460


- City has high coefficient as it only gets 2 values: 0 and 1 for HN and HCMC, should ignore
- Year, property type & num. of bedrooms; Importance

**With standardize**

In [8]:
lr_with_standardize = make_pipeline(StandardScaler(), lr)

cv_results = cross_val_metrics_calculate(lr_with_standardize, X, y, kfold.split(X))
print(cv_results)

{'mse': 19016.646595901762, 'rmse': 79.90874796632674, 'mae': 8.972131741802269, 'mape': 3.238516859642897, 'medae': 4.545275997365263, 'medape': 0.7432247988298}


In [8]:
lr_with_standardize.fit(X, y)
print("Coefficients of each feature:")
for i in range(lr.n_features_in_):
    print("%s: %.10f" % (feature_names[i], lr.coef_[i]))

Coefficients of each feature:
Area (m2): 0.0233702667
Property Type: 2.6056100965
Bedrooms: 3.4921102582
Bathrooms: 1.3887746780
Address: -0.0672583797
Law Document: -0.9201180132
Quarter: 0.8619879178
Year: 3.4003806859
Latitude: -2.9731446239
Longitude: -0.4629818853
Postal Code: -1.5270823366
Importance: 0.2591036951
Place Rank: 1.3504607094
City: 2.5645609597


- Latitude has more importance to price, due to dataset consisting of real estates at HN and HCMC which have noticeably different latitudes
- with standardized data, area has more contribution to price

- No significant difference in metrics whether data is standardized or not
- RMSE and MAE show huge average errors, but MAPE shows only 3.14% loss?
- Time (year and quarter), property type, location features (city, latitude, postal code) and bedroom num. have high impact on price

# Ridge Regression

In [15]:
# Select parameters
ridge_cv = RidgeCV(alphas=[0.1, 0.01, 0.001, 0.005, 0.05, 0.5, 0.0025, 0.025, 0.25, 1, 2.5, 5, 10],
                   scoring='neg_mean_absolute_error',
                   cv=5)

ridge_cv.fit(X, y)
ridge_cv.alpha_

10.0

In [16]:
ridge_cv_standardize = make_pipeline(StandardScaler(), ridge_cv)

ridge_cv_standardize.fit(X,y)
ridge_cv.alpha_

10.0

In [9]:
ridge = Ridge(alpha=10)

cv_results = cross_val_metrics_calculate(ridge, X, y, kfold.split(X))
print(cv_results)

{'mse': 19013.768244523155, 'rmse': 79.9040088979126, 'mae': 8.971763497115978, 'mape': 3.238188158926131, 'medae': 4.546524884464401, 'medape': 0.7445116553600659}


In [10]:
ridge.fit(X, y)
print("Coefficients of each feature:")
for i in range(ridge.n_features_in_):
    print("%s: %.10f" % (feature_names[i], ridge.coef_[i]))

Coefficients of each feature:
Area (m2): 0.0000004139
Property Type: 1.8714108956
Bedrooms: 1.2342593413
Bathrooms: 0.5318908765
Address: -0.0000342230
Law Document: -0.7092743496
Quarter: 0.8362379541
Year: 2.2739671471
Latitude: -0.6159401868
Longitude: -0.0780961782
Postal Code: -0.0000541674
Importance: 4.3518363981
Place Rank: 0.5981853895
City: 5.9664079953


**With standardize**

In [10]:
ridge_s = Ridge(alpha=10)
ridge_with_standardize = make_pipeline(StandardScaler(), ridge_s)

cv_results = cross_val_metrics_calculate(ridge_with_standardize, X, y, kfold.split(X))
print(cv_results)

{'mse': 19001.71609043715, 'rmse': 79.88420721202944, 'mae': 8.971390695121217, 'mape': 3.2386693593554683, 'medae': 4.543566892223657, 'medape': 0.7434995342679571}


In [12]:
ridge_with_standardize.fit(X, y)
print("Coefficients of each feature:")
for i in range(ridge_s.n_features_in_):
    print("%s: %.10f" % (feature_names[i], ridge_s.coef_[i]))

Coefficients of each feature:
Area (m2): 0.0233852824
Property Type: 2.6047439077
Bedrooms: 3.4901717669
Bathrooms: 1.3906924428
Address: -0.0671420499
Law Document: -0.9207184217
Quarter: 0.8612884301
Year: 3.3987672259
Latitude: -2.9493007493
Longitude: -0.4594834600
Postal Code: -1.5101863830
Importance: 0.2590759337
Place Rank: 1.3471811709
City: 2.5578403527


- Ridge provides almost similar results (coefficients and metrics' results) to OLS linear regression

# Lasso Regression

In [21]:
# Select parameters
lasso_cv = LassoCV(n_alphas=100,
                   alphas=[0.1, 0.01, 0.001, 0.005, 0.05, 0.5, 0.0025, 0.025, 0.25, 1, 2.5, 5, 10],
                   cv=None)

lasso_cv.fit(X, y)
lasso_cv.alpha_

0.01

In [22]:
lasso_cv_standardize = make_pipeline(StandardScaler(), lasso_cv)

lasso_cv_standardize.fit(X,y)
lasso_cv.alpha_

0.025

In [11]:
lasso = Lasso(alpha=0.01)

cv_results = cross_val_metrics_calculate(lasso, X, y, kfold.split(X))
print(cv_results)

{'mse': 19008.312878635214, 'rmse': 79.8956963196177, 'mae': 8.97105879884, 'mape': 3.238413514180036, 'medae': 4.539586791895326, 'medape': 0.7466637228201058}


In [24]:
lasso.fit(X, y)
print("Coefficients of each feature:")
for i in range(lasso.n_features_in_):
    print("%s: %.10f" % (feature_names[i], lasso.coef_[i]))

Coefficients of each feature:
Area (m2): 0.0000004959
Property Type: 1.8724589291
Bedrooms: 1.2357145027
Bathrooms: 0.5317423261
Address: -0.0000321686
Law Document: -0.7088079211
Quarter: 0.8247430152
Year: 2.2695205031
Latitude: -0.6021371304
Longitude: -0.0762069336
Postal Code: -0.0000542255
Importance: 0.0000000000
Place Rank: 0.5415563853
City: 5.7993527302


**With standardize**

In [12]:
lasso_s = Lasso(alpha=0.025)
lasso_with_standardize = make_pipeline(StandardScaler(), lasso_s)

cv_results = cross_val_metrics_calculate(lasso_with_standardize, X, y, kfold.split(X))
print(cv_results)

{'mse': 18534.27078426646, 'rmse': 79.11256183363977, 'mae': 8.954101480570717, 'mape': 3.2442162989717884, 'medae': 4.535761514307711, 'medape': 0.7474921237681549}


- MSE: Ridge and OLS (both with and without standardization) and Lasso without standardization ~ 19000; with standardization ~ 18500
- Small change: MAE better, MAPE worse

In [14]:
lasso_with_standardize.fit(X, y)
print("Coefficients of each feature:")
for i in range(lasso_s.n_features_in_):
    print("%s: %.10f" % (feature_names[i], lasso_s.coef_[i]))

Coefficients of each feature:
Area (m2): 0.0000000000
Property Type: 2.5808152311
Bedrooms: 3.4865549563
Bathrooms: 1.3939165325
Address: -0.0398882575
Law Document: -0.9192615343
Quarter: 0.8228822271
Year: 3.3601097165
Latitude: -2.2014596042
Longitude: -0.3302537720
Postal Code: -0.9519989211
Importance: 0.2195174479
Place Rank: 1.2114204117
City: 2.3779500840


# Conclusion
- Lasso with data standardization performs the best, although not too different from others
- Select different coefficient for regularization term used for standardized data provides better results; Ridge still the same but Lasso noticeably better
- Ridge and Lasso do not really improve performance
- Pattern in feature coefficients:
    + property type, bedroom num., post year, and regional features (city and latitude+longitude) contribute the most; especially city and latitude, signaling a noticeable price difference in real estates at HN and HCMC
    + area has small value of coefficient, only noticeable after standardization. This may be due to areas having large values while prices measured in billion VND are smaller

In [15]:
# Save a version of model fitted on training set for use of comparison on testing set
# Use model with standard scaler (better results)

lr_with_standardize.fit(X, y)
ridge_with_standardize.fit(X, y)
lasso_with_standardize.fit(X, y)

import pickle
pickle.dump(lr_with_standardize, open("../models/LinearRegression.h5", 'wb'))
pickle.dump(ridge_with_standardize, open("../models/Ridge.h5", 'wb'))
pickle.dump(lasso_with_standardize, open("../models/Lasso.h5", 'wb'))