In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [1]:
DIRECTORY = "/content/drive/MyDrive/Colab Notebooks/Data Science"

In [7]:
import os
import pandas as pd

data_test = pd.read_csv(os.path.join(DIRECTORY, "data/test_data_2nd.csv"))
X = data_test.iloc[:, 1:-1]
y = data_test.iloc[:, -1]

In [5]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.svm import LinearSVR
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.kernel_approximation import RBFSampler
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from xgboost import XGBRegressor
import pickle

lr = pickle.load(open(os.path.join(DIRECTORY, "models/LinearRegression.h5"), 'rb'))
ridge = pickle.load(open(os.path.join(DIRECTORY, "models/Ridge.h5"), 'rb'))
lasso = pickle.load(open(os.path.join(DIRECTORY, "models/Lasso.h5"), 'rb'))

linear_svm = pickle.load(open(os.path.join(DIRECTORY, "models/LinearSVM.h5"), 'rb'))
svm = pickle.load(open(os.path.join(DIRECTORY, "models/SVM.h5"), 'rb'))

gb = pickle.load(open(os.path.join(DIRECTORY, "models/GradientBoosting.h5"), 'rb'))
rf = pickle.load(open(os.path.join(DIRECTORY, "models/RandomForest.h5"), 'rb'))

xgb = XGBRegressor()
xgb.load_model(os.path.join(DIRECTORY, "models/XGBoostRegressor.json"))
xgb.set_params(device='cpu')


https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [None]:
y_lr = lr.predict(X)
y_ridge = ridge.predict(X)
y_lasso = lasso.predict(X)

y_linear_svm = linear_svm.predict(X)
y_svm = svm.predict(X)

y_gb = gb.predict(X)
y_rf = rf.predict(X)

y_xgb = xgb.predict(X)

In [None]:
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error, root_mean_squared_error

metrics_dict = {'mae':mean_absolute_error, 'mape':mean_absolute_percentage_error, 'rmse':root_mean_squared_error}

losses = {model:{'mae':0,'mape':0,'rmse':0} for model in ['lr', 'ridge', 'lasso', 'svm', 'linearsvm', 'gb', 'rf', 'xgb']}
for loss in ['mae', 'mape', 'rmse']:
    losses['lr'][loss] = metrics_dict[loss](y, y_lr)
    losses['ridge'][loss] = metrics_dict[loss](y, y_ridge)
    losses['lasso'][loss] = metrics_dict[loss](y, y_lasso)
    losses['linearsvm'][loss] = metrics_dict[loss](y, y_linear_svm)
    losses['svm'][loss] = metrics_dict[loss](y, y_svm)
    losses['gb'][loss] = metrics_dict[loss](y, y_gb)
    losses['rf'][loss] = metrics_dict[loss](y, y_rf)
    losses['xgb'][loss] = metrics_dict[loss](y, y_xgb)

print(*[item for item in losses.items()], sep='\n')
# print in readable format

('lr', {'mae': 8.3599254761376, 'mape': 3.111050682877996, 'rmse': 34.38298487668231})
('ridge', {'mae': 8.35959132556248, 'mape': 3.1110874296026636, 'rmse': 34.39549202938774})
('lasso', {'mae': 8.036135455047038, 'mape': 2.992216267557128, 'rmse': 22.339548303823815})
('svm', {'mae': 7.36885184398889, 'mape': 1.6401658399062982, 'rmse': 23.551251229815872})
('linearsvm', {'mae': 7.178288239382752, 'mape': 1.6719883891728784, 'rmse': 45.675751632075894})
('gb', {'mae': 5.81645647902222, 'mape': 1.714644147340918, 'rmse': 21.92017820481343})
('rf', {'mae': 5.198979468589072, 'mape': 2.6571721376149644, 'rmse': 18.281289348584238})
('xgb', {'mae': 4.940948400261332, 'mape': 2.05431800619016, 'rmse': 19.92407339432346})


**26/11:**
- Lasso looks good
- SVM (with RBF kernel) > Linear SVM
- SVM has better MAE & MAPE, but Lasso has slightly better RMSE; should not be a problem

**1/12**
- Added Random Forest, Gradient Boosting and XGBoost's GB
- XGBoost Gradient Boosting: 2.05 MAPE, 4.94 MAE -> best -> should use for demo if needed

In [8]:
data_train = pd.read_csv(os.path.join(DIRECTORY, "data/train_data_2nd.csv"))
X = pd.concat([X, data_train.iloc[:, 1:-1]])
y = pd.concat([y, data_train.iloc[:, -1]])
xgb.fit(X, y)

In [11]:
svm.fit(X, y)
rf.fit(X, y)

In [12]:
rf.feature_importances_

array([0.3064747 , 0.05938746, 0.06461407, 0.0622189 , 0.08788003,
       0.03485235, 0.03932847, 0.04769204, 0.08683863, 0.09187964,
       0.06600922, 0.02488651, 0.01772129, 0.01021669])

In [9]:
xgb.feature_importances_

array([0.04859125, 0.06785666, 0.0277437 , 0.02498208, 0.02790911,
       0.03167776, 0.02835268, 0.03060718, 0.05101044, 0.05988442,
       0.04876573, 0.0229215 , 0.2960473 , 0.23365015], dtype=float32)

In [10]:
xgb.feature_names_in_

array(['Area (m2)', 'Property Type', 'Bedrooms', 'Bathrooms', 'Address',
       'Law Document', 'Quarter', 'Year', 'Latitude', 'Longitude',
       'Postal Code', 'Importance', 'Place Rank', 'City'], dtype='<U13')

In [13]:
xgb.save_model(os.path.join(DIRECTORY, "models/XGBoostRegressor_final.json"))

In [14]:
pickle.dump(rf, open(os.path.join(DIRECTORY, "models/RandomForest_final.h5"), 'wb'))
pickle.dump(svm, open(os.path.join(DIRECTORY, "models/SVM_final.h5"), 'wb'))