# Prediction for houses and appartments with different regressions models

In [8]:
from sklearn.pipeline import Pipeline
from preprocessing import trainTestClean
import xgboost as xgb
import numpy as np
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
# Basic sklearn models
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostRegressor

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler

In [9]:
df_train, df_test = trainTestClean()

X_train = df_train.drop(columns=['price', 'id'])
y_train = df_train['price']
X_test = df_test.drop(columns=['price', 'id'])
y_test = df_test['price']

X_test = X_test[X_train.columns]



In [None]:
x_scaler = StandardScaler()
X_train = x_scaler.fit_transform(X_train)
X_test = x_scaler.transform(X_test)

y_scaler = StandardScaler()
y_train = y_scaler.fit_transform(y_train.values.reshape(-1, 1))
y_test = y_scaler.transform(y_test.values.reshape(-1, 1))

In [10]:
# select multiple models

models = {
    'LinearRegression': LinearRegression(),
    'Lasso': Lasso(alpha=0.1),
    'DecisionTree': DecisionTreeRegressor(random_state=42),
    'RandomForest': RandomForestRegressor(n_estimators=100, random_state=42),
    'ElasticNet': ElasticNet(random_state=0),
    'XGBoost': xgb.XGBRegressor(n_estimators=2000, random_state=42, learning_rate=0.1),
    'XGBoostElsa': xgb.XGBRegressor(n_estimators=2000, random_state=42, learning_rate=0.05, subsample= 0.8),
    'XGBoostAlex': xgb.XGBRegressor(n_estimators=2500, random_state=42, learning_rate=0.08, subsample= 0.8),
    'XGBoostAlex2': xgb.XGBRegressor(n_estimators=2500, random_state=42, learning_rate=0.08),
    'XGBoostGridCV': xgb.XGBRegressor(n_estimators=3000, model__max_depth=7,random_state=42, learning_rate=0.01, subsample= 0.8),
    'XGBoostBrutForce': xgb.XGBRegressor(colsample_bylevel=0.9289879319689553, colsample_bytree=0.7245003417617129, learning_rate=0.05183941032332593, max_depth=9, n_estimators=2496, reg_alpha=1.9905053073241674, reg_lambda=0.05061583846218687, subsample=0.7482424154252496),
    'SVR': SVR(kernel='rbf', C=1.0, epsilon=0.2),
    'GradientBoosting': GradientBoostingRegressor(n_estimators=100, random_state=42),
    'AdaBoost': AdaBoostRegressor(random_state=42),
    'LightGBM': lgb.LGBMRegressor(random_state=42),
    'CatBoost': CatBoostRegressor(random_state=42, silent=True),
    'Ridge': Ridge(alpha=1.0),
    'MLP': MLPRegressor(hidden_layer_sizes=(100,), max_iter=1000, random_state=42)
}



In [11]:
results = {}
best_mae = float('inf')
best_model_name = ''
best_pipeline = Pipeline([])

for name, model in models.items():
    pipeline = Pipeline([
        ('model', model)
    ])

    pipeline.fit(X_train, y_train)

    preds = pipeline.predict(X_test)

    preds_train = pipeline.predict(X_train)
    preds_test = pipeline.predict(X_test)

    #preds_train = y_scaler.inverse_transform(preds_train.reshape(-1, 1))
    #preds_test = y_scaler.inverse_transform(preds_test.reshape(-1, 1))
    #y_train = y_scaler.inverse_transform(y_train.reshape(-1, 1))
    #y_test = y_scaler.inverse_transform(y_test.reshape(-1, 1))

    r2_train = r2_score(y_train, preds_train)
    rmse_train = np.sqrt(mean_squared_error(y_train, preds_train))
    mae_train = mean_absolute_error(y_train, preds_train)
    mse_train = mean_squared_error(y_train, preds_train)

    print(f"{name} - Train : R2 = {r2_train:.4f}, MAE = {mae_train:.4f}, MSE = {mse_train:.4f}, RMSE = {rmse_train:.4f}")

    r2_test = r2_score(y_test, preds_test)
    rmse_test = np.sqrt(mean_squared_error(y_test, preds_test))
    mae_test = mean_absolute_error(y_test, preds_test)
    mse_test = mean_squared_error(y_test, preds_test)

    print(f"{name} - Test : R2 = {r2_test:.4f}, MAE = {mae_test:.4f}, MSE = {mse_test:.4f}, RMSE = {rmse_test:.4f}")

    results[name] = mae_test

    if mae_test < best_mae:
        best_mae = mae_test
        best_mse = mse_test
        best_r2 = r2_test
        best_model_name = name
        best_pipeline = pipeline
        best_model = model

print("Models results :")
for model_name, mae in results.items():
    print(f"{model_name} : MAE = {mae:.4f}")

print(f"\n -> Best Model : {best_model_name} with MAE = {best_mae:.4f} and MSE = {best_mse:.4f}; r2 = {best_r2:.4f}")

LinearRegression - Train : R2 = 0.6867, MAE = 71720.5729, MSE = 10822262051.7261, RMSE = 104030.1017
LinearRegression - Test : R2 = 0.6889, MAE = 72497.5984, MSE = 10806612087.9954, RMSE = 103954.8560


  model = cd_fast.enet_coordinate_descent(


Lasso - Train : R2 = 0.6867, MAE = 71703.3783, MSE = 10822570642.6641, RMSE = 104031.5848
Lasso - Test : R2 = 0.6890, MAE = 72460.1801, MSE = 10802712202.6497, RMSE = 103936.0967
DecisionTree - Train : R2 = 1.0000, MAE = 0.0000, MSE = 0.0000, RMSE = 0.0000
DecisionTree - Test : R2 = 0.9919, MAE = 4879.7646, MSE = 281147297.0871, RMSE = 16767.4475
RandomForest - Train : R2 = 0.9997, MAE = 766.8673, MSE = 8666948.0418, RMSE = 2943.9681
RandomForest - Test : R2 = 0.9983, MAE = 1876.8360, MSE = 60179974.8425, RMSE = 7757.5753


  model = cd_fast.enet_coordinate_descent(


ElasticNet - Train : R2 = 0.5608, MAE = 83818.2875, MSE = 15171019840.6093, RMSE = 123170.6939
ElasticNet - Test : R2 = 0.5584, MAE = 84558.8567, MSE = 15339566815.5196, RMSE = 123853.0049
XGBoost - Train : R2 = 0.9999, MAE = 1097.3945, MSE = 2345429.7508, RMSE = 1531.4796
XGBoost - Test : R2 = 0.9967, MAE = 3447.7501, MSE = 114130985.4916, RMSE = 10683.2104
XGBoostElsa - Train : R2 = 0.9999, MAE = 1621.8871, MSE = 4760763.7889, RMSE = 2181.9175
XGBoostElsa - Test : R2 = 0.9970, MAE = 3512.3140, MSE = 103262827.2953, RMSE = 10161.8319
XGBoostAlex - Train : R2 = 0.9999, MAE = 986.9748, MSE = 1788913.3623, RMSE = 1337.5027
XGBoostAlex - Test : R2 = 0.9965, MAE = 3557.0809, MSE = 122877792.9664, RMSE = 11085.0256
XGBoostAlex2 - Train : R2 = 0.9999, MAE = 1042.5075, MSE = 2170148.1252, RMSE = 1473.1423
XGBoostAlex2 - Test : R2 = 0.9970, MAE = 3343.4120, MSE = 104378981.3017, RMSE = 10216.6032


Parameters: { "model__max_depth" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


XGBoostGridCV - Train : R2 = 0.9995, MAE = 2734.2174, MSE = 15987454.7181, RMSE = 3998.4315
XGBoostGridCV - Test : R2 = 0.9970, MAE = 3846.1223, MSE = 105533429.2667, RMSE = 10272.9465
XGBoostBrutForce - Train : R2 = 1.0000, MAE = 312.5983, MSE = 201012.6662, RMSE = 448.3444
XGBoostBrutForce - Test : R2 = 0.9925, MAE = 9435.2359, MSE = 260505747.6874, RMSE = 16140.1904
SVR - Train : R2 = -0.0694, MAE = 137352.6645, MSE = 36941441552.4349, RMSE = 192201.5649
SVR - Test : R2 = -0.0649, MAE = 137813.2190, MSE = 36991505135.6860, RMSE = 192331.7580
GradientBoosting - Train : R2 = 0.9925, MAE = 10385.3162, MSE = 259162138.9572, RMSE = 16098.5136
GradientBoosting - Test : R2 = 0.9916, MAE = 10881.9580, MSE = 290808701.8479, RMSE = 17053.1141
AdaBoost - Train : R2 = 0.7131, MAE = 86154.7274, MSE = 9909749903.2223, RMSE = 99547.7268
AdaBoost - Test : R2 = 0.7084, MAE = 87251.9402, MSE = 10127823804.2742, RMSE = 100637.0896
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead 