In [1]:
import pandas as pd
import numpy as np
import seaborn as sns

from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import root_mean_squared_error,root_mean_squared_log_error,mean_absolute_percentage_error
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor,AdaBoostRegressor,RandomForestRegressor
from sklearn.svm import SVR
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv("formatted_data.csv")
IQR_SIZE = 74 - 43
size_outliers_upper = 74 + 1.5*IQR_SIZE
size_outliers_lower = 43.3 - 1.5*IQR_SIZE
IQR_PRICE = 1330000 - 729000
price_outliers_upper = 1330000 + 1.5*IQR_PRICE
price_outliers_lower = 729000 - 1.5*IQR_PRICE

IQR_RENT = 1090 - 612
rent_outliers_upper = 1090 + 1.5*IQR_RENT
rent_outliers_lower = 612 - 1.5*IQR_RENT

# deleting outliers to check their importance to the models
# w_o meaning without outliers
df_w_o = df[(df['rozmiar'].between(size_outliers_lower,size_outliers_upper)) 
        & (df['czynsz'].between(rent_outliers_lower,rent_outliers_upper)) 
        & (df['cena'].between(price_outliers_lower,price_outliers_upper)) ]

In [3]:
dummies_df = pd.get_dummies(df.drop('ulica',axis=1), dtype=int)
dummies_df_w_o = pd.get_dummies(df_w_o.drop('ulica',axis=1), dtype=int)

In [4]:
scaler = StandardScaler()
X = dummies_df.drop('cena',axis=1)
y = dummies_df['cena']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train_trans =scaler.fit_transform(X_train)
X_test_trans =scaler.transform(X_test)

X_W_O= dummies_df_w_o.drop('cena',axis=1)
y_w_o = dummies_df_w_o['cena']

X_train_w_o, X_test_w_o, y_train_w_o, y_test_w_o = train_test_split(X_W_O, y_w_o, test_size=0.2, random_state=42)
X_train_trans_w_o =scaler.fit_transform(X_train_w_o)
X_test_trans_w_o =scaler.transform(X_test_w_o)

In [5]:
# result tables
RMSE = []
RMSLE = []
MAPE = []

In [6]:
def getModelWithBestParams(model,x_train,x_test,y_train,y_test,data,param_grid):
    grid_model = GridSearchCV(model,param_grid,cv=5,scoring='neg_root_mean_squared_log_error')
    grid_model.fit(x_train,y_train)
    y_pred = grid_model.predict(x_test)
    RMSE.append(f'{grid_model.best_estimator_} RMSE score on {data}: {root_mean_squared_error(y_test,y_pred)}')
    RMSLE.append(f'{grid_model.best_estimator_} RMSLE score on {data}: {root_mean_squared_log_error(y_test,y_pred)}')
    MAPE.append(f'{grid_model.best_estimator_} MAPE score on {data}: {mean_absolute_percentage_error(y_test,y_pred)}')
    return grid_model

In [7]:
rf = RandomForestRegressor()
param_grid = {'n_estimators':[100,150,200],'criterion':['squared_error','friedman_mse'],'max_features':[0.33],'max_depth':[5,10,15]}


rf_1 = getModelWithBestParams(rf,X_train,X_test,y_train,y_test,"unscaled data with outliers",param_grid)
rf_2 = getModelWithBestParams(rf,X_train_trans,X_test_trans,y_train,y_test,"scaled data with outliers",param_grid)
rf_3 = getModelWithBestParams(rf,X_train_w_o,X_test_w_o,y_train_w_o,y_test_w_o,"unscaled data without outliers",param_grid)
rf_4 = getModelWithBestParams(rf,X_train_trans_w_o,X_test_trans_w_o,y_train_w_o,y_test_w_o,"scaled data without outliers",param_grid)

In [8]:
dtForAdaBoost = DecisionTreeRegressor(max_depth=15,max_features=0.33)
ab = AdaBoostRegressor(dtForAdaBoost)
param_grid = {'n_estimators':[50,75,100],'learning_rate':[0.001,0.01],'loss':['square','exponential']}

ab_1 = getModelWithBestParams(ab,X_train,X_test,y_train,y_test,"unscaled data with outliers",param_grid)
ab_2 = getModelWithBestParams(ab,X_train_trans,X_test_trans,y_train,y_test,"scaled data with outliers",param_grid)
ab_3 = getModelWithBestParams(ab,X_train_w_o,X_test_w_o,y_train_w_o,y_test_w_o,"unscaled data without outliers",param_grid)
ab_4 = getModelWithBestParams(ab,X_train_trans_w_o,X_test_trans_w_o,y_train_w_o,y_test_w_o,"scaled data without outliers",param_grid)

In [9]:
gb = GradientBoostingRegressor()
param_grid = {'n_estimators':[200,300],'learning_rate':[0.01,0.001],'loss':['squared_error'],'max_depth':[5,10,15]}

gb_1 = getModelWithBestParams(gb,X_train,X_test,y_train,y_test,"unscaled data with outliers",param_grid)
gb_2 = getModelWithBestParams(gb,X_train_trans,X_test_trans,y_train,y_test,"scaled data with outliers",param_grid)
gb_3 = getModelWithBestParams(gb,X_train_w_o,X_test_w_o,y_train_w_o,y_test_w_o,"unscaled data without outliers",param_grid)
gb_4 = getModelWithBestParams(gb,X_train_trans_w_o,X_test_trans_w_o,y_train_w_o,y_test_w_o,"scaled data without outliers",param_grid)

In [10]:
svr = SVR()
param_grid = {'kernel':['rbf'],
              'C':[5000000,10000000,50000000], 
              'gamma':['scale','auto'],
              'epsilon':[0.0000005,0.000001]}

svr_1 = getModelWithBestParams(svr,X_train,X_test,y_train,y_test,"unscaled data with outliers",param_grid)
svr_2 = getModelWithBestParams(svr,X_train_trans,X_test_trans,y_train,y_test,"scaled data with outliers",param_grid)
svr_3 = getModelWithBestParams(svr,X_train_w_o,X_test_w_o,y_train_w_o,y_test_w_o,"unscaled data without outliers",param_grid)
svr_4 = getModelWithBestParams(svr,X_train_trans_w_o,X_test_trans_w_o,y_train_w_o,y_test_w_o,"scaled data without outliers",param_grid)

Traceback (most recent call last):
  File "c:\Users\Marcin\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\model_selection\_validation.py", line 971, in _score
    scores = scorer(estimator, X_test, y_test, **score_params)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Marcin\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\metrics\_scorer.py", line 279, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true, **_kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Marcin\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\metrics\_scorer.py", line 376, in _score
    return self._sign * self._score_func(y_true, y_pred, **scoring_kwargs)
                        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Marcin\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\utils\_param_validatio

In [14]:
print(rf_1.best_params_)
print(rf_2.best_params_)
print(rf_3.best_params_)
print(rf_4.best_params_)

{'criterion': 'friedman_mse', 'max_depth': 15, 'max_features': 0.33, 'n_estimators': 100}
{'criterion': 'squared_error', 'max_depth': 15, 'max_features': 0.33, 'n_estimators': 100}
{'criterion': 'squared_error', 'max_depth': 15, 'max_features': 0.33, 'n_estimators': 200}
{'criterion': 'friedman_mse', 'max_depth': 15, 'max_features': 0.33, 'n_estimators': 200}


In [15]:
print(ab_1.best_params_)
print(ab_2.best_params_)
print(ab_3.best_params_)
print(ab_4.best_params_)

{'learning_rate': 0.001, 'loss': 'square', 'n_estimators': 75}
{'learning_rate': 0.001, 'loss': 'square', 'n_estimators': 75}
{'learning_rate': 0.001, 'loss': 'exponential', 'n_estimators': 75}
{'learning_rate': 0.01, 'loss': 'exponential', 'n_estimators': 100}


In [16]:
print(gb_1.best_params_)
print(gb_2.best_params_)
print(gb_3.best_params_)
print(gb_4.best_params_)

{'learning_rate': 0.01, 'loss': 'squared_error', 'max_depth': 10, 'n_estimators': 300}
{'learning_rate': 0.01, 'loss': 'squared_error', 'max_depth': 10, 'n_estimators': 300}
{'learning_rate': 0.01, 'loss': 'squared_error', 'max_depth': 10, 'n_estimators': 300}
{'learning_rate': 0.01, 'loss': 'squared_error', 'max_depth': 10, 'n_estimators': 300}


In [17]:
print(svr_1.best_params_)
print(svr_2.best_params_)
print(svr_3.best_params_)
print(svr_4.best_params_)

{'C': 50000000, 'epsilon': 1e-06, 'gamma': 'scale', 'kernel': 'rbf'}
{'C': 5000000, 'epsilon': 5e-07, 'gamma': 'auto', 'kernel': 'rbf'}
{'C': 50000000, 'epsilon': 1e-06, 'gamma': 'scale', 'kernel': 'rbf'}
{'C': 5000000, 'epsilon': 5e-07, 'gamma': 'auto', 'kernel': 'rbf'}
