In [None]:
import pickle
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from catboost import CatBoostRegressor
from sklearn.impute import SimpleImputer
import numpy as np
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.inspection import permutation_importance
import matplotlib.pyplot as plt
import catboost as cb
import xgboost as xgb

In [None]:
#Define random seed for result reproduction
random_seed = 42
np.random.seed(random_seed)

In [None]:
# Load the pickle file
pickle_path = r"C:\Users\pinte\OneDrive\Plocha\DAB\Semestr 2\Data X\Assignment\data_for_model.pck"

with open(pickle_path, 'rb') as f:
    data = pickle.load(f)

In [None]:
# target variable
target = 'price'  

# drop rows with missing target variable values
df = data.dropna(subset=[target])

# Identify and remove outliers
Q1 = df[target].quantile(0.25)
Q3 = df[target].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

df = df[(df[target] >= lower_bound) & (df[target] <= upper_bound)]

In [None]:
# separate features and target variable
X = df.drop(target, axis=1)
y = df[target]

# remove columns with any missing values
X = X.dropna(axis=1, how='any')

# one hot encode categorical variables
X = pd.get_dummies(X, drop_first=True)

# split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_seed)

Linear regression

In [None]:
# Modeling
linear_regressor = LinearRegression()
linear_regressor.fit(X_train, y_train)

y_pred = linear_regressor.predict(X_test)

rmse_lr = np.sqrt(mean_squared_error(y_test, y_pred))
r2_lr = r2_score(y_test, y_pred)

print("Linear regression results")
print(f"RMSE: {rmse_lr}")
print(f"R^2: {r2_lr}")

Random forest

In [None]:
# initialize and train the Random Forest model
rf = RandomForestRegressor(n_estimators=100, random_state=random_seed)
rf.fit(X_train, y_train)

# predictions for the test set
y_pred_rf = rf.predict(X_test)

# calculating r2 for random forest
r2_rf = r2_score(y_test, y_pred_rf)
print(f'Random Forest R²: {r2_rf:.4f}')

Optimalized random forest

In [None]:
# Inicializace RandomForestRegressor
rf_regressor = RandomForestRegressor(random_state=random_seed, n_jobs=-1)


param_grid_rf = {
    'n_estimators': [100, 200, 300, 500],
    'max_features': [4,8,9],
    'min_samples_split': [5,10, 20],
    'bootstrap': [True, False]
}

# Inicializace GridSearchCV s menším počtem křížových validací (cv)
grid_search_rf = GridSearchCV(estimator=rf_regressor, param_grid=param_grid_rf, scoring='neg_mean_squared_error', cv=5, verbose=1, n_jobs=-1)

# Trénování modelu pomocí GridSearchCV
grid_search_rf.fit(X_train, y_train)

# Nejlepší parametry
best_params_rf = grid_search_rf.best_params_
print(f"Best params: {best_params_rf}")

# Nejlepší model
best_model_rf = grid_search_rf.best_estimator_

# Predikce na testovacích datech
predictions_rf = best_model_rf.predict(X_test)

# Hodnocení modelu
rmse_rf = np.sqrt(mean_squared_error(y_test, predictions_rf))
r2_rf = r2_score(y_test, predictions_rf)

print("Výsledky optimalizovaného modelu Random Forest")
print(f"RMSE: {rmse_rf}")
print(f"R^2: {r2_rf}")

In [None]:
# display attribute importances
importances = rf.feature_importances_
feature_importances = pd.DataFrame({'Feature': X.columns, 'Importance': importances})
feature_importances = feature_importances.sort_values(by='Importance', ascending=False)

# plotting the attributes importance
plt.figure(figsize=(10, 8))
plt.barh(feature_importances['Feature'], feature_importances['Importance'])
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.title('Feature Importance')
plt.gca().invert_yaxis()
plt.show()

feature_importances.head(10)

Xboost

In [31]:
xgb_regressor = xgb.XGBRegressor(objective ='reg:squarederror', n_estimators=100, random_state=random_seed)
xgb_regressor.fit(X_train, y_train)
y_pred_xgb = xgb_regressor.predict(X_test)
r2_xgb = r2_score(y_test, y_pred_xgb)
print("R2: {:.2f}".format(r2_xgb))

R2: 0.55


In [30]:
import xgboost as xgb
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from scipy.stats import uniform, randint

# Custom XGBoost wrapper to include early stopping
class XGBRegressorEarlyStopping(xgb.XGBRegressor):
    def fit(self, X, y, eval_set=None, **kwargs):
        if eval_set is None:
            eval_set = [(X, y)]
        return super().fit(X, y, eval_set=eval_set, early_stopping_rounds=10, **kwargs)

# Pipeline with scaling and model
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('xgb', XGBRegressorEarlyStopping(objective='reg:squarederror', random_state=random_seed))
])

# Define parameter distribution for randomized search
param_dist = {
    'xgb__n_estimators': randint(50, 200),
    'xgb__learning_rate': uniform(0.01, 0.3),
    'xgb__max_depth': randint(3, 10),
    'xgb__min_child_weight': randint(1, 6),
    'xgb__subsample': uniform(0.5, 0.5),
    'xgb__colsample_bytree': uniform(0.5, 0.5),
    'xgb__gamma': uniform(0, 0.4),
    'xgb__reg_alpha': uniform(0, 1),
    'xgb__reg_lambda': uniform(0.5, 2)
}

# Perform randomized search
random_search = RandomizedSearchCV(estimator=pipeline, param_distributions=param_dist, n_iter=50, 
                                   cv=3, n_jobs=-1, verbose=2, scoring='r2', random_state=random_seed)
random_search.fit(X_train, y_train)

# Best model
best_pipeline = random_search.best_estimator_

# Predictions and evaluation
y_pred_xgb = best_pipeline.predict(X_test)
r2_xgb = r2_score(y_test, y_pred_xgb)
print("Best R2: {:.2f}".format(r2_xgb))
print("Best Parameters: ", random_search.best_params_)


Fitting 3 folds for each of 50 candidates, totalling 150 fits
[0]	validation_0-rmse:1328.12588
[1]	validation_0-rmse:1269.90480
[2]	validation_0-rmse:1251.80689
[3]	validation_0-rmse:1192.72277
[4]	validation_0-rmse:1180.32766
[5]	validation_0-rmse:1127.50866
[6]	validation_0-rmse:1076.06514
[7]	validation_0-rmse:1067.90056
[8]	validation_0-rmse:1035.71203
[9]	validation_0-rmse:1026.70691
[10]	validation_0-rmse:1010.46477
[11]	validation_0-rmse:997.39238
[12]	validation_0-rmse:970.97151
[13]	validation_0-rmse:943.40821
[14]	validation_0-rmse:919.62997




[15]	validation_0-rmse:906.69478
[16]	validation_0-rmse:893.06106
[17]	validation_0-rmse:881.17177
[18]	validation_0-rmse:872.91598
[19]	validation_0-rmse:863.94514
[20]	validation_0-rmse:853.08570
[21]	validation_0-rmse:840.04872
[22]	validation_0-rmse:827.34475
[23]	validation_0-rmse:817.53552
[24]	validation_0-rmse:808.18107
[25]	validation_0-rmse:795.81586
[26]	validation_0-rmse:787.79372
[27]	validation_0-rmse:780.85252
[28]	validation_0-rmse:772.01386
[29]	validation_0-rmse:762.02501
[30]	validation_0-rmse:753.87830
[31]	validation_0-rmse:748.27043
[32]	validation_0-rmse:738.43150
[33]	validation_0-rmse:735.14489
[34]	validation_0-rmse:723.85931
[35]	validation_0-rmse:721.25439
[36]	validation_0-rmse:719.88047
[37]	validation_0-rmse:714.74254
[38]	validation_0-rmse:708.21823
[39]	validation_0-rmse:703.45544
[40]	validation_0-rmse:696.56297
[41]	validation_0-rmse:689.24097
[42]	validation_0-rmse:683.90692
[43]	validation_0-rmse:681.24745
[44]	validation_0-rmse:678.17877
[45]	valid

Catboost

In [29]:
import catboost as cb
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np


# Inicializace CatBoostRegressor
cb_regressor = cb.CatBoostRegressor(random_seed=42, silent=True)

# RandomizedSearch parameters setting
param_distributions_cb = {
    'iterations': [500, 1000, 1500],
    'depth': [4, 6, 8, 10],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'l2_leaf_reg': [1, 3, 5, 7, 9],
    'bootstrap_type': ['Bayesian', 'Bernoulli', 'MVS'],
    'subsample': [0.8, 0.9, 1.0],
    'colsample_bylevel': [0.8, 0.9, 1.0]
}

# Inicializace RandomizedSearchCV
random_search_cb = RandomizedSearchCV(
    estimator=cb_regressor,
    param_distributions=param_distributions_cb,
    n_iter=50,  # Počet náhodných kombinací, které chceme vyzkoušet
    scoring='neg_mean_squared_error',
    cv=3,
    verbose=1,
    n_jobs=-1,
    random_state=42
)

# Trénování modelu pomocí RandomizedSearchCV
random_search_cb.fit(X_train, y_train)

# Nejlepší parametry
best_params_cb = random_search_cb.best_params_
print(f"Best params: {best_params_cb}")

# Nejlepší model
best_model_cb = random_search_cb.best_estimator_

# Predikce na testovacích datech
predictions_cb = best_model_cb.predict(X_test)

# Hodnocení modelu
rmse_cb = np.sqrt(mean_squared_error(y_test, predictions_cb))
r2_cb = r2_score(y_test, predictions_cb)

print("Výsledky optimalizovaného modelu CatBoost s vybranými atributy")
print(f"RMSE: {rmse_cb}")
print(f"R^2: {r2_cb}")

48 fits failed out of a total of 150.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
48 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\pinte\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\model_selection\_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\pinte\AppData\Local\Programs\Python\Python312\Lib\site-packages\catboost\core.py", line 5827, in fit
    return self._fit(X, y, cat_features, text_features, embedding_features, None, sample_weight, None, None, None, None, baseline,
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  Fil

Best params: {'subsample': 0.9, 'learning_rate': 0.1, 'l2_leaf_reg': 7, 'iterations': 1500, 'depth': 8, 'colsample_bylevel': 0.8, 'bootstrap_type': 'MVS'}
Výsledky optimalizovaného modelu CatBoost s vybranými atributy
RMSE: 926.3776581213591
R^2: 0.5767799826413864
