In [1]:
import pandas as pd
import pickle

In [2]:
pickle_path = "../data/processed/final_data.pck"

# Load the pickle file
with open(pickle_path, 'rb') as f:
    data = pickle.load(f)

print(data.columns)

categorical_features = ['host_response_time', 'neighbourhood_cleansed', 'room_type', 'season']

Index(['host_id', 'host_response_time', 'host_response_rate',
       'host_acceptance_rate', 'host_is_superhost', 'neighbourhood_cleansed',
       'room_type', 'accommodates', 'bathrooms', 'bedrooms', 'price',
       'minimum_nights', 'maximum_nights', 'instant_bookable',
       'reviews_per_month', 'amenities_count', 'count_verifications',
       'seasonal_availability', 'season', 'min_rating', 'max_rating',
       'distance_from_city_centre'],
      dtype='object')


In [337]:
# Linear Regression

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.impute import SimpleImputer

# Dummy encode categorical columns

dummy_df = pd.get_dummies(data, columns=categorical_columns)

# Split features (X) and target variable (y)
X = dummy_df.drop('price', axis=1)  # Features
y = dummy_df['price']  # Target variable

# Impute missing values in X
imputer = SimpleImputer(strategy='median')
X_imputed = imputer.fit_transform(X)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_imputed, y, test_size=0.2, random_state=42)

# Initialize and fit linear regression model
linear_regressor = LinearRegression()
linear_regressor.fit(X_train, y_train)

# Predict on the testing set
y_pred_linear = linear_regressor.predict(X_test)

# Calculate R-squared score
r2_linear = r2_score(y_test, y_pred_linear)
print("R2: {:.2f}".format(r2_linear))


R2: 0.03


In [338]:
# Catboost 1 - all selected features

from catboost import CatBoostRegressor
from sklearn.metrics import mean_squared_error
import numpy as np

X = data.drop('price', axis=1) 
y = data['price']

catboost_regressor = CatBoostRegressor(iterations=1000,
                                       learning_rate=0.1,
                                       depth=8,
                                       loss_function='RMSE',
                                       verbose=0)
catboost_regressor.fit(X_train, y_train)

# Predict on the testing set
y_pred_catboost = catboost_regressor.predict(X_test)

# R2
r2_catboost = r2_score(y_test, y_pred_catboost)
print("R2 (CatBoost): {:.2f}".format(r2_catboost))

# MSE
mse_catboost = mean_squared_error(y_test, y_pred_catboost)

# RMSE
rmse_catboost = np.sqrt(mse_catboost)

print("RMSE (CatBoost): {:.2f}".format(rmse_catboost))

R2 (CatBoost): 0.56
RMSE (CatBoost): 24911.48


In [339]:
# Catboost 2
for feature in categorical_features:
    data[feature] = data[feature].astype('category')
    if 'Missing' not in data[feature].cat.categories:
        data[feature] = data[feature].cat.add_categories('Missing')
    data[feature].fillna('Missing', inplace=True)
    
train_size = 7000

x_train = data.iloc[:train_size]
y_train = data.iloc[:train_size]['price']

x_test = data.iloc[train_size:]
y_test = data.iloc[train_size:]['price']

x_test = data.drop(x_train.index)

y_test = data.drop(y_train.index)['price']
regressor = CatBoostRegressor(eval_metric='R2',
                              iterations = 400, 
                              cat_features = categorical_features, 
                              random_state = 123
                              )
rfe_dict = regressor.select_features(X = x_train, 
                                     y = y_train, 
                                     eval_set = (x_test,y_test),
                                     features_for_select = '0-19',
                                     num_features_to_select = 8, 
                                     steps = 4, 
                                     verbose = 10,
                                     train_final_model = True, 
                                     plot = True
                                     )


A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.





MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Learning rate set to 0.12087
Step #1 out of 4
0:	learn: 0.1043200	test: 0.2027838	best: 0.2027838 (0)	total: 7.29ms	remaining: 2.91s
10:	learn: 0.5788850	test: 0.6497267	best: 0.6497267 (10)	total: 87.9ms	remaining: 3.11s
20:	learn: 0.7943710	test: 0.4913172	best: 0.6503638 (11)	total: 148ms	remaining: 2.66s
30:	learn: 0.8930556	test: 0.0035191	best: 0.6503638 (11)	total: 208ms	remaining: 2.47s
40:	learn: 0.9405800	test: -0.4394253	best: 0.6503638 (11)	total: 253ms	remaining: 2.21s
50:	learn: 0.9674468	test: -1.0268974	best: 0.6503638 (11)	total: 297ms	remaining: 2.03s
60:	learn: 0.9833810	test: -1.2733901	best: 0.6503638 (11)	total: 371ms	remaining: 2.06s
70:	learn: 0.9918853	test: -1.3687757	best: 0.6503638 (11)	total: 436ms	remaining: 2.02s
80:	learn: 0.9956712	test: -1.4693121	best: 0.6503638 (11)	total: 486ms	remaining: 1.91s
90:	learn: 0.9974620	test: -1.5504525	best: 0.6503638 (11)	total: 538ms	remaining: 1.83s
100:	learn: 0.9984643	test: -1.6251751	best: 0.6503638 (11)	total: 5

In [340]:
from sklearn.metrics import mean_squared_error

y_pred = regressor.predict(x_test)

r2_cat = r2_score(y_test, y_pred)
print("R2 on test set:", r2_cat)

rmse = mean_squared_error(y_test, y_pred, squared=False)
print("RMSE:", rmse)

R2 on test set: 0.8882335651347875
RMSE: 2685.968331383495



'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.



In [341]:
# křížová validace, která je úplně k ničemu

from sklearn.model_selection import KFold

# Define number of folds for cross-validation
n_splits = 20
kf = KFold(n_splits=n_splits, shuffle=True, random_state=123)

# Initialize CatBoostRegressor

regressor = CatBoostRegressor(eval_metric='R2',
                              iterations = 400, 
                              cat_features = categorical_features, 
                              random_state = 123
                              )
rfe_dict = regressor.select_features(X = x_train, 
                                     y = y_train, 
                                     eval_set = (x_test,y_test),
                                     features_for_select = '0-19',
                                     num_features_to_select = 8, 
                                     steps = 4, 
                                     verbose = 10,
                                     train_final_model = True, 
                                     plot = True
                                     )

# Perform cross-validation
rmses = []
for train_index, val_index in kf.split(x_train):
    X_train_fold, X_val_fold = x_train.iloc[train_index], x_train.iloc[val_index]
    y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]

    # Fit the model
    regressor.fit(X_train_fold, y_train_fold, eval_set=(X_val_fold, y_val_fold))

    # Predict on the validation set
    y_pred_val = regressor.predict(X_val_fold)

    # Calculate RMSE
    rmse_fold = np.sqrt(mean_squared_error(y_val_fold, y_pred_val))
    rmses.append(rmse_fold)

# Calculate average RMSE across all folds
avg_rmse = np.mean(rmses)
print("Average RMSE across all folds:", avg_rmse)


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Learning rate set to 0.12087
Step #1 out of 4
0:	learn: 0.1043200	test: 0.2027838	best: 0.2027838 (0)	total: 9.65ms	remaining: 3.85s
10:	learn: 0.5788850	test: 0.6497267	best: 0.6497267 (10)	total: 120ms	remaining: 4.24s
20:	learn: 0.7943710	test: 0.4913172	best: 0.6503638 (11)	total: 152ms	remaining: 2.75s
30:	learn: 0.8930556	test: 0.0035191	best: 0.6503638 (11)	total: 192ms	remaining: 2.28s
40:	learn: 0.9405800	test: -0.4394253	best: 0.6503638 (11)	total: 237ms	remaining: 2.07s
50:	learn: 0.9674468	test: -1.0268974	best: 0.6503638 (11)	total: 280ms	remaining: 1.92s
60:	learn: 0.9833810	test: -1.2733901	best: 0.6503638 (11)	total: 333ms	remaining: 1.85s
70:	learn: 0.9918853	test: -1.3687757	best: 0.6503638 (11)	total: 395ms	remaining: 1.83s
80:	learn: 0.9956712	test: -1.4693121	best: 0.6503638 (11)	total: 447ms	remaining: 1.76s
90:	learn: 0.9974620	test: -1.5504525	best: 0.6503638 (11)	total: 502ms	remaining: 1.7s
100:	learn: 0.9984643	test: -1.6251751	best: 0.6503638 (11)	total: 539

KeyboardInterrupt: 