# Importing data and packages

In [75]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
import xgboost as xgb
from sklearn.model_selection import RandomizedSearchCV
from datetime import datetime
from scipy.stats import uniform, randint

In [76]:
df = pd.read_csv("PrzefiltrowanaRamka\\PrzefiltrowanaRamka2.csv")

In [77]:
df['created_at'] = pd.to_datetime(df['created_at']).astype("int64")
df['free_from'] = pd.to_datetime(df['free_from']).astype("int64")

In [78]:
df.columns

Index(['market', 'created_at', 'description', 'price', 'm', 'price_per_m',
       'rooms_num', 'building_type', 'floor_no', 'building_floors_num',
       'building_material', 'windows_type', 'heating', 'build_year',
       'construction_status', 'rent', 'building_ownership', 'free_from',
       'remote_services', 'balcony', 'separate_kitchen', 'basement', 'lift',
       'garage', 'garden', 'air_conditioning', 'terrace', 'usable_room',
       'two_storey', 'cable-television', 'internet', 'phone', 'fridge',
       'furniture', 'stove', 'washing_machine', 'oven', 'dishwasher', 'tv',
       'entryphone', 'closed_area', 'monitoring', 'anti_burglary_door',
       'roller_shutters', 'alarm', 'Topic 1', 'Topic 2', 'Topic 3', 'Topic 4',
       'Topic 5', 'Topic 6', 'Topic 7', 'Topic 8', 'Topic 9', 'Topic 10',
       'Topic 11', 'Topic 12', 'Topic 13', 'Topic 14', 'Topic 15', 'Topic 16',
       'Topic 17', 'Topic 18', 'Topic 19', 'Topic 20', 'Topic 21', 'Topic 22',
       'Topic 23', 'Topic 24',

In [79]:
df = df.drop(columns=['description', 'building_ownership', 'price', 'm'])

In [80]:
def print_full(x):
    pd.set_option('display.max_rows', None)
    pd.set_option('display.max_columns', None)
    pd.set_option('display.width', 2000)
    pd.set_option('display.float_format', '{:20,.2f}'.format)
    pd.set_option('display.max_colwidth', None)
    print(x)
    pd.reset_option('display.max_rows')
    pd.reset_option('display.max_columns')
    pd.reset_option('display.width')
    pd.reset_option('display.float_format')
    pd.reset_option('display.max_colwidth')

# Feature selection

In [81]:
train_features, test_features, train_target, test_target = train_test_split(df.drop(columns=['price_per_m']),
                                                                            df['price_per_m'],
                                                                            test_size=0.3,
                                                                            random_state=21)

In [82]:
rewards = [0] * len(train_features.columns)

for _ in range(10):
    X_train, X_test, y_train, y_test = train_test_split(train_features, train_target, test_size=0.3, random_state=2024)
    
    param_grid = {
        'learning_rate': [0.01, 0.1, 0.2],
        'max_depth': [3, 5, 7],
        'subsample': [0.8, 0.9, 1.0],
        'colsample_bytree': [0.8, 0.9, 1.0],
        'n_estimators': [100, 250, 500],
    }
    
    kf = KFold(n_splits=5, shuffle=True, random_state=2024)
    
    grid_search = RandomizedSearchCV(
        xgb.XGBRegressor(random_state=2024, device='cuda'),
        param_grid,
        scoring='neg_root_mean_squared_error',
        cv=kf,
        n_jobs=-1
    )
    
    grid_search.fit(X_train, y_train)
    
    feature_importances = grid_search.best_estimator_.feature_importances_
    top_15_indices = np.argsort(feature_importances)[-15:]
    
    for i, idx in enumerate(top_15_indices):
        rewards[idx] += i

sorted_features = sorted(range(len(rewards)), key=lambda i: rewards[i], reverse=True)[:15]


In [83]:
print("Top 15 features with rewards:")
for idx in sorted_features:
    print(f"Feature {train_features.columns[idx]} with reward {rewards[idx]}")

Top 15 features with rewards:
Feature tv with reward 131
Feature rooms_num with reward 130
Feature alarm with reward 126
Feature monitoring with reward 108
Feature garage with reward 91
Feature building_type with reward 73
Feature closed_area with reward 71
Feature build_year with reward 67
Feature dishwasher with reward 56
Feature created_at with reward 52
Feature air_conditioning with reward 45
Feature building_floors_num with reward 40
Feature basement with reward 20
Feature heating with reward 19
Feature stove with reward 16


In [84]:
cols = train_features.columns[sorted_features]

In [85]:
train_features = train_features[cols]
test_features = test_features[cols]

In [102]:
train_target

3144      6050.00
9862      9596.06
13203     7625.99
4015      7300.00
1092      7800.00
           ...   
9336      8029.68
48       11006.29
8964      7350.00
5944      9264.71
5327      7236.84
Name: price_per_m, Length: 9621, dtype: float64

In [101]:
test_target

11600    9377.02
3121     7268.41
7719     7600.00
12968    8150.01
11495    7975.00
          ...   
4882     6632.65
2525     7419.19
9308     7706.67
4763     5189.87
10260    8098.10
Name: price_per_m, Length: 4124, dtype: float64

# Hyperparameter tuning

In [130]:
model1 = xgb.XGBRegressor(random_state=21, device='cuda')

In [131]:
grid_params_rs = [{
    "n_estimators": randint(1,500),
    "learning_rate": uniform(0,1),
    "max_depth": randint(1,10),
    'subsample': uniform(0,1),
    'colsample_bytree': uniform(0,1),
    "min_child_weight": uniform(0,200),
    'gamma': uniform(0.5,0.5)
}]

In [132]:
grid = RandomizedSearchCV(model1, grid_params_rs, scoring='neg_root_mean_squared_error', random_state=2024, n_iter=100)

In [133]:
grid.fit(train_features, train_target)

In [134]:
grid.best_params_

{'colsample_bytree': 0.9826048568521379,
 'gamma': 0.6555744457704711,
 'learning_rate': 0.39910537736891816,
 'max_depth': 8,
 'min_child_weight': 72.75480672757548,
 'n_estimators': 140,
 'subsample': 0.891264524167909}

## Fitting model with best params

In [141]:
best_model = xgb.XGBRegressor(random_state=21, device='cuda', **grid.best_params_, early_stopping_rounds=20)

In [142]:
train_features, val_features, train_target, val_target = train_test_split(train_features, train_target, random_state=21, train_size=0.9 )

In [143]:
best_model.fit(train_features, train_target, eval_set=[(train_features, train_target), (val_features, val_target)])

[0]	validation_0-rmse:991.15956	validation_1-rmse:1049.16315
[1]	validation_0-rmse:888.63380	validation_1-rmse:961.69353
[2]	validation_0-rmse:821.92914	validation_1-rmse:915.12716
[3]	validation_0-rmse:782.92809	validation_1-rmse:896.41151
[4]	validation_0-rmse:761.20332	validation_1-rmse:873.60184
[5]	validation_0-rmse:749.19089	validation_1-rmse:862.18292
[6]	validation_0-rmse:739.57986	validation_1-rmse:858.18167
[7]	validation_0-rmse:734.15922	validation_1-rmse:859.13704
[8]	validation_0-rmse:725.92303	validation_1-rmse:852.23174
[9]	validation_0-rmse:718.17268	validation_1-rmse:846.36867
[10]	validation_0-rmse:712.52513	validation_1-rmse:842.70170
[11]	validation_0-rmse:708.58663	validation_1-rmse:839.78285
[12]	validation_0-rmse:706.44873	validation_1-rmse:838.83168
[13]	validation_0-rmse:702.36935	validation_1-rmse:836.46178
[14]	validation_0-rmse:699.26465	validation_1-rmse:834.61032
[15]	validation_0-rmse:696.94258	validation_1-rmse:835.39716
[16]	validation_0-rmse:695.33140	

In [144]:
best_model.score(train_features, train_target)

0.6653218695818983

In [145]:
best_model.score(test_features, test_target)

0.006207623352661451

In [139]:
default = xgb.XGBRegressor(random_state=20)
default.fit(train_features, train_target)

In [140]:
default.score(test_features,test_target)

0.00624059569505564