# Importing data and packages

In [82]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
import xgboost as xgb
from sklearn.model_selection import RandomizedSearchCV
from datetime import datetime
from scipy.stats import uniform, randint

In [83]:
def print_full(x):
    pd.set_option('display.max_rows', None)
    pd.set_option('display.max_columns', None)
    pd.set_option('display.width', 2000)
    pd.set_option('display.float_format', '{:20,.2f}'.format)
    pd.set_option('display.max_colwidth', None)
    print(x)
    pd.reset_option('display.max_rows')
    pd.reset_option('display.max_columns')
    pd.reset_option('display.width')
    pd.reset_option('display.float_format')
    pd.reset_option('display.max_colwidth')

In [84]:
df = pd.read_csv("PrzefiltrowanaRamka.csv")

In [85]:
df['created_at'] = pd.to_datetime(df['created_at']).astype("int64")
df['free_from'] = pd.to_datetime(df['free_from']).astype("int64")

In [86]:
print_full(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13745 entries, 0 to 13744
Data columns (total 74 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   market               13745 non-null  int64  
 1   created_at           13745 non-null  int64  
 2   price                13745 non-null  float64
 3   m                    13745 non-null  float64
 4   rooms_num            13745 non-null  int64  
 5   building_type        11011 non-null  float64
 6   floor_no             12777 non-null  float64
 7   building_floors_num  11657 non-null  float64
 8   building_material    5950 non-null   float64
 9   windows_type         9049 non-null   float64
 10  heating              9864 non-null   float64
 11  build_year           8587 non-null   float64
 12  construction_status  8660 non-null   float64
 13  rent                 3847 non-null   float64
 14  building_ownership   9127 non-null   float64
 15  free_from            13745 non-null 

In [88]:
df = df.drop(columns=['price', 'm'])

In [89]:
df2 = df.copy()[['market', 'created_at', 'price_per_m', 'rooms_num', 'district_label']]
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13745 entries, 0 to 13744
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   market          13745 non-null  int64  
 1   created_at      13745 non-null  int64  
 2   price_per_m     13745 non-null  float64
 3   rooms_num       13745 non-null  int64  
 4   district_label  13745 non-null  int64  
dtypes: float64(1), int64(4)
memory usage: 537.0 KB


# Model on full data

## Feature selection

In [90]:
train_features, test_features, train_target, test_target = train_test_split(df.drop(columns=['price_per_m']),
                                                                            df['price_per_m'],
                                                                            test_size=0.3,
                                                                            random_state=21)

In [91]:
rewards = [0] * len(train_features.columns)

for _ in range(10):
    X_train, X_test, y_train, y_test = train_test_split(train_features, train_target, test_size=0.3, random_state=2024)
    
    param_grid = {
        'learning_rate': [0.01, 0.1, 0.2],
        'max_depth': [3, 5, 7],
        'subsample': [0.8, 0.9, 1.0],
        'colsample_bytree': [0.8, 0.9, 1.0],
        'n_estimators': [100, 250, 500],
    }
    
    kf = KFold(n_splits=5, shuffle=True, random_state=2024)
    
    grid_search = RandomizedSearchCV(
        xgb.XGBRegressor(random_state=2024, device='cuda'),
        param_grid,
        #scoring='neg_root_mean_squared_error',
        cv=kf,
        n_jobs=-1
    )
    
    grid_search.fit(X_train, y_train)
    
    feature_importances = grid_search.best_estimator_.feature_importances_
    top_15_indices = np.argsort(feature_importances)[-15:]
    
    for i, idx in enumerate(top_15_indices):
        rewards[idx] += i

sorted_features = sorted(range(len(rewards)), key=lambda i: rewards[i], reverse=True)[:15]


In [92]:
print("Top 15 features with rewards:")
for idx in sorted_features:
    print(f"Feature {train_features.columns[idx]} with reward {rewards[idx]}")

Top 15 features with rewards:
Feature rooms_num with reward 132
Feature alarm with reward 128
Feature monitoring with reward 116
Feature tv with reward 115
Feature garage with reward 83
Feature build_year with reward 73
Feature building_type with reward 71
Feature closed_area with reward 71
Feature created_at with reward 65
Feature dishwasher with reward 53
Feature heating with reward 30
Feature basement with reward 29
Feature stove with reward 29
Feature building_floors_num with reward 26
Feature washing_machine with reward 9


In [93]:
cols = train_features.columns[sorted_features]

In [94]:
train_features = train_features[cols]
test_features = test_features[cols]

In [95]:
train_target

3144      6050.00
9862      9596.06
13203     7625.99
4015      7300.00
1092      7800.00
           ...   
9336      8029.68
48       11006.29
8964      7350.00
5944      9264.71
5327      7236.84
Name: price_per_m, Length: 9621, dtype: float64

In [96]:
test_target

11600    9377.02
3121     7268.41
7719     7600.00
12968    8150.01
11495    7975.00
          ...   
4882     6632.65
2525     7419.19
9308     7706.67
4763     5189.87
10260    8098.10
Name: price_per_m, Length: 4124, dtype: float64

## Hyperparameter tuning

In [97]:
model1 = xgb.XGBRegressor(random_state=21)

In [98]:
grid_params_rs = [{
    "n_estimators": randint(1,500),
    "learning_rate": uniform(0,0.2),
    "max_depth": randint(1,8),
    'subsample': uniform(0,1),
    'colsample_bytree': uniform(0,0.5),
    "min_child_weight": uniform(0,200),
    'gamma': uniform(0.5,0.5)
}]

In [99]:
grid = RandomizedSearchCV(model1, grid_params_rs, random_state=2024, n_iter=100)

In [100]:
grid.fit(train_features, train_target)

In [101]:
grid.best_params_

{'colsample_bytree': 0.4710433924108117,
 'gamma': 0.8470987322522179,
 'learning_rate': 0.1442372714910396,
 'max_depth': 5,
 'min_child_weight': 19.900294953730867,
 'n_estimators': 490,
 'subsample': 0.8814587995546425}

### Fitting model with best params

In [102]:
best_model = xgb.XGBRegressor(random_state=21, device='cuda', **grid.best_params_, early_stopping_rounds=20)

In [103]:
train_features, val_features, train_target, val_target = train_test_split(train_features, train_target, random_state=21, train_size=0.9 )

In [104]:
best_model.fit(train_features, train_target, eval_set=[(train_features, train_target), (val_features, val_target)])

[0]	validation_0-rmse:1149.95784	validation_1-rmse:1197.86867
[1]	validation_0-rmse:1101.94227	validation_1-rmse:1153.51901
[2]	validation_0-rmse:1056.69963	validation_1-rmse:1111.91032
[3]	validation_0-rmse:1036.53465	validation_1-rmse:1093.74165
[4]	validation_0-rmse:987.67792	validation_1-rmse:1045.56346
[5]	validation_0-rmse:960.99618	validation_1-rmse:1023.72714


[6]	validation_0-rmse:941.86897	validation_1-rmse:1009.66281
[7]	validation_0-rmse:921.75270	validation_1-rmse:995.87261
[8]	validation_0-rmse:908.02255	validation_1-rmse:984.18974
[9]	validation_0-rmse:886.90966	validation_1-rmse:962.82236
[10]	validation_0-rmse:877.90135	validation_1-rmse:958.62682
[11]	validation_0-rmse:854.39224	validation_1-rmse:937.91636
[12]	validation_0-rmse:843.76100	validation_1-rmse:929.70090
[13]	validation_0-rmse:830.03497	validation_1-rmse:917.79689
[14]	validation_0-rmse:822.03535	validation_1-rmse:911.91536
[15]	validation_0-rmse:808.99713	validation_1-rmse:900.20245
[16]	validation_0-rmse:804.97896	validation_1-rmse:898.11908
[17]	validation_0-rmse:794.30802	validation_1-rmse:890.52063
[18]	validation_0-rmse:786.89194	validation_1-rmse:885.85808
[19]	validation_0-rmse:782.65939	validation_1-rmse:882.21372
[20]	validation_0-rmse:776.90393	validation_1-rmse:877.63605
[21]	validation_0-rmse:768.77228	validation_1-rmse:872.86022
[22]	validation_0-rmse:764.

In [105]:
best_model.score(train_features, train_target)

0.6709709528403816

In [106]:
best_model.score(test_features, test_target)

0.0063986905669292105

In [107]:
default = xgb.XGBRegressor(random_state=13)
default.fit(train_features, train_target)

In [108]:
default.score(test_features,test_target)

0.006949672130936557

# Model on data without the text variables

In [109]:
train2_features, test2_features, train2_target, test2_target = train_test_split(df2.drop(columns=['price_per_m']),
                                                                            df2['price_per_m'],
                                                                            test_size=0.3,
                                                                            random_state=21)

In [110]:
model2 = xgb.XGBRegressor(random_state=21)


In [111]:
grid2 = RandomizedSearchCV(model2, grid_params_rs, random_state=2024, n_iter=100)

In [112]:
grid2.fit(train2_features, train2_target)

In [113]:
grid2.best_params_

{'colsample_bytree': 0.4564777607652652,
 'gamma': 0.7136062655304964,
 'learning_rate': 0.11883021019734086,
 'max_depth': 3,
 'min_child_weight': 22.224174320023415,
 'n_estimators': 277,
 'subsample': 0.5990454905580306}

In [114]:
default2 = xgb.XGBRegressor(random_state=13)
default2.fit(train2_features, train2_target)
default2.score(test2_features,test2_target)

0.0035980458103337387

In [115]:
best_model2 = xgb.XGBRegressor(random_state=21, device='cuda', **grid2.best_params_)

In [116]:
best_model2.fit(train2_features, train2_target)

In [117]:
best_model2.score(train2_features, train2_target)

0.3925270915084612

In [118]:
best_model2.score(test2_features, test2_target)

0.0031818947464469227