# Importing data and packages

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
import xgboost as xgb
from sklearn.model_selection import RandomizedSearchCV
from datetime import datetime
from scipy.stats import uniform, randint
from dateutil.relativedelta import relativedelta
from sklearn.metrics import root_mean_squared_error

In [2]:
def print_full(x):
    pd.set_option('display.max_rows', None)
    pd.set_option('display.max_columns', None)
    pd.set_option('display.width', 2000)
    pd.set_option('display.float_format', '{:20,.2f}'.format)
    pd.set_option('display.max_colwidth', None)
    print(x)
    pd.reset_option('display.max_rows')
    pd.reset_option('display.max_columns')
    pd.reset_option('display.width')
    pd.reset_option('display.float_format')
    pd.reset_option('display.max_colwidth')

In [3]:
df = pd.read_csv("PrzefiltrowanaRamka\\PrzefiltrowanaRamka2.csv")

In [4]:
df['created_at'] = pd.to_datetime(df['created_at'])
df['free_from'] = pd.to_datetime(df['free_from'])
maping = {
    'co_operative_ownership_with_a_land_and_mortgage_registe': 3,
    'co_operative_ownership': 2,
    'full_ownership': 1,
    'share': 4
}
df['building_ownership'] = df['building_ownership'].map(maping)

In [5]:
df = df.drop(columns=['description'])

In [6]:
print_full(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13732 entries, 0 to 13731
Data columns (total 73 columns):
 #   Column               Non-Null Count  Dtype         
---  ------               --------------  -----         
 0   market               13732 non-null  int64         
 1   created_at           13732 non-null  datetime64[ns]
 2   price                13732 non-null  float64       
 3   m                    13732 non-null  float64       
 4   rooms_num            13732 non-null  int64         
 5   building_type        10999 non-null  float64       
 6   floor_no             12764 non-null  float64       
 7   building_floors_num  11644 non-null  float64       
 8   building_material    5943 non-null   float64       
 9   windows_type         9037 non-null   float64       
 10  heating              9853 non-null   float64       
 11  build_year           8577 non-null   float64       
 12  construction_status  8649 non-null   float64       
 13  rent                 3843 non-n

In [7]:
cutoff = df['created_at'].max() - relativedelta(months=6)

In [8]:
df_train, df_test = df.loc[df['created_at'] < cutoff].copy(), df.loc[df['created_at'] >= cutoff].copy()

In [9]:
df_train['created_at'] = df_train['created_at'].astype("int64")
df_train['free_from'] = df_train['free_from'].astype("int64")
df_test['created_at'] = df_test['created_at'].astype("int64")
df_test['free_from'] = df_test['free_from'].astype("int64")

In [10]:
print_full(df_train.info())
print_full(df_test.info())

<class 'pandas.core.frame.DataFrame'>
Index: 10048 entries, 0 to 10817
Data columns (total 73 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   market               10048 non-null  int64  
 1   created_at           10048 non-null  int64  
 2   price                10048 non-null  float64
 3   m                    10048 non-null  float64
 4   rooms_num            10048 non-null  int64  
 5   building_type        8050 non-null   float64
 6   floor_no             9195 non-null   float64
 7   building_floors_num  8537 non-null   float64
 8   building_material    4522 non-null   float64
 9   windows_type         6894 non-null   float64
 10  heating              7668 non-null   float64
 11  build_year           6348 non-null   float64
 12  construction_status  6281 non-null   float64
 13  rent                 3080 non-null   float64
 14  building_ownership   7064 non-null   float64
 15  free_from            10048 non-null  int6

In [11]:
train_features, test_features = df_train.drop(columns=['price']).copy(), df_test.drop(columns=['price']).copy()
train_target, test_target = df_train[['price']].copy(), df_test[['price']].copy()

In [12]:
train2_features, test2_features = df_train[['market', 'created_at', 'm', 'rooms_num', 'district_label']].copy(), df_test[['market', 'created_at', 'm', 'rooms_num', 'district_label']].copy()
train2_target, test2_target = df_train[['price']].copy(), df_test[['price']].copy()

# Model on full data

## Feature selection

In [13]:
rewards = [0] * len(train_features.columns)

for _ in range(10):
    X_train, X_test, y_train, y_test = train_test_split(train_features, train_target, test_size=0.3, random_state=2024)
    
    param_grid = {
        'learning_rate': [0.01, 0.1, 0.2],
        'max_depth': [3, 5, 7],
        'subsample': [0.8, 0.9, 1.0],
        'colsample_bytree': [0.8, 0.9, 1.0],
        'n_estimators': [100, 250, 500],
    }
    
    kf = KFold(n_splits=5, shuffle=True, random_state=2024)
    
    grid_search = RandomizedSearchCV(
        xgb.XGBRegressor(random_state=2024),
        param_grid,
        cv=kf,
        n_jobs=-1
    )
    
    grid_search.fit(X_train, y_train)
    
    feature_importances = grid_search.best_estimator_.feature_importances_
    top_15_indices = np.argsort(feature_importances)[-15:]
    
    for i, idx in enumerate(top_15_indices):
        rewards[idx] += i

sorted_features = sorted(range(len(rewards)), key=lambda i: rewards[i], reverse=True)[:15]


In [14]:
print("Top 15 features with rewards:")
for idx in sorted_features:
    print(f"Feature {train_features.columns[idx]} with reward {rewards[idx]}")

Top 15 features with rewards:
Feature m with reward 140
Feature garage with reward 122
Feature monitoring with reward 104
Feature dishwasher with reward 94
Feature build_year with reward 91
Feature basement with reward 86
Feature building_floors_num with reward 65
Feature building_type with reward 63
Feature rooms_num with reward 52
Feature tv with reward 40
Feature building_ownership with reward 38
Feature construction_status with reward 29
Feature two_storey with reward 25
Feature garden with reward 23
Feature building_material with reward 17


In [15]:
cols = train_features.columns[sorted_features]

In [16]:
train_features = train_features[cols]
test_features = test_features[cols]

## Hyperparameter tuning

In [17]:
model1 = xgb.XGBRegressor(random_state=21)

In [18]:
grid_params_rs = [{
    "n_estimators": randint(1,500),
    "learning_rate": uniform(0,0.2),
    "max_depth": randint(1,8),
    'subsample': uniform(0,1),
    'colsample_bytree': uniform(0,0.5),
    "min_child_weight": uniform(0,200),
    'gamma': uniform(0.5,0.5)
}]

In [19]:
grid = RandomizedSearchCV(model1, grid_params_rs, random_state=2024, n_iter=100)

In [20]:
grid.fit(train_features, train_target)

In [21]:
grid.best_params_

{'colsample_bytree': 0.4710433924108117,
 'gamma': 0.8470987322522179,
 'learning_rate': 0.1442372714910396,
 'max_depth': 5,
 'min_child_weight': 19.900294953730867,
 'n_estimators': 490,
 'subsample': 0.8814587995546425}

### Fitting model with best params

In [22]:
best_model = xgb.XGBRegressor(random_state=21, **grid.best_params_)

In [23]:
best_model.fit(train_features, train_target)

In [24]:
best_model.score(train_features, train_target)

0.9554511687962649

In [25]:
best_model.score(test_features, test_target)

0.9024027520686917

In [26]:
predicted_target = best_model.predict(test_features)
root_mean_squared_error(test_target, predicted_target)

45635.142710360036

#### default model (for comparison)

In [27]:
default = xgb.XGBRegressor(random_state=13)
default.fit(train_features, train_target)

In [28]:
default.score(test_features,test_target)

0.881381186427364

# Model on data without the text variables

In [29]:
model2 = xgb.XGBRegressor(random_state=21)


In [30]:
grid2 = RandomizedSearchCV(model2, grid_params_rs, random_state=2024, n_iter=100)

In [31]:
grid2.fit(train2_features, train2_target)

In [32]:
grid2.best_params_

{'colsample_bytree': 0.4564777607652652,
 'gamma': 0.7136062655304964,
 'learning_rate': 0.11883021019734086,
 'max_depth': 3,
 'min_child_weight': 22.224174320023415,
 'n_estimators': 277,
 'subsample': 0.5990454905580306}

In [33]:
best_model2 = xgb.XGBRegressor(random_state=21, **grid2.best_params_)

In [34]:
best_model2.fit(train2_features, train2_target)

In [35]:
best_model2.score(train2_features, train2_target)

0.8646124990251315

In [36]:
best_model2.score(test2_features, test2_target)

0.8853994833082246

In [37]:
predicted2_target = best_model2.predict(test2_features)
root_mean_squared_error(test_target, predicted2_target)

49450.8675471495

#### default model (for comparison)

In [38]:
default2 = xgb.XGBRegressor(random_state=13)
default2.fit(train2_features, train2_target)
default2.score(test2_features,test2_target)

0.8654291946989527