In [10]:
import xgboost as xgb
from hyperopt import hp, fmin, tpe, Trials, STATUS_OK, space_eval
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split




def encode_top_n_cities(df, city_column, n):
    h = df.copy()
    top_cities = h[city_column].value_counts().nlargest(n).index

    h[city_column] = h[city_column].apply(lambda x: x if x in top_cities else 'Other')

    label_encoder = LabelEncoder()
    h[f'{city_column}_encoded'] = label_encoder.fit_transform(h[city_column])

    return h

def train_xgb_with_hyperopt(train_set, test_set, numerical_features, target_column, categorical_features=None, max_evals=100):
    """
    Train an XGBoost model with hyperparameter tuning using hyperopt.

    Parameters:
        train_set (pd.DataFrame): Pre-separated training set.
        test_set (pd.DataFrame): Pre-separated testing set.
        numerical_features (list): List of numerical feature column names.
        target_column (str): Target column name.
        categorical_features (list, optional): List of categorical feature column names. Default is None.
        max_evals (int): Maximum number of hyperparameter optimization iterations. Default is 100.

    Returns:
        dict: A dictionary containing the best hyperparameters and evaluation metrics.
    """

    X_train = train_set[numerical_features + categorical_features] if categorical_features else train_set[numerical_features]
    y_train = train_set[target_column]

    X_test = test_set[numerical_features + categorical_features] if categorical_features else test_set[numerical_features]
    y_test = test_set[target_column]
    def objective(params):
        params['max_depth'] = int(params['max_depth'])
        params['min_child_weight'] = int(params['min_child_weight'])
        model = xgb.XGBRegressor(enable_categorical=True, **params)

        kf = KFold(n_splits=5, shuffle=True, random_state=42)
        r2_scores = []

        for train_idx, val_idx in kf.split(X_train):
            X_train_fold, X_val_fold = X_train.iloc[train_idx], X_train.iloc[val_idx]
            y_train_fold, y_val_fold = y_train.iloc[train_idx], y_train.iloc[val_idx]

            model.fit(X_train_fold, y_train_fold)
            y_val_pred = model.predict(X_val_fold)
            r2 = r2_score(y_val_fold, y_val_pred)
            r2_scores.append(r2)

        mean_r2 = np.mean(r2_scores)
        return {'loss': -mean_r2, 'status': STATUS_OK}

    space = {
        'n_estimators': hp.choice('n_estimators', [100, 200, 300]),
        'learning_rate': hp.uniform('learning_rate', 0.09, 0.5),
        'max_depth': hp.choice('max_depth', [3, 5, 7, 9, 12]),
        'min_child_weight': hp.choice('min_child_weight', [1, 3, 5, 7]),
        'subsample': hp.uniform('subsample', 0.6, 1.0),
        'colsample_bytree': hp.uniform('colsample_bytree', 0.6, 1.0),
        'gamma': hp.uniform('gamma', 0, 0.4),
        'reg_alpha': hp.uniform('reg_alpha', 0, 0.05),
        'reg_lambda': hp.uniform('reg_lambda', 0, 0.05),
        'objective': 'reg:squarederror',
        'eval_metric': 'rmse',
        'seed': 42
    }
    trials = Trials()
    best = fmin(fn=objective, space=space, algo=tpe.suggest, max_evals=max_evals, trials=trials)
    best_params = space_eval(space, best)

    final_model = xgb.XGBRegressor(enable_categorical=True, **best_params)
    final_model.fit(X_train, y_train)
    y_test_pred = final_model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))
    mae = mean_absolute_error(y_test, y_test_pred)
    r2 = r2_score(y_test, y_test_pred)

    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    r2_scores, rmse_scores, mae_scores = [], [], []

    for train_idx, val_idx in kf.split(X_train):
        X_train_fold, X_val_fold = X_train.iloc[train_idx], X_train.iloc[val_idx]
        y_train_fold, y_val_fold = y_train.iloc[train_idx], y_train.iloc[val_idx]

        model = xgb.XGBRegressor(enable_categorical=True, **best_params)
        model.fit(X_train_fold, y_train_fold)

        y_val_pred = model.predict(X_val_fold)
        r2_scores.append(r2_score(y_val_fold, y_val_pred))
        rmse_scores.append(np.sqrt(mean_squared_error(y_val_fold, y_val_pred)))
        mae_scores.append(mean_absolute_error(y_val_fold, y_val_pred))

    return {
        'final_model':final_model,
        'best_params': best_params,
        'test_rmse': rmse,
        'test_mae': mae,
        'test_r2': r2,
        'cv_mean_r2': np.mean(r2_scores),
        'cv_mean_rmse': np.mean(rmse_scores),
        'cv_mean_mae': np.mean(mae_scores)
    }


def remove_outliers_iqr(df, column,quantiles=(0.25,0.75) ):
    Q1 = df[column].quantile(quantiles[0])
    Q3 = df[column].quantile(quantiles[1])
    IQR = Q3 - Q1

    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    df_no_outliers = df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]

    print(f"IQR Method: Removed {df.shape[0] - df_no_outliers.shape[0]} rows.")
    return df_no_outliers

def get_collection(client, name):
    return pd.DataFrame(client['Houses'][name].find({})).drop(['_id', 'title', 'url','description'], axis=1)
def convert_price_to_tnd(df, price_column='price', eur_to_tnd=3.34):
    h = df.copy()
    h[price_column] = (
        h[price_column]
        .str.extract(r'(\d[\d\s]*)')[0]
        .str.replace(r'\s+', '', regex=True).str.replace(',','',regex=True)
        .astype('Int64')
        * h[price_column].apply(lambda x: eur_to_tnd if 'EUR' in x else 1)
    )
    h = h.reset_index(drop=True)
    return h.copy()


In [11]:
import pandas as pd
import numpy as np
import pymongo
import warnings
import os
import dotenv
dotenv.load_dotenv('secret.env')

warnings.filterwarnings('ignore')


mongodb_uri = os.getenv('MONGODB_URI')
client = pymongo.MongoClient(mongodb_uri)



<h3> Affare data pre-processing

In [12]:
affare = get_collection(client, 'Affare')

In [13]:
affare = affare[affare['price'].isna() == False]
affare = convert_price_to_tnd(affare, 'price')
affare.drop(['Meublée','posting_date','Adresse'],axis=1,inplace=True)
affare['Chambre'] = affare['Chambre'].str.extract('(\d+)').astype('Int64')
affare['Salles de bains'] = affare['Salles de bains'].str.extract('(\d+)').astype('Int64')
affare['Superficie'] = affare['Superficie'].str.extract('(\d+)').astype('Int64')
affare['city'] = affare['location'].str.split(' - ', expand=True).loc[:, 1].str.lower()
affare['state'] = affare['location'].str.split(' - ', expand=True).loc[:, 0].str.lower()
affare['Type'] = affare['Type'].fillna('villa')
affare.rename(
    {"Chambre": 'n_bedrooms',
    'Salles de bains': 'n_bathrooms',
    'Superficie': 'area'}
    , axis='columns', inplace=True)
affare['city'] = affare['city'].apply(lambda x: 'hammamet' if 'hammamet' in x else x)
affare = affare[(affare['area'].isna() == False) & (affare['n_bathrooms'].isna() == False)]


<h3> Menzili pre-processing

In [14]:
menzili = get_collection(client, 'menzili')

In [15]:
menzili['price'] = menzili['price'].str.replace(' ', '').str.extract('(\d+)').astype('Int64')
menzili.dropna(subset=['Surf terrain', 'Salle de bain', 'Chambres', 'price', 'location'],inplace=True)
menzili.drop(['Piéces Totale', 'Année construction', 'Surf habitable', 'misc'], axis=1, inplace=True)
menzili.rename({'Chambres': 'n_bedrooms', 'Salle de bain': 'n_bathrooms', 'Surf terrain': 'area'}, axis='columns', inplace=True)
menzili['n_bedrooms'] = menzili['n_bedrooms'].str.replace('+', '').astype('Int64')
menzili['n_bathrooms'] = menzili['n_bathrooms'].str.replace('+', '').astype('Int64')
menzili['area'] = menzili['area'].str.extract('(\d+)').astype('Int64')
menzili['state'] = menzili['location'].str.split(', ', expand=True).loc[:, 2].str.replace('é', 'e').str.lower()
menzili['city'] = menzili['location'].str.split(', ', expand=True).loc[:, 1].str.replace('é', 'e').str.lower()
menzili.dropna(subset=['city', 'state'], inplace=True)
menzili['city'] = menzili['city'].str.replace('djerba - midoun', 'djerba').apply(lambda x: 'hammamet' if 'hammamet' in x else x)


Filtering the aggregated dataset. We will only consider houses in the price range of 80,000 - 1,100,000 TND.

Only houses with Area < 1500m² will be considered due to data imbalance 

In [18]:
aggregate_df = pd.concat([affare,menzili])
aggregate_df = aggregate_df[(aggregate_df['price'] > 80000) & (aggregate_df['price'] <= 1000000)]
aggregate_df = aggregate_df[(aggregate_df['n_bedrooms'] <= 7) & (aggregate_df['n_bedrooms'] >= 1)]
aggregate_df = aggregate_df[(aggregate_df['n_bathrooms'] >= 1) & (aggregate_df['n_bathrooms'] < 7)]
aggregate_df = aggregate_df[(aggregate_df['area'] >= 100) & (aggregate_df['area'] <= 1500)]

label_encoder = LabelEncoder()
aggregate_df['type_encoded'] = label_encoder.fit_transform(aggregate_df['Type'])
aggregate_df['price_log'] = np.log1p(aggregate_df['price'])


aggregate_df['area_log'] = np.log1p(aggregate_df['area'])

In [20]:


train_set, test_set = train_test_split(aggregate_df, test_size=0.2, random_state=42)

def preprocess_data(train_df):
    temp = train_df.copy()
    # # 
    # temp = remove_outliers_iqr(temp, 'area')
    # temp = remove_outliers_iqr(temp, 'n_bedrooms')
    # Encode top N cities and states
    temp = encode_top_n_cities(temp, 'city', 30)
    temp = encode_top_n_cities(temp, 'state', 10)
    temp['city'] = temp['city'].astype('category')
    temp['state'] = temp['state'].astype('category')
    return temp

def preprocss_test_data(test_set, cities,cities_encoding,states,states_encoding):
    temp = test_set.copy()
    temp['city'] = temp['city'].apply(lambda x: x if x in cities.value_counts() else 'Other')
    temp['city'] = temp['city'].astype('category')
    t = pd.concat([cities,cities_encoding],axis=1).drop_duplicates()

    temp['city_encoded'] = temp['city'].apply(lambda x: t.loc[t['city']==x,'city_encoded'].iloc[0])
    temp['state'] = temp['state'].apply(lambda x: x if x in states.value_counts() else 'Other')
    temp['state'] = temp['state'].astype('category')
    t = pd.concat([states,states_encoding],axis=1).drop_duplicates()

    temp['state_encoded'] = temp['state'].apply(lambda x: t.loc[t['state']==x,'state_encoded'].iloc[0])
    return temp
    
    

train_set_preprocessed = preprocess_data(train_set)
test_set_preprocessed = preprocss_test_data(test_set, train_set_preprocessed['city'],train_set_preprocessed['city_encoded'],train_set_preprocessed['state'],train_set_preprocessed['state_encoded'])
# 
FEATURES = ['area', 'n_bedrooms', 'n_bathrooms','city_encoded','state_encoded','type_encoded']
model_results = train_xgb_with_hyperopt(train_set_preprocessed, test_set_preprocessed, FEATURES, 'price')


100%|██████████| 100/100 [01:32<00:00,  1.08trial/s, best loss: -0.7772433638572693]


In [21]:
model_results

{'final_model': XGBRegressor(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=0.7334342542663905, device=None,
              early_stopping_rounds=None, enable_categorical=True,
              eval_metric='rmse', feature_types=None, gamma=0.048799306537731214,
              grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=0.0999968145747301,
              max_bin=None, max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=7, max_leaves=None,
              min_child_weight=3, missing=nan, monotone_constraints=None,
              multi_strategy=None, n_estimators=300, n_jobs=None,
              num_parallel_tree=None, random_state=None, ...),
 'best_params': {'colsample_bytree': 0.7334342542663905,
  'eval_metric': 'rmse',
  'gamma': 0.048799306537731214,
  'learning_rate': 0.0999968145747301,
  'max_depth': 7,
  