In [77]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import numpy as np
from sklearn.model_selection import GridSearchCV
import xgboost as xgb
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.base import clone
from sklearn.preprocessing import StandardScaler
from sklearn.compose import TransformedTargetRegressor
import joblib
from model.transformers import MapFloorValues, StatusValues, BasicTransformer
from model.transformers import remove_index
import numpy as np
import pandas as pd

pd.set_option("future.no_silent_downcasting", True)
pd.set_option('display.max_columns', 50)  
pd.set_option('display.max_rows', 50)  

In [78]:
def load_data(file_path, target_column):
    """
    Load data for TSV files.
    
    Args:
    - file_path (str): Path to the TSV file.
    - target_column (str): The column to be used as the target for the model.
    
    Returns:
    - X, y 
    """
    # Load the data from the CSV file
    data = pd.read_csv(file_path, sep='\t')
    
    # Check for missing values (optional step)
    # data = data.dropna()
    
    # Split features (X) and target (y)
    X = data.drop(columns=[target_column])
    y = data[target_column]
    return X, y

In [79]:
rents_file_path = '../output/rents/output.csv'
rents_X, rents_y = load_data(rents_file_path, 'price')
rents_X = rents_X.drop(columns=['tenantGender', 'newDevelopmentFinished', 'garageType', 'isSmokingAllowed', 'externalReference', 'thumbnail','topNewDevelopment', 'superTopHighlight', 'hasStaging'])

In [80]:
sales_file_path = '../output/sales/output.csv'
sales_X, sales_y = load_data(sales_file_path, 'price')
sales_X = sales_X.drop(columns=['tenantGender', 'newDevelopmentFinished', 'garageType', 'isSmokingAllowed', 'externalReference', 'thumbnail', 'topNewDevelopment', 'superTopHighlight', 'hasStaging'])

In [81]:
param_grid = {
    'n_estimators': [100],
    'min_child_weight': [0.5, 1],
    'gamma': [0, 0.25],
    'max_depth': [3, 6],
    'learning_rate': [0.05, 0.1],
    'subsample': [0.8],
    'colsample_bytree': [0.8, 0.9, 1.0],
    'reg_alpha': [1, 2],
    'reg_lambda': [5, 10],
}

# Define the preprocessing steps
categorical_features = ['district', 'neighborhood', 'propertyType']
binary_features = ['newDevelopment', 'exterior', 'hasLift', 'garage', 'storage_room', 'suite_bath', 'janitor', 'pool', 'animal', 'mansard']
numerical_features = ['bathrooms', 'size', 'rooms', 'created', 'latitude', 'longitude']


preprocessor = ColumnTransformer(
    transformers=[
        ('bin', 'passthrough', binary_features),  # Keep numerical features as they are
        ('num', 'passthrough', numerical_features),
        ('status', StatusValues(), ['status']),
        ('floor', MapFloorValues(), ['floor']),
        ('cat', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=np.nan), categorical_features)  # One-hot encode categorical features
    ]
)

grid_search = GridSearchCV(estimator=xgb.XGBRegressor(), param_grid=param_grid, cv=3, scoring='neg_mean_squared_error', verbose=2, n_jobs=-1)
xboostregressor_normalized_model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', grid_search) 
])

pipeline = TransformedTargetRegressor(
    regressor=xboostregressor_normalized_model_pipeline,
    transformer=StandardScaler()  
    )

In [82]:
def split_dataframe(df, target_column):
    y_pandas = df[[target_column]]         # Select the target column as a DataFrame
    X_pandas = df.drop(columns=[target_column])  # Drop the target column for X
    return X_pandas, y_pandas

In [83]:
def evaluate_model(model, X_test, y_test):
    """
    Evaluate a regression model on test data and print performance metrics.

    Parameters:
    - model: The trained regression model.
    - X_test: Features of the test dataset.
    - y_test: True target values for the test dataset.

    Returns:
    - metrics: A dictionary containing MSE, MAE, and R² score.
    """
    # Predict on the test set
    y_pred = model.predict(X_test)
    
    # Calculate metrics
    mse = mean_squared_error(y_test,(y_pred))
    mae = mean_absolute_error(y_test, (y_pred))
    r2 = r2_score(y_test, (y_pred))
    
    # Print metrics
    print(f"Mean Squared Error: {mse:.4f}")
    print(f"Mean Absolute Error: {mae:.4f}")
    print(f"R² Score: {r2:.4f}")
    
    # Return metrics as a dictionary
    return {
        'mse': mse,
        'mae': mae,
        'r2': r2
    }

In [84]:
def filter_abs_percent(df, column, percent):
    threshold = df[column].abs().quantile(percent) 
    return df[df[column].abs() <= threshold] 

In [85]:
rents_X = BasicTransformer(training=True).transform(rents_X)
sales_X = BasicTransformer(training=True).transform(sales_X)

In [86]:
rents_X, rents_y = remove_index(rents_X, rents_y)
sales_X, sales_y = remove_index(sales_X, sales_y)

In [88]:
rents_X_train, rents_X_test, rents_y_train, rents_y_test = train_test_split(rents_X, rents_y, test_size=0.2, random_state=42)
sales_X_train, sales_X_test, sales_y_train, sales_y_test = train_test_split(sales_X, sales_y, test_size=0.2, random_state=42)

In [89]:
xboostregressor_rents = clone(pipeline)
xboostregressor_rents.fit(rents_X_train, rents_y_train)

Fitting 3 folds for each of 192 candidates, totalling 576 fits
[CV] END colsample_bytree=0.8, gamma=0, learning_rate=0.05, max_depth=3, min_child_weight=0.5, n_estimators=100, reg_alpha=1, reg_lambda=5, subsample=0.8; total time=  -0.1s
[CV] END colsample_bytree=0.8, gamma=0, learning_rate=0.05, max_depth=3, min_child_weight=0.5, n_estimators=100, reg_alpha=1, reg_lambda=5, subsample=0.8; total time=  -0.1s
[CV] END colsample_bytree=0.8, gamma=0, learning_rate=0.05, max_depth=3, min_child_weight=0.5, n_estimators=100, reg_alpha=1, reg_lambda=5, subsample=0.8; total time=  -0.1s
[CV] END colsample_bytree=0.8, gamma=0, learning_rate=0.05, max_depth=3, min_child_weight=0.5, n_estimators=100, reg_alpha=1, reg_lambda=10, subsample=0.8; total time=   2.8s
[CV] END colsample_bytree=0.8, gamma=0, learning_rate=0.05, max_depth=3, min_child_weight=0.5, n_estimators=100, reg_alpha=1, reg_lambda=10, subsample=0.8; total time=   2.8s
[CV] END colsample_bytree=0.8, gamma=0, learning_rate=0.05, max_d

In [90]:
xboostregressor_sales = clone(pipeline)
xboostregressor_sales.fit(sales_X_train, sales_y_train)

Fitting 3 folds for each of 192 candidates, totalling 576 fits
[CV] END colsample_bytree=0.8, gamma=0, learning_rate=0.05, max_depth=3, min_child_weight=0.5, n_estimators=100, reg_alpha=1, reg_lambda=5, subsample=0.8; total time=   0.1s
[CV] END colsample_bytree=0.8, gamma=0, learning_rate=0.05, max_depth=3, min_child_weight=0.5, n_estimators=100, reg_alpha=1, reg_lambda=5, subsample=0.8; total time=   0.1s
[CV] END colsample_bytree=0.8, gamma=0, learning_rate=0.05, max_depth=3, min_child_weight=0.5, n_estimators=100, reg_alpha=1, reg_lambda=5, subsample=0.8; total time=   0.1s
[CV] END colsample_bytree=0.8, gamma=0, learning_rate=0.05, max_depth=3, min_child_weight=0.5, n_estimators=100, reg_alpha=1, reg_lambda=10, subsample=0.8; total time=   0.1s
[CV] END colsample_bytree=0.8, gamma=0, learning_rate=0.05, max_depth=3, min_child_weight=0.5, n_estimators=100, reg_alpha=1, reg_lambda=10, subsample=0.8; total time=   0.1s
[CV] END colsample_bytree=0.8, gamma=0, learning_rate=0.05, max_d

In [91]:
print("Valor rents")
evaluate_model(xboostregressor_rents, rents_X_test, rents_y_test)
print("Valor sales")
evaluate_model(xboostregressor_sales, sales_X_test, sales_y_test)
print("--------------")

Valor rents
Mean Squared Error: 40718.4865
Mean Absolute Error: 155.0056
R² Score: 0.5952
Valor sales
Mean Squared Error: 1894346650.0967
Mean Absolute Error: 32965.5624
R² Score: 0.6915
--------------


In [92]:
print("Valor rents")
evaluate_model(xboostregressor_rents, rents_X_train,  rents_y_train)
print("Valor sales")
evaluate_model(xboostregressor_sales, sales_X_train, sales_y_train)
print("--------------")

Valor rents
Mean Squared Error: 29548.0741
Mean Absolute Error: 131.7056
R² Score: 0.7048
Valor sales
Mean Squared Error: 1139675283.8062
Mean Absolute Error: 25359.3626
R² Score: 0.8130
--------------


In [93]:
sales_y_pred = xboostregressor_sales.predict(sales_X)
rents_y_pred= xboostregressor_rents.predict(rents_X)
sales_y_diff_series = sales_y - sales_y_pred
rents_y_diff_series = rents_y - rents_y_pred
sales_pred = pd.concat([sales_X, sales_y_diff_series.rename('y_diff'),sales_y.rename('y')], axis=1)
rents_pred = pd.concat([rents_X, rents_y_diff_series.rename('y_diff'),rents_y.rename('y')], axis=1)


In [94]:
sales_pred_quantil = filter_abs_percent(sales_pred, 'y_diff', 0.975)
rents_pred_quantil = filter_abs_percent(rents_pred, 'y_diff', 0.9)

In [95]:
rents_X_quantil, rents_y_quantil = split_dataframe(rents_pred_quantil, 'y')
sales_X_quantil, sales_y_quantil = split_dataframe(sales_pred_quantil, 'y')

In [96]:
pipeline_rents = clone(pipeline)
pipeline_rents.fit(rents_X_quantil, rents_y_quantil)

Fitting 3 folds for each of 192 candidates, totalling 576 fits
[CV] END colsample_bytree=0.8, gamma=0, learning_rate=0.05, max_depth=3, min_child_weight=0.5, n_estimators=100, reg_alpha=1, reg_lambda=5, subsample=0.8; total time=   0.1s
[CV] END colsample_bytree=0.8, gamma=0, learning_rate=0.05, max_depth=3, min_child_weight=0.5, n_estimators=100, reg_alpha=1, reg_lambda=5, subsample=0.8; total time=   0.1s
[CV] END colsample_bytree=0.8, gamma=0, learning_rate=0.05, max_depth=3, min_child_weight=0.5, n_estimators=100, reg_alpha=1, reg_lambda=5, subsample=0.8; total time=   0.1s
[CV] END colsample_bytree=0.8, gamma=0, learning_rate=0.05, max_depth=3, min_child_weight=0.5, n_estimators=100, reg_alpha=1, reg_lambda=10, subsample=0.8; total time=   0.1s
[CV] END colsample_bytree=0.8, gamma=0, learning_rate=0.05, max_depth=3, min_child_weight=0.5, n_estimators=100, reg_alpha=1, reg_lambda=10, subsample=0.8; total time=   0.1s
[CV] END colsample_bytree=0.8, gamma=0, learning_rate=0.05, max_d

In [97]:
pipeline_sales = clone(pipeline)
pipeline_sales.fit(sales_X_quantil, sales_y_quantil)

Fitting 3 folds for each of 192 candidates, totalling 576 fits
[CV] END colsample_bytree=0.8, gamma=0, learning_rate=0.05, max_depth=3, min_child_weight=0.5, n_estimators=100, reg_alpha=1, reg_lambda=5, subsample=0.8; total time=   0.1s
[CV] END colsample_bytree=0.8, gamma=0, learning_rate=0.05, max_depth=3, min_child_weight=0.5, n_estimators=100, reg_alpha=1, reg_lambda=5, subsample=0.8; total time=   0.1s
[CV] END colsample_bytree=0.8, gamma=0, learning_rate=0.05, max_depth=3, min_child_weight=0.5, n_estimators=100, reg_alpha=1, reg_lambda=5, subsample=0.8; total time=   0.1s
[CV] END colsample_bytree=0.8, gamma=0, learning_rate=0.05, max_depth=3, min_child_weight=0.5, n_estimators=100, reg_alpha=1, reg_lambda=10, subsample=0.8; total time=   0.1s
[CV] END colsample_bytree=0.8, gamma=0, learning_rate=0.05, max_depth=3, min_child_weight=0.5, n_estimators=100, reg_alpha=1, reg_lambda=10, subsample=0.8; total time=   0.1s
[CV] END colsample_bytree=0.8, gamma=0, learning_rate=0.05, max_d

In [98]:
print("Valor rents")
evaluate_model(pipeline_rents, rents_X_test, rents_y_test)
print("Valor sales")
evaluate_model(pipeline_sales, sales_X_test, sales_y_test)
print("--------------")

Valor rents
Mean Squared Error: 38720.0660
Mean Absolute Error: 144.4919
R² Score: 0.6151
Valor sales
Mean Squared Error: 1523692346.0037
Mean Absolute Error: 27831.8902
R² Score: 0.7519
--------------


In [99]:
joblib.dump(pipeline_rents, "../model/rents_model.pkl")
joblib.dump(pipeline_sales, "../model/sales_model.pkl")

['../model/sales_model.pkl']