# Training a XGBoost model for price prediction

### Importing libs and defining functions

In [0]:
import pandas as pd
import numpy as np
import joblib

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import TransformedTargetRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import HistGradientBoostingRegressor

In [0]:
def clip_transformer(a_min, a_max):
  return FunctionTransformer(
      np.clip,
      kw_args={'a_min': a_min, 'a_max': a_max},
      feature_names_out='one-to-one'
  )

In [0]:
def zero_to_nan(x):
  return np.where(x == 0, np.nan, x)

In [0]:
log_transformer = FunctionTransformer(
    np.log1p,
    feature_names_out='one-to-one'
)

### Reading files

In [0]:
path = '/Workspace/Users/vinicius.araujo@quintoandar.com.br/DS Upskilling/Database extraction/available_houses.csv'
df = pd.read_csv(path, index_col=0)
df.reset_index(drop=True, inplace=True)
df.head()

Unnamed: 0,type,region_name,total_area,bathrooms,bedrooms,iptu,parking_slots,price
0,Apartamento,Brooklin,48,1,1,190,1,990000
1,Apartamento,Brooklin,396,2,4,1500,3,3600000
2,Apartamento,Brooklin,77,1,2,180,1,767000
3,Apartamento,Brooklin,204,4,4,1750,4,3650000
4,Casa,Brooklin,300,2,3,1334,4,2125000


### Preparing the file

In [0]:
y = df.price
X = df.drop(['price'], axis=1)
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2,random_state=0)

categorical_cols = [col for col in X.columns if X[col].dtype == 'object']

### Model

In [0]:
iptu_pipeline = Pipeline(steps=[
    ('zero_to_nan', FunctionTransformer(
        zero_to_nan,
        feature_names_out='one-to-one'
    )),
    ('log', FunctionTransformer(
        np.log1p,
        feature_names_out='one-to-one'
    ))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('bedrooms_clip', clip_transformer(1, 4), ['bedrooms']),
        ('bathrooms_clip', clip_transformer(1, 5), ['bathrooms']),
        ('parking_clip', clip_transformer(0, 5), ['parking_slots']),
        ('log_num', log_transformer, ['total_area']),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols),
        ('iptu', iptu_pipeline, ['iptu'])
    ],
    remainder='passthrough'
)

model = HistGradientBoostingRegressor(
    max_iter=1000,
    max_depth=8,
    learning_rate=0.1,
    random_state=0,
    early_stopping=True
)

pipe = Pipeline(
  steps=[
    ('preprocessor', preprocessor), 
    ('model', model)
  ]
)

final_model = TransformedTargetRegressor(
    regressor=pipe,
    func=np.log1p,
    inverse_func=np.expm1
)

### Results

In [0]:
final_model.fit(X_train, y_train)
preds = final_model.predict(X_valid)

print('MAE:', mean_absolute_error(y_valid, preds))

🏃 View run rogue-snake-284 at: https://dbc-931ee6e0-6803.cloud.databricks.com/ml/experiments/1145737259598963/runs/11727a0ce44d4f6d91763f5e98fba898
🧪 View experiment at: https://dbc-931ee6e0-6803.cloud.databricks.com/ml/experiments/1145737259598963
MAE: 394426.6061823446


### Cross-validation

In [0]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(
    final_model,
    X_train,
    y_train,
    cv=5,
    scoring='neg_mean_absolute_error',
    n_jobs=-1
)

mae_scores = -scores

print("MAE por fold:", mae_scores)
print("MAE médio:", mae_scores.mean())
print("Std:", mae_scores.std())

MAE por fold: [281869.84132406 285072.32471413 300396.37432778 296927.33518446
 293685.07843185]
MAE médio: 291590.19079645537
Std: 7034.064329293996


### Exporting pickle

In [0]:
joblib.dump(final_model, 'house_price_model.joblib')

['house_price_model.joblib']

In [0]:
pip freeze > requirements.txt

### Importing pickle for test purposes

In [0]:
production_test = joblib.load('house_price_model.joblib')

In [0]:
sample = pd.DataFrame([{
    "bedrooms": 3,
    "bathrooms": 2,
    "parking_slots": 1,
    "total_area": 85,
    "iptu": 1200,
    "region_name": "Pinheiros",
    "type": "Apartamento"
}])

price_pred = production_test.predict(sample)
print("Production predicted price:", price_pred[0])

price_pred = final_model.predict(sample)
print("Staging predicted price:", price_pred[0])

Production predicted price: 879611.4764407098
Staging predicted price: 879611.4764407098
