# Modelling

In [8]:
%load_ext autoreload
%autoreload 2

import pandas as pd
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor

from house_prices_ml.config import RAW_DATA_DIR, TARGET
from house_prices_ml.modelling import build_pipeline, tune_hyperparameters, evaluate_model

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:
train = pd.read_csv(RAW_DATA_DIR / "train.csv", index_col='Id') 
X = train.drop(TARGET, axis=1)
y = train[TARGET]
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [4]:
model = XGBRegressor(random_state=42)

pipeline = build_pipeline(model)

search = tune_hyperparameters(pipeline, X_train, y_train)

Fitting 5 folds for each of 50 candidates, totalling 250 fits


In [5]:
best_model = search.best_estimator_

In [9]:
y_train_pred = best_model.predict(X_train)
train_mse = evaluate_model(y_train, y_train_pred)

y_val_pred = best_model.predict(X_val)
val_mse = evaluate_model(y_val, y_val_pred)

print(f"Train MSE: {train_mse:,.2f}")
print(f"Validation MSE: {val_mse:,.2f}")
print(f"Search best score: {search.best_score_}")

Train MSE: 0.11
Validation MSE: 0.13
Search best score: 0.8655501961708069
