# Model Training

## import libraries

In [16]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor

from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import joblib

# For preprocessing (from previous step)
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer


# load and split data

In [17]:
# Load dataset
data = pd.read_csv("../data/raw/train.csv")

# drop id
if 'Id' in data.columns:
    data = data.drop(columns=['Id'])

# Split features and target
X = data.drop("SalePrice", axis=1)
y = data["SalePrice"]

# Identify numeric and categorical columns
numeric_features = X.select_dtypes(include=["int64","float64"]).columns
categorical_features = X.select_dtypes(include=["object"]).columns

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


## preprocessing pipeline

In [18]:
# Numeric pipeline
numeric_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# Categorical pipeline
categorical_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# ColumnTransformer
preprocessor = ColumnTransformer([
    ('num', numeric_transformer, numeric_features),
    ('cat', categorical_transformer, categorical_features)
])


## model pipeline

In [19]:
# Linear Regression
lr_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', LinearRegression())
])

# Random Forest
rf_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', RandomForestRegressor(n_estimators=200, random_state=42))
])

# XGBoost
xgb_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', XGBRegressor(n_estimators=500, learning_rate=0.05, random_state=42))
])


## train model

In [20]:
# Train Linear Regression
lr_pipeline.fit(X_train, y_train)

# Train Random Forest
rf_pipeline.fit(X_train, y_train)

# Train XGBoost
xgb_pipeline.fit(X_train, y_train)


## evaluate model

In [21]:
def evaluate_model(pipeline, X_test, y_test):
    y_pred = pipeline.predict(X_test)
    mae = mean_absolute_error(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)
    return mae, rmse, r2

models = {'Linear Regression': lr_pipeline,
          'Random Forest': rf_pipeline,
          'XGBoost': xgb_pipeline}

for name, model in models.items():
    mae, rmse, r2 = evaluate_model(model, X_test, y_test)
    print(f"{name} -> MAE: {mae:.2f}, RMSE: {rmse:.2f}, R2: {r2:.3f}")


Linear Regression -> MAE: 18288.19, RMSE: 29473.85, R2: 0.887
Random Forest -> MAE: 17412.83, RMSE: 28512.65, R2: 0.894
XGBoost -> MAE: 15919.65, RMSE: 25194.87, R2: 0.917


## save best model

In [22]:
# Example: save Random Forest (or the one with best RMSE/R2)
joblib.dump(rf_pipeline, "../models/home_price_model.pkl")


['../models/home_price_model.pkl']