In [2]:
# Imports
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import joblib
import os

## 📥 Load Cleaned Data

In [3]:
# Load cleaned dataset
df = pd.read_csv('../data/cleaned/cleaned_data.csv')
X = df.drop('SalePrice', axis=1)
y = df['SalePrice']

# Split into train/test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## 🌲 Decision Tree Regressor – GridSearchCV

In [4]:
# Define parameter grid
dt_params = {
    'max_depth': [5, 10, 20, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Grid search
dt_grid = GridSearchCV(DecisionTreeRegressor(random_state=42), dt_params, cv=5, scoring='r2', n_jobs=-1)
dt_grid.fit(X_train, y_train)

# Best estimator
best_dt = dt_grid.best_estimator_
print("Best Decision Tree Parameters:", dt_grid.best_params_)

Best Decision Tree Parameters: {'max_depth': 10, 'min_samples_leaf': 2, 'min_samples_split': 10}


## 🌳 Random Forest Regressor – GridSearchCV

In [5]:
# Define parameter grid
rf_params = {
    'n_estimators': [100, 200],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

# Grid search
rf_grid = GridSearchCV(RandomForestRegressor(random_state=42), rf_params, cv=3, scoring='r2', n_jobs=-1)
rf_grid.fit(X_train, y_train)

# Best estimator
best_rf = rf_grid.best_estimator_
print("Best Random Forest Parameters:", rf_grid.best_params_)

Best Random Forest Parameters: {'max_depth': 20, 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 100}


## 📊 Model Evaluation

In [8]:
# Evaluate both tuned models
def evaluate_model(name, model):
    preds = model.predict(X_test)
    r2 = r2_score(y_test, preds)
    from sklearn.metrics import root_mean_squared_error
    rmse = root_mean_squared_error(y_test, preds)
    mae = mean_absolute_error(y_test, preds)
    print(f"{name} -> R²: {r2:.4f}, RMSE: {rmse:.2f}, MAE: {mae:.2f}")
    return [r2, rmse, mae]

results = {
    "Model": [],
    "R2 Score": [],
    "RMSE": [],
    "MAE": []
}

for name, model in [("Tuned Decision Tree", best_dt), ("Tuned Random Forest", best_rf)]:
    r2, rmse, mae = evaluate_model(name, model)
    results["Model"].append(name)
    results["R2 Score"].append(r2)
    results["RMSE"].append(rmse)
    results["MAE"].append(mae)

pd.DataFrame(results)

Tuned Decision Tree -> R²: 0.8202, RMSE: 37137.27, MAE: 25053.69
Tuned Random Forest -> R²: 0.8948, RMSE: 28405.01, MAE: 17544.94


Unnamed: 0,Model,R2 Score,RMSE,MAE
0,Tuned Decision Tree,0.820193,37137.265615,25053.692744
1,Tuned Random Forest,0.89481,28405.012769,17544.942611


## 💾 Save Best Models

In [9]:
os.makedirs('../models/tuned', exist_ok=True)
joblib.dump(best_dt, '../models/tuned/best_decision_tree.pkl')
joblib.dump(best_rf, '../models/tuned/best_random_forest.pkl')
print("✅ Tuned models saved in 'models/tuned/' folder.")

✅ Tuned models saved in 'models/tuned/' folder.
