In [1]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import joblib

## 📥 Load Cleaned Data

In [2]:
# Load the cleaned dataset
df = pd.read_csv('../data/cleaned/cleaned_data.csv')
df.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,...,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,60,3,65.0,8450,1,3,3,0,4,0,...,0,0,0,0,0,2,2008,8,4,208500
1,20,3,80.0,9600,1,3,3,0,2,0,...,0,0,0,0,0,5,2007,8,4,181500
2,60,3,68.0,11250,1,0,3,0,4,0,...,0,0,0,0,0,9,2008,8,4,223500
3,70,3,60.0,9550,1,0,3,0,0,0,...,272,0,0,0,0,2,2006,8,0,140000
4,60,3,84.0,14260,1,0,3,0,2,0,...,0,0,0,0,0,12,2008,8,4,250000


## ✂️ Train-Test Split

In [3]:
# Split into features and target
X = df.drop('SalePrice', axis=1)
y = df['SalePrice']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## 📈 Linear Regression

In [4]:
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)

y_pred_lr = lr_model.predict(X_test)

# Evaluation
mse_lr = mean_squared_error(y_test, y_pred_lr)
rmse_lr = np.sqrt(mse_lr)
mae_lr = mean_absolute_error(y_test, y_pred_lr)
r2_lr = r2_score(y_test, y_pred_lr)

print(f"Linear Regression -> R²: {r2_lr:.4f}, RMSE: {rmse_lr:.2f}, MAE: {mae_lr:.2f}")

Linear Regression -> R²: 0.8493, RMSE: 33999.26, MAE: 21338.37


## 🌲 Decision Tree Regressor

In [5]:
dt_model = DecisionTreeRegressor(random_state=42)
dt_model.fit(X_train, y_train)

y_pred_dt = dt_model.predict(X_test)

# Evaluation
mse_dt = mean_squared_error(y_test, y_pred_dt)
rmse_dt = np.sqrt(mse_dt)
mae_dt = mean_absolute_error(y_test, y_pred_dt)
r2_dt = r2_score(y_test, y_pred_dt)

print(f"Decision Tree -> R²: {r2_dt:.4f}, RMSE: {rmse_dt:.2f}, MAE: {mae_dt:.2f}")

Decision Tree -> R²: 0.7977, RMSE: 39390.50, MAE: 26411.79


## 📊 Performance Comparison

In [6]:
# Compare model scores
results = pd.DataFrame({
    'Model': ['Linear Regression', 'Decision Tree'],
    'R2 Score': [r2_lr, r2_dt],
    'RMSE': [rmse_lr, rmse_dt],
    'MAE': [mae_lr, mae_dt]
})
results

Unnamed: 0,Model,R2 Score,RMSE,MAE
0,Linear Regression,0.849296,33999.25851,21338.366858
1,Decision Tree,0.797712,39390.495519,26411.791096


## 💾 Save Models

In [7]:
# Save models
import os
os.makedirs('../models', exist_ok=True)
joblib.dump(lr_model, '../models/linear_regression.pkl')
joblib.dump(dt_model, '../models/decision_tree.pkl')
print("✅ Models saved in '../models/' folder")

✅ Models saved in '../models/' folder
