In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

import joblib


In [2]:
df = pd.read_csv("../data/BigMartSales.csv")


In [3]:
X = df.drop("Item_Outlet_Sales", axis=1)
y = df["Item_Outlet_Sales"]


In [4]:
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42
)


In [5]:
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)

lr_pred = lr_model.predict(X_test)

lr_mse = mean_squared_error(y_test, lr_pred)
lr_r2 = r2_score(y_test, lr_pred)

print("Linear Regression MSE:", lr_mse)
print("Linear Regression R2:", lr_r2)


Linear Regression MSE: 1291620.8742570821
Linear Regression R2: 0.5247847317667971


In [6]:
rf_model = RandomForestRegressor(
    n_estimators=100,
    random_state=42
)

rf_model.fit(X_train, y_train)


RandomForestRegressor(random_state=42)

In [7]:
rf_pred = rf_model.predict(X_test)

rf_mse = mean_squared_error(y_test, rf_pred)
rf_r2 = r2_score(y_test, rf_pred)

print("Random Forest MSE:", rf_mse)
print("Random Forest R2:", rf_r2)


Random Forest MSE: 1168155.6463721595
Random Forest R2: 0.5702102606942034


In [8]:
comparison = pd.DataFrame({
    "Model": ["Linear Regression", "Random Forest"],
    "MSE": [lr_mse, rf_mse],
    "R2 Score": [lr_r2, rf_r2]
})

comparison


Unnamed: 0,Model,MSE,R2 Score
0,Linear Regression,1291621.0,0.524785
1,Random Forest,1168156.0,0.57021


In [9]:
joblib.dump(rf_model, "../models/best_model.pkl")


['../models/best_model.pkl']

In [10]:
print("Day 13 Model Improvement Completed Successfully ✅")


Day 13 Model Improvement Completed Successfully ✅
