# BigMart Sales Prediction - Model Training

## 1. Import Libraries & Load Data

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error

%matplotlib inline

In [None]:
train_df = pd.read_csv('../dataset/processed/feat_eng_train.csv')
test_df = pd.read_csv('../dataset/processed/feat_eng_test.csv')

print("Train Shape:", train_df.shape)
print("Test Shape:", test_df.shape)

## 2. Train/Validation Split
Dropping unnecessary columns and splitting data.

In [None]:
# Drop target and ID columns from input features
X = train_df.drop(columns=['Item_Outlet_Sales'])
y = train_df['Item_Outlet_Sales']

# Ensure independent variables match in test set (excluding target)
# Note: Some columns like Item_Identifier might prevent model training if not numerical.
cols_to_drop = ['Item_Identifier', 'Outlet_Identifier']
X = X.drop(columns=cols_to_drop, errors='ignore')
test_X = test_df.drop(columns=cols_to_drop, errors='ignore')

# Also drop Outlet_Establishment_Year if it's still there, as we have Outlet_Years
if 'Outlet_Establishment_Year' in X.columns:
    X = X.drop(columns=['Outlet_Establishment_Year'])
    test_X = test_X.drop(columns=['Outlet_Establishment_Year'])

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

print("Training Set:", X_train.shape)
print("Validation Set:", X_val.shape)

## 3. Model Training & Evaluation
Using RMSE as the evaluation metric.

In [None]:
def evaluate_model(model, X_train, y_train, X_val, y_val):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    rmse = np.sqrt(mean_squared_error(y_val, y_pred))
    return rmse

### Linear Regression

In [None]:
lr = LinearRegression()
rmse_lr = evaluate_model(lr, X_train, y_train, X_val, y_val)
print("Linear Regression RMSE:", rmse_lr)

### Decision Tree

In [None]:
dt = DecisionTreeRegressor(random_state=42)
rmse_dt = evaluate_model(dt, X_train, y_train, X_val, y_val)
print("Decision Tree RMSE:", rmse_dt)

### Random Forest

In [None]:
rf = RandomForestRegressor(random_state=42, n_estimators=100)
rmse_rf = evaluate_model(rf, X_train, y_train, X_val, y_val)
print("Random Forest RMSE:", rmse_rf)

### XGBoost

In [None]:
xgb = XGBRegressor(objective='reg:squarederror', n_estimators=100, learning_rate=0.1)
rmse_xgb = evaluate_model(xgb, X_train, y_train, X_val, y_val)
print("XGBoost RMSE:", rmse_xgb)