# 🛍️ Sales Forecasting Project - Multi-Model Evaluation

This notebook demonstrates sales forecasting using multiple models:
- Random Forest
- XGBoost
- CatBoost
- Prophet (time series model)

We'll perform EDA, feature extraction, training, and evaluation.

In [None]:
# 📦 Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from prophet import Prophet
import warnings
warnings.filterwarnings("ignore")


## 📂 Load Dataset

In [None]:
# ✅ Change this path if needed
file_path = "data/sales_data.csv"
df = pd.read_csv(file_path, parse_dates=["Date"])
df.head()


## 📊 Data Overview

In [None]:
df.describe()

In [None]:
df.info()

## 🧪 Feature Engineering

In [None]:
df["Week"] = df["Date"].dt.isocalendar().week
df["Month"] = df["Date"].dt.month
df["Year"] = df["Date"].dt.year
df.head()

## 📈 Sales Trend Overview

In [None]:
plt.figure(figsize=(14, 5))
sns.lineplot(data=df.groupby("Date")["Sales"].sum())
plt.title("Total Weekly Sales Over Time")
plt.ylabel("Sales")
plt.grid(True)
plt.show()

## 🤖 Model 1: Random Forest, XGBoost, CatBoost

In [None]:

# Features and target
X = df[["Store", "Promo", "Week", "Month", "Year"]]
y = df["Sales"]

categorical_features = ["Store"]
numeric_features = ["Promo", "Week", "Month", "Year"]

preprocessor = ColumnTransformer([
    ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features)
], remainder="passthrough")

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define models
models = {
    "RandomForest": RandomForestRegressor(n_estimators=100, random_state=42),
    "XGBoost": XGBRegressor(n_estimators=100, random_state=42),
    "CatBoost": CatBoostRegressor(verbose=0, random_state=42)
}

# Train and evaluate
results = {}
for name, model in models.items():
    pipe = Pipeline([
        ("preprocessor", preprocessor),
        ("regressor", model)
    ])
    pipe.fit(X_train, y_train)
    preds = pipe.predict(X_test)
    results[name] = {
        "MAE": mean_absolute_error(y_test, preds),
        "RMSE": mean_squared_error(y_test, preds, squared=False),
        "R2": r2_score(y_test, preds)
    }

# Display results
pd.DataFrame(results).T


## 🔮 Model 2: Prophet (Time Series)

In [None]:

# Aggregate total sales for Prophet
df_prophet = df.groupby("Date")[["Sales"]].sum().reset_index()
df_prophet.columns = ["ds", "y"]

model = Prophet()
model.fit(df_prophet)

future = model.make_future_dataframe(periods=12, freq="W")
forecast = model.predict(future)

# Plot forecast
model.plot(forecast)
plt.title("Prophet Forecast: Weekly Sales")
plt.show()


## ✅ Summary & Observations

In [None]:
# Compare Prophet with other models visually
forecast_tail = forecast.set_index("ds")[["yhat"]].rename(columns={"yhat": "Prophet_Predicted"})
actual = df_prophet.set_index("ds")[["y"]].rename(columns={"y": "Actual_Sales"})
merged = pd.concat([actual, forecast_tail], axis=1).dropna()

plt.figure(figsize=(14, 5))
plt.plot(merged.index, merged["Actual_Sales"], label="Actual")
plt.plot(merged.index, merged["Prophet_Predicted"], label="Prophet Forecast")
plt.legend()
plt.title("Actual vs Prophet Forecast")
plt.grid(True)
plt.show()
