# Supply Chain Demand Forecasting

This project aims to predict the number of products sold using various supply chain features. We use a Random Forest Regressor to forecast demand based on structured features from the dataset.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import joblib

import warnings
warnings.filterwarnings("ignore")

## Data Loading

In [None]:
# Load dataset
file_path = '/content/supply_chain_data.csv'  # Adjust this path as needed
data = pd.read_csv(file_path)
data.head()

## Initial Data Overview

In [None]:
data.info()

In [None]:
data.describe()

In [None]:
data.isna().sum()

## Exploratory Data Analysis (EDA)

In [None]:
plt.figure(figsize=(10, 6))
sns.histplot(data['number_of_products_sold'], kde=True)
plt.title('Distribution of Products Sold')
plt.show()

In [None]:
plt.figure(figsize=(12, 6))
sns.heatmap(data.corr(numeric_only=True), annot=True, fmt=".2f", cmap="coolwarm")
plt.title("Correlation Heatmap")
plt.show()

## Feature Engineering and Preprocessing

In [None]:
X = data.drop(columns='number_of_products_sold')
y = data['number_of_products_sold']

categorical_cols = X.select_dtypes(include=['object']).columns.tolist()
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

categorical_transformer = OneHotEncoder(handle_unknown='ignore')
numerical_transformer = StandardScaler()

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

## Model Training with Random Forest

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', RandomForestRegressor(random_state=42))
])

pipeline.fit(X_train, y_train)

## Model Evaluation

In [None]:
y_pred = pipeline.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)
r2 = r2_score(y_test, y_pred)

print(f"Mean Absolute Error (MAE): {mae:.2f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.2f}")
print(f"R² Score: {r2:.4f}")

## Saving the Model

In [None]:
joblib.dump(pipeline, 'final_supply_chain_model.joblib')

## Next Steps

- Try hyperparameter tuning for further improvements
- Explore more advanced models (XGBoost, LightGBM)
- Perform time-based train/test splits if applicable
- Deploy model using Flask or Streamlit for real-world use