<a href="https://colab.research.google.com/github/adadoun/inventoryPlanningRecommendation/blob/main/NNSalesPrediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Overview:

This notebook tackles the problem of sales prediction using Neural Networks that benefits of encoding the skus efficiently

## Library Import

In [5]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from tabulate import tabulate
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error
from tqdm import tqdm
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import joblib

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cpu


In [4]:
# Load the dataset
train_data = pd.read_csv('drive/MyDrive/Collab_DATA/PolarData/train_data.csv')
test_data = pd.read_csv('drive/MyDrive/Collab_DATA/PolarData/test_data.csv')

## Data Preparation for Neural Network Training

In [6]:
# Prepare data for PyTorch
feature_columns = [col for col in train_data.columns if col not in ['DATE',
                                                                    'QUANTITY_SOLD',
                                                                    'SKU',
                                                                    'CURRENT_LEVEL',
                                                                    'SKU_INDEX']]

X_train = train_data[feature_columns]
y_train = train_data['QUANTITY_SOLD']
sku_train = train_data['SKU_INDEX']

X_test = test_data[feature_columns]
y_test = test_data['QUANTITY_SOLD']
sku_test = test_data['SKU_INDEX']

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

# Save the scaler for later usage
scaler_filename = "drive/MyDrive/Collab_DATA/PolarData/scaler.save"
joblib.dump(scaler, scaler_filename)
print(f"Scaler saved to {scaler_filename}")

X_test_scaled = scaler.transform(X_test)

Scaler saved to drive/MyDrive/Collab_DATA/PolarData/scaler.save


## Pytorch code for NN Development

### Sales DataSet for loading

In [7]:
class SalesDataset(Dataset):
    """
    Custom PyTorch Dataset for sales data.

    This dataset holds the features (X), target values (y), and SKU identifiers
    for sales prediction tasks.

    Args:
        X (np.ndarray): Input features.
        y (np.ndarray or pd.Series): Target values.
        sku (np.ndarray or pd.Series): SKU identifiers.

    Attributes:
        X (torch.FloatTensor): Input features tensor.
        y (torch.FloatTensor): Target values tensor.
        sku (torch.LongTensor): SKU identifiers tensor.
    """

    def __init__(self, X, y, sku):
        self.X = torch.FloatTensor(X).to(device)
        self.y = torch.FloatTensor(y.values).to(device)
        self.sku = torch.LongTensor(sku.values).to(device)

    def __len__(self):
        """Returns the number of samples in the dataset."""
        return len(self.X)

    def __getitem__(self, idx):
        """
        Retrieves a single sample from the dataset.

        Args:
            idx (int): Index of the sample to retrieve.

        Returns:
            tuple: (features, sku, target) for the requested sample.
        """
        return self.X[idx], self.sku[idx], self.y[idx]

In [12]:
# Create datasets and dataloaders
train_dataset = SalesDataset(X_train_scaled, y_train, sku_train)
test_dataset = SalesDataset(X_test_scaled, y_test, sku_test)
train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=128, shuffle=False)

### Neural Network Definition

In [20]:
class SalesNN(nn.Module):
    """
    Neural Network model for sales prediction.

    This model combines numerical features with SKU embeddings to predict sales.

    Args:
        num_features (int): Number of numerical input features.
        num_skus (int): Number of unique SKUs in the dataset.
        embedding_dim (int, optional): Dimension of the SKU embedding. Defaults to 32.

    Attributes:
        sku_embedding (nn.Embedding): Embedding layer for SKUs.
        fc1, fc2, fc3, fc4 (nn.Linear): Fully connected layers.
        relu (nn.ReLU): ReLU activation function.
    """

    def __init__(self, num_features, num_skus, embedding_dim=16):
        super(SalesNN, self).__init__()
        self.sku_embedding = nn.Embedding(num_skus, embedding_dim)
        self.fc1 = nn.Linear(num_features + embedding_dim, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, 32)
        self.fc4 = nn.Linear(32, 1)
        self.relu = nn.ReLU()

    def forward(self, x, sku):
        """
        Forward pass of the neural network.

        Args:
            x (torch.Tensor): Tensor of numerical features.
            sku (torch.Tensor): Tensor of SKU indices.

        Returns:
            torch.Tensor: Predicted sales values.
        """
        sku_emb = self.sku_embedding(sku)
        x = torch.cat((x, sku_emb), dim=1)
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        x = self.relu(self.fc3(x))
        return self.fc4(x).squeeze()

def initialize_model(feature_columns, sku_train, device):
    """
    Initialize the SalesNN model, loss function, and optimizer.

    Args:
        feature_columns (list): List of feature column names.
        sku_train (pd.Series or np.array): Training data SKU identifiers.
        device (torch.device): Device to run the model on (CPU or GPU).

    Returns:
        tuple: (model, criterion, optimizer)
    """
    # Initialize model
    salesnn_model = SalesNN(num_features=len(feature_columns), num_skus=sku_train.nunique()).to(device)

    # Define loss function
    criterion = nn.MSELoss()

    # Initialize optimizer
    optimizer = optim.Adam(salesnn_model.parameters(), lr=0.00003)

    return salesnn_model, criterion, optimizer

salesnn_model, criterion, optimizer = initialize_model(feature_columns, sku_train, device)

In [21]:
def print_model_architecture(model, num_features, num_skus):
    """
    Print a detailed summary of the model architecture.

    Args:
        model (nn.Module): The PyTorch model to summarize.
        num_features (int): The number of input features.
        num_skus (int): The number of unique SKUs.
    """
    def get_layer_info(module):
        """Helper function to get info about a layer"""
        return {
            "Layer Type": type(module).__name__,
            "Output Shape": str(tuple(module.weight.shape)) if hasattr(module, 'weight') else "N/A",
            "Param #": sum(p.numel() for p in module.parameters() if p.requires_grad)
        }

    print("Model Architecture:")
    print("==================")

    layers_info = []
    for name, module in model.named_children():
        layer_info = get_layer_info(module)
        if name == "sku_embedding":
            layer_info["Input Shape"] = f"({num_skus},)"
        elif name == "fc1":
            layer_info["Input Shape"] = f"({num_features + model.sku_embedding.embedding_dim},)"
        else:
            prev_layer = layers_info[-1] if layers_info else None
            layer_info["Input Shape"] = prev_layer["Output Shape"] if prev_layer else "N/A"
        layers_info.append({"Layer Name": name, **layer_info})

    print(tabulate(layers_info, headers="keys", tablefmt="grid"))

    # Print total parameters
    total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    print(f"\nTotal Trainable Parameters: {total_params:,}")

    # Print model summary
    print("\nModel Summary:")
    print("==============")
    print(model)

# Example usage:
num_features = len(feature_columns)  # Replace with actual number of features
num_skus = sku_train.nunique()  # Replace with actual number of unique SKUs
print_model_architecture(salesnn_model, num_features, num_skus)


Model Architecture:
+---------------+--------------+----------------+-----------+---------------+
| Layer Name    | Layer Type   | Output Shape   |   Param # | Input Shape   |
| sku_embedding | Embedding    | (792, 16)      |     12672 | (792,)        |
+---------------+--------------+----------------+-----------+---------------+
| fc1           | Linear       | (128, 61)      |      7936 | (61,)         |
+---------------+--------------+----------------+-----------+---------------+
| fc2           | Linear       | (64, 128)      |      8256 | (128, 61)     |
+---------------+--------------+----------------+-----------+---------------+
| fc3           | Linear       | (32, 64)       |      2080 | (64, 128)     |
+---------------+--------------+----------------+-----------+---------------+
| fc4           | Linear       | (1, 32)        |        33 | (32, 64)      |
+---------------+--------------+----------------+-----------+---------------+
| relu          | ReLU         | N/A        

### Neural Network Training

In [22]:
from torch.nn import L1Loss

def train_sales_nn(salesnn_model, train_loader, test_loader, optimizer, num_epochs=20):
    """
    Train the Sales Neural Network model using Mean Absolute Error.

    Args:
        salesnn_model (nn.Module): The SalesNN model to train.
        train_loader (DataLoader): DataLoader for training data.
        test_loader (DataLoader): DataLoader for test data.
        optimizer (torch.optim.Optimizer): The optimizer for training.
        num_epochs (int, optional): Number of training epochs. Defaults to 20.

    Returns:
        tuple: Lists of training and test MAE for each epoch.
    """
    # Use L1Loss for MAE
    criterion = L1Loss()

    train_maes = []
    test_maes = []

    for epoch in tqdm(range(num_epochs), desc="Training Progress"):
        # Training phase
        salesnn_model.train()
        train_mae = 0
        for X_batch, sku_batch, y_batch in train_loader:
            optimizer.zero_grad()
            y_pred = salesnn_model(X_batch, sku_batch)
            loss = criterion(y_pred, y_batch)
            loss.backward()
            optimizer.step()
            train_mae += loss.item()

        avg_train_mae = train_mae / len(train_loader)
        train_maes.append(avg_train_mae)

        # Evaluation phase
        salesnn_model.eval()
        test_mae = 0
        with torch.no_grad():
            for X_batch, sku_batch, y_batch in test_loader:
                y_pred = salesnn_model(X_batch, sku_batch)
                mae = criterion(y_pred, y_batch)
                test_mae += mae.item()

        avg_test_mae = test_mae / len(test_loader)
        test_maes.append(avg_test_mae)

        print(f"Epoch {epoch+1}/{num_epochs}, Train MAE: {avg_train_mae:.4f}, Test MAE: {avg_test_mae:.4f}")

    return train_maes, test_maes

train_maes, test_maes = train_sales_nn(salesnn_model, train_loader, test_loader, optimizer)

Training Progress:   5%|▌         | 1/20 [00:13<04:10, 13.16s/it]

Epoch 1/20, Train MAE: 25.0522, Test MAE: 59.5736


Training Progress:  10%|█         | 2/20 [00:27<04:14, 14.13s/it]

Epoch 2/20, Train MAE: 15.9073, Test MAE: 40.8609


Training Progress:  15%|█▌        | 3/20 [00:37<03:26, 12.14s/it]

Epoch 3/20, Train MAE: 14.5695, Test MAE: 38.9342


Training Progress:  20%|██        | 4/20 [00:43<02:36,  9.79s/it]

Epoch 4/20, Train MAE: 14.0708, Test MAE: 37.2880


Training Progress:  25%|██▌       | 5/20 [00:52<02:18,  9.21s/it]

Epoch 5/20, Train MAE: 13.6456, Test MAE: 36.0419


Training Progress:  30%|███       | 6/20 [00:58<01:56,  8.35s/it]

Epoch 6/20, Train MAE: 13.3573, Test MAE: 35.1233


Training Progress:  35%|███▌      | 7/20 [01:12<02:11, 10.12s/it]

Epoch 7/20, Train MAE: 13.1650, Test MAE: 34.3752


Training Progress:  40%|████      | 8/20 [01:25<02:11, 10.97s/it]

Epoch 8/20, Train MAE: 13.0305, Test MAE: 33.8463


Training Progress:  45%|████▌     | 9/20 [01:36<02:02, 11.12s/it]

Epoch 9/20, Train MAE: 12.9307, Test MAE: 33.4580


Training Progress:  50%|█████     | 10/20 [01:47<01:49, 11.00s/it]

Epoch 10/20, Train MAE: 12.8534, Test MAE: 33.0716


Training Progress:  55%|█████▌    | 11/20 [01:55<01:29, 10.00s/it]

Epoch 11/20, Train MAE: 12.7954, Test MAE: 32.9347


Training Progress:  60%|██████    | 12/20 [02:02<01:12,  9.10s/it]

Epoch 12/20, Train MAE: 12.7588, Test MAE: 32.7175


Training Progress:  65%|██████▌   | 13/20 [02:10<01:01,  8.72s/it]

Epoch 13/20, Train MAE: 12.7189, Test MAE: 32.6896


Training Progress:  70%|███████   | 14/20 [02:18<00:51,  8.55s/it]

Epoch 14/20, Train MAE: 12.7065, Test MAE: 32.5378


Training Progress:  75%|███████▌  | 15/20 [02:25<00:40,  8.19s/it]

Epoch 15/20, Train MAE: 12.6623, Test MAE: 32.3880


Training Progress:  80%|████████  | 16/20 [02:33<00:32,  8.22s/it]

Epoch 16/20, Train MAE: 12.6490, Test MAE: 32.3044


Training Progress:  85%|████████▌ | 17/20 [02:42<00:24,  8.19s/it]

Epoch 17/20, Train MAE: 12.6274, Test MAE: 32.3972


Training Progress:  90%|█████████ | 18/20 [02:50<00:16,  8.12s/it]

Epoch 18/20, Train MAE: 12.6205, Test MAE: 32.3196


Training Progress:  95%|█████████▌| 19/20 [02:56<00:07,  7.76s/it]

Epoch 19/20, Train MAE: 12.5909, Test MAE: 32.2235


Training Progress: 100%|██████████| 20/20 [03:05<00:00,  9.27s/it]

Epoch 20/20, Train MAE: 12.5824, Test MAE: 32.1652





### MAE Over the epochs for train & test sets

In [26]:
# Plot training and validation loss using Plotly
fig = go.Figure()

fig.add_trace(go.Scatter(x=list(range(1, 20 + 1)), y=train_maes,
                         mode='lines',
                         name='Train Loss'))
fig.add_trace(go.Scatter(x=list(range(1, 20 + 1)), y=test_maes,
                         mode='lines',
                         name='Test Loss'))

fig.update_layout(title='Training and Validation Loss',
                  xaxis_title='Epoch',
                  yaxis_title='Loss',
                  legend_title='Legend',
                  hovermode='x')

fig.show()

### Model Evaluation

In [27]:
# Evaluate model
salesnn_model.eval()
y_pred = []
with torch.no_grad():
    for X_batch, sku_batch, _ in test_loader:
        y_pred.extend(salesnn_model(X_batch, sku_batch).cpu().numpy())

y_pred = np.array(y_pred)
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mape = np.mean(np.abs((y_test - y_pred) / y_test)) * 100

print(f"MAE: {mae:.2f}")
print(f"RMSE: {rmse:.2f}")
print(f"MAPE: {mape:.2f}%")


MAE: 32.51
RMSE: 155.93
MAPE: 90.96%


### Saving the model for later usage

In [28]:
model_save_path = 'drive/MyDrive/Collab_DATA/PolarData/NN_sales_prediction_model.pth'

# Save the entire model
torch.save(salesnn_model, model_save_path)

print(f"Model saved to {model_save_path}")

Model saved to drive/MyDrive/Collab_DATA/PolarData/NN_sales_prediction_model.pth


### Add predictions to the dataframe for model evaluation

In [24]:
# Add predictions to the test dataframe
test_data['predictions'] = y_pred

### Actual vs Predicted Sales

In [29]:
def plot_actual_vs_predicted(y_test, y_pred):
    """
    Create a scatter plot of actual vs predicted sales using Plotly.

    Args:
        y_test (np.array or pd.Series): Actual sales values.
        y_pred (np.array): Predicted sales values.

    Returns:
        None: Displays the plot using Plotly.
    """
    # Create a new Figure object
    fig = go.Figure()

    # Add scatter plot for actual vs predicted sales
    fig.add_trace(go.Scatter(
        x=y_test,
        y=y_pred,
        mode='markers',
        name='Sales',
        marker=dict(color='blue', opacity=0.5),
        hovertemplate='Actual: %{x:.2f}<br>Predicted: %{y:.2f}'
    ))

    # Add diagonal line representing perfect prediction
    min_val = min(y_test.min(), y_pred.min())
    max_val = max(y_test.max(), y_pred.max())
    fig.add_trace(go.Scatter(
        x=[min_val, max_val],
        y=[min_val, max_val],
        mode='lines',
        name='Perfect Prediction',
        line=dict(color='red', dash='dash')
    ))

    # Update layout
    fig.update_layout(
        title='Actual vs Predicted Sales',
        xaxis_title='Actual Sales',
        yaxis_title='Predicted Sales',
        hovermode='closest'
    )

    # Make the aspect ratio equal
    fig.update_yaxes(
        scaleanchor="x",
        scaleratio=1,
    )

    # Display the plot
    fig.show()

plot_actual_vs_predicted(y_test, y_pred)

## Compute statistics on the metrics by sku

In [30]:
# Evaluate performance per SKU
def calculate_metrics(group):
    mae = mean_absolute_error(group['QUANTITY_SOLD'], group['predictions'])
    rmse = np.sqrt(mean_squared_error(group['QUANTITY_SOLD'], group['predictions']))
    mape = np.mean(np.abs((group['QUANTITY_SOLD'] - group['predictions']) / group['QUANTITY_SOLD'])) * 100
    return pd.Series({'MAE': mae, 'RMSE': rmse, 'MAPE': mape})

sku_performance = test_data.groupby('SKU').apply(calculate_metrics)

print("\nTop 5 SKUs by lowest MAPE:")
print(sku_performance.sort_values('MAPE').head())

print("\nBottom 5 SKUs by highest MAPE:")
print(sku_performance.sort_values('MAPE', ascending=False).head())

# Calculate statistics for evaluation metrics
metric_stats = sku_performance.agg(['mean', 'median', 'min', 'max'])
metric_stats.loc['p90'] = sku_performance.quantile(0.9)

print("\nStatistics of Evaluation Metrics:")
print(metric_stats)


Top 5 SKUs by lowest MAPE:
                MAE       RMSE       MAPE
SKU                                      
9e3f0cf2  37.838070  56.466451  11.815064
086b152f  41.808238  50.347231  14.279035
3c5b3289  29.947561  42.290610  14.566434
7ea5a39d  63.132672  72.884780  15.529648
93df17a5  15.557292  17.876496  16.435661

Bottom 5 SKUs by highest MAPE:
                MAE        RMSE         MAPE
SKU                                         
dd6d894e  89.278555  101.641591  7147.445083
9c62ebd2  85.104075   88.459346  4742.049376
8ee51eca  32.635815   32.635815  3263.581467
a4c22298  24.602772   24.669595  2460.277176
0b6fea39  31.672079   31.680659  2356.974316

Statistics of Evaluation Metrics:
                MAE         RMSE         MAPE
mean      33.423351    47.707254   120.406636
median    11.126662    13.886435    51.517224
min        1.407715     1.727398    11.815064
max     2018.501856  2318.280494  7147.445083
p90       45.564242    60.256860   197.719643


### Violin plots on the distribution of the metrics for different skus

In [31]:
def remove_outliers(data, lower_percentile=2.5, upper_percentile=97.5):
    lower = np.percentile(data, lower_percentile)
    upper = np.percentile(data, upper_percentile)
    return data[(data >= lower) & (data <= upper)]

def visualize_and_analyze_metrics(sku_performance: pd.DataFrame):
    """
    Create violin plots for performance metrics and print summary statistics.

    Args:
        sku_performance (pd.DataFrame): DataFrame containing performance metrics (MAE, RMSE, MAPE) for each SKU.
    """
    # Plot statistics with outliers removed and using violin plots
    fig = make_subplots(rows=1, cols=3, subplot_titles=("MAE", "RMSE", "MAPE"))

    for i, metric in enumerate(['MAE', 'RMSE', 'MAPE']):
        # Remove outliers
        cleaned_data = remove_outliers(sku_performance[metric])

        # Create violin plot
        fig.add_trace(go.Violin(y=cleaned_data, name=metric, box_visible=True, meanline_visible=True), row=1, col=i+1)

    fig.update_layout(title_text="Distribution of Evaluation Metrics Across SKUs (Outliers Removed)", height=500)
    fig.show()

    # Print statistics of the cleaned data
    print("\nStatistics of Evaluation Metrics (After Removing Outliers):")
    for metric in ['MAE', 'RMSE', 'MAPE']:
        cleaned_data = remove_outliers(sku_performance[metric])
        stats = cleaned_data.describe()
        print(f"\n{metric}:")
        print(f"Mean: {stats['mean']:.2f}")
        print(f"Median: {stats['50%']:.2f}")
        print(f"Min: {stats['min']:.2f}")
        print(f"Max: {stats['max']:.2f}")
        print(f"90th Percentile: {np.percentile(cleaned_data, 90):.2f}")

visualize_and_analyze_metrics(sku_performance)



Statistics of Evaluation Metrics (After Removing Outliers):

MAE:
Mean: 19.68
Median: 11.13
Min: 2.73
Max: 231.81
90th Percentile: 41.18

RMSE:
Mean: 26.17
Median: 13.89
Min: 3.16
Max: 368.77
90th Percentile: 52.91

MAPE:
Mean: 81.42
Median: 51.52
Min: 18.70
Max: 555.32
90th Percentile: 171.95


### Show Predicted Sales for top 5 SKUs

In [32]:
# Get top 5 most sold SKUs
top_10_skus = train_data.groupby('SKU')['QUANTITY_SOLD'].sum().nlargest(10).index

# Plot sales predictions for top 10 most sold SKUs
for sku in top_10_skus:
    sku_data = test_data[test_data['SKU'] == sku].sort_values('DATE')

    fig = go.Figure()
    fig.add_trace(go.Scatter(x=sku_data['DATE'], y=sku_data['QUANTITY_SOLD'], mode='lines', name='Actual Sales'))
    fig.add_trace(go.Scatter(x=sku_data['DATE'], y=sku_data['predictions'], mode='lines', name='Predicted Sales'))

    fig.update_layout(title=f'Sales Prediction for SKU: {sku}',
                      xaxis_title='Date',
                      yaxis_title='Quantity Sold',
                      legend_title='Legend')
    fig.show()

### Compute MAE with regards to the number of past sales

In [33]:
def analyze_mae_vs_past_sales(train_data: pd.DataFrame, sku_performance: pd.DataFrame) -> pd.DataFrame:
    """
    Analyze the relationship between Mean Absolute Error (MAE) and the number of past sales for each SKU.

    Args:
        train_data (pd.DataFrame): Training dataset containing SKU and sales data.
        sku_performance (pd.DataFrame): DataFrame containing performance metrics for each SKU.

    Returns:
        pd.DataFrame: Cleaned and sorted DataFrame containing Past_Sales_Count and MAE for each SKU.
    """
    # Calculate the number of past sales for each SKU
    past_sales_count = train_data.groupby('SKU').size()

    # Combine past sales count with MAE
    mae_vs_past_sales = pd.DataFrame({
        'Past_Sales_Count': past_sales_count,
        'MAE': sku_performance['MAE']
    })

    # Print diagnostic information
    print("Before cleaning:")
    print(f"Total rows: {len(mae_vs_past_sales)}")
    print(f"Rows with NaN: {mae_vs_past_sales.isna().sum().sum()}")
    print(f"Rows with inf: {np.isinf(mae_vs_past_sales).sum().sum()}")

    # Remove rows with NaN or infinite values
    mae_vs_past_sales_clean = mae_vs_past_sales.replace([np.inf, -np.inf], np.nan).dropna()

    print("\nAfter cleaning:")
    print(f"Total rows: {len(mae_vs_past_sales_clean)}")

    # Sort by Past_Sales_Count for better visualization
    mae_vs_past_sales_clean = mae_vs_past_sales_clean.sort_values('Past_Sales_Count')

    return mae_vs_past_sales_clean

mae_vs_past_sales_clean = analyze_mae_vs_past_sales(train_data, sku_performance)


Before cleaning:
Total rows: 792
Rows with NaN: 0
Rows with inf: 0

After cleaning:
Total rows: 792


### Plot Evolution of the MAE over Past Sales

In [34]:
def analyze_and_visualize_mae_vs_sales_ranges(mae_vs_past_sales_clean: pd.DataFrame):
    """
    Analyze and visualize the relationship between Mean Absolute Error (MAE) and ranges of past sales.

    Args:
        mae_vs_past_sales_clean (pd.DataFrame): Cleaned DataFrame containing Past_Sales_Count and MAE for each SKU.
    """
    # Define ranges for past sales
    ranges = [0, 10, 20, 30, 40, 50, 100, 200, 300, float('inf')]
    range_labels = ['0-10', '11-20', '21-30', '31-40', '41-50', '51-100', '101-200', '201-300', '300+']

    # Categorize past sales into ranges
    mae_vs_past_sales_clean['Range'] = pd.cut(mae_vs_past_sales_clean['Past_Sales_Count'],
                                              bins=ranges, labels=range_labels, include_lowest=True)

    # Calculate average MAE for each range
    average_mae_by_range = mae_vs_past_sales_clean.groupby('Range')['MAE'].mean().reset_index()

    # Calculate the number of SKUs in each range
    sku_count_by_range = mae_vs_past_sales_clean['Range'].value_counts().sort_index().reset_index()
    sku_count_by_range.columns = ['Range', 'SKU_Count']

    # Merge average MAE and SKU count
    plot_data = pd.merge(average_mae_by_range, sku_count_by_range, on='Range')

    print("\nAverage MAE and SKU Count by Range of Past Sales:")
    print(plot_data)

    # Create a figure with secondary y-axis
    fig = make_subplots(specs=[[{"secondary_y": True}]])

    # Add bar plot for average MAE
    fig.add_trace(
        go.Bar(x=plot_data['Range'], y=plot_data['MAE'], name="Average MAE"),
        secondary_y=False,
    )

    # Add line plot for SKU count
    fig.add_trace(
        go.Scatter(x=plot_data['Range'], y=plot_data['SKU_Count'], name="Number of SKUs", mode="lines+markers"),
        secondary_y=True,
    )

    # Update layout
    fig.update_layout(
        title_text="Average MAE and SKU Distribution by Range of Past Sales",
        xaxis_title="Range of Past Sales",
    )

    # Update y-axes
    fig.update_yaxes(title_text="Average MAE", secondary_y=False)
    fig.update_yaxes(title_text="Number of SKUs", secondary_y=True)

    # Show the plot
    fig.show()

analyze_and_visualize_mae_vs_sales_ranges(mae_vs_past_sales_clean)


Average MAE and SKU Count by Range of Past Sales:
     Range         MAE  SKU_Count
0     0-10  209.346199         30
1    11-20  263.345530          9
2    21-30   25.353348         14
3    31-40   20.883586         17
4    41-50   22.177388         34
5   51-100   21.132221         86
6  101-200   26.471245        183
7  201-300   24.808037        196
8     300+   21.671817        223




