<a href="https://colab.research.google.com/github/adadoun/inventoryPlanningRecommendation/blob/main/LgbmSalesPrediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Sales Data Analysis

This notebook contains an analysis of the sales data from the provided CSV file.

In [None]:
!pip install category_encoders

Collecting category_encoders
  Downloading category_encoders-2.6.3-py2.py3-none-any.whl.metadata (8.0 kB)
Downloading category_encoders-2.6.3-py2.py3-none-any.whl (81 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.9/81.9 kB[0m [31m855.7 kB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: category_encoders
Successfully installed category_encoders-2.6.3


In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import train_test_split
from lightgbm import LGBMRegressor
from category_encoders import TargetEncoder
from plotly.subplots import make_subplots
import plotly.graph_objects as go
from sklearn.model_selection import RandomizedSearchCV, TimeSeriesSplit
import hashlib
import warnings
warnings.filterwarnings('ignore')

Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Load train/test datasets
train_data = pd.read_csv('drive/MyDrive/Collab_DATA/PolarData/train_data.csv')
test_data = pd.read_csv('drive/MyDrive/Collab_DATA/PolarData/test_data.csv')



## Utilities functions for LGBM Model

In [None]:
def train_lightgbm(X_train: np.ndarray, y_train: np.ndarray, params: dict) -> LGBMRegressor:
    """
    Train a LightGBM model with given parameters.

    Args:
        X_train (np.ndarray): Training features.
        y_train (np.ndarray): Training target values.
        params (dict): Model hyperparameters.

    Returns:
        LGBMRegressor: Trained LightGBM model.
    """
    model = LGBMRegressor(**params, random_state=42)
    model.fit(X_train, y_train)
    return model

def train_lightgbm_with_random_search(X_train: np.ndarray, y_train: np.ndarray) -> LGBMRegressor:
    """
    Train a LightGBM model using RandomizedSearchCV for hyperparameter tuning.

    Args:
        X_train (np.ndarray): Training features.
        y_train (np.ndarray): Training target values.

    Returns:
        LGBMRegressor: Best LightGBM model found by RandomizedSearchCV.
    """
    # Define the parameter space
    param_space = {
        'n_estimators': [100, 200, 300, 500, 1000],
        'learning_rate': [0.01, 0.05, 0.1, 0.2],
        'max_depth': [-1, 3, 5, 7, 9],
        'num_leaves': [31, 63, 127, 255],
        'min_child_samples': [5, 10, 20, 50],
        'subsample': [0.6, 0.7, 0.8, 0.9, 1.0],
        'colsample_bytree': [0.6, 0.7, 0.8, 0.9, 1.0],
        'reg_alpha': [0, 0.1, 0.5, 1],
        'reg_lambda': [0, 0.1, 0.5, 1]
    }

    # Create the LightGBM model
    model = LGBMRegressor(random_state=42)

    # Set up TimeSeriesSplit for time series data
    tscv = TimeSeriesSplit(n_splits=3)

    # Set up RandomizedSearchCV
    random_search = RandomizedSearchCV(
        estimator=model,
        param_distributions=param_space,
        n_iter=30,  # Number of parameter settings that are sampled
        cv=tscv,
        scoring='neg_mean_absolute_error',
        random_state=42,
        verbose=3,
        n_jobs=-1  # Use all available cores
    )

    # Fit RandomizedSearchCV
    random_search.fit(X_train, y_train)

    print("Best parameters found:")
    print(random_search.best_params_)
    print(f"Best MAE score: {-random_search.best_score_:.4f}")

    return random_search.best_estimator_

def make_predictions(model: LGBMRegressor, X_test: np.ndarray) -> np.ndarray:
    """
    Make predictions using a trained model.

    Args:
        model (LGBMRegressor): Trained LightGBM model.
        X_test (np.ndarray): Test features.

    Returns:
        np.ndarray: Predicted values.
    """
    return model.predict(X_test)

def evaluate_model(actual: np.ndarray, predicted: np.ndarray) -> tuple:
    """
    Evaluate model performance using various metrics.

    Args:
        actual (np.ndarray): Actual target values.
        predicted (np.ndarray): Predicted target values.

    Returns:
        tuple: (MAE, RMSE, MAPE) - Mean Absolute Error, Root Mean Square Error, and Mean Absolute Percentage Error.
    """
    mae = mean_absolute_error(actual, predicted)
    rmse = np.sqrt(mean_squared_error(actual, predicted))
    mape = np.mean(np.abs((actual - predicted) / actual)) * 100
    return mae, rmse, mape

def calculate_metrics(group: pd.DataFrame) -> pd.Series:
    """
    Calculate performance metrics for a group (typically a single SKU).

    Args:
        group (pd.DataFrame): DataFrame containing 'QUANTITY_SOLD' and 'predictions' columns.

    Returns:
        pd.Series: Series containing MAE, RMSE, and MAPE for the group.
    """
    mae = mean_absolute_error(group['QUANTITY_SOLD'], group['predictions'])
    rmse = np.sqrt(mean_squared_error(group['QUANTITY_SOLD'], group['predictions']))
    mape = np.mean(np.abs((group['QUANTITY_SOLD'] - group['predictions']) / group['QUANTITY_SOLD'])) * 100
    return pd.Series({'MAE': mae, 'RMSE': rmse, 'MAPE': mape})

def remove_outliers(data: np.ndarray, lower_percentile: float = 2.5, upper_percentile: float = 97.5) -> np.ndarray:
    """
    Remove outliers from the data based on specified percentiles.

    Args:
        data (np.ndarray): Input data array.
        lower_percentile (float, optional): Lower percentile for outlier removal. Defaults to 2.5.
        upper_percentile (float, optional): Upper percentile for outlier removal. Defaults to 97.5.

    Returns:
        np.ndarray: Data with outliers removed.
    """
    lower = np.percentile(data, lower_percentile)
    upper = np.percentile(data, upper_percentile)
    return data[(data >= lower) & (data <= upper)]


## Encode categorical feature using target encoding method

In [None]:
# Target encode SKU
encoder = TargetEncoder(cols=['SKU'])
train_encoded = train_data.copy()
test_encoded = test_data.copy()
train_encoded['SKU_encoded'] = encoder.fit_transform(train_data[['SKU']], train_data['QUANTITY_SOLD'])
test_encoded['SKU_encoded'] = encoder.transform(test_data[['SKU']])

# Prepare X and y for training
feature_columns = [col for col in train_encoded.columns if col not in ['DATE', 'QUANTITY_SOLD', 'SKU']]
X_train = train_encoded[feature_columns]
y_train = train_encoded['QUANTITY_SOLD']
X_test = test_encoded[feature_columns]
y_test = test_encoded['QUANTITY_SOLD']

## Train, Evaluate LGBM Model & Display most important features

In [None]:
params = {'subsample': 0.8, 'reg_lambda': 0.1,
          'reg_alpha': 0, 'num_leaves': 63,
          'n_estimators': 1000, 'min_child_samples': 10,
          'max_depth': 7, 'learning_rate': 0.01, 'colsample_bytree': 0.8,
          'min_split_gain': 0,  # Allow splits with no gain
          'min_child_weight': 1e-3, # Reduce minimum child weight
          'verbose': -1
          }

# Train LightGBM model with random search
model = train_lightgbm(X_train, y_train, params)

# Make predictions
predictions = model.predict(X_test)

# Add predictions to the test dataframe
test_encoded['predictions'] = predictions

# Evaluate model
mae, rmse, mape = evaluate_model(y_test, predictions)

print("\nOverall Model Performance:")
print(f"MAE: {mae:.2f}")
print(f"RMSE: {rmse:.2f}")
print(f"MAPE: {mape:.2f}%")

# Feature importance
feature_importance = dict(zip(feature_columns, model.feature_importances_))
sorted_features = sorted(feature_importance.items(), key=lambda x: x[1], reverse=True)

print("\nTop 10 Most Important Features:")
for feature, importance in sorted_features[:10]:
    print(f"{feature}: {importance:.4f}")


Overall Model Performance:
MAE: 43.88
RMSE: 182.17
MAPE: 199.27%

Top 10 Most Important Features:
lag_1: 2881.0000
SKU_encoded: 2579.0000
rolling_std_2: 1713.0000
week: 1582.0000
week_cos: 1499.0000
lag_52: 1445.0000
lag_36: 1441.0000
lag_2: 1358.0000
lag_6: 1246.0000
lag_4: 1236.0000


## Compute statistics on the metrics by sku

In [None]:
sku_performance = test_encoded.groupby('SKU').apply(calculate_metrics)

print("\nTop 5 SKUs by lowest MAPE:")
print(sku_performance.sort_values('MAPE').head())

print("\nBottom 5 SKUs by highest MAPE:")
print(sku_performance.sort_values('MAPE', ascending=False).head())

# Calculate statistics for evaluation metrics
metric_stats = sku_performance.agg(['mean', 'median', 'min', 'max'])
metric_stats.loc['p90'] = sku_performance.quantile(0.9)

print("\nStatistics of Evaluation Metrics:")
print(metric_stats)


Top 5 SKUs by lowest MAPE:
                MAE       RMSE       MAPE
SKU                                      
11a6821d   3.246452   4.662299  13.867859
83071c5a   3.311018   3.906928  15.455410
c68914a5  10.131976  13.591147  16.187653
d6233f0e   4.670722   6.349523  16.487763
26a22cac   4.740106   5.898834  16.801119

Bottom 5 SKUs by highest MAPE:
                 MAE        RMSE          MAPE
SKU                                           
2a73ea2b  906.315148  970.770382  28900.883878
9c62ebd2  448.019656  475.831459  25797.006647
0b6fea39  257.121586  257.413032  19590.263724
f91767a9  712.575182  836.776288  12952.698242
4c70e74d  151.903567  160.139173  12225.413280

Statistics of Evaluation Metrics:
                MAE         RMSE          MAPE
mean      45.181281    60.655929    291.133325
median    12.824872    16.009950     61.320064
min        1.653508     2.162165     13.867859
max     1659.309528  2041.795627  28900.883878
p90       79.985154    99.036460    252.289530


In [None]:
def visualize_and_analyze_metrics(sku_performance: pd.DataFrame):
    """
    Create violin plots for performance metrics and print summary statistics.

    Args:
        sku_performance (pd.DataFrame): DataFrame containing performance metrics (MAE, RMSE, MAPE) for each SKU.
    """
    # Plot statistics with outliers removed and using violin plots
    fig = make_subplots(rows=1, cols=3, subplot_titles=("MAE", "RMSE", "MAPE"))

    for i, metric in enumerate(['MAE', 'RMSE', 'MAPE']):
        # Remove outliers
        cleaned_data = remove_outliers(sku_performance[metric])

        # Create violin plot
        fig.add_trace(go.Violin(y=cleaned_data, name=metric, box_visible=True, meanline_visible=True), row=1, col=i+1)

    fig.update_layout(title_text="Distribution of Evaluation Metrics Across SKUs (Outliers Removed)", height=500)
    fig.show()

    # Print statistics of the cleaned data
    print("\nStatistics of Evaluation Metrics (After Removing Outliers):")
    for metric in ['MAE', 'RMSE', 'MAPE']:
        cleaned_data = remove_outliers(sku_performance[metric])
        stats = cleaned_data.describe()
        print(f"\n{metric}:")
        print(f"Mean: {stats['mean']:.2f}")
        print(f"Median: {stats['50%']:.2f}")
        print(f"Min: {stats['min']:.2f}")
        print(f"Max: {stats['max']:.2f}")
        print(f"90th Percentile: {np.percentile(cleaned_data, 90):.2f}")

visualize_and_analyze_metrics(sku_performance)



Statistics of Evaluation Metrics (After Removing Outliers):

MAE:
Mean: 28.15
Median: 12.82
Min: 3.06
Max: 360.96
90th Percentile: 63.11

RMSE:
Mean: 36.59
Median: 16.01
Min: 3.65
Max: 464.26
90th Percentile: 84.55

MAPE:
Mean: 106.23
Median: 61.32
Min: 19.91
Max: 1272.00
90th Percentile: 230.73


## Show Predicted Sales for top 5 SKUs

In [None]:
# Get top 5 most sold SKUs
top_5_skus = train_data.groupby('SKU')['QUANTITY_SOLD'].sum().nlargest(5).index

# Plot sales predictions for top 5 most sold SKUs
for sku in top_5_skus:
    sku_data = test_encoded[test_encoded['SKU'] == sku].sort_values('DATE')

    fig = go.Figure()
    fig.add_trace(go.Scatter(x=sku_data['DATE'], y=sku_data['QUANTITY_SOLD'], mode='lines', name='Actual Sales'))
    fig.add_trace(go.Scatter(x=sku_data['DATE'], y=sku_data['predictions'], mode='lines', name='Predicted Sales'))

    fig.update_layout(title=f'Sales Prediction for SKU: {sku}',
                      xaxis_title='Date',
                      yaxis_title='Quantity Sold',
                      legend_title='Legend')
    fig.show()

## Compute MAE with regards to the number of past sales

In [None]:
def analyze_mae_vs_past_sales(train_data: pd.DataFrame, sku_performance: pd.DataFrame) -> pd.DataFrame:
    """
    Analyze the relationship between Mean Absolute Error (MAE) and the number of past sales for each SKU.

    Args:
        train_data (pd.DataFrame): Training dataset containing SKU and sales data.
        sku_performance (pd.DataFrame): DataFrame containing performance metrics for each SKU.

    Returns:
        pd.DataFrame: Cleaned and sorted DataFrame containing Past_Sales_Count and MAE for each SKU.
    """
    # Calculate the number of past sales for each SKU
    past_sales_count = train_data.groupby('SKU').size()

    # Combine past sales count with MAE
    mae_vs_past_sales = pd.DataFrame({
        'Past_Sales_Count': past_sales_count,
        'MAE': sku_performance['MAE']
    })

    # Print diagnostic information
    print("Before cleaning:")
    print(f"Total rows: {len(mae_vs_past_sales)}")
    print(f"Rows with NaN: {mae_vs_past_sales.isna().sum().sum()}")
    print(f"Rows with inf: {np.isinf(mae_vs_past_sales).sum().sum()}")

    # Remove rows with NaN or infinite values
    mae_vs_past_sales_clean = mae_vs_past_sales.replace([np.inf, -np.inf], np.nan).dropna()

    print("\nAfter cleaning:")
    print(f"Total rows: {len(mae_vs_past_sales_clean)}")

    # Sort by Past_Sales_Count for better visualization
    mae_vs_past_sales_clean = mae_vs_past_sales_clean.sort_values('Past_Sales_Count')

    return mae_vs_past_sales_clean

mae_vs_past_sales_clean = analyze_mae_vs_past_sales(train_data, sku_performance)


Before cleaning:
Total rows: 792
Rows with NaN: 0
Rows with inf: 0

After cleaning:
Total rows: 792


## Plot Evolution of the MAE over Past Sales

In [None]:
def analyze_and_visualize_mae_vs_sales_ranges(mae_vs_past_sales_clean: pd.DataFrame):
    """
    Analyze and visualize the relationship between Mean Absolute Error (MAE) and ranges of past sales.

    Args:
        mae_vs_past_sales_clean (pd.DataFrame): Cleaned DataFrame containing Past_Sales_Count and MAE for each SKU.
    """
    # Define ranges for past sales
    ranges = [0, 10, 20, 30, 40, 50, 100, 200, 300, float('inf')]
    range_labels = ['0-10', '11-20', '21-30', '31-40', '41-50', '51-100', '101-200', '201-300', '300+']

    # Categorize past sales into ranges
    mae_vs_past_sales_clean['Range'] = pd.cut(mae_vs_past_sales_clean['Past_Sales_Count'],
                                              bins=ranges, labels=range_labels, include_lowest=True)

    # Calculate average MAE for each range
    average_mae_by_range = mae_vs_past_sales_clean.groupby('Range')['MAE'].mean().reset_index()

    # Calculate the number of SKUs in each range
    sku_count_by_range = mae_vs_past_sales_clean['Range'].value_counts().sort_index().reset_index()
    sku_count_by_range.columns = ['Range', 'SKU_Count']

    # Merge average MAE and SKU count
    plot_data = pd.merge(average_mae_by_range, sku_count_by_range, on='Range')

    print("\nAverage MAE and SKU Count by Range of Past Sales:")
    print(plot_data)

    # Create a figure with secondary y-axis
    fig = make_subplots(specs=[[{"secondary_y": True}]])

    # Add bar plot for average MAE
    fig.add_trace(
        go.Bar(x=plot_data['Range'], y=plot_data['MAE'], name="Average MAE"),
        secondary_y=False,
    )

    # Add line plot for SKU count
    fig.add_trace(
        go.Scatter(x=plot_data['Range'], y=plot_data['SKU_Count'], name="Number of SKUs", mode="lines+markers"),
        secondary_y=True,
    )

    # Update layout
    fig.update_layout(
        title_text="Average MAE and SKU Distribution by Range of Past Sales",
        xaxis_title="Range of Past Sales",
    )

    # Update y-axes
    fig.update_yaxes(title_text="Average MAE", secondary_y=False)
    fig.update_yaxes(title_text="Number of SKUs", secondary_y=True)

    # Show the plot
    fig.show()

analyze_and_visualize_mae_vs_sales_ranges(mae_vs_past_sales_clean)


Average MAE and SKU Count by Range of Past Sales:
     Range         MAE  SKU_Count
0     0-10  300.617052         30
1    11-20  221.399634          9
2    21-30   98.746068         14
3    31-40   48.433141         17
4    41-50   34.280434         34
5   51-100   42.683814         86
6  101-200   36.094184        183
7  201-300   27.710800        196
8     300+   25.532568        223
