<a href="https://colab.research.google.com/github/adadoun/inventoryPlanningRecommendation/blob/main/DataPreparation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Overview:

This notebook prepare the data to be used for model training and evaluation

## Library Import

In [None]:
import pandas as pd
import numpy as np
import hashlib
import pickle

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Load the dataset
df = pd.read_csv('drive/MyDrive/Collab_DATA/PolarData/sales_data_csv.csv')

In [None]:
def hash_sku(sku: str) -> str:
    """
    Generate a hashed identifier for a SKU.

    Args:
        sku (str): The original SKU string.

    Returns:
        str: An 8-character hash of the SKU.
    """
    return hashlib.md5(str(sku).encode()).hexdigest()[:8]

def get_top_skus(df: pd.DataFrame, threshold: float = 0.90) -> list:
    """
    Identify the top SKUs that account for a given percentage of total sales.

    Args:
        df (pd.DataFrame): DataFrame containing sales data.
        threshold (float, optional): The cumulative sales percentage threshold. Defaults to 0.90.

    Returns:
        list: List of top SKUs.
    """
    sku_sales = df.groupby('SKU')['QUANTITY_SOLD'].sum().sort_values(ascending=False)
    cumulative_percentage = sku_sales.cumsum() / sku_sales.sum()
    top_skus = cumulative_percentage[cumulative_percentage <= threshold].index.tolist()
    print(f"Number of SKUs representing {threshold*100}% of sales: {len(top_skus)}")
    print(f"Total number of SKUs: {len(sku_sales)}")
    return top_skus

def prepare_data(df: pd.DataFrame) -> pd.DataFrame:
    """
    Prepare the data for analysis by aggregating to weekly level and merging current inventory levels.

    Args:
        df (pd.DataFrame): Raw sales data.

    Returns:
        pd.DataFrame: Prepared weekly sales data with current inventory levels.
    """
    df['DATE'] = pd.to_datetime(df['DATE'])
    current_levels = df.groupby('SKU')['CURRENT_LEVEL'].first().reset_index()
    weekly_sales = df.groupby(['SKU', pd.Grouper(key='DATE', freq='W-MON')])['QUANTITY_SOLD'].sum().reset_index()
    weekly_sales = weekly_sales.sort_values(['SKU', 'DATE'])
    weekly_sales = weekly_sales.merge(current_levels, on='SKU', how='left')
    return weekly_sales

def add_time_features(df: pd.DataFrame) -> pd.DataFrame:
    """
    Add time-based features to the dataset.

    Args:
        df (pd.DataFrame): DataFrame with a 'DATE' column.

    Returns:
        pd.DataFrame: DataFrame with additional time-based features.
    """
    df['year'] = df['DATE'].dt.year
    df['month'] = df['DATE'].dt.month
    df['week'] = df['DATE'].dt.isocalendar().week
    df['day_of_week'] = df['DATE'].dt.dayofweek
    df['is_weekend'] = df['day_of_week'].isin([5, 6]).astype(int)
    df['quarter'] = df['DATE'].dt.quarter

    # Cyclical encoding of time features
    df['month_sin'] = np.sin(2 * np.pi * df['month']/12)
    df['month_cos'] = np.cos(2 * np.pi * df['month']/12)
    df['week_sin'] = np.sin(2 * np.pi * df['week']/53)
    df['week_cos'] = np.cos(2 * np.pi * df['week']/53)
    df['day_of_week_sin'] = np.sin(2 * np.pi * df['day_of_week']/7)
    df['day_of_week_cos'] = np.cos(2 * np.pi * df['day_of_week']/7)

    return df

def add_lag_features(df: pd.DataFrame, lags: list = [1, 2, 3, 4, 5, 6, 12, 24, 36, 48, 52]) -> pd.DataFrame:
    """
    Add lagged sales features to the dataset.

    Args:
        df (pd.DataFrame): Sales data DataFrame.
        lags (list, optional): List of lag periods to create. Defaults to [1, 2, 3, 4, 5, 6, 12, 24, 36, 48, 52].

    Returns:
        pd.DataFrame: DataFrame with additional lagged sales features.
    """
    for lag in lags:
        df[f'lag_{lag}'] = df.groupby('SKU')['QUANTITY_SOLD'].shift(lag)
    return df

def add_rolling_features(df: pd.DataFrame, windows: list = [1, 2, 3, 4, 5, 6, 12, 24, 36, 48, 52]) -> pd.DataFrame:
    """
    Add rolling mean and standard deviation features to the dataset.

    Args:
        df (pd.DataFrame): Sales data DataFrame.
        windows (list, optional): List of window sizes for rolling calculations.
                                  Defaults to [1, 2, 3, 4, 5, 6, 12, 24, 36, 48, 52].

    Returns:
        pd.DataFrame: DataFrame with additional rolling features.
    """
    df = df.sort_values(['SKU', 'DATE'])
    for window in windows:
        df[f'rolling_mean_{window}'] = df.groupby('SKU')['QUANTITY_SOLD'].transform(
            lambda x: x.shift(1).rolling(window=window, min_periods=1).mean()
        )
        df[f'rolling_std_{window}'] = df.groupby('SKU')['QUANTITY_SOLD'].transform(
            lambda x: x.shift(1).rolling(window=window, min_periods=1).std()
        )
    return df

def fill_missing_values(df: pd.DataFrame) -> pd.DataFrame:
    """
    Fill missing values in lag and rolling features.

    Args:
        df (pd.DataFrame): DataFrame with lag and rolling features.

    Returns:
        pd.DataFrame: DataFrame with missing values filled.
    """
    df = df.sort_values(['SKU', 'DATE'])
    lag_columns = [col for col in df.columns if col.startswith('lag_')]
    rolling_columns = [col for col in df.columns if col.startswith('rolling_')]

    df_filled = df.copy()

    for col in lag_columns + rolling_columns:
        df_filled[col] = df_filled.groupby('SKU')[col].ffill()
        df_filled[col] = df_filled[col].fillna(0)

    return df_filled


## Data Preparation and Filtering

In [None]:
# Apply hash function to SKU column
df['SKU'] = df['SKU'].apply(hash_sku)

weekly_sales = prepare_data(df)

# Get top SKUs
top_skus = get_top_skus(weekly_sales, threshold=0.90)

# Filter for top SKUs
weekly_sales = weekly_sales[weekly_sales['SKU'].isin(top_skus)]

weekly_sales = weekly_sales.replace([np.inf, -np.inf], np.nan).dropna()

Number of SKUs representing 90.0% of sales: 815
Total number of SKUs: 3835


## Feature Engineering

In [None]:
# feature engineering steps
weekly_sales = add_time_features(weekly_sales)
weekly_sales = add_lag_features(weekly_sales)
weekly_sales = add_rolling_features(weekly_sales)

# After feature engineering, apply the function:
weekly_sales = fill_missing_values(weekly_sales)

## Sanity Checks on the data

In [None]:
# If there are still NaN values, you might want to drop those rows
weekly_sales = weekly_sales.dropna()

print("Shape after handling missing values:", weekly_sales.shape)

# Final check
print("\nFinal check for NaN or infinite values:")
# Check for any remaining NaN values
print(f"Number of missing values: {weekly_sales.isna().sum().sum()}")
print(f"Number of infinite values: {np.isinf(weekly_sales.select_dtypes(include=np.number)).sum().sum().sum()}")

Shape after handling missing values: (177112, 49)

Final check for NaN or infinite values:
Number of missing values: 0
Number of infinite values: 0


## Split the data into train/test
The idea is to keep the last three months of data as test set for each sku

In [None]:
# Temporal split (last 3 months as test set)
test_start_date = weekly_sales['DATE'].max() - pd.DateOffset(months=3)
train_data = weekly_sales[weekly_sales['DATE'] < test_start_date]
test_data = weekly_sales[weekly_sales['DATE'] >= test_start_date]

In [None]:
print(f"Shape of train data: {train_data.shape}")
print(f"Shape of test data: {test_data.shape}")

Shape of train data: (167164, 49)
Shape of test data: (9948, 49)


## Use only common skus in the test set

In [None]:
# Get common SKUs
common_skus = set(train_data['SKU']) & set(test_data['SKU'])

# Filter train and test data to include only common SKUs
train_data = train_data[train_data['SKU'].isin(common_skus)]
test_data = test_data[test_data['SKU'].isin(common_skus)]

# Verify that all SKUs in train data are also in test data
assert set(train_data['SKU']) == set(test_data['SKU']), "SKUs in train and test data do not match"

print(f"Number of common SKUs: {len(common_skus)}")
print(f"Shape of train data: {train_data.shape}")
print(f"Shape of test data: {test_data.shape}")

Number of common SKUs: 792
Shape of train data: (166552, 49)
Shape of test data: (9862, 49)


## Assign an index for each SKU to be used later as input of the NN

In [None]:
sku_to_index = {sku: idx for idx, sku in enumerate(train_data['SKU'].unique())}
train_data['SKU_INDEX'] = train_data['SKU'].map(sku_to_index)
test_data['SKU_INDEX'] = test_data['SKU'].map(sku_to_index)

In [None]:
train_data.to_csv('drive/MyDrive/Collab_DATA/PolarData/train_data.csv', index=False)
test_data.to_csv('drive/MyDrive/Collab_DATA/PolarData/test_data.csv', index=False)