In [1]:
# !pip install openpyxl
# !pip install seaborn
# !pip install plotly
# !pip install statsmodels

In [2]:
import plotly.io as pio
pio.renderers.default = "notebook_connected"

# ---- Imports ----
import pandas as pd
import numpy as np
import plotly.express as px
import os
import glob

In [3]:

drop_feature = [
    "CompanyName",
    "CustGroupName",
    "CompanyChainName",
    "PRODUCTNAME"
]
date_feature = [
    "InvoiceDate"
]

categorical_feature = [
    "CustGroup",
    "State"
]
cat_numeric_feature = [
    "DATAAREAID",
    "CompanyChain",
    "ItemNumber"
]
numerical_feature = [
    "INVOICEDQUANTITY",
    "QTYInKG/Ltr"
]
preprocessing_only = [
    "SALESORDERORIGINCODE"
]
cols = drop_feature + date_feature + categorical_feature + cat_numeric_feature + numerical_feature + preprocessing_only

In [8]:
input_path_2025 = "./data/Sales_Data_OctDec2025"
input_path_2024 = "./data/Sales_Data_OctDec2024"

# Collect CSV files from both directories
csv_files = (
    glob.glob(os.path.join(input_path_2025, "*.csv")) +
    glob.glob(os.path.join(input_path_2024, "*.csv"))
)

if not csv_files:
    raise ValueError("No CSV files found in both input paths")

# Read and combine
sales_df = pd.concat(
    [pd.read_csv(f, low_memory=False, usecols=cols) for f in csv_files],
    ignore_index=True
)

# Clean State column
sales_df["State"] = (
    sales_df["State"]
    .astype(str)
    .str.strip()
    .str.lower()
    .str.title()
)

print(sales_df.shape)

(9016258, 13)


In [10]:
# Drop only columns that are truly not required
df = sales_df
df = df.drop(columns=[
    "CompanyName",
    "CompanyChainName",
    "PRODUCTNAME",
    "CustGroupName"
])

# Convert InvoiceDate to datetime
df["InvoiceDate"] = pd.to_datetime(df["InvoiceDate"])

# Create calendar features
df["Date"] = df["InvoiceDate"].dt.date              # Date (YYYY-MM-DD)
df["DayOfWeek"] = df["InvoiceDate"].dt.day_name()   # Monday, Tuesday, ...
df["Month"] = df["InvoiceDate"].dt.month            # 1–12
df["Year"] = df["InvoiceDate"].dt.year              # YYYY
df['State'] = df['State'].str.strip().str.lower().str.title()
# Define group-by columns
group_cols = [
    "Date",
    "DayOfWeek",
    "Month",
    "Year",
    "DATAAREAID",
    "CustGroup",
    "State",
    "CompanyChain",
    "ItemNumber"
]

# Aggregate numerical features
sales_agg = (
    df
    .groupby(group_cols, dropna=False)
    .agg({
        "INVOICEDQUANTITY": "sum",
        "QTYInKG/Ltr": "sum",
        "SALESORDERORIGINCODE": "nunique"
    })
    .reset_index()
    .rename(columns={
        "SALESORDERORIGINCODE": "order_origin_count"
    })
)

# Final aggregated dataset
sales_agg


Unnamed: 0,Date,DayOfWeek,Month,Year,DATAAREAID,CustGroup,State,CompanyChain,ItemNumber,INVOICEDQUANTITY,QTYInKG/Ltr,order_origin_count
0,2024-10-01,Tuesday,10,2024,1102,CR01,Abu Dhabi,12,400024,-1.0,-1.00,1
1,2024-10-01,Tuesday,10,2024,1102,CR01,Abu Dhabi,12,400710,1.0,1.50,0
2,2024-10-01,Tuesday,10,2024,1102,CR01,Abu Dhabi,12,403993,8.0,8.00,1
3,2024-10-01,Tuesday,10,2024,1102,CR01,Abu Dhabi,15,403263,4.0,2.00,0
4,2024-10-01,Tuesday,10,2024,1102,CR01,Abu Dhabi,15,403308,1.0,0.50,0
...,...,...,...,...,...,...,...,...,...,...,...,...
1222534,2025-12-30,Tuesday,12,2025,1102,EC01,Dubai,343,420726,2.0,0.68,1
1222535,2025-12-30,Tuesday,12,2025,1102,EC01,Dubai,343,422108,2.0,0.60,1
1222536,2025-12-30,Tuesday,12,2025,1102,EC01,Dubai,343,422113,1.0,0.32,1
1222537,2025-12-30,Tuesday,12,2025,1102,EC01,Dubai,343,422518,11.0,11.00,1


In [6]:
df = sales_df
missing_summary = pd.DataFrame({
    "Column": df.columns,
    "Missing_Count": df.isna().sum(),
    "Missing_Percentage": (df.isna().sum() / len(df)) * 100,
    "Non_Missing_Count": df.notna().sum(),
    "Total_Rows": len(df),
    "Dtype": df.dtypes,
    "Unique_Values": df.nunique(),
    "Has_Missing": df.isna().any(),
    "All_Missing": df.isna().all()
})

missing_summary["Missing_Severity"] = missing_summary["Missing_Percentage"].apply(
    lambda p: "No Missing" if p == 0 else "Low" if p < 5 else "Medium" if p < 20 else "High"
)

missing_summary = missing_summary.sort_values("Missing_Percentage", ascending=False)

missing_summary



Unnamed: 0,Column,Missing_Count,Missing_Percentage,Non_Missing_Count,Total_Rows,Dtype,Unique_Values,Has_Missing,All_Missing,Missing_Severity
SALESORDERORIGINCODE,SALESORDERORIGINCODE,1536071,33.195637,3091257,4627328,object,18,True,False,High
State,State,30408,0.657139,4596920,4627328,object,11,True,False,Low
DATAAREAID,DATAAREAID,0,0.0,4627328,4627328,int64,2,False,False,No Missing
CompanyName,CompanyName,0,0.0,4627328,4627328,object,2,False,False,No Missing
InvoiceDate,InvoiceDate,0,0.0,4627328,4627328,object,91,False,False,No Missing
CustGroup,CustGroup,0,0.0,4627328,4627328,object,17,False,False,No Missing
CustGroupName,CustGroupName,0,0.0,4627328,4627328,object,17,False,False,No Missing
CompanyChain,CompanyChain,0,0.0,4627328,4627328,int64,183,False,False,No Missing
CompanyChainName,CompanyChainName,0,0.0,4627328,4627328,object,183,False,False,No Missing
ItemNumber,ItemNumber,0,0.0,4627328,4627328,int64,3791,False,False,No Missing


In [11]:
# Monthly seasonality
sales_df.groupby(["Year", "Month"])["SalesQty"].sum().reset_index()


KeyError: 'Year'

In [23]:
import pandas as pd
import numpy as np

# ================================
# CONFIGURATION
# ================================

drop_feature = [
    "CompanyName",
    "CustGroupName",
    "CompanyChainName",
    "PRODUCTNAME"
]

date_feature = ["InvoiceDate"]

categorical_feature = [
    "CustGroup",
    "State"
]

cat_numeric_feature = [
    "DATAAREAID",
    "CompanyChain",
    "ItemNumber"
]

numerical_feature = [
    "INVOICEDQUANTITY",
    "QTYInKG/Ltr"
]

preprocessing_only = [
    "SALESORDERORIGINCODE"
]

cols = (
    drop_feature
    + date_feature
    + categorical_feature
    + cat_numeric_feature
    + numerical_feature
    + preprocessing_only
)

# ================================
# STEP 1: BASIC CLEANING
# ================================

def clean_sales_data(sales_df: pd.DataFrame) -> pd.DataFrame:
    df = sales_df.copy()

    # Keep only required columns
    df = df[[c for c in cols if c in df.columns]]

    # Drop descriptive name columns
    df = df.drop(columns=[c for c in drop_feature if c in df.columns])

    # Convert date
    df["InvoiceDate"] = pd.to_datetime(df["InvoiceDate"], errors="coerce")

    # Drop invalid rows
    df = df.dropna(subset=["InvoiceDate", "ItemNumber"])

    # Fill quantity nulls
    df["INVOICEDQUANTITY"] = df["INVOICEDQUANTITY"].fillna(0)
    df["QTYInKG/Ltr"] = df["QTYInKG/Ltr"].fillna(0)
    df["SALESORDERORIGINCODE"] = df["SALESORDERORIGINCODE"].fillna("unknown")

    return df


# ================================
# STEP 2: DATE FEATURE ENGINEERING
# ================================

def add_date_features(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()

    df["Year"] = df["InvoiceDate"].dt.year
    df["Month"] = df["InvoiceDate"].dt.month
    df["DayOfWeek"] = df["InvoiceDate"].dt.dayofweek
    df["WeekOfYear"] = df["InvoiceDate"].dt.isocalendar().week.astype(int)
    df["IsWeekend"] = df["DayOfWeek"].isin([5, 6]).astype(int)

    return df


# ================================
# STEP 3: AGGREGATION (PATTERN BASE)
# ================================

def aggregate_sales(df: pd.DataFrame) -> pd.DataFrame:
    group_cols = [
        "Year",
        "Month",
        "ItemNumber",
        "State",
        "CustGroup"
    ]

    agg_df = (
        df.groupby(group_cols)
        .agg(
            total_qty=("INVOICEDQUANTITY", "sum"),
            avg_qty=("INVOICEDQUANTITY", "mean"),
            order_count=("InvoiceDate", "nunique")
        )
        .reset_index()
    )

    return agg_df


# ================================
# STEP 4: PATTERN METRICS
# ================================

def compute_pattern_metrics(df: pd.DataFrame) -> pd.DataFrame:
    """
    Compute CV, zero-ratio, and trend metrics per Item
    """

    metrics = (
        df.groupby("ItemNumber")["total_qty"]
        .agg(
            mean_qty="mean",
            std_qty="std",
            zero_ratio=lambda x: (x == 0).mean()
        )
        .reset_index()
    )

    metrics["cv"] = metrics["std_qty"] / metrics["mean_qty"]
    metrics["cv"] = metrics["cv"].replace([np.inf, -np.inf], np.nan)

    return metrics


# ================================
# STEP 5: PATTERN CLASSIFICATION
# ================================

def classify_patterns(metrics_df: pd.DataFrame) -> pd.DataFrame:
    def classify(row):
        if row["zero_ratio"] > 0.3:
            return "Intermittent"
        elif row["cv"] < 0.5:
            return "Stable"
        elif row["cv"] > 1.0:
            return "Volatile"
        else:
            return "Seasonal"

    metrics_df["demand_pattern"] = metrics_df.apply(classify, axis=1)
    return metrics_df


# ================================
# STEP 6: MASTER FUNCTION
# ================================

def find_sales_patterns(sales_df: pd.DataFrame):
    """
    Main entry point
    """

    df = clean_sales_data(sales_df)
    df = add_date_features(df)

    agg_df = aggregate_sales(df)
    metrics_df = compute_pattern_metrics(agg_df)
    pattern_df = classify_patterns(metrics_df)

    return {
        "clean_data": df,
        "aggregated_data": agg_df,
        "pattern_summary": pattern_df
    }


# ================================
# USAGE
# ================================
result = find_sales_patterns(sales_df)

# result["pattern_summary"]  # Item-wise demand pattern
# result["aggregated_data"]  # Forecast-ready dataset
# result["clean_data"]       # Feature-engineered raw data


In [24]:
result["pattern_summary"]

Unnamed: 0,ItemNumber,mean_qty,std_qty,zero_ratio,cv,demand_pattern
0,100008,9.973333,5.696820,0.0,0.571205,Seasonal
1,100009,21.128750,17.759957,0.0,0.840559,Seasonal
2,100011,15154.133333,28070.770762,0.0,1.852351,Volatile
3,100017,27.027083,18.857202,0.0,0.697715,Seasonal
4,100018,152.465769,446.472366,0.0,2.928345,Volatile
...,...,...,...,...,...,...
4776,422873,157.000000,,0.0,,Seasonal
4777,422874,157.000000,,0.0,,Seasonal
4778,422878,15.000000,,0.0,,Seasonal
4779,422886,600.000000,,0.0,,Seasonal


In [21]:
result["aggregated_data"]

Unnamed: 0,Year,Month,ItemNumber,State,CustGroup,total_qty,avg_qty,order_count
0,2024,10,100008,Dubai,EC01,2.550,0.510000,4
1,2024,10,100008,Dubai,RO01,8.935,0.558438,13
2,2024,10,100009,Dubai,EC01,1.530,0.765000,2
3,2024,10,100009,Dubai,RO01,27.030,0.711316,21
4,2024,10,100011,Dubai,EC01,24.910,0.830333,19
...,...,...,...,...,...,...,...,...
47232,2025,12,422872,Doha,EX01,157.000,157.000000,1
47233,2025,12,422873,Doha,EX01,157.000,157.000000,1
47234,2025,12,422874,Doha,EX01,157.000,157.000000,1
47235,2025,12,422878,Dubai,HS01,15.000,15.000000,1


In [22]:
result["clean_data"]  

Unnamed: 0,InvoiceDate,CustGroup,State,DATAAREAID,CompanyChain,ItemNumber,INVOICEDQUANTITY,QTYInKG/Ltr,SALESORDERORIGINCODE,Year,Month,DayOfWeek,WeekOfYear,IsWeekend
0,2025-12-01,CR01,Dubai,1202,101,400016,2.0,3.0,Format I,2025,12,0,49,0
1,2025-12-01,CR01,Dubai,1202,101,400049,2.0,3.0,Format I,2025,12,0,49,0
2,2025-12-01,CR01,Dubai,1202,101,400710,1.0,1.5,Format I,2025,12,0,49,0
3,2025-12-01,CR01,Dubai,1202,101,400141,1.0,1.5,Format I,2025,12,0,49,0
4,2025-12-01,CR01,Dubai,1202,101,400893,1.0,1.5,Format I,2025,12,0,49,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9016253,2024-11-30,HS01,Dubai,1202,57,400049,4.0,6.0,,2024,11,5,48,1
9016254,2024-11-30,HS01,Dubai,1202,57,400016,1.0,1.5,,2024,11,5,48,1
9016255,2024-11-30,HS01,Dubai,1202,57,400893,2.0,3.0,,2024,11,5,48,1
9016256,2024-11-30,HS01,Dubai,1202,57,400710,2.0,3.0,,2024,11,5,48,1


In [47]:
import pandas as pd
import numpy as np
import plotly.express as px

# ================================
# CONFIGURATION
# ================================

drop_feature = [
    "CompanyName",
    "CustGroupName",
    "CompanyChainName",
    "PRODUCTNAME"
]

date_feature = ["InvoiceDate"]

categorical_feature = ["CustGroup", "State"]

cat_numeric_feature = ["DATAAREAID", "CompanyChain", "ItemNumber"]

numerical_feature = ["INVOICEDQUANTITY", "QTYInKG/Ltr"]

cols = drop_feature + date_feature + categorical_feature + cat_numeric_feature + numerical_feature

# ================================
# STEP 1: CLEANING
# ================================

def clean_sales_data(sales_df: pd.DataFrame) -> pd.DataFrame:
    df = sales_df.copy()
    df = df[[c for c in cols if c in df.columns]]
    df = df.drop(columns=[c for c in drop_feature if c in df.columns])

    df["InvoiceDate"] = pd.to_datetime(df["InvoiceDate"], errors="coerce")
    df = df.dropna(subset=["InvoiceDate", "ItemNumber"])

    df["INVOICEDQUANTITY"] = df["INVOICEDQUANTITY"].fillna(0)

    return df

# ================================
# STEP 2: DATE FEATURES
# ================================

def add_date_features(df):
    df = df.copy()
    df["Year"] = df["InvoiceDate"].dt.year
    df["Month"] = df["InvoiceDate"].dt.month
    return df

# ================================
# STEP 3: AGGREGATION
# ================================

def aggregate_sales(df):
    return (
        df.groupby(["ItemNumber", "Year", "Month"])
        .agg(total_qty=("INVOICEDQUANTITY", "sum"))
        .reset_index()
    )

# ================================
# STEP 4: YoY % CHANGE
# ================================

def enrich_time_series(agg_df, item_number):
    df = agg_df[agg_df["ItemNumber"] == item_number].copy()

    df["YearMonth"] = pd.to_datetime(
        df["Year"].astype(str) + "-" + df["Month"].astype(str) + "-01"
    )

    df = df.sort_values(["Month", "Year"])

    # ---- YoY % Change (Month-wise) ----
    df["yoy_pct"] = (
        df.groupby("Month")["total_qty"]
        .pct_change() * 100
    )

    return df

# ================================
# STEP 5: PLOTS
# ================================

def plot_month_year_comparison(df, item_number):
    fig = px.line(
        df,
        x="Month",
        y="total_qty",
        color="Year",
        markers=True,
        title=f"Month-wise Sales Comparison – Item {item_number}",
    )
    fig.show()


def plot_yoy_change(df, item_number):
    fig = px.bar(
        df,
        x="Month",
        y="yoy_pct",
        color="Year",
        title=f"YoY % Change per Month – Item {item_number}",
    )
    fig.show()

# ================================
# STEP 6: MASTER FUNCTION
# ================================

def analyze_item(sales_df, item_number):
    df = clean_sales_data(sales_df)
    df = add_date_features(df)
    agg_df = aggregate_sales(df)

    ts_df = enrich_time_series(agg_df, item_number)

    plot_month_year_comparison(ts_df, item_number)
    plot_yoy_change(ts_df, item_number)

# ================================
# USAGE
# ================================

selected_item = 400043
analyze_item(sales_df, selected_item)

In [41]:
import pandas as pd
import numpy as np
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go

# ================================
# CONFIGURATION
# ================================

drop_feature = [
    "CompanyName",
    "CustGroupName",
    "CompanyChainName",
    "PRODUCTNAME"
]

date_feature = ["InvoiceDate"]

categorical_feature = ["CustGroup", "State"]

cat_numeric_feature = ["DATAAREAID", "CompanyChain", "ItemNumber"]

numerical_feature = ["INVOICEDQUANTITY", "QTYInKG/Ltr"]

cols = drop_feature + date_feature + categorical_feature + cat_numeric_feature + numerical_feature

# ================================
# STEP 1: CLEANING
# ================================

def clean_sales_data(sales_df: pd.DataFrame) -> pd.DataFrame:
    df = sales_df.copy()
    df = df[[c for c in cols if c in df.columns]]
    df = df.drop(columns=[c for c in drop_feature if c in df.columns])

    df["InvoiceDate"] = pd.to_datetime(df["InvoiceDate"], errors="coerce")
    df = df.dropna(subset=["InvoiceDate", "ItemNumber"])

    df["INVOICEDQUANTITY"] = df["INVOICEDQUANTITY"].fillna(0)

    return df

# ================================
# STEP 2: DATE FEATURES
# ================================

def add_date_features(df):
    df = df.copy()
    df["Year"] = df["InvoiceDate"].dt.year
    df["Month"] = df["InvoiceDate"].dt.month
    df["Day"] = df["InvoiceDate"].dt.day
    df["DayOfWeek"] = df["InvoiceDate"].dt.dayofweek  # Monday=0, Sunday=6
    df["WeekOfYear"] = df["InvoiceDate"].dt.isocalendar().week
    return df

# ================================
# STEP 3: AGGREGATION
# ================================

def aggregate_sales(df):
    # Month-wise
    agg_month = (
        df.groupby(["ItemNumber", "Year", "Month"])
        .agg(total_qty=("INVOICEDQUANTITY", "sum"))
        .reset_index()
    )

    # Day-of-week
    agg_day = (
        df.groupby(["ItemNumber", "Year", "DayOfWeek"])
        .agg(total_qty=("INVOICEDQUANTITY", "sum"))
        .reset_index()
    )

    # Week-of-year
    agg_week = (
        df.groupby(["ItemNumber", "Year", "WeekOfYear"])
        .agg(total_qty=("INVOICEDQUANTITY", "sum"))
        .reset_index()
    )

    return agg_month, agg_day, agg_week

# ================================
# STEP 4: YoY % CHANGE
# ================================

def enrich_time_series_month(agg_month_df, item_number):
    df = agg_month_df[agg_month_df["ItemNumber"] == item_number].copy()
    df["YearMonth"] = pd.to_datetime(df["Year"].astype(str) + "-" + df["Month"].astype(str) + "-01")
    df = df.sort_values(["Year", "Month"])
    df["yoy_pct"] = df.groupby("Month")["total_qty"].pct_change() * 100
    return df

def enrich_time_series_week(agg_week_df, item_number):
    df = agg_week_df[agg_week_df["ItemNumber"] == item_number].copy()
    df = df.sort_values(["Year", "WeekOfYear"])
    df["yoy_pct"] = df.groupby("WeekOfYear")["total_qty"].pct_change() * 100
    return df

def enrich_time_series_day(agg_day_df, item_number):
    df = agg_day_df[agg_day_df["ItemNumber"] == item_number].copy()
    df = df.sort_values(["Year", "DayOfWeek"])
    df["yoy_pct"] = df.groupby("DayOfWeek")["total_qty"].pct_change() * 100
    return df

# ================================
# STEP 5: DASHBOARD PLOTS
# ================================

def plot_combined_dashboard(ts_month, ts_week, ts_day, item_number):
    """
    Combined dashboard for a single item:
    - Month-wise trend & YoY
    - Week-wise trend & YoY
    - Day-of-Week trend & YoY
    """
    fig = make_subplots(
        rows=3, cols=2,
        subplot_titles=(
            "Month-wise Sales", "Month-wise YoY %",
            "Week-wise Sales", "Week-wise YoY %",
            "Day-of-Week Sales", "Day-of-Week YoY %"
        )
    )

    # -------- Month-wise --------
    for year in ts_month["Year"].unique():
        df = ts_month[ts_month["Year"] == year]
        fig.add_trace(go.Scatter(
            x=df["Month"], y=df["total_qty"],
            mode="lines+markers", name=f"{year} Sales",
        ), row=1, col=1)
        fig.add_trace(go.Bar(
            x=df["Month"], y=df["yoy_pct"], name=f"{year} YoY %",
        ), row=1, col=2)

    # -------- Week-wise --------
    for year in ts_week["Year"].unique():
        df = ts_week[ts_week["Year"] == year]
        fig.add_trace(go.Scatter(
            x=df["WeekOfYear"], y=df["total_qty"],
            mode="lines+markers", name=f"{year} Sales",
            showlegend=False
        ), row=2, col=1)
        fig.add_trace(go.Bar(
            x=df["WeekOfYear"], y=df["yoy_pct"],
            name=f"{year} YoY %",
            showlegend=False
        ), row=2, col=2)

    # -------- Day-of-Week --------
    day_labels = ["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"]
    for year in ts_day["Year"].unique():
        df = ts_day[ts_day["Year"] == year]
        fig.add_trace(go.Scatter(
            x=[day_labels[d] for d in df["DayOfWeek"]],
            y=df["total_qty"],
            mode="lines+markers", name=f"{year} Sales",
            showlegend=False
        ), row=3, col=1)
        fig.add_trace(go.Bar(
            x=[day_labels[d] for d in df["DayOfWeek"]],
            y=df["yoy_pct"],
            name=f"{year} YoY %",
            showlegend=False
        ), row=3, col=2)

    fig.update_layout(
        height=900, width=1200,
        title_text=f"Sales Dashboard – Item {item_number}",
        showlegend=True
    )

    fig.show()

# ================================
# STEP 6: MASTER FUNCTION
# ================================

def analyze_item_dashboard(sales_df, item_number):
    df = clean_sales_data(sales_df)
    df = add_date_features(df)
    agg_month, agg_day, agg_week = aggregate_sales(df)

    ts_month = enrich_time_series_month(agg_month, item_number)
    ts_week = enrich_time_series_week(agg_week, item_number)
    ts_day = enrich_time_series_day(agg_day, item_number)

    # Plot combined dashboard
    plot_combined_dashboard(ts_month, ts_week, ts_day, item_number)

# ================================
# USAGE
# ================================

# Example usage
selected_item = 400043
analyze_item_dashboard(sales_df, selected_item)


In [44]:
import pandas as pd
import numpy as np
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# ================================
# CONFIGURATION
# ================================

drop_feature = [
    "CompanyName",
    "CustGroupName",
    "CompanyChainName",
    "PRODUCTNAME"
]

group_cols = [
    "Date",
    "DayOfWeek",
    "Month",
    "Year",
    "DATAAREAID",
    "CustGroup",
    "State",
    "CompanyChain",
    "ItemNumber"
]

numerical_feature = ["INVOICEDQUANTITY", "QTYInKG/Ltr"]

# ================================
# STEP 1: CLEANING
# ================================

def clean_sales_data(sales_df: pd.DataFrame) -> pd.DataFrame:
    df = sales_df.copy()
    df = df.drop(columns=[c for c in drop_feature if c in df.columns])
    # convert InvoiceDate to datetime
    df["InvoiceDate"] = pd.to_datetime(df["InvoiceDate"], errors="coerce")
    df = df.dropna(subset=["InvoiceDate", "ItemNumber"])
    df["INVOICEDQUANTITY"] = df["INVOICEDQUANTITY"].fillna(0)
    return df


# ================================
# STEP 2: DATE FEATURES
# ================================

def add_date_features(df):
    df = df.copy()
    df["Year"] = df["InvoiceDate"].dt.year
    df["Month"] = df["InvoiceDate"].dt.month
    df["DayOfWeek"] = df["InvoiceDate"].dt.dayofweek  # Monday=0
    df["WeekOfYear"] = df["InvoiceDate"].dt.isocalendar().week
    return df

# ================================
# STEP 3: AGGREGATION
# ================================

def aggregate_sales(df):
    # Month-wise
    agg_month = (
        df.groupby(["ItemNumber", "Year", "Month"])
        .agg(total_qty=("INVOICEDQUANTITY", "sum"))
        .reset_index()
    )

    # Week-wise
    agg_week = (
        df.groupby(["ItemNumber", "Year", "WeekOfYear"])
        .agg(total_qty=("INVOICEDQUANTITY", "sum"))
        .reset_index()
    )

    # Day-of-week
    agg_day = (
        df.groupby(["ItemNumber", "Year", "DayOfWeek"])
        .agg(total_qty=("INVOICEDQUANTITY", "sum"))
        .reset_index()
    )

    return agg_month, agg_week, agg_day

# ================================
# STEP 4: ENRICH TIME SERIES WITH YoY AND ROLLING
# ================================

def enrich_time_series(df, group_col, rolling_window=None):
    df = df.copy()
    df = df.sort_values(["Year", group_col])
    df["yoy_pct"] = df.groupby(group_col)["total_qty"].pct_change() * 100
    if rolling_window:
        df[f"rolling_avg"] = df["total_qty"].rolling(window=rolling_window, min_periods=1).mean()
    else:
        df["rolling_avg"] = df["total_qty"]
    return df

# ================================
# STEP 5: DASHBOARD PLOTS
# ================================

def plot_dashboard(ts_month, ts_week, ts_day, item_number):
    day_labels = ["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"]

    fig = make_subplots(
        rows=3, cols=2,
        subplot_titles=(
            "Month-wise Sales", "Month-wise YoY %",
            "Week-wise Sales", "Week-wise YoY %",
            "Day-of-Week Sales", "Day-of-Week YoY %"
        )
    )

    # --- Month ---
    for year in ts_month["Year"].unique():
        df = ts_month[ts_month["Year"] == year]
        fig.add_trace(go.Scatter(
            x=df["Month"], y=df["total_qty"], mode="lines+markers", name=f"{year} Sales"
        ), row=1, col=1)
        fig.add_trace(go.Scatter(
            x=df["Month"], y=df["rolling_avg"], mode="lines", line=dict(dash="dash"),
            name=f"{year} 30-day avg", showlegend=True
        ), row=1, col=1)
        fig.add_trace(go.Bar(
            x=df["Month"], y=df["yoy_pct"], name=f"{year} YoY %"
        ), row=1, col=2)

    # --- Week ---
    for year in ts_week["Year"].unique():
        df = ts_week[ts_week["Year"] == year]
        fig.add_trace(go.Scatter(
            x=df["WeekOfYear"], y=df["total_qty"], mode="lines+markers", name=f"{year} Sales", showlegend=False
        ), row=2, col=1)
        fig.add_trace(go.Scatter(
            x=df["WeekOfYear"], y=df["rolling_avg"], mode="lines", line=dict(dash="dash"), name=f"{year} 4-week avg", showlegend=False
        ), row=2, col=1)
        fig.add_trace(go.Bar(
            x=df["WeekOfYear"], y=df["yoy_pct"], name=f"{year} YoY %", showlegend=False
        ), row=2, col=2)

    # --- Day-of-Week ---
    for year in ts_day["Year"].unique():
        df = ts_day[ts_day["Year"] == year]
        fig.add_trace(go.Scatter(
            x=[day_labels[d] for d in df["DayOfWeek"]], y=df["total_qty"], mode="lines+markers", name=f"{year} Sales", showlegend=False
        ), row=3, col=1)
        fig.add_trace(go.Scatter(
            x=[day_labels[d] for d in df["DayOfWeek"]], y=df["rolling_avg"], mode="lines", line=dict(dash="dash"), name=f"{year} 30-day avg", showlegend=False
        ), row=3, col=1)
        fig.add_trace(go.Bar(
            x=[day_labels[d] for d in df["DayOfWeek"]], y=df["yoy_pct"], name=f"{year} YoY %", showlegend=False
        ), row=3, col=2)

    fig.update_layout(height=900, width=1200, title_text=f"Sales Dashboard – Item {item_number}", showlegend=True)
    fig.show()

# ================================
# MASTER FUNCTION
# ================================

def analyze_item_dashboard(sales_df, item_number):
    df = clean_sales_data(sales_df)
    df = add_date_features(df)
    agg_month, agg_week, agg_day = aggregate_sales(df)

    ts_month = enrich_time_series(agg_month, group_col="Month", rolling_window=30)
    ts_week = enrich_time_series(agg_week, group_col="WeekOfYear", rolling_window=4)
    ts_day = enrich_time_series(agg_day, group_col="DayOfWeek", rolling_window=30)

    plot_dashboard(ts_month, ts_week, ts_day, item_number)

# ================================
# USAGE
# ================================

selected_item = 400043
analyze_item_dashboard(sales_df, selected_item)


In [46]:
import pandas as pd
import numpy as np
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# ================================
# CONFIGURATION
# ================================
drop_feature = [
    "CompanyName",
    "CustGroupName",
    "CompanyChainName",
    "PRODUCTNAME"
]

numerical_feature = ["INVOICEDQUANTITY", "QTYInKG/Ltr"]

# ================================
# STEP 1: CLEANING
# ================================
def clean_sales_data(sales_df: pd.DataFrame) -> pd.DataFrame:
    df = sales_df.copy()
    df = df.drop(columns=[c for c in drop_feature if c in df.columns])
    # Convert InvoiceDate to datetime
    df["InvoiceDate"] = pd.to_datetime(df["InvoiceDate"], errors="coerce")
    df = df.dropna(subset=["InvoiceDate", "ItemNumber"])
    df["INVOICEDQUANTITY"] = df["INVOICEDQUANTITY"].fillna(0)
    return df

# ================================
# STEP 2: DATE FEATURES
# ================================
def add_date_features(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    df["Year"] = df["InvoiceDate"].dt.year
    df["Month"] = df["InvoiceDate"].dt.month
    df["DayOfWeek"] = df["InvoiceDate"].dt.dayofweek  # Monday=0
    df["WeekOfYear"] = df["InvoiceDate"].dt.isocalendar().week
    return df

# ================================
# STEP 3: AGGREGATION
# ================================
def aggregate_sales(df: pd.DataFrame):
    # Month-wise
    agg_month = (
        df.groupby(["ItemNumber", "Year", "Month"])
        .agg(total_qty=("INVOICEDQUANTITY", "sum"))
        .reset_index()
    )
    # Week-wise
    agg_week = (
        df.groupby(["ItemNumber", "Year", "WeekOfYear"])
        .agg(total_qty=("INVOICEDQUANTITY", "sum"))
        .reset_index()
    )
    # Day-of-week
    agg_day = (
        df.groupby(["ItemNumber", "Year", "DayOfWeek"])
        .agg(total_qty=("INVOICEDQUANTITY", "sum"))
        .reset_index()
    )
    return agg_month, agg_week, agg_day

# ================================
# STEP 4: ENRICH TIME SERIES WITH YoY AND ROLLING
# ================================
def enrich_time_series(df, time_col, freq="M", rolling_window=None):
    """
    df: aggregated sales data
    time_col: column to sort by ('Month', 'WeekOfYear', 'DayOfWeek')
    freq: 'M' = month, 'W' = week, 'D' = day-of-week
    rolling_window: window size for rolling average
    """
    df = df.copy()
    df = df.sort_values(["ItemNumber", "Year", time_col])

    # YoY % calculation
    if freq == "M":
        df["yoy_pct"] = df.groupby(["ItemNumber", time_col])["total_qty"].pct_change() * 100
    elif freq == "W":
        df["yoy_pct"] = df.groupby(["ItemNumber", time_col])["total_qty"].pct_change() * 100
    else:  # Day-of-week
        df["yoy_pct"] = df.groupby(["ItemNumber", time_col])["total_qty"].pct_change() * 100

    # Rolling average
    if rolling_window:
        df["rolling_avg"] = df.groupby("ItemNumber")["total_qty"].transform(
            lambda x: x.rolling(rolling_window, min_periods=1).mean()
        )
    else:
        df["rolling_avg"] = df["total_qty"]

    return df

# ================================
# STEP 5: DASHBOARD PLOTS
# ================================
def plot_dashboard(ts_month, ts_week, ts_day, item_number):
    day_labels = ["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"]

    fig = make_subplots(
        rows=3, cols=2,
        subplot_titles=(
            "Month-wise Sales", "Month-wise YoY %",
            "Week-wise Sales", "Week-wise YoY %",
            "Day-of-Week Sales", "Day-of-Week YoY %"
        )
    )

    # --- Month ---
    for year in ts_month["Year"].unique():
        df = ts_month[ts_month["Year"] == year]
        fig.add_trace(go.Scatter(
            x=df["Month"], y=df["total_qty"], mode="lines+markers", name=f"{year} Sales"
        ), row=1, col=1)
        fig.add_trace(go.Scatter(
            x=df["Month"], y=df["rolling_avg"], mode="lines", line=dict(dash="dash"),
            name=f"{year} Rolling Avg", showlegend=True
        ), row=1, col=1)
        fig.add_trace(go.Bar(
            x=df["Month"], y=df["yoy_pct"], name=f"{year} YoY %"
        ), row=1, col=2)

    # --- Week ---
    for year in ts_week["Year"].unique():
        df = ts_week[ts_week["Year"] == year]
        fig.add_trace(go.Scatter(
            x=df["WeekOfYear"], y=df["total_qty"], mode="lines+markers", name=f"{year} Sales", showlegend=False
        ), row=2, col=1)
        fig.add_trace(go.Scatter(
            x=df["WeekOfYear"], y=df["rolling_avg"], mode="lines", line=dict(dash="dash"), name=f"{year} Rolling Avg", showlegend=False
        ), row=2, col=1)
        fig.add_trace(go.Bar(
            x=df["WeekOfYear"], y=df["yoy_pct"], name=f"{year} YoY %", showlegend=False
        ), row=2, col=2)

    # --- Day-of-Week ---
    for year in ts_day["Year"].unique():
        df = ts_day[ts_day["Year"] == year]
        fig.add_trace(go.Scatter(
            x=[day_labels[d] for d in df["DayOfWeek"]], y=df["total_qty"], mode="lines+markers", name=f"{year} Sales", showlegend=False
        ), row=3, col=1)
        fig.add_trace(go.Scatter(
            x=[day_labels[d] for d in df["DayOfWeek"]], y=df["rolling_avg"], mode="lines", line=dict(dash="dash"), name=f"{year} Rolling Avg", showlegend=False
        ), row=3, col=1)
        fig.add_trace(go.Bar(
            x=[day_labels[d] for d in df["DayOfWeek"]], y=df["yoy_pct"], name=f"{year} YoY %", showlegend=False
        ), row=3, col=2)

    fig.update_layout(height=900, width=1200, title_text=f"Sales Dashboard – Item {item_number}", showlegend=True)
    fig.show()

# ================================
# MASTER FUNCTION
# ================================
def analyze_item_dashboard(sales_df, item_number):
    df = clean_sales_data(sales_df)
    df = add_date_features(df)
    
    # Filter by selected item
    df = df[df["ItemNumber"] == item_number]

    # Aggregate
    agg_month, agg_week, agg_day = aggregate_sales(df)

    # Enrich with rolling and YoY
    ts_month = enrich_time_series(agg_month, time_col="Month", freq="M", rolling_window=3)  # 3-month rolling
    ts_week = enrich_time_series(agg_week, time_col="WeekOfYear", freq="W", rolling_window=4)  # 4-week rolling
    ts_day = enrich_time_series(agg_day, time_col="DayOfWeek", freq="D", rolling_window=7)  # 7-day rolling

    # Plot
    plot_dashboard(ts_month, ts_week, ts_day, item_number)

    # Return processed data for inspection/export
    return ts_month, ts_week, ts_day

# ================================
# USAGE
# ================================
# Example: analyze sales for ItemNumber 400043
selected_item = 400043
ts_month, ts_week, ts_day = analyze_item_dashboard(sales_df, selected_item)
