# Initial Exploration

In [None]:
import os
from datetime import datetime

# Data manipulation
import pandas as pd
from pandas.api.types import is_datetime64_any_dtype as is_datetime
import numpy as np

# Time series analysis
from statsmodels.tsa.seasonal import seasonal_decompose

# Data visualization
import matplotlib.pyplot as plt
import seaborn as sns

from belly_rubb.config import REPORTS_DIR, INTERIM_DATA_DIR, RAW_DATA_DIR

pd.set_option('display.max_columns', None)

In [None]:
# Create a list of DataFrames from the orders CSV files
orders_dir = '../data/raw/orders'
csv_files = [f for f in os.listdir(orders_dir) if f.endswith('.csv')]

dataframes = []
for file in csv_files:
    file_path = os.path.join(orders_dir, file)
    df = pd.read_csv(file_path)
    dataframes.append(df)

# Combine dataframes into one
df = pd.concat(dataframes, ignore_index=True)
df.head()

In [None]:
# Save aggregated orders data to csv

df.to_csv(INTERIM_DATA_DIR / 'orders.csv', index=False)

In [None]:
print(f"Rows: {df.shape[0]}\nColumns: {df.shape[1]}")

In [None]:
df.columns

In [None]:
df.info()

In [None]:
# Convert date columns to datetime

df['Order Date'] = pd.to_datetime(df['Order Date'])
df['Fulfillment Date'] = pd.to_datetime(df['Fulfillment Date'], format='%m/%d/%Y, %I:%M %p')

# Data Quality Checks
- [x] Investigate missing values
- [x] Identify static columns
- [x] Check for duplicates
- [x] Validate data types
- [x] Spot outliers

## Missing values and static columns

In [None]:
# Identify empty or static columns

for col in df.columns:
    unique_values = set(df[col].dropna().unique())
    if len(unique_values) == 0:
        print(f"Column '{col}' is empty.")
    elif len(unique_values) == 1:
        print(f"Column '{col}' has a single unique value: {unique_values.pop()}")

In [None]:
# List distinct Recipient Country values

df['Recipient Country'].unique()

In [None]:
zz_country = df[df['Recipient Country'] == 'ZZ']
zz_country

In [None]:
len(zz_country)

In [None]:
# View missing value percentages

df.isna().sum().div(len(df))*100

### Conclusions
* `Currency`, `Fulfillment Location`, and `Recipient Region` have a single unique value.
* `Order Shipping Price`, `Order Refunded Amount`, and `Item SKU` are empty.
* `Recipient Country` has 370 rows with the value `ZZ`.
* Geographic details such as `Recipient Address` and `Recipient Postal Code` are missing greater than **87%** of their data.

## Null values

In [None]:
# View rows with all null values

df[df.isna().all(axis=1)]

### Conclusions

No row is missing **all** values.

## Duplicates

In [None]:
# Check for duplicates

duplicates = df.duplicated()
df[duplicates]

In [None]:
# Check duplicate 'Armen 59-07' rows

df[(df['Order'] == 'Armen 59-07') & (df['Order Date'] == '2024/08/31')]

In [None]:
# Check duplicate Troy Issac rows

df[(df['Order'] == 'Troy Issac') & (df['Order Date'] == '2024/12/19')]

### Conclusions

* Each order is split into multiple rows, one for each menu item.
* Menu items are not grouped together however. For example, one order can have multiple rows with `CRINKLE FRIES` as the `Item name`. 
* If items are part of separate combos or groupings, they are listed separately.

## Outliers

In [None]:
def calculate_iqr(df: pd.DataFrame, col: str) -> tuple:
    """
    Calculate the Interquartile Range (IQR) for a given column.
    
    Args:
        col (str): The name of the column to calculate IQR for.
        
    Returns:
        tuple: A tuple containing the lower and upper bounds for outliers.
    """
    q1 = df[col].quantile(0.25)
    q3 = df[col].quantile(0.75)
    iqr = q3 - q1

    lower_bound = q1 - 1.5 * iqr
    upper_bound = q3 + 1.5 * iqr

    return lower_bound, upper_bound


### Order Total

In [None]:
# Identify outliers in Order Total

plt.figure(figsize=(12, 6))

sns.histplot(data=df, x='Order Total', color='skyblue')

plt.title('Distribution of Order Total', fontsize=16, fontweight='bold')
plt.xlabel('Order Total ($)', fontsize=14)
plt.ylabel('Frequency', fontsize=14)

lower_bound, upper_bound = calculate_iqr(df=df, col='Order Total')
plt.axvline(x=df['Order Total'].mean(), color='red', linestyle='--', label='Mean')
plt.axvline(x=df['Order Total'].median(), color='green', linestyle='--', label='Median')
plt.axvline(x=lower_bound, color='orange', linestyle='-.', label='Lower Bound') if lower_bound > 0 else None
plt.axvline(x=upper_bound, color='orange', linestyle='-.', label='Upper Bound')

plt.legend()

plt.tight_layout()
plt.show()

In [None]:
# Print outlier information

num_order_total_outliers = len(df[df['Order Total'] > upper_bound])
print(f"Number of outliers in 'Order Total': {num_order_total_outliers}")
print(f"Percentage of total dataset: {num_order_total_outliers / len(df) * 100:.2f}%")

In [None]:
# View Order Total outliers

order_total_outliers = df[df['Order Total'] > upper_bound].sort_values(by='Order Total', ascending=False)
order_total_outliers

#### Conclusions

* Outliers in `Order Total` are simply large orders.
* There does not seem to be data entry mistakes or suspicious activity.

### Item Price

In [None]:
# Identify outliers in Item Price

plt.figure(figsize=(12, 6))

sns.histplot(data=df, x='Item Price', color='skyblue')

plt.title('Distribution of Item Price', fontsize=16, fontweight='bold')
plt.xlabel('Item Price ($)', fontsize=14)
plt.ylabel('Frequency', fontsize=14)

lower_bound, upper_bound = calculate_iqr(df=df, col='Item Price')

plt.axvline(x=df['Item Price'].mean(), color='red', linestyle='--', label='Mean')
plt.axvline(x=df['Item Price'].median(), color='green', linestyle='--', label='Median')
plt.axvline(x=lower_bound, color='orange', linestyle='-.', label='Lower Bound') if lower_bound > 0 else None
plt.axvline(x=upper_bound, color='orange', linestyle='-.', label='Upper Bound')
plt.legend()

plt.tight_layout()
plt.show()

In [None]:
# View Item Price outlier information

num_item_price_outliers = len(df[df['Item Price'] > upper_bound])
print(f"Number of outliers in 'Item Price': {num_item_price_outliers}")
print(f"Percentage of total dataset: {num_item_price_outliers / len(df) * 100:.2f}%")

In [None]:
# List Item Price outliers

item_price_outliers = df[df['Item Price'] > upper_bound].sort_values(by='Item Price', ascending=False)
item_price_outliers

In [None]:
# List Item Price outlier Items

item_price_outliers['Item Name'].unique()

In [None]:
# View outlier row with no item price

item_price_outliers[item_price_outliers['Item Name'].isna()]

#### Conclusions

* Outliers in `Item Price` are catering packages, platters, bundles, and more expensive combos.
* The outlier with no `Item Price` information appears to be a custom order lacking much information.

### Item Options Total Price

In [None]:
# Identify outliers in Item Options Total Price

plt.figure(figsize=(12, 6))

sns.histplot(data=df, x='Item Options Total Price', color='skyblue')

plt.title('Distribution of Item Options Total Price', fontsize=16, fontweight='bold')
plt.xlabel('Item Options Total Price ($)', fontsize=14)
plt.ylabel('Frequency', fontsize=14)

lower_bound, upper_bound = calculate_iqr(df=df, col='Item Options Total Price')

plt.axvline(x=df['Item Options Total Price'].mean(), color='red', linestyle='--', label='Mean')
plt.axvline(x=df['Item Options Total Price'].median(), color='green', linestyle='--', label='Median')
plt.axvline(x=lower_bound, color='orange', linestyle='-.', label='Lower Bound') if lower_bound > 0 else None
plt.axvline(x=upper_bound, color='orange', linestyle='-.', label='Upper Bound')
plt.legend()

plt.tight_layout()
plt.show()

In [None]:
item_options_total_price_outliers = df[df['Item Options Total Price'] > upper_bound].sort_values(by='Item Options Total Price', ascending=False)
item_options_total_price_outliers

In [None]:
# Print number of rows shared by all price outliers

merged_df = pd.merge(left=item_price_outliers, right=item_options_total_price_outliers, left_index=True, right_index=True, how='inner', suffixes=('_item_price', '_item_options_total_price'))
merged_df = pd.merge(left=merged_df, right=order_total_outliers, left_index=True, right_index=True, how='inner', suffixes=('', '_order_total'))

print(f"Number of merged outliers: {len(merged_df)}")

#### Conclusions

Rows which have a high `Item Price` also have a high `Item Options Total Price` and `Order Total`, signaling a relationship between these features.

In [None]:
# Verify results with correlation heatmap

plt.figure(figsize=(12, 6))

plt.title('Correlation Matrix', fontsize=16, fontweight='bold')
sns.heatmap(df.corr(numeric_only=True), annot=True, fmt=".2f", cmap='coolwarm', cbar=True, center=0)

plt.tight_layout()
plt.show()

# Business Logic Validation

- [x] Investigate tax calculation
- [x] Confirm total calculated correctly from subtotal
- [x] Check for canceled/voided/refunded orders

## Total Calculation

In [None]:
# Inspect rows where Order Subtotal and Order Tax Total don't add up to Order Total

tax_rate_valid = np.isclose((df['Order Subtotal'] + df['Order Tax Total']), df['Order Total'])
df[~tax_rate_valid].head()

It is not apparently cleaer why `Order Tax Total` and `Order Subtotal` do not add up to `Order Total` in all rows. Possible reasons could include added fees or tips, whose information is missing in this dataset.

## Tax Calculation

In [None]:
# Calculate effective tax rates

tax_rate_valid_df = df[tax_rate_valid]
(tax_rate_valid_df['Order Tax Total']/tax_rate_valid_df['Order Subtotal']).round(decimals=2).unique()

In [None]:
# Count occurrences of each tax rate

df['Tax Rate'] = (df['Order Tax Total']/df['Order Subtotal']).round(decimals=2)
df.groupby(by='Tax Rate').size()

In [None]:
# Order with no tax calculated

df[df['Tax Rate'] == 0.00]

In [None]:
df[df['Tax Rate'] == 0.05]

The statewide tax rate in California is **7.25%**. In Los Angeles, the combined sales tax rate (state and local) is **9.50%**. While a majority of the orders are between 9-10%, there are a small amount with a much lower tax rate.

## Canceled/Voided/Refunded orders

In [None]:
df['Fulfillment Status'].unique()

In [None]:
# View cancelled orders

df[df['Fulfillment Status'] == 'Canceled']

In [None]:
# Generate pseudo order id to be able to group orders

df['pseudo_order_id'] = df['Order Name'].str.split(' ').str[0] + '_' + df['Order Date'].astype(str)
df['pseudo_order_id']

In [None]:
# Calculate number of canceled orders

cancelled_orders = df[df['Fulfillment Status'] == 'Canceled']
num_canceled_orders = cancelled_orders['pseudo_order_id'].nunique()

print(f"Number of canceled orders: {num_canceled_orders}")

In [None]:
# View New orders

df[df['Fulfillment Status'] == 'New']

In [None]:
# View orders with no fulfillment status

null_fulfillment_status = df[df['Fulfillment Status'].isnull()]
null_fulfillment_status

In [None]:
null_fulfillment_status.info()

In [None]:
# Investigate Order Name for orders with no fulfillment status

null_fulfillment_status['Order Name'].unique()

In [None]:
# Count fulfillment types including null values

df['Fulfillment Type'].value_counts(dropna=False)

In [None]:
# List Channels of orders with no fulfillment status

null_fulfillment_status['Channels'].unique()

In [None]:
# List refunded orders

df[df['Order Refunded Amount'] > 0]

### Conclusions
* There were a total of **6** cancelled orders.
* **2** orders were listed as **New**. These were probably in progress at the time of capturing the data.
* **1073** rows have no `Fulfillment Status` listed.
    * These orders have primarily numeric `Order Name`.
    * They are also missing `Fulfillment Type`.
    * The single `Channel` for these rows is `BELLY RUBB - BBQ Ribs to Go & Catering`
* There are no refunded orders. This probably means that `Order Refunded Amount` does not apply when `Fulfillment Status` is `Canceled`.

# Customer-Level Insights

- [x] Count unique customers
- [x] Identify repeat vs. new customers
- [x] Explore order frequency (days between orders) for repeat customers
- [x] Check for missing/anonymous customer records

## Unique Customers

In [None]:
# Calculate percentage of records where Order Name is the same as Recipient Name

print(f"Percentage of records where Order Name is the same as Recipient Name: {np.round(len(df[df['Order Name'] == df['Recipient Name']]) / len(df) * 100, decimals=2)}%")

In [None]:
# Investigate records where Order Name is not the same as Recipient Name

order_recipient_name_neq = df[~(df['Order Name'] == df['Recipient Name'])]
order_recipient_name_neq.head()

In [None]:
# List of recipient names

order_recipient_name_neq['Recipient Name'].unique()

In [None]:
# List of order names

order_recipient_name_neq['Order Name'].unique()

In [None]:
order_recipient_name_neq['Channels'].unique()

In [None]:
# Count number of unique recipients

print(f"Number of unique customers: {len(df['Recipient Name'].unique())}")

### Conclusions
* **822** unique customers have ordered from the restaurant.
* In **73%** of records `Order Name` and `Recipient Name` are equal.
    * This allows us to rely on `Recipient Name` to later investigate repeat customers.
* The rest all have missing `Recipient Name` and majority cryptic `Order Name`.
    * These records are also those with null `Fulfillment Status`.
    * Since they all have a single value for `Channel`, namely `BELLY RUBB - BBQ Ribs To Go & Catering`, it can be assumed that these are manually entered entries.

## Repeat Customers

In [None]:
# Calculate number of repeat orders

order_names = df.groupby(by='pseudo_order_id')['Recipient Name'].unique().str[0]
repeat_counts = order_names.value_counts()
repeat_customers = repeat_counts[repeat_counts > 1]

print(f"Number of repeat customers: {len(repeat_customers)}")
print(f"Percentage of customers who are repeat customers: {(len(repeat_customers) / repeat_counts.shape[0]) * 100:.2f}%")

In [None]:
# Visualize Repeat Customer Frequency Distribution

recipient_freq = repeat_customers.value_counts(normalize=True).sort_index()

plt.figure(figsize=(12, 6))

plt.title('Repeat Customer Frequency Distribution', fontsize=16, fontweight='bold')
plt.bar(recipient_freq.index, recipient_freq.values)

plt.xlabel('Number of Repeat Orders', fontsize=14)
plt.ylabel('Percentage of Customers', fontsize=14)

plt.tight_layout()
plt.show()

In [None]:
# Visualize Cumulative Distribution of Repeat Customers

plt.figure(figsize=(12, 6))

plt.hist(repeat_customers, cumulative=True, density=True, bins=range(2, repeat_customers.max() + 2), edgecolor='black', color='skyblue', alpha=0.7)

plt.title('Cumulative Distribution of Repeat Customers', fontsize=16, fontweight='bold')
plt.xlabel('Number of Repeat Orders', fontsize=14)
plt.ylabel('Density', fontsize=14)

plt.show()

### Conclusions
* Approximately **10.87%** of customers are repeat customers, constituting customer who have ordered at least twice.
* The distribution of repeat orders is heavily **right-skewed**, as expected.
    * About **60%** of repeat customers have ordered **2** times.
    * About **80%** of repeat customers have ordered **4 or less** times.

## Order frequency for repeat customers

In [None]:
def calculate_days_difference(date1: datetime, date2: datetime) -> int:
    """
    Calculate the number of days between two dates.
    
    Args:
        date1 (datetime): The first date.
        date2 (datetime): The second date.
        
    Returns:
        int: The number of days between the two dates.
    """
    return (date2 - date1).days

In [None]:
# Create row to signify if customer is a repeat customer

df['repeat_customer'] = df['Recipient Name'].isin(repeat_customers.index)
df.head()

In [None]:
# Create a dataframe for repeat customers

repeat_customers_df = df[(df['repeat_customer']) & (df['Fulfillment Status'] == 'Completed')]
repeat_customers_df = repeat_customers_df.sort_values(by=['Recipient Name', 'Order Date'])
repeat_customers_df.head()

In [None]:
# Generate dataframe with one row per order for repeat customers

order_df = repeat_customers_df.groupby(by=['pseudo_order_id']).agg({
    'Order Date': 'first',
    'Recipient Name': 'first'
}).sort_values(by=['Recipient Name', 'Order Date']).reset_index()

order_df.head()

In [None]:
# Add column for previous order date

order_df['previous_order_date'] = order_df.groupby(by='Recipient Name')['Order Date'].shift(1)
order_df.head()

In [None]:
# Calculate days since last order per customer

order_df['days_since_last_order'] = order_df['Order Date'].sub(order_df['previous_order_date']).dt.days
avg_days_since_last_order = order_df.groupby(by='Recipient Name')['days_since_last_order'].mean().sort_values()
avg_days_since_last_order

In [None]:
# Visualize distribution of average days between orders

avg_days_counts = avg_days_since_last_order.value_counts()

plt.figure(figsize=(12, 6))
plt.title("Distribution of Average Days Since Last Order", fontsize=16, fontweight='bold')
plt.hist(avg_days_since_last_order, bins=50, color='skyblue', edgecolor='black', alpha=0.7)

plt.xlabel('Average Days Since Last Order', fontsize=14)
plt.ylabel('Frequency', fontsize=14)

mean_avg_days = avg_days_since_last_order.mean()
median_avg_days = avg_days_since_last_order.median()

plt.axvline(mean_avg_days, color='red', linestyle='dashed', linewidth=1, label='Mean')
plt.text(mean_avg_days*1.1, plt.ylim()[1]*0.8, np.round(mean_avg_days, 1), color='red', ha='center')
plt.axvline(median_avg_days, color='blue', linestyle='dashed', linewidth=1, label='Median')
plt.text(median_avg_days*1.15, plt.ylim()[1]*0.8, np.round(median_avg_days, 1), color='blue', ha='center')
plt.legend()

plt.tight_layout()
plt.show()

### Conclusions
* The *average number of days* between repeat orders is **54.2 days**.
* The *median number of days* between repeat orders is **35 days**.
* Distribution of average days between orders is also heavily **right-skewed**.

## Missing Customer Records

In [None]:
# Inspect customer record columns

customer_record_cols = ['Recipient Name', 'Recipient Email', 'Recipient Phone', 'Recipient Address', 'Recipient Postal Code', 'Recipient City', 'Recipient Region', 'Recipient Country']

df[customer_record_cols].info()

In [None]:
# Calculate percentage of records missing for each customer record column

np.round(df[customer_record_cols].isna().sum() / len(df) * 100, 2)

In [None]:
# Inspect customer records with missing Recipient Name

df[df['Recipient Name'].isna()].head()

### Conclusions
* **87%** of recipient address information is missing.
* **50%** of recipient emails are missing.
* **26%** of recipient names are missing

# Time-Based Patterns

- [x] Confirm dataset start and end dates
- [x] Explore order volume by day of week
- [x] Explore order volume by hour of day
- [x] Look for seasonality

## Confirm dataset start and end dates

In [None]:
# Verify order date is datetime

print(f"Order Date is datetime: {is_datetime(df['Order Date'])}")
print(f"Fulfillment Date is datetime: {is_datetime(df['Fulfillment Date'])}")

In [None]:
# Create time series DataFrame with each row corresponding to one order and Order Date as index

time_series_df = df.groupby('pseudo_order_id').agg({
    'Order Date': 'first',
    'Order Total': 'first'
}).sort_values(by='Order Date').reset_index().set_index('Order Date')

In [None]:
# Create time series DataFrame with each row corresponding to one order and Fulfillment Date as index

time_series_fulfillment_df = df.groupby('pseudo_order_id').agg({
    'Fulfillment Date': 'first',
    'Order Total': 'first'
}).sort_values(by='Fulfillment Date').reset_index().set_index('Fulfillment Date')

time_series_fulfillment_df.head()

In [None]:
# Inspect time series DataFrame

time_series_df.head()

In [None]:
# Print earliest and latest order date to confirm range

print(f"Earliest order date: {time_series_df.index.min()}")
print(f"Latest order date: {time_series_df.index.max()}")

In [None]:
# Print earliest and latest order date to confirm range

print(f"Earliest order date: {time_series_fulfillment_df.index.min()}")
print(f"Latest order date: {time_series_fulfillment_df.index.max()}")

### Conclusions
* `Fulfillment Date` is missing for orders before **12/5/2023**.
* `Order date` is available for all orders and spans the entire dataset.

## Explore order volume by day of week

In [None]:
# Resample frequency to daily and add day of week column

day_of_week_orders = time_series_df.resample('D').size().reset_index(name='orders_count')
day_of_week_orders['day_of_week'] = day_of_week_orders['Order Date'].dt.day_name()
day_of_week_orders.head()

In [None]:
# Calculate average number of orders per day of week

week_day_order = ['Sunday', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday']

day_of_week_means = day_of_week_orders.groupby(by='day_of_week')['orders_count'].mean().reindex(week_day_order)
day_of_week_means.head()

In [None]:
# Calculate median number of orders per day of week

day_of_week_medians = day_of_week_orders.groupby(by='day_of_week')['orders_count'].median().reindex(week_day_order)
day_of_week_medians.head()

In [None]:
# Plot sales by day of the week

fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(12, 6), sharey=True)

fig.suptitle('Orders per Day of Week', fontsize=16, fontweight='bold')

day_of_week_means.plot(ax=ax[0])
ax[0].set_title('Mean Orders', fontsize=14, fontweight='bold')
ax[0].set_xlabel('', fontsize=12)

day_of_week_medians.plot(ax=ax[1])
ax[1].set_title('Median Orders', fontsize=14, fontweight='bold')
ax[1].set_xlabel('', fontsize=12)

fig.supylabel('Number of Orders', fontsize=12)
fig.supxlabel('Day of Week', fontsize=12)

ax[0].grid(visible=True, axis='both', linestyle='--', alpha=0.7)
ax[1].grid(visible=True, axis='both', linestyle='--', alpha=0.7)

plt.tight_layout()
plt.show()

### Conclusions
* Trends between *mean* and *median* orders per day of week are similar.
* **Fridays** have the highest number of orders, followed by **Tuesdays**.
* Some orders have been taken on Sundays and Mondays, but this is irrelevant since they are outside business hours.
* **Saturdays**, **Wednesdays**, and **Thursdays** share similar *mean* and *median* order quantities.

## Explore order volume by hour of day

In [None]:
time_series_fulfillment_df.head()

In [None]:
# Resample data to hourly frequency

hour_of_day_orders = time_series_fulfillment_df.resample('h').size().reset_index(name='orders_count')

hour_of_day_orders['hour_of_day'] = hour_of_day_orders['Fulfillment Date'].dt.hour
hour_of_day_orders.head()

In [None]:
# Calculate mean and median orders per hour of day

hour_of_day_means = hour_of_day_orders.groupby(by='hour_of_day')['orders_count'].mean()
hour_of_day_totals = hour_of_day_orders.groupby('hour_of_day')['orders_count'].sum()

In [None]:
# Plot sales by hour

fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(12, 6))

fig.suptitle('Orders per Hour of Day', fontsize=16, fontweight='bold')

hour_of_day_means.plot(ax=ax[0])
ax[0].set_title('Mean Orders', fontsize=14, fontweight='bold')
ax[0].set_ylabel('Mean Orders', fontsize=12)
ax[0].set_xlabel('')

hour_of_day_totals.plot(ax=ax[1])
ax[1].set_title('Total Orders', fontsize=14, fontweight='bold')
ax[1].set_ylabel('Total Orders', fontsize=12)
ax[1].set_xlabel('')

fig.supxlabel('Hour of Day', fontsize=12)

ax[0].grid(visible=True, axis='both', linestyle='--', alpha=0.7)
ax[1].grid(visible=True, axis='both', linestyle='--', alpha=0.7)

plt.tight_layout()
plt.show()

### Conclusions
* `Order Date` does not store timestamp data, therefore `Fulfillment Date` was used to get hourly order data.
* Peak order time is around **7pm**
* Orders **decrease** closer to closing time.

## Seasonality

In [None]:
# Check for weekly seasonality

result = seasonal_decompose(day_of_week_orders['orders_count'], period=7)

fig = result.plot()
fig.set_size_inches(12, 6)
plt.show()

### Conclusions
* Since there are only **22** months worth of data, we cannot check for *monthly seasonality*.
* Seasonal decomposition confirms that there is a **weekly** seasonal cycle.

# Menu and Product-Level Insights

- [x] Count unique items sold
- [x] Check for item categories (main, side, etc.)
- [x] Rank top-selling items (by quantity, revenue)

## Unique Items Sold

In [None]:
df.head()

In [None]:
# Inspect quantity of unique item names

print(f"Number of unique item names: {len(df['Item Name'].unique())}")

In [None]:
# Get unique number of item and item variation combinations

name_vars_counts = df.groupby(by='Item Name')['Item Variation'].nunique().sort_values(ascending=False).reset_index(name='count')
print(f"Number of unique item and variation combinations: {np.sum(name_vars_counts['count'])}")

In [None]:
# Drop rows with missing Item Name

df = df.dropna(subset='Item Name')
df['Item Name'].isna().any()

* Belly Rubb offers **104** unique items based on `Item Name`.
* **135** unique combinations of items and their variations have been sold.

## Item Aliases

This section explores the various spellings for menu items.

In [None]:
# Import item information for comparison

items_df = pd.read_csv(RAW_DATA_DIR / 'MLW4W4RYAASNM_catalog-2025-08-26-2046.csv')
items_df.head()

In [None]:
from rapidfuzz.fuzz import token_set_ratio

def is_fuzzy_match(item_name: str, catalog_item: str, threshold: int = 85):
    """
    Check if two items are a fuzzy match.

    Params:
        item_name (str): The name of the item to match.
        catalog_item (str): The proper name of the item in the catalog.
        threshold (int): The matching threshold (default is 85).

    Returns:
        bool: True if the items are a fuzzy match, False otherwise.
    """

    return token_set_ratio(item_name.lower(), catalog_item.lower()) > threshold

In [None]:
# Create category feature

items_df['category'] = items_df['Reporting Category'].str.split(" \(", expand=True)[0]
items_df

In [None]:
items_df['category'].unique()

### NaN

In [None]:
items_df[items_df['category'].isna()]

#### Pulled Beef Sliders

This item is marked as `unavailable` on Square.

In [None]:
pulled_beef_orders = df[df['Item Name'].apply(lambda x: is_fuzzy_match(x, 'Pulled Beef Sliders (12pcs)', threshold=71))]
pulled_beef_orders['Item Name'].unique()

#### Pear+Gorgonzola Salad (Full Pan)

This item is marked as `unavailable` on Square.

In [None]:
pear_full_orders = df[df['Item Name'].apply(lambda x: is_fuzzy_match(x, 'Pear+Gorgonzola Salad (Full Pan)'))]
pear_full_orders['Item Name'].unique()

In [None]:
df[df['Item Name'].str.lower().str.contains('pear')]['Item Name'].unique()

### Party Packages

In [None]:
# View party package catalog items

party_packages = items_df[items_df['category'] == 'PARTY PACKAGE'].dropna(axis=1)
party_packages

In [None]:
# Get party package orders

party_package_orders = df.dropna(subset='Item Name')
party_package_orders = party_package_orders[party_package_orders['Item Name'].apply(lambda x: is_fuzzy_match(x, 'party package'))]
party_package_orders['Item Name'].unique()

#### Party Package (10-12 ppl)

In [None]:
# Select 10-12ppl party package orders

party_package_ten = df.dropna(subset='Item Name')

party_package_ten = party_package_ten[party_package_ten['Item Name'].apply(lambda x: is_fuzzy_match(x, 'Party Package (10-12ppl)', threshold=92))]
party_package_ten['Item Name'].unique()

In [None]:
# Inspect differences in order date and price

party_package_ten.groupby(by='Item Name').agg(
    min_order_date = ('Order Date', 'min'),
    max_order_date = ('Order Date', 'max'),
    item_price = ('Order Date', 'first')
).sort_values(by='min_order_date')

In [None]:
party_package_ten['Item Variation'].unique()

In [None]:
# Inspect outlier

df[df['Item Name'] == 'Party Package (10-12ppl)']

* There is **one** order of this party package with no space in *(10-12 ppl)*.
    * This is the first and only order with that alias.


#### Party Package (6-8 ppl)

In [None]:
# Get orders of party package (6-8 ppl)

party_package_six = df.dropna(subset='Item Name')

party_package_six = party_package_six[party_package_six['Item Name'].apply(lambda x: is_fuzzy_match(x, 'Party Package (6-8 ppl)', threshold=92))]
party_package_six['Item Name'].unique()

In [None]:
# Inspect order date and price differences between aliases

party_package_six.groupby(by='Item Name').agg(
    min_order_date = ('Order Date', 'min'),
    max_order_date = ('Order Date', 'max'),
    item_price = ('Item Price', 'first')
).sort_values(by='min_order_date')

In [None]:
# Inspect outlier order with an extra space

df[df['Item Name'] == 'Party Package  (6-8 ppl)']

In [None]:
party_package_six['Item Variation'].unique()

* There is an **extra space** in the most recent order of the *Party Package (6-8 ppl)*.
    * Since it is only one order and it is placed through a *Payment Link*, it is likely a manual entry typo.

#### Party Package (4-6 ppl)

In [None]:
# Get party package (4-6 ppl) orders

party_package_four = df.dropna(subset='Item Name')

party_package_four = party_package_four[party_package_four['Item Name'].apply(lambda x: is_fuzzy_match(x, 'Party Package (4-6 ppl)', threshold=92))]
party_package_four['Item Name'].unique()

In [None]:
# View order dates and prices for party package (4-6ppl) orders

party_package_four.groupby(by='Item Name').agg(
    min_order_date = ('Order Date', 'min'),
    max_order_date = ('Order Date', 'max'),
    item_price = ('Item Price', 'first')
).sort_values(by='min_order_date')

In [None]:
# Get outlier order

df[df['Item Name'] == 'PARTY PACKAGE (4-6ppl)']

In [None]:
party_package_four['Item Variation'].unique()

* There is a single order with the alias `PARTY PACKAGE (4-6ppl)`.
*   Otherwise orders match catalog naming.

### Desserts

In [None]:
# List dessert items

dessert_catalog = items_df[items_df['category'] == 'DESSERTS'].dropna(axis=1)
dessert_catalog

There are only two desserts on the menu, `CHEESECAKE BITES` and `MARSH'n'COOKIE`.

#### Marsh'n'Cookie

In [None]:
marsh_orders = df.dropna(subset='Item Name')

marsh_orders = marsh_orders[marsh_orders['Item Name'].apply(lambda x: is_fuzzy_match(x, "MARSH'n'COOKIE", threshold=70))]
marsh_orders['Item Name'].unique()

In [None]:
marsh_orders['Item Variation'].unique()

In [None]:
marsh_orders['Order Date'].aggregate(['min', 'max'])

#### Cheesecake Bites

In [None]:
# Get cheesecake orders

cheesecake_orders = df.dropna(subset='Item Name')

cheesecake_orders = cheesecake_orders[cheesecake_orders['Item Name'].apply(lambda x: is_fuzzy_match(x, 'CHEESECAKE', threshold=70))]
cheesecake_orders['Item Name'].unique()

In [None]:
# Inspect variations

cheesecake_orders['Item Variation'].unique()

In [None]:
# Inspect earliest and latest order dates

cheesecake_orders['Order Date'].aggregate(['min', 'max'])

### Sips

In [None]:
sips = items_df[items_df['category'] == 'SIPS']
sips

In [None]:
# Get drink orders

sips_items = sips['Item Name'].values

sips_orders = df[df['Item Name'].isin(sips_items)]
sips_orders['Item Name'].unique()

In [None]:
# Check if there are other spellings of items

for item in sips_items:
    print(df[df['Item Name'].dropna().apply(lambda x: is_fuzzy_match(x, item, threshold=70))]['Item Name'].unique())

Since drink orders map perfectly to the menu catalog, there is no need to create standardization entries for them.

### Dips

In [None]:
# Get menu items in DIPS category

dips_items = items_df[items_df['category'] == 'DIPS']
dips_items

Each dip has two versions, one seemingly standard size, and one *8 OZ* size.

In [None]:
# Orders with sauce or dip in Item Name

df[df['Item Name'].str.lower().str.contains('sauce|dip')]['Item Name'].unique()

* `CHUNKY BLUE CHEESE DIP`, `BLUE CHEESE SAUCE`, `SWEET AND SPICY BBQ SAUCE`, `SIGNATURE BBQ SAUCE`, `HOUSE-MADE CURRY DIP`, `TANGY & SWEET CURRY DIP`, and `BBQ dipping sauce` are not in the menu catalog.
    * `SIGNATURE BBQ SAUCE` is very similar to `SIGNATURE BBQ DIP`.
    * `SWEET AND SPICY BBQ SAUCE` is very similar to `SWEET AND SPICY BBQ DIP`.

#### Curry Dip

The curry dip has been seemingly taken off the menu since it only exists in orders and not the current catalog.

In [None]:
# Select curry dip orders

curry_dip = df[df['Item Name'].apply(lambda x: is_fuzzy_match(x, 'CURRY DIP', threshold=70))]
curry_dip['Item Name'].unique()

In [None]:
# Confirm order dates

curry_dip['Order Date'].aggregate(['min', 'max'])

In [None]:
curry_dip['Item Variation'].unique()

#### Sweet and Spicy BBQ Dip

In [None]:
# Sweet and spicy catalog items

ss_bbq_catalog = items_df[items_df['Item Name'].str.lower().str.contains('spicy')].dropna(axis=1)
ss_bbq_catalog

Similar to the other dip items, the *8 oz* version is **$3.50** while the standard version is **$1.45**.
The *8 oz* version is also labeled a **Sauce**.

In [None]:
ss_bbq_orders = df[df['Item Name'].apply(lambda x: is_fuzzy_match(x, 'sweet and spicy', threshold=70))]
ss_bbq_orders['Item Name'].unique()

In [None]:
ss_bbq_orders['Item Variation'].unique()

#### Signature BBQ Dip

In [None]:
# View signature bbq dip catalog items

bbq_dip_catalog = items_df[items_df['Item Name'].str.lower().str.contains('signature')].dropna(axis=1)
bbq_dip_catalog

In [None]:
# Inspect prices

bbq_dip_catalog[['Item Name', 'Price']]

The only differences between the two entries are: 
* The *8 oz* version costs **$3.50** while the standard costs **$1.45**.
* The *8 oz* version is titled `SIGNATURE BBQ DIP`.

In [None]:
# Get orders of signature bbq dip

bbq_dip_orders = df[df['Item Name'].apply(lambda x: is_fuzzy_match(x, 'SIGNATURE BBQ DIP', threshold=70))]
bbq_dip_orders['Item Name'].unique()

In [None]:
# Compare order dates and prices

bbq_dip_orders.groupby(by='Item Name').agg(
    min_order_date = ('Order Date', 'min'),
    max_order_date = ('Order Date', 'max'),
    price = ('Item Price', 'first')
).sort_values(by='min_order_date')

`BBQ dipping sauce` and `SIGNATURE BBQ SAUCE` ARE the same items as `SIGNATURE BBQ DIP`.

In [None]:
bbq_dip_orders['Item Variation'].unique()

#### Pickled Jalapeno Peppers

In [None]:
# Check pickled jalapeno pepper catalog items

peppers_catalog = items_df[items_df['Item Name'].str.lower().str.contains('pickled')]
peppers_catalog

In [None]:
# Select pickled jalapeno orders

peppers_orders = df[df['Item Name'].apply(lambda x: is_fuzzy_match(x, 'PICKLED JALAPEÑO PEPPERS', threshold=70))]
peppers_orders['Item Name'].unique()

In [None]:
# Inspect variations

peppers_orders['Item Variation'].unique()

#### Mayonnaise

In [None]:
mayonnaise_orders = df[df['Item Name'].apply(lambda x: is_fuzzy_match(x, 'MAYONNAISE'))]
mayonnaise_orders['Item Name'].unique()

In [None]:
mayonnaise_orders['Item Variation'].unique()

#### Mustard

In [None]:
mustard_orders = df[df['Item Name'].apply(lambda x: is_fuzzy_match(x, 'MUSTARD'))]
mustard_orders['Item Name'].unique()

In [None]:
mustard_orders['Item Variation'].unique()

#### Ketchup

In [None]:
# Get ketchup orders

ketchup_orders = df[df['Item Name'].apply(lambda x: is_fuzzy_match(x, 'ketchup', threshold=75))]
ketchup_orders['Item Name'].unique()

In [None]:
ketchup_orders['Item Variation'].unique()

#### Creamy Blue Cheese Dip

No larger size of the *Creamy Blue Cheese Dip* is offered.

In [None]:
# Get orders of the creamy blue cheese dip

blue_cheese_dip = df[df['Item Name'].apply(lambda x: is_fuzzy_match(x, "CREAMY BLUE CHEESE DIP", threshold=75))]
blue_cheese_dip['Item Name'].unique()

In [None]:
# Compare order dates and prices for different blue cheese orders

blue_cheese_dip.groupby(by='Item Name').agg(
    min_order_date = ('Order Date', 'min'),
    max_order_date = ('Order Date', 'max'),
    price = ('Item Price', 'first')
).sort_values(by='min_order_date')

In [None]:
blue_cheese_dip['Item Variation'].unique()

#### Ranch Dip

In [None]:
# Ranch dip menu items

ranch_dip_catalog = items_df[items_df['Item Name'].str.lower().str.contains('ranch')].dropna(axis=1)
ranch_dip_catalog

In [None]:
ranch_dip_catalog[['Item Name', 'Price', 'Shipping Enabled', 'Delivery Enabled', 'Pickup Enabled']]

* The 8 oz version of ranch is called `Classy Ranch Sauce (8 oz)`, vs the regular being `CLASSY RANCH DIP`.
* The 8 oz version costs **$3.50**, vs. the standard **$1.45**.

In [None]:
# Get ranch orders

ranch_orders = df[df['Item Name'].apply(lambda x: is_fuzzy_match(x, 'CLASSY RANCH DIP')) | df['Item Name'].apply(lambda x: is_fuzzy_match(x, 'Classy Ranch Sauce (8 oz)'))]
ranch_orders

In [None]:
# List unique aliases for ranch orders

ranch_orders['Item Name'].unique()

In [None]:
# Inspect outlier ranch order alias

ranch_orders[ranch_orders['Item Name'] == 'Ranch (8 oz)']

The `Item Price` is the same for ranch orders with the name `Randh (8 oz)`, which differs from those listed in the catalog. Therefore, it is safe to assume that these refer to the same item.

#### Boom-Boom Sauce

In [None]:
# Get boom-boom sauce items from menu catalog

boom_catalog = dips_items[dips_items['Item Name'].str.contains('BOOM-BOOM SAUCE')].dropna(axis=1)
boom_catalog

In [None]:
# Differences between the two boom-boom sauce items

boom_catalog[['Item Name', 'Price', 'Shipping Enabled', 'Delivery Enabled', 'Pickup Enabled']]

* The **8 OZ** listing is priced higher at **$3.50**.
* `Shipping Enabled` is **True** for the *8 oz* listing, but both `Delivery Enabled` and `Pickup Enabled` are **False**.

In [None]:
# Get orders of boom-boom sauce

boom_orders = df[df['Item Name'].apply(lambda x: is_fuzzy_match(x, 'BOOM-BOOM SAUCE'))]
boom_orders['Item Name'].unique()

In [None]:
# Check variations

boom_orders['Item Variation'].unique()

In [None]:
# Check order dates

boom_orders['Order Date'].describe()

### Sandwiches

In [None]:
# List catalog items in SANDWICHES category

items_df[items_df['category'] == 'SANDWICHES']

In [None]:
# List unique item names in orders dataset with sandwich in the name

df[df['Item Name'].str.lower().str.contains('sandwich', na=False)]['Item Name'].unique()

There are some inconsistencies between *order* data and *catalog* data:
* `CIABATTA STEAK SANDWICH`, `1/2 POUND STEAK SANDWICH`, `CHIABATTA STEAK SANDWICH`, and `ULTIMATE STEAK SANDWICH` don't exist in the current catalog.
* `Crispy Chicken Sandwich` does not exist in the catalog since there it is fully uppercase.

#### Pulled Beef Sandwich

In [None]:
# Get orders of pulled beef sandwiches

pulled_beef = df[df['Item Name'].str.contains('PULLED BEEF SANDWICH', na=False)]
pulled_beef.sort_values(by='Order Date')

In [None]:
pulled_beef['Item Variation'].unique()

This item is a **new** addition to the menu and therefore matches the catalog entry.

#### Steak Sandwiches

In [None]:
steak_sandwich_aliases = [
    'CIABATTA STEAK SANDWICH',
    '1/2 POUND STEAK SANDWICH',
    'CHIABATTA STEAK SANDWICH',
    'ULTIMATE STEAK SANDWICH',
    'STEAK SANDWICH']

steak_sandwiches = df[df['Item Name'].isin(steak_sandwich_aliases)]
steak_sandwiches.sort_values(by='Order Date')

In [None]:
# Inspect order dates and prices for steak sandwich item names

steak_sandwiches.groupby(by='Item Name').agg(
    min_order_date = ('Order Date', 'min'),
    max_order_date = ('Order Date', 'max'),
    item_price = ('Item Price', 'first')
).sort_values(by='min_order_date')

* It looks like these 5 aliases for the `STEAK SANDWICH` correspond to each other.
    * The `Item Price` is **$15.45** until it increases for the `CIABATTA STEAK SANDWICH` TO **$19.45**.
    * The `Order Date` for the items lines up with no overlapping dates.

In [None]:
# Inspect item variations

steak_sandwiches['Item Variation'].unique()

#### Crispy Chicken Sandwich

In [None]:
chicken_sandwich = df.dropna(subset='Item Name')

chicken_sandwich = chicken_sandwich[chicken_sandwich['Item Name'].str.lower().str.contains('chicken sandwich', na=False)]
chicken_sandwich.sort_values(by='Order Date')

In [None]:
# Check outlier alias

df[df['Item Name'] == 'Crispy Chicken Sandwich']

The very first order of the *crispy chicken sandwich* had the outlier alias. The others follow the exact format of the catalog.

### Combos

In [None]:
# Get list of items in catalog with COMBOS category

items_df[items_df['category'] == 'COMBOS']

In [None]:
# Get list of orders with combo in item name

combo_keywords = ['combo', 'deal', 'bundle']

orders_combo = df[(df['Item Name'].str.lower().str.contains('|'.join(combo_keywords), na=False)) | (df['Item Name'] == 'WINGS & FRIES')]
orders_combo['Item Name'].unique()

In [None]:
orders_combo['Item Variation'].unique()

When comparing the list of combos in the menu catalog with orders with the name *combo* in them, there are some discrepancies:
* `BABY BACK RIB COMBO`, `THE BELLY COMBO`, `BBQ RIB COMBO`, `THE BABY BACK COMBO`, `BABYBACK RIB COMBO`, `WINGS COMBO`, `RIBS & WINGS BUNDLE !BEEF RIBS ONLY!`, AND `BEEF SANDWICH COMBO` are present in orders by not in the catalog.
* Each combo in the catalog only has a `Regular` variation option, but orders have **two** other possible variations.

#### The Belly Combo

In [None]:
# Belly combo

belly_combo = df.dropna(subset='Item Name')

belly_combo = belly_combo[belly_combo['Item Name'] == 'THE BELLY COMBO']
belly_combo.sort_values(by='Order Date')

In [None]:
belly_combo.describe()

* `THE BELLY COMBO` only existed in *2024*.
* It does not appear like it was updated with a different alias.

In [None]:
belly_combo['Item Variation'].unique()

#### Wings & Fries

In [None]:
# Wings & Fries

wings_and_fries = df.dropna(subset='Item Name')

wings_and_fries = wings_and_fries[wings_and_fries['Item Name'] == 'WINGS & FRIES']
wings_and_fries.sort_values(by='Order Date')

In [None]:
wings_combo = df.dropna(subset='Item Name')

wings_combo = wings_combo[wings_combo['Item Name'] == 'WINGS COMBO']
wings_combo.sort_values(by='Order Date')

In [None]:
wings_combo['Item Variation'].unique()

In [None]:
wings_and_fries['Item Variation'].unique()

#### Steak Sandwich Combo

In [None]:
# Steak sandwich combo

steak_sandwich_combo = df.dropna(subset='Item Name')

steak_sandwich_combo = steak_sandwich_combo[steak_sandwich_combo['Item Name'] == 'STEAK SANDWICH COMBO']
steak_sandwich_combo.sort_values(by='Order Date')

In [None]:
steak_sandwich_combo['Item Variation'].unique()

In [None]:
steak_sandwich_combo['Order Date'].min()

In [None]:
beef_sandwich_combo = df.dropna(subset='Item Name')

beef_sandwich_combo = beef_sandwich_combo[beef_sandwich_combo['Item Name'] == 'BEEF SANDWICH COMBO']
beef_sandwich_combo.sort_values(by='Order Date')

In [None]:
beef_sandwich_combo['Item Variation']

In [None]:
steak_sandwich_combo['Item Price'].describe()

There is only **one** order of `BEEF SANDWICH COMBO` which has the same `Item Price` as the initial price of the `STEAK SANDWICH COMBO`.

#### Pulled Sandwich Combo

In [None]:
# Pulled sandwich combo

pulled_sandwich_combo = df.dropna(subset='Item Name')

pulled_sandwich_combo = pulled_sandwich_combo[pulled_sandwich_combo['Item Name'] == 'PULLED SANDWICH COMBO']
pulled_sandwich_combo.sort_values(by='Order Date')

In [None]:
pulled_sandwich_combo['Item Variation'].unique()

Since this is a newer addition to the menu, *catalog* and *order* data match perfectly.

#### Pork Rack Combo

In [None]:
# Pork rack combo

pork_rack_combo = df.dropna(subset='Item Name')

pork_rack_combo = pork_rack_combo[pork_rack_combo['Item Name'] == 'PORK RACK COMBO']
pork_rack_combo.sort_values(by='Order Date')

In [None]:
pork_rack_combo['Item Variation'].unique()

Since this is a newer addition to the menu, *catalog* and *order* data match perfectly.

#### Crispy Chicken Combo

In [None]:
# Crispy chicken combo

crispy_chicken_combo = df.dropna(subset='Item Name')

crispy_chicken_combo = crispy_chicken_combo[crispy_chicken_combo['Item Name'] == 'CRISPY CHICKEN COMBO']
crispy_chicken_combo.sort_values(by='Order Date')

In [None]:
crispy_chicken_combo['Item Variation'].unique()

Since this is a newer addition to the menu, data in *orders* and the *catalog* match perfectly.

#### Ribs & Wings Bundle

In [None]:
# Ribs & wings bundle

ribs_wings_bundle = df.dropna(subset='Item Name')

ribs_wings_bundle = ribs_wings_bundle[ribs_wings_bundle['Item Name'] == 'RIBS & WINGS BUNDLE']
ribs_wings_bundle.sort_values(by='Order Date')

In [None]:
# Inspect variant of ribs & wings bundle

df[df['Item Name'] == 'RIBS & WINGS BUNDLE !BEEF RIBS ONLY!']

Since there is only **one** order of the `RIBS & WINGS BUNDLE !BEEF RIBS ONLY!`, we'll standardize it to a regular order of the bundle.

#### Baby Back Rib Combo

In [None]:
baby_back_combo = df.dropna(subset='Item Name')

# Use fuzzy matching to find similar item names
baby_back_combo['is_baby_back_rib_combo'] = baby_back_combo['Item Name'].apply(lambda x: is_fuzzy_match(x, 'BABY BACK RIB COMBO'))
baby_back_combo = baby_back_combo[baby_back_combo['is_baby_back_rib_combo']]

In [None]:
baby_back_combo['Item Name'].unique()

In [None]:
# Compare order dates for three variants of the combo name

baby_back_combo.groupby(by='Item Name').agg(
    min_order_date = ('Order Date', 'min'),
    max_order_date = ('Order Date', 'max'),
    item_price = ('Item Price', 'first')
)

* *Baby Back Rib Combo* items range in date from `2023-12-09` to `2025-04-29`.
* There is a significant gap between `THE BABY BACK COMBO` and `BABYBACK RIB COMBO` implying there may be another alias for that combo.

In [None]:
# Examine orders between THE BABY BACK COMBO and BABYBACK RIB COMBO

min_date = pd.to_datetime('2024-02-28')
max_date = pd.to_datetime('2024-04-09')

between_orders = df[df['Order Date'].between(min_date, max_date, inclusive='neither')]
between_orders.describe()

In [None]:
# Check for combos

between_orders['Item Name'].unique()

In [None]:
# Select BBQ RIB COMBO orders

bbq_rib_combo = df[df['Item Name'] == 'BBQ RIB COMBO'].sort_values(by='Order Date')
bbq_rib_combo

In [None]:
# Inspect order dates and item prices

bbq_rib_combo[['Order Date', 'Item Price']].describe()

In [None]:
# Select THE BELLY COMBO orders

belly_combo = df[df['Item Name'] == 'THE BELLY COMBO'].sort_values(by='Order Date')
belly_combo

In [None]:
# Inspect order dates and item prices

belly_combo[['Order Date', 'Item Price']].describe()

* There are **3** aliases for Baby Back Rib combos: `BABY BACK RIB COMBO`, `BABYBACK RIB COMBO`, and `THE BABY BACK COMBO`.
    * It is **highly likely** that `BBQ RIB COMBO` is another alias because it fills in the gap in orders and is the same price.
* `THE BELLY COMBO` **is not** an alias because those orders coincide with the other *baby back rib combo* orders.

The `Item Description` for the `3-BONE MEAL DEAL` states: *Flying solo, or want to get it all for one? Well, this one’s for you! Get three, juicy, baby back pork rib bones with two sides of your choosing. Includes complementary pickled peppers, signature BBQ sauce dip, and a beverage!*
This implies that the `3-BONE MEAL DEAL` is the latest version of the baby back rib combo.

In [None]:
# Check variations

bbq_rib_combo_aliases = ["3-BONE MEAL DEAL", "BABY BACK RIB COMBO", "THE BABY BACK COMBO", "BABYBACK RIB COMBO", "BBQ RIB COMBO"]
df[df['Item Name'].isin(bbq_rib_combo_aliases)]['Item Variation'].unique()

In [None]:
# Check outlier variations

df[(df['Item Variation'] == 'Crinkle Fries- Truffle Salt, Mac&Cheese') | (df['Item Variation'] == 'Crinkle Fries- Rosemary Pepper')]

Since there are only two orders with `Item Variation` which seems to specify sides and seasonings, we'll standardize them to `Regular`.

#### Beef Rack Combo

In [None]:
# Get orders of beef rack combo

beef_rack = df.dropna(subset='Item Name')

# Use fuzzy matching to get orders
beef_rack['is_beef_rack_combo'] = beef_rack['Item Name'].apply(lambda x: is_fuzzy_match(x, 'BEEF RACK COMBO'))
beef_rack = beef_rack[beef_rack['is_beef_rack_combo']]

In [None]:
# Check item names

beef_rack['Item Name'].unique()

In [None]:
# Check variations

beef_rack['Item Variation'].unique()

In [None]:
# Check order dates for beef racks

beef_rack['Order Date'].describe()

#### 3-Bone Meal Deal

In [None]:
# Get orders of three bone combo

three_bone = df.dropna(subset='Item Name')

# Use fuzzy matching to get orders
three_bone['is_three_bone_combo'] = three_bone['Item Name'].apply(lambda x: is_fuzzy_match(x, '3-BONE MEAL DEAL'))
three_bone = three_bone[three_bone['is_three_bone_combo']]

In [None]:
# Get unique item names for three bone combo

three_bone['Item Name'].unique()

In [None]:
# Check variations

three_bone['Item Variation'].unique()

In [None]:
# Check dates

three_bone['Order Date'].describe()

As discovered while exploring *baby back rib combo* orders, this is the most up-to-date alias for baby back rib combo orders.

### Bites

In [None]:
# Get list of items in catalog for BITES category

items_df[items_df['category'] == 'BITES']

#### Pork Belly Bites

In [None]:
bites = df.dropna(subset='Item Name')
bites = bites[bites['Item Name'].str.lower().str.contains('bite')]

bites

In [None]:
bites['Item Name'].unique()

In [None]:
belly_bites = bites[bites['Item Name'] == 'BELLY BITES']
belly_bites['Item Variation'].unique()

In [None]:
pork_belly_bites = bites[bites['Item Name'] == 'PORK BELLY BITES']
pork_belly_bites

In [None]:
pork_belly_bites['Item Variation'].unique()

In [None]:
items_df[items_df['Item Name'] == 'PORK BELLY BITES']

In [None]:
# Get last order date of pork belly bites

print(f"Last order of pork belly bites: {belly_bites['Order Date'].max()}")

In [None]:
belly_bites.sort_values(by='Order Date').tail(1)['Item Variation']

* **Belly bites** was recently updated to `PORK BELLY BITES`.
* `Item Variation` was accordingly updated to only a single value, **Regular**.
    * Previous variation values will be standardized to **regular**.

#### Mozzarella Sticks

In [None]:
df = df.dropna(subset='Item Name')
df[df['Item Name'].str.lower().str.contains('wedge')]

In [None]:
sticks = df.dropna(subset='Item Name')
sticks = sticks[sticks['Item Name'].str.lower().str.contains('stick')]

sticks

In [None]:
sticks['Item Name'].unique()

In [None]:
sticks['Item Variation'].unique()

In [None]:
items_df[items_df['Item Name'].str.lower().str.contains('stick')]

#### Grilled Sweet Corn

In [None]:
corn = df.dropna(subset='Item Name')
corn = corn[corn['Item Name'].str.lower().str.contains('corn')]

corn

In [None]:
corn['Item Name'].unique()

In [None]:
corn['Item Variation'].unique()

In [None]:
items_df[items_df['Item Name'].str.lower().str.contains('corn')]

#### Chicken Wings

In [None]:
wings = df.dropna(subset='Item Name')
wings = wings[wings['Item Name'].str.lower().str.contains('wing')]

wings

In [None]:
# List of items including wings

wings['Item Name'].unique()

In [None]:
# See list of wings in catalog that are in BITES category

items_df[(items_df['Item Name'].str.lower().str.contains('wing')) & (items_df['category'] == 'BITES')]

In [None]:
# Number of orders per kind of wings

bites_wings = ['CHICKEN WINGS', 'FRIED CHICKEN WINGS']
wings = wings[wings['Item Name'].isin(bites_wings)]

wings['Item Name'].value_counts()

In [None]:
# Inspect order dates for different aliases

wings.groupby(by='Item Name')['Order Date'].agg(['count', 'min', 'max'])

In [None]:
# List variations

wings['Item Variation'].unique()

* Chicken wings menu item changed to `FRIED CHICKEN WINGS` in **June 2025**.
* `Item Variation` stays consistent between orders and catalog.

### Sides

In [None]:
# Get list of Sides

items_df[items_df['category'] == 'SIDES']['Item Name'].unique()

#### Pineapple Slaw

In [None]:
# Create slaw dataframe

slaw_df = df.dropna(subset='Item Name')
slaw_df = slaw_df[slaw_df['Item Name'].str.lower().str.contains('slaw')]

slaw_df

In [None]:
# Check item name aliase

slaw_df['Item Name'].unique()

In [None]:
# Check number of orders with each alias

slaw_df['Item Name'].value_counts()

In [None]:
# Inspect alias with one order

slaw_df[slaw_df['Item Name'] == 'Pineapple Coleslaw']

In [None]:
# Check unique variations

slaw_df['Item Variation'].unique()

In [None]:
# Compare with catalog options

items_df[items_df['Item Name'].str.lower().str.contains('slaw')]

In [None]:
# Inspect variations that don't exist in current catalog

slaw_df[(slaw_df['Item Variation'] == '8 oz.') | (slaw_df['Item Variation'] == '4 oz.')]

* The alias for pineapple coleslaw changed to **all caps** after the first order.
* Only the first two orders contains variations `8 oz.` and `4 oz.`.
    * The current variations in the catalog are `Side` and `Full`.

#### Pear Gorgonzola Salad

In [None]:
# Create salad dataframe

salad_df = df.dropna(subset='Item Name')
salad_df = salad_df[salad_df['Item Name'].str.lower().str.contains('salad')]

salad_df

In [None]:
# Get list of aliases

salad_df['Item Name'].unique()

In [None]:
# Get list of variations

salad_df['Item Variation'].unique()

In [None]:
# Confirm with catalog

items_df[items_df['Item Name'] == 'PEAR GORGONZOLA SALAD']['Variation Name'].unique()

#### Baked Baby Potatoes

In [None]:
# Create baked potatoes dataframe

potatoes_df = df.dropna(subset='Item Name')
potatoes_df = potatoes_df[potatoes_df['Item Name'].str.lower().str.contains('potato')]

potatoes_df

In [None]:
# List unique item names

potatoes_df['Item Name'].unique()

In [None]:
# List unique variations

potatoes_df['Item Variation'].unique()

In [None]:
# Confirm possible variations with catalog

items_df[items_df['Item Name'] == 'BAKED BABY POTATOES']['Variation Name'].unique()

#### Mac and Cheese

In [None]:
# Create mac and cheese dataframe

mac_df = df.dropna(subset=['Item Name'])
mac_df = mac_df[mac_df['Item Name'].str.lower().str.contains('mac')]

mac_df

In [None]:
# Validate items

mac_df['Item Name'].unique()

In [None]:
# List possible variations

mac_df['Item Variation'].unique()

In [None]:
# Inspect family size and pan orders

mac_pan = mac_df[(mac_df['Item Name'] == 'Mac’n’Cheese Family size') | (mac_df['Item Name'] == 'Mac’n’Cheese Half Pan')]

mac_pan

In [None]:
# Number of orders of each variation

mac_df['Item Name'].value_counts()

In [None]:
# Inspect alias

mac_df[mac_df['Item Name'] == 'Artisan Mac and Cheese']

In [None]:
# View order dates per item

mac_df.groupby(by='Item Name')['Order Date'].agg(['min', 'max', 'count']).sort_values(by='min')

#### Fries

In [None]:
items_df[items_df['Item Name'].str.lower().str.contains('fries')]

In [None]:
fries_df = df.dropna(subset=['Item Name'])

fries_df = fries_df[fries_df['Item Name'].str.lower().str.contains('fries')]
fries_df

In [None]:
# List of unique item names for fries

fries_df['Item Name'].unique()

In [None]:
# List variations

fries_df['Item Variation'].unique()

In [None]:
fries_df[['Item Name', 'Item Variation']].value_counts()

In [None]:
# Count number of orders for each unique item name

fries_df['Item Name'].value_counts()

In [None]:
fries_df[fries_df['Item Name'] == 'LOAD YOUR FRIES']

In [None]:
# Inspect min and max order dates for fries

fries_dates = fries_df.groupby(by='Item Name')['Order Date'].agg(['min', 'max', 'count']).reset_index()
fries_dates

In [None]:
fries_dates['in_catalog'] = fries_dates['Item Name'].isin(items_df['Item Name'])
fries_dates.sort_values(by='in_catalog', inplace=True)
fries_dates

In [None]:
crinkle_orders = fries_df[fries_df['Item Name'] == 'CRINKLE FRIES']
crinkle_orders

In [None]:
crispy_fries = fries_df[fries_df['Item Name'] == 'CRISPY  FRIES']
crispy_fries

##### Loaded Fries

In [None]:
loaded_orders = fries_df[fries_df['Item Name'] == 'LOADED FRIES']
loaded_orders

In [None]:
loaded_fries = fries_df[fries_df['Item Name'].str.lower().str.contains('loaded')]
loaded_fries['Item Name'].unique()

In [None]:
loaded_fries['Item Variation'].unique()

In [None]:
loaded_fries[(loaded_fries['Item Variation'] == 'Truffle Salt') | (loaded_fries['Item Variation'] == 'Rosemary Pepper')]

There were changes made to the fries offered by the restaurant:
* There was one order with Item name `LOAD YOUR FRIES`. This is a mistake.
    * This item is no longer on the menu.
* `French fries` were only listed on one day with **two** orders.
* `Wings & Fries` is a recent addition, from **Februrary 9th, 2025**, and is still part of the menu.
* `LOADED FRIES` are still on the menu and have been since **March 5th, 2024**.
* `CRISPY FRIES` were added to the menu on **May 6th, 2025**.
    * They replaced `CRINKLE FRIES`, which are not in the catalog and have a *maximum order date* of **May 6th, 2025**.

### Ribs

In [None]:
# Get all rib items

rib_df = df.dropna(subset=['Item Name'])
rib_df = rib_df[rib_df['Item Name'].str.lower().str.contains('rib')]
rib_df['Item Name'].unique()

In [None]:
# Check ribs category in catalog

items_df[items_df['category'] == 'RIBS']

#### Pork Ribs

In [None]:
# Get orders of pork ribs

pork_rib_aliases = ['GLAZED BABY BACK PORK RIBS', 'BABY BACK PORK RIBS', '“Get your baby back” Pork Ribs']
pork_ribs_df = rib_df[rib_df['Item Name'].isin(pork_rib_aliases)]
pork_ribs_df

In [None]:
# List variations for pork ribs

pork_ribs_df['Item Variation'].unique()

#### Beef Ribs

In [None]:
# Beef Ribs

beef_rib_aliases = ['BEEF BACK RIBS (Full Rack)', 'BEEF SHORT RIB', 'SHORT RIB PLATTER', 'BEEF SHORT RIB (Full Rack)', 'Beef Short Rib', 'Party Package !BEEF RIBS ONLY! (4-6 ppl)', 'RIBS & WINGS BUNDLE !BEEF RIBS ONLY!', 'BEEF SHORT RIBS (FULL RACK)', 'Beef Back Ribs (Full Rack)', 'BEEF BACK RIBS (FULL RACK)']
beef_rib_df = rib_df[rib_df['Item Name'].isin(beef_rib_aliases)]
beef_rib_df['Item Name'].unique()

In [None]:
# List variations for beef rib orders

beef_rib_df['Item Variation'].unique()

In [None]:
beef_rib_df[beef_rib_df['Item Name'] == 'Beef Short Rib']

In [None]:
# Get beef items from catalog

items_beef = items_df[items_df['Item Name'].str.lower().str.contains('beef')]
items_beef

In [None]:
# Check for orders of dino ribs

dino_df = df.dropna(subset='Item Name')
dino_df = dino_df[dino_df['Item Name'].str.lower().str.contains('dino')]

dino_df

In [None]:
# Check if only dino dinner or if also dino ribs

dino_df['Item Name'].unique()

In [None]:
dino_df['Item Variation'].unique()

## Item Categories

In [None]:
# See list of possible item variations

df['Item Variation'].unique()

In [None]:
print(f"Number of unique sides offered: {len(df[df['Item Variation'] == 'Side']['Item Name'].unique())}")
print(f"Unique sides: {df[df['Item Variation'] == 'Side']['Item Name'].unique()}")

In [None]:
df[df['Item Variation'] == 'Truffle Salt']['Item Name'].value_counts()

In [None]:
df[df['Item Variation'] == 'Rosemary Pepper']['Item Name'].value_counts()

In [None]:
df[df['Item Variation'] == '6 pcs']['Item Name'].value_counts()

In [None]:
# See number of orders per method of spelling Mac & Cheese

mac_and_cheese_variations = ['ARTISAN MAC AND CHEESE', 'MAC&CHEESE', 'Artisan Mac and Cheese']

variant_counts = {}

for variant in mac_and_cheese_variations:
    variant_counts[variant] = len(df[df['Item Name'] == variant])

variant_counts

In [None]:
df[df['Item Name'] == 'Artisan Mac and Cheese']

In [None]:
# See variations of Mac & Cheese ordered

df[df['Item Name'].isin(mac_and_cheese_variations)]['Item Variation'].value_counts()

In [None]:
# See variations of Pineapple Slaw ordered

df[df['Item Name'] == 'PINEAPPLE SLAW']['Item Variation'].value_counts()

In [None]:
df[df['Item Variation'] == 'Full Rack'].head(1)

In [None]:
df[df['Item Variation'] == 'Full Rack']['Item Name'].value_counts()

In [None]:
df[df['Item Variation'] == 'Full'].head(1)

In [None]:
df[df['Item Variation'] == 'Full']['Item Name'].value_counts()

In [None]:
regular_items = df[df['Item Variation'] == 'Regular']
regular_items['Item Name'].value_counts().reset_index()

In [None]:
regular_items['Item Name'].unique()

In [None]:
df[df['Item Name'] == 'BEEF BACK RIBS (FULL RACK)']

In [None]:
df[df['Item Name'] == 'Beef Back Ribs (Full Rack)']

* `Item Name` and `Item Variations` have duplicate entries with variations in spelling.
    * Mac & Cheese has **three** different spellings: `ARTISAN MAC AND CHEESE`, `MAC&CHEESE`, `Artisan Mac and Cheese`
    * A full rack of beef back ribs has **two** different spellings: `Beef Back Ribs (Full Rack)`, and `BEEF BACK RIBS (FULL RACK)`
        * Both orders are from `DOORDASH` so that can't explain the difference.
    * There are **four** different spellings for Baby back pork ribs: `GLAZED BABY BACK PORK RIBS`, `BABY BACK PORK RIBS`, `GET YOUR BABY BACK!`, and `"Get your baby back" Pork Ribs`.
* Side items, such as the Mac & Cheese and slaw can have `Item Variation` set to `Full`, `Side`, or `8oz`.
* `Item Variation` `Regular` seems to imply to not apply any variations.
    * `Full Rack` relates to orders of pork ribs.
    * `Side`, `Full`, and `8oz` relate to orders of sides.
    * Values with `pcs` relate to items that come in multiples such as Chicken Wings and Mozzarella Sticks
    * Seasoning variations such as `Rosemary Pepper` and `Truffle Salt` relate to orders of fries.


## Top-selling Items

### Quantity

In [None]:
# Inspect quantity of items and their variations ordered

item_variation_counts = df.groupby(by=['Item Name', 'Item Variation']).size().sort_values(ascending=False).reset_index(name='count')

item_variation_counts.to_csv(REPORTS_DIR / 'item_variation_counts.csv', index=False)

In [None]:
# Inspect quantity of items ordered

item_counts = df.groupby(by='Item Name').size().sort_values(ascending=False).reset_index(name='count')

item_counts.to_csv(REPORTS_DIR / 'item_counts.csv', index=False)

In [None]:
# See top 10 performing items

item_counts.head(10)

In [None]:
# See top 10 performing items and their variations

item_variation_counts.head(10)

In [None]:
# See bottom 10 performing items

item_counts.tail(10)

In [None]:
# See bottom 10 performing items and their variations

item_variation_counts.tail(10)

### Revenue

In [None]:
df[df['Item Quantity'] > 1].head()

In [None]:
# List top 10 performing items based on revenue

item_revenues = df.groupby(by='Item Name')['Item Total Price'].sum().sort_values(ascending=False).reset_index(name='total_revenue')
item_revenues.head(10)

In [None]:
# List top 10 performing items and their variations based on revenue

item_combination_revenues = df.groupby(by=['Item Name', 'Item Variation'])['Item Total Price'].sum().sort_values(ascending=False).reset_index(name='total_revenue')
item_combination_revenues.head(10)

In [None]:
# List worst 10 performing items based on revenue

item_revenues.tail(10)

In [None]:
# List items with zero revenue

item_revenues[item_revenues['total_revenue'] == 0]

Condiments provide **zero** revenue.

In [None]:
# List worst 10 performing items and their variations based on revenue

item_combination_revenues[item_combination_revenues['total_revenue'] > 0].tail(10)

* Items ranked by revenue:
    1. `GLAZED BABY BACK PORK RIBS`: **$16,092.10**
    2. `BEEF BACK RIBS (Full Rack)`: **$11,299.39**
    3. `STEAK SANDWICH COMBO`: **$6,297.93**
* Items and variations ranked by revenue:
    1. Full Rack of Glazed Baby Back Pork Ribs: **$12,391.97**
    2. Regular order of Beef Back Ribs (Full Rack): **$11,299.39**
    3. Regular order of Steak Sandwich Combo: **$6297.93**
* Items ranked lowest by revenue:
    1. `Pickled Jalapenos (8oz)`: **$1.64**
    2. `BLUE CHEESE SAUCE`: **$2.05**
    3. `BBQ dipping sauce`: **$2.55**

# Revenue & Payments

- [x] Total Sales, average order value, median order value.
- [x] Check for chargebacks/refunds and their impact.

## Totals

In [None]:
df.head()

In [None]:
# Create dataframe with order-level information

orders_df = df.groupby(by='pseudo_order_id').agg({
    'Order Total': 'first',
    'Order Refunded Amount': 'first'
}).reset_index()

orders_df.head()

In [None]:
# Calculate total sales

print(f"Total Sales: {orders_df['Order Total'].sum():.2f}")

In [None]:
# Summary statistics for Order Total

orders_df['Order Total'].describe()

### Conclusions
* There were a total of **1,432** orders since opening.
* Total Sales = **$90,341.63**
* Average Sale Amount = **$63.08**
* Median Sale Amount = **$50.44**
* Maximum Sale Amount = **$1,158.75**
* Minimum Sale Amount = **$1.00**

## Refunds

In [None]:
# Summary statistics for order refunded amount

orders_df['Order Refunded Amount'].describe()

### Conclusions
* All values in `Order Refunded Amount` are **NaN**.
* This either means that there were no refunded orders, or that this information is not tracked in this dataset.

# Conclusions and Next Steps

***Data Quality Checks***

- Columns `Currency`, `Order Shipping Price`, `Order Refunded Amount`, `Fulfillment Location`, `Recipient Region`, and `Item SKU` carry no valuabe information and can be dropped from the dataset.
- Geographic details such as `Address`, `Postal Code`, `City`, `Region` are missing greater than **87%** of entries.

**Business Logic Validation**
* Outliers in `Item Price` are catering packages, platters, bundles, and combos.
* There are **no** refunded orders.
    * There were **6** cancelled orders.
* **1,073** rows are missing Fulfillment data.

**Customer-Level Insights**
* **822** unique customers.
* **11%** of customers are repeat customers (have ordered more than once).
    * **60%** of repeat customers order at most **2** times.
    * The *average number of days* between repeat orers is **54.2 days**.
    * The *median number of days* between repeat orders if **35 days**.

**Time-Based Patterns**
* **Fridays** have the highest number of orders.
    * **Tuesdays** are the second most popular days.
    * Other days of the week have similar *mean* and *median* order quantities.
* Peak order time is around **7pm**.

**Menu and Product-Level Insights**
* **135** unique combinations of items and their variations have been sold.
* Menu item names need to be standardized.
* `Item Variation` applies to both main and side dishes.
    * Can't be used as a *category* feature.
* Baby Back Pork Ribs is the **most ordered** item and provides the **highest revenue**.
* Some items have very low sales and need further exploration.

**Revenue & Payments**
* Total Sales: **$90,341.63**
* Average Order Amount: **$63.08**
* Median Order Amount: **$50.44**