# Initial Exploration

In [None]:
import os
from datetime import datetime

# Data manipulation
import pandas as pd
from pandas.api.types import is_datetime64_any_dtype as is_datetime
import numpy as np

# Time series analysis
from statsmodels.tsa.seasonal import seasonal_decompose

# Data visualization
import matplotlib.pyplot as plt
import seaborn as sns

from belly_rubb.config import REPORTS_DIR

pd.set_option('display.max_columns', None)

## Orders

In [None]:
# Create a list of DataFrames from the orders CSV files
orders_dir = '../data/raw/orders'
csv_files = [f for f in os.listdir(orders_dir) if f.endswith('.csv')]

dataframes = []
for file in csv_files:
    file_path = os.path.join(orders_dir, file)
    df = pd.read_csv(file_path)
    dataframes.append(df)

# Combine dataframes into one
df = pd.concat(dataframes, ignore_index=True)
df.head()

In [None]:
print(f"Rows: {df.shape[0]}\nColumns: {df.shape[1]}")

In [None]:
df.columns

In [None]:
df.info()

In [None]:
# Convert date columns to datetime

df['Order Date'] = pd.to_datetime(df['Order Date'])
df['Fulfillment Date'] = pd.to_datetime(df['Fulfillment Date'], format='%m/%d/%Y, %I:%M %p')

### Data Quality Checks
- [x] Investigate missing values
- [x] Identify static columns
- [x] Check for duplicates
- [x] Validate data types
- [x] Spot outliers

##### Missing values and static columns

In [None]:
# Identify empty or static columns

for col in df.columns:
    unique_values = set(df[col].dropna().unique())
    if len(unique_values) == 0:
        print(f"Column '{col}' is empty.")
    elif len(unique_values) == 1:
        print(f"Column '{col}' has a single unique value: {unique_values.pop()}")

In [None]:
# List distinct Recipient Country values

df['Recipient Country'].unique()

In [None]:
zz_country = df[df['Recipient Country'] == 'ZZ']
zz_country

In [None]:
len(zz_country)

In [None]:
# View missing value percentages

df.isna().sum().div(len(df))*100

* `Currency`, `Fulfillment Location`, and `Recipient Region` have a single unique value.
* `Order Shipping Price`, `Order Refunded Amount`, and `Item SKU` are empty.
* `Recipient Country` has 370 rows with the value `ZZ`.
* Geographic details such as `Recipient Address` and `Recipient Postal Code` are missing greater than **87%** of their data.

#### Null values

In [None]:
# View rows with all null values

df[df.isna().all(axis=1)]

#### Duplicates

In [None]:
# Check for duplicates

duplicates = df.duplicated()
df[duplicates]

In [None]:
# Check duplicate 'Armen 59-07' rows

df[(df['Order'] == 'Armen 59-07') & (df['Order Date'] == '2024/08/31')]

In [None]:
# Check duplicate Troy Issac rows

df[(df['Order'] == 'Troy Issac') & (df['Order Date'] == '2024/12/19')]

Each order is split into multiple rows, one for each menu item. Menu items are not grouped together however. For example, one order can have multiple rows with `CRINKLE FRIES` as the `Item name`. This implies that if items are part of separate combos or groupings, they are listed separately.

#### Outliers

In [None]:
def calculate_iqr(df: pd.DataFrame, col: str) -> tuple:
    """
    Calculate the Interquartile Range (IQR) for a given column.
    
    Args:
        col (str): The name of the column to calculate IQR for.
        
    Returns:
        tuple: A tuple containing the lower and upper bounds for outliers.
    """
    q1 = df[col].quantile(0.25)
    q3 = df[col].quantile(0.75)
    iqr = q3 - q1

    lower_bound = q1 - 1.5 * iqr
    upper_bound = q3 + 1.5 * iqr

    return lower_bound, upper_bound


##### Order Total

In [None]:
# Identify outliers in Order Total

plt.figure(figsize=(12, 6))

sns.histplot(data=df, x='Order Total', color='skyblue')

plt.title('Distribution of Order Total', fontsize=16, fontweight='bold')
plt.xlabel('Order Total ($)', fontsize=14)
plt.ylabel('Frequency', fontsize=14)

lower_bound, upper_bound = calculate_iqr(df=df, col='Order Total')
plt.axvline(x=df['Order Total'].mean(), color='red', linestyle='--', label='Mean')
plt.axvline(x=df['Order Total'].median(), color='green', linestyle='--', label='Median')
plt.axvline(x=lower_bound, color='orange', linestyle='-.', label='Lower Bound') if lower_bound > 0 else None
plt.axvline(x=upper_bound, color='orange', linestyle='-.', label='Upper Bound')

plt.legend()

plt.tight_layout()
plt.show()

In [None]:
# Print outlier information

num_order_total_outliers = len(df[df['Order Total'] > upper_bound])
print(f"Number of outliers in 'Order Total': {num_order_total_outliers}")
print(f"Percentage of total dataset: {num_order_total_outliers / len(df) * 100:.2f}%")

In [None]:
# View Order Total outliers

order_total_outliers = df[df['Order Total'] > upper_bound].sort_values(by='Order Total', ascending=False)
order_total_outliers

It appears that outliers in `Order Total` are simply large orders. At a cursory investigation there does not seem to be data entry mistakes or suspicious activity.

##### Item Price

In [None]:
# Identify outliers in Item Price

plt.figure(figsize=(12, 6))

sns.histplot(data=df, x='Item Price', color='skyblue')

plt.title('Distribution of Item Price', fontsize=16, fontweight='bold')
plt.xlabel('Item Price ($)', fontsize=14)
plt.ylabel('Frequency', fontsize=14)

lower_bound, upper_bound = calculate_iqr(df=df, col='Item Price')

plt.axvline(x=df['Item Price'].mean(), color='red', linestyle='--', label='Mean')
plt.axvline(x=df['Item Price'].median(), color='green', linestyle='--', label='Median')
plt.axvline(x=lower_bound, color='orange', linestyle='-.', label='Lower Bound') if lower_bound > 0 else None
plt.axvline(x=upper_bound, color='orange', linestyle='-.', label='Upper Bound')
plt.legend()

plt.tight_layout()
plt.show()

In [None]:
# View Item Price outlier information

num_item_price_outliers = len(df[df['Item Price'] > upper_bound])
print(f"Number of outliers in 'Item Price': {num_item_price_outliers}")
print(f"Percentage of total dataset: {num_item_price_outliers / len(df) * 100:.2f}%")

In [None]:
# List Item Price outliers

item_price_outliers = df[df['Item Price'] > upper_bound].sort_values(by='Item Price', ascending=False)
item_price_outliers

In [None]:
# List Item Price outlier Items

item_price_outliers['Item Name'].unique()

In [None]:
# View outlier row with no item price

item_price_outliers[item_price_outliers['Item Name'].isna()]

* Outliers in `Item Price` are catering packages, platters, bundles, and more expensive combos.
* The outlier with no `Item Price` information appears to be a custom order lacking much information.

##### Item Options Total Price

In [None]:
# Identify outliers in Item Options Total Price

plt.figure(figsize=(12, 6))

sns.histplot(data=df, x='Item Options Total Price', color='skyblue')

plt.title('Distribution of Item Options Total Price', fontsize=16, fontweight='bold')
plt.xlabel('Item Options Total Price ($)', fontsize=14)
plt.ylabel('Frequency', fontsize=14)

lower_bound, upper_bound = calculate_iqr(df=df, col='Item Options Total Price')

plt.axvline(x=df['Item Options Total Price'].mean(), color='red', linestyle='--', label='Mean')
plt.axvline(x=df['Item Options Total Price'].median(), color='green', linestyle='--', label='Median')
plt.axvline(x=lower_bound, color='orange', linestyle='-.', label='Lower Bound') if lower_bound > 0 else None
plt.axvline(x=upper_bound, color='orange', linestyle='-.', label='Upper Bound')
plt.legend()

plt.tight_layout()
plt.show()

In [None]:
item_options_total_price_outliers = df[df['Item Options Total Price'] > upper_bound].sort_values(by='Item Options Total Price', ascending=False)
item_options_total_price_outliers

In [None]:
# Print number of rows shared by all price outliers

merged_df = pd.merge(left=item_price_outliers, right=item_options_total_price_outliers, left_index=True, right_index=True, how='inner', suffixes=('_item_price', '_item_options_total_price'))
merged_df = pd.merge(left=merged_df, right=order_total_outliers, left_index=True, right_index=True, how='inner', suffixes=('', '_order_total'))

print(f"Number of merged outliers: {len(merged_df)}")

Rows which have a high `Item Price` also have a high `Item Options Total Price` and `Order Total`, signaling a relationship between these features.

In [None]:
# Verify results with correlation heatmap

plt.figure(figsize=(12, 6))

plt.title('Correlation Matrix', fontsize=16, fontweight='bold')
sns.heatmap(df.corr(numeric_only=True), annot=True, fmt=".2f", cmap='coolwarm', cbar=True, center=0)

plt.tight_layout()
plt.show()

### Business Logic Validation

- [x] Investigate tax calculation
- [x] Confirm total calculated correctly from subtotal
- [x] Check for canceled/voided/refunded orders

##### Total Calculation

In [None]:
# Inspect rows where Order Subtotal and Order Tax Total don't add up to Order Total

tax_rate_valid = np.isclose((df['Order Subtotal'] + df['Order Tax Total']), df['Order Total'])
df[~tax_rate_valid].head()

It is not apparently cleaer why `Order Tax Total` and `Order Subtotal` do not add up to `Order Total` in all rows. Possible reasons could include added fees or tips, whose information is missing in this dataset.

#### Tax Calculation

In [None]:
# Calculate effective tax rates

tax_rate_valid_df = df[tax_rate_valid]
(tax_rate_valid_df['Order Tax Total']/tax_rate_valid_df['Order Subtotal']).round(decimals=2).unique()

In [None]:
# Count occurrences of each tax rate

df['Tax Rate'] = (df['Order Tax Total']/df['Order Subtotal']).round(decimals=2)
df.groupby(by='Tax Rate').size()

In [None]:
# Order with no tax calculated

df[df['Tax Rate'] == 0.00]

In [None]:
df[df['Tax Rate'] == 0.05]

The statewide tax rate in California is **7.25%**. In Los Angeles, the combined sales tax rate (state and local) is **9.50%**. While a majority of the orders are between 9-10%, there are a small amount with a much lower tax rate.

#### Canceled/Voided/Refunded orders

In [None]:
df['Fulfillment Status'].unique()

In [None]:
# View cancelled orders

df[df['Fulfillment Status'] == 'Canceled']

In [None]:
# Generate pseudo order id to be able to group orders

df['pseudo_order_id'] = df['Order Name'].str.split(' ').str[0] + '_' + df['Order Date'].astype(str)
df['pseudo_order_id']

In [None]:
# Calculate number of canceled orders

cancelled_orders = df[df['Fulfillment Status'] == 'Canceled']
num_canceled_orders = cancelled_orders['pseudo_order_id'].nunique()

print(f"Number of canceled orders: {num_canceled_orders}")

In [None]:
# View New orders

df[df['Fulfillment Status'] == 'New']

In [None]:
# View orders with no fulfillment status

null_fulfillment_status = df[df['Fulfillment Status'].isnull()]
null_fulfillment_status

In [None]:
null_fulfillment_status.info()

In [None]:
# Investigate Order Name for orders with no fulfillment status

null_fulfillment_status['Order Name'].unique()

In [None]:
# Count fulfillment types including null values

df['Fulfillment Type'].value_counts(dropna=False)

In [None]:
# List Channels of orders with no fulfillment status

null_fulfillment_status['Channels'].unique()

In [None]:
# List refunded orders

df[df['Order Refunded Amount'] > 0]

* There were a total of **6** cancelled orders.
* **2** orders were listed as **New**. These were probably in progress at the time of capturing the data.
* **1073** rows have no `Fulfillment Status` listed.
    * These orders have primarily numeric `Order Name`.
    * They are also missing `Fulfillment Type`.
    * The single `Channel` for these rows is `BELLY RUBB - BBQ Ribs to Go & Catering`
* There are no refunded orders. This probably means that `Order Refunded Amount` does not apply when `Fulfillment Status` is `Canceled`.

### Customer-Level Insights

- [x] Count unique customers
- [x] Identify repeat vs. new customers
- [x] Explore order frequency (days between orders) for repeat customers
- [x] Check for missing/anonymous customer records

#### Unique Customers

In [None]:
# Calculate percentage of records where Order Name is the same as Recipient Name

print(f"Percentage of records where Order Name is the same as Recipient Name: {np.round(len(df[df['Order Name'] == df['Recipient Name']]) / len(df) * 100, decimals=2)}%")

In [None]:
# Investigate records where Order Name is not the same as Recipient Name

order_recipient_name_neq = df[~(df['Order Name'] == df['Recipient Name'])]
order_recipient_name_neq.head()

In [None]:
# List of recipient names

order_recipient_name_neq['Recipient Name'].unique()

In [None]:
# List of order names

order_recipient_name_neq['Order Name'].unique()

In [None]:
order_recipient_name_neq['Channels'].unique()

In [None]:
# Count number of unique recipients

print(f"Number of unique customers: {len(df['Recipient Name'].unique())}")

* **822** unique customers have ordered from the restaurant.
* In **73%** of records `Order Name` and `Recipient Name` are equal.
    * This allows us to rely on `Recipient Name` to later investigate repeat customers.
* The rest all have missing `Recipient Name` and majority cryptic `Order Name`.
    * These records are also those with null `Fulfillment Status`.
    * Since they all have a single value for `Channel`, namely `BELLY RUBB - BBQ Ribs To Go & Catering`, it can be assumed that these are manually entered entries.

#### Repeat Customers

In [None]:
# Calculate number of repeat orders

order_names = df.groupby(by='pseudo_order_id')['Recipient Name'].unique().str[0]
repeat_counts = order_names.value_counts()
repeat_customers = repeat_counts[repeat_counts > 1]

print(f"Number of repeat customers: {len(repeat_customers)}")
print(f"Percentage of customers who are repeat customers: {(len(repeat_customers) / repeat_counts.shape[0]) * 100:.2f}%")

In [None]:
# Visualize Repeat Customer Frequency Distribution

recipient_freq = repeat_customers.value_counts(normalize=True).sort_index()

plt.figure(figsize=(12, 6))

plt.title('Repeat Customer Frequency Distribution', fontsize=16, fontweight='bold')
plt.bar(recipient_freq.index, recipient_freq.values)

plt.xlabel('Number of Repeat Orders', fontsize=14)
plt.ylabel('Percentage of Customers', fontsize=14)

plt.tight_layout()
plt.show()

In [None]:
# Visualize Cumulative Distribution of Repeat Customers

plt.figure(figsize=(12, 6))

plt.hist(repeat_customers, cumulative=True, density=True, bins=range(2, repeat_customers.max() + 2), edgecolor='black', color='skyblue', alpha=0.7)

plt.title('Cumulative Distribution of Repeat Customers', fontsize=16, fontweight='bold')
plt.xlabel('Number of Repeat Orders', fontsize=14)
plt.ylabel('Density', fontsize=14)

plt.show()

* Approximately **10.87%** of customers are repeat customers, constituting customer who have ordered at least twice.
* The distribution of repeat orders is heavily **right-skewed**, as expected.
    * About **60%** of repeat customers have ordered **2** times.
    * About **80%** of repeat customers have ordered **4 or less** times.

#### Order frequency for repeat customers

In [None]:
def calculate_days_difference(date1: datetime, date2: datetime) -> int:
    """
    Calculate the number of days between two dates.
    
    Args:
        date1 (datetime): The first date.
        date2 (datetime): The second date.
        
    Returns:
        int: The number of days between the two dates.
    """
    return (date2 - date1).days

In [None]:
# Create row to signify if customer is a repeat customer

df['repeat_customer'] = df['Recipient Name'].isin(repeat_customers.index)
df.head()

In [None]:
# Create a dataframe for repeat customers

repeat_customers_df = df[(df['repeat_customer']) & (df['Fulfillment Status'] == 'Completed')]
repeat_customers_df = repeat_customers_df.sort_values(by=['Recipient Name', 'Order Date'])
repeat_customers_df.head()

In [None]:
# Generate dataframe with one row per order for repeat customers

order_df = repeat_customers_df.groupby(by=['pseudo_order_id']).agg({
    'Order Date': 'first',
    'Recipient Name': 'first'
}).sort_values(by=['Recipient Name', 'Order Date']).reset_index()

order_df.head()

In [None]:
# Add column for previous order date

order_df['previous_order_date'] = order_df.groupby(by='Recipient Name')['Order Date'].shift(1)
order_df.head()

In [None]:
# Calculate days since last order per customer

order_df['days_since_last_order'] = order_df['Order Date'].sub(order_df['previous_order_date']).dt.days
avg_days_since_last_order = order_df.groupby(by='Recipient Name')['days_since_last_order'].mean().sort_values()
avg_days_since_last_order

In [None]:
# Visualize distribution of average days between orders

avg_days_counts = avg_days_since_last_order.value_counts()

plt.figure(figsize=(12, 6))
plt.title("Distribution of Average Days Since Last Order", fontsize=16, fontweight='bold')
plt.hist(avg_days_since_last_order, bins=50, color='skyblue', edgecolor='black', alpha=0.7)

plt.xlabel('Average Days Since Last Order', fontsize=14)
plt.ylabel('Frequency', fontsize=14)

mean_avg_days = avg_days_since_last_order.mean()
median_avg_days = avg_days_since_last_order.median()

plt.axvline(mean_avg_days, color='red', linestyle='dashed', linewidth=1, label='Mean')
plt.text(mean_avg_days*1.1, plt.ylim()[1]*0.8, np.round(mean_avg_days, 1), color='red', ha='center')
plt.axvline(median_avg_days, color='blue', linestyle='dashed', linewidth=1, label='Median')
plt.text(median_avg_days*1.15, plt.ylim()[1]*0.8, np.round(median_avg_days, 1), color='blue', ha='center')
plt.legend()

plt.tight_layout()
plt.show()

* The *average number of days* between repeat orders is **54.2 days**.
* The *median number of days* between repeat orders is **35 days**.
* Distribution of average days between orders is also heavily **right-skewed**.

#### Missing Customer Records

In [None]:
# Inspect customer record columns

customer_record_cols = ['Recipient Name', 'Recipient Email', 'Recipient Phone', 'Recipient Address', 'Recipient Postal Code', 'Recipient City', 'Recipient Region', 'Recipient Country']

df[customer_record_cols].info()

In [None]:
# Calculate percentage of records missing for each customer record column

np.round(df[customer_record_cols].isna().sum() / len(df) * 100, 2)

In [None]:
# Inspect customer records with missing Recipient Name

df[df['Recipient Name'].isna()].head()

* **87%** of recipient address information is missing.
* **50%** of recipient emails are missing.
* **26%** of recipient names are missing

### Time-Based Patterns

- [x] Confirm dataset start and end dates
- [x] Explore order volume by day of week
- [x] Explore order volume by hour of day
- [x] Look for seasonality

#### Confirm dataset start and end dates

In [None]:
# Verify order date is datetime

print(f"Order Date is datetime: {is_datetime(df['Order Date'])}")
print(f"Fulfillment Date is datetime: {is_datetime(df['Fulfillment Date'])}")

In [None]:
# Create time series DataFrame with each row corresponding to one order and Order Date as index

time_series_df = df.groupby('pseudo_order_id').agg({
    'Order Date': 'first',
    'Order Total': 'first'
}).sort_values(by='Order Date').reset_index().set_index('Order Date')

In [None]:
# Create time series DataFrame with each row corresponding to one order and Fulfillment Date as index

time_series_fulfillment_df = df.groupby('pseudo_order_id').agg({
    'Fulfillment Date': 'first',
    'Order Total': 'first'
}).sort_values(by='Fulfillment Date').reset_index().set_index('Fulfillment Date')

time_series_fulfillment_df.head()

In [None]:
# Inspect time series DataFrame

time_series_df.head()

In [None]:
# Print earliest and latest order date to confirm range

print(f"Earliest order date: {time_series_df.index.min()}")
print(f"Latest order date: {time_series_df.index.max()}")

In [None]:
# Print earliest and latest order date to confirm range

print(f"Earliest order date: {time_series_fulfillment_df.index.min()}")
print(f"Latest order date: {time_series_fulfillment_df.index.max()}")

* `Fulfillment Date` is missing for orders before **12/5/2023**.
* `Order date` is available for all orders and spans the entire dataset.

#### Explore order volume by day of week

In [None]:
# Resample frequency to daily and add day of week column

day_of_week_orders = time_series_df.resample('D').size().reset_index(name='orders_count')
day_of_week_orders['day_of_week'] = day_of_week_orders['Order Date'].dt.day_name()
day_of_week_orders.head()

In [None]:
# Calculate average number of orders per day of week

week_day_order = ['Sunday', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday']

day_of_week_means = day_of_week_orders.groupby(by='day_of_week')['orders_count'].mean().reindex(week_day_order)
day_of_week_means.head()

In [None]:
# Calculate median number of orders per day of week

day_of_week_medians = day_of_week_orders.groupby(by='day_of_week')['orders_count'].median().reindex(week_day_order)
day_of_week_medians.head()

In [None]:
# Plot sales by day of the week

fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(12, 6), sharey=True)

fig.suptitle('Orders per Day of Week', fontsize=16, fontweight='bold')

day_of_week_means.plot(ax=ax[0])
ax[0].set_title('Mean Orders', fontsize=14, fontweight='bold')
ax[0].set_xlabel('', fontsize=12)

day_of_week_medians.plot(ax=ax[1])
ax[1].set_title('Median Orders', fontsize=14, fontweight='bold')
ax[1].set_xlabel('', fontsize=12)

fig.supylabel('Number of Orders', fontsize=12)
fig.supxlabel('Day of Week', fontsize=12)

ax[0].grid(visible=True, axis='both', linestyle='--', alpha=0.7)
ax[1].grid(visible=True, axis='both', linestyle='--', alpha=0.7)

plt.tight_layout()
plt.show()

* Trends between *mean* and *median* orders per day of week are similar.
* **Fridays** have the highest number of orders, followed by **Tuesdays**.
* Some orders have been taken on Sundays and Mondays, but this is irrelevant since they are outside business hours.
* **Saturdays**, **Wednesdays**, and **Thursdays** share similar *mean* and *median* order quantities.

#### Explore order volume by hour of day

In [None]:
time_series_fulfillment_df.head()

In [None]:
# Resample data to hourly frequency

hour_of_day_orders = time_series_fulfillment_df.resample('h').size().reset_index(name='orders_count')

hour_of_day_orders['hour_of_day'] = hour_of_day_orders['Fulfillment Date'].dt.hour
hour_of_day_orders.head()

In [None]:
# Calculate mean and median orders per hour of day

hour_of_day_means = hour_of_day_orders.groupby(by='hour_of_day')['orders_count'].mean()
hour_of_day_totals = hour_of_day_orders.groupby('hour_of_day')['orders_count'].sum()

In [None]:
# Plot sales by hour

fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(12, 6))

fig.suptitle('Orders per Hour of Day', fontsize=16, fontweight='bold')

hour_of_day_means.plot(ax=ax[0])
ax[0].set_title('Mean Orders', fontsize=14, fontweight='bold')
ax[0].set_ylabel('Mean Orders', fontsize=12)
ax[0].set_xlabel('')

hour_of_day_totals.plot(ax=ax[1])
ax[1].set_title('Total Orders', fontsize=14, fontweight='bold')
ax[1].set_ylabel('Total Orders', fontsize=12)
ax[1].set_xlabel('')

fig.supxlabel('Hour of Day', fontsize=12)

ax[0].grid(visible=True, axis='both', linestyle='--', alpha=0.7)
ax[1].grid(visible=True, axis='both', linestyle='--', alpha=0.7)

plt.tight_layout()
plt.show()

* `Order Date` does not store timestamp data, therefore `Fulfillment Date` was used to get hourly order data.
* Peak order time is around **7pm**
* Orders **decrease** closer to closing time.

#### Seasonality

In [None]:
# Check for weekly seasonality

result = seasonal_decompose(day_of_week_orders['orders_count'], period=7)

fig = result.plot()
fig.set_size_inches(12, 6)
plt.show()

* Since there are only **22** months worth of data, we cannot check for *monthly seasonality*.
* Seasonal decomposition confirms that there is a **weekly** seasonal cycle.

### Menu and Product-Level Insights

- [x] Count unique items sold
- [x] Check for item categories (main, side, etc.)
- [x] Rank top-selling items (by quantity, revenue)

#### Unique Items Sold

In [None]:
df.head()

In [None]:
# Inspect quantity of unique item names

print(f"Number of unique item names: {len(df['Item Name'].unique())}")

In [None]:
# Get unique number of item and item variation combinations

name_vars_counts = df.groupby(by='Item Name')['Item Variation'].nunique().sort_values(ascending=False).reset_index(name='count')
print(f"Number of unique item and variation combinations: {np.sum(name_vars_counts['count'])}")

* Belly Rubb offers **104** unique items based on `Item Name`.
* **135** unique combinations of items and their variations have been sold.

#### Item Categories

In [None]:
# See list of possible item variations

df['Item Variation'].unique()

In [None]:
print(f"Number of unique sides offered: {len(df[df['Item Variation'] == 'Side']['Item Name'].unique())}")
print(f"Unique sides: {df[df['Item Variation'] == 'Side']['Item Name'].unique()}")

In [None]:
df[df['Item Variation'] == 'Truffle Salt']['Item Name'].value_counts()

In [None]:
df[df['Item Variation'] == 'Rosemary Pepper']['Item Name'].value_counts()

In [None]:
df[df['Item Variation'] == '6 pcs']['Item Name'].value_counts()

In [None]:
# See number of orders per method of spelling Mac & Cheese

mac_and_cheese_variations = ['ARTISAN MAC AND CHEESE', 'MAC&CHEESE', 'Artisan Mac and Cheese']

variant_counts = {}

for variant in mac_and_cheese_variations:
    variant_counts[variant] = len(df[df['Item Name'] == variant])

variant_counts

In [None]:
df[df['Item Name'] == 'Artisan Mac and Cheese']

In [None]:
# See variations of Mac & Cheese ordered

df[df['Item Name'].isin(mac_and_cheese_variations)]['Item Variation'].value_counts()

In [None]:
# See variations of Pineapple Slaw ordered

df[df['Item Name'] == 'PINEAPPLE SLAW']['Item Variation'].value_counts()

In [None]:
df[df['Item Variation'] == 'Full Rack'].head(1)

In [None]:
df[df['Item Variation'] == 'Full Rack']['Item Name'].value_counts()

In [None]:
df[df['Item Variation'] == 'Full'].head(1)

In [None]:
df[df['Item Variation'] == 'Full']['Item Name'].value_counts()

In [None]:
regular_items = df[df['Item Variation'] == 'Regular']
regular_items['Item Name'].value_counts().reset_index()

In [None]:
regular_items['Item Name'].unique()

In [None]:
df[df['Item Name'] == 'BEEF BACK RIBS (FULL RACK)']

In [None]:
df[df['Item Name'] == 'Beef Back Ribs (Full Rack)']

* `Item Name` and `Item Variations` have duplicate entries with variations in spelling.
    * Mac & Cheese has **three** different spellings: `ARTISAN MAC AND CHEESE`, `MAC&CHEESE`, `Artisan Mac and Cheese`
    * A full rack of beef back ribs has **two** different spellings: `Beef Back Ribs (Full Rack)`, and `BEEF BACK RIBS (FULL RACK)`
        * Both orders are from `DOORDASH` so that can't explain the difference.
    * There are **four** different spellings for Baby back pork ribs: `GLAZED BABY BACK PORK RIBS`, `BABY BACK PORK RIBS`, `GET YOUR BABY BACK!`, and `"Get your baby back" Pork Ribs`.
* Side items, such as the Mac & Cheese and slaw can have `Item Variation` set to `Full`, `Side`, or `8oz`.
* `Item Variation` `Regular` seems to imply to not apply any variations.
    * `Full Rack` relates to orders of pork ribs.
    * `Side`, `Full`, and `8oz` relate to orders of sides.
    * Values with `pcs` relate to items that come in multiples such as Chicken Wings and Mozzarella Sticks
    * Seasoning variations such as `Rosemary Pepper` and `Truffle Salt` relate to orders of fries.


#### Top-selling Items

##### Quantity

In [None]:
# Inspect quantity of items and their variations ordered

item_variation_counts = df.groupby(by=['Item Name', 'Item Variation']).size().sort_values(ascending=False).reset_index(name='count')

item_variation_counts.to_csv(REPORTS_DIR / 'item_variation_counts.csv', index=False)

In [None]:
# Inspect quantity of items ordered

item_counts = df.groupby(by='Item Name').size().sort_values(ascending=False).reset_index(name='count')

item_counts.to_csv(REPORTS_DIR / 'item_counts.csv', index=False)

In [None]:
# See top 10 performing items

item_counts.head(10)

In [None]:
# See top 10 performing items and their variations

item_variation_counts.head(10)

In [None]:
# See bottom 10 performing items

item_counts.tail(10)

In [None]:
# See bottom 10 performing items and their variations

item_variation_counts.tail(10)

##### Revenue

In [None]:
df[df['Item Quantity'] > 1].head()

In [None]:
# List top 10 performing items based on revenue

item_revenues = df.groupby(by='Item Name')['Item Total Price'].sum().sort_values(ascending=False).reset_index(name='total_revenue')
item_revenues.head(10)

In [None]:
# List top 10 performing items and their variations based on revenue

item_combination_revenues = df.groupby(by=['Item Name', 'Item Variation'])['Item Total Price'].sum().sort_values(ascending=False).reset_index(name='total_revenue')
item_combination_revenues.head(10)

In [None]:
# List worst 10 performing items based on revenue

item_revenues.tail(10)

In [None]:
# List items with zero revenue

item_revenues[item_revenues['total_revenue'] == 0]

Condiments provide **zero** revenue.

In [None]:
# List worst 10 performing items and their variations based on revenue

item_combination_revenues[item_combination_revenues['total_revenue'] > 0].tail(10)

* Items ranked by revenue:
    1. `GLAZED BABY BACK PORK RIBS`: **$16,092.10**
    2. `BEEF BACK RIBS (Full Rack)`: **$11,299.39**
    3. `STEAK SANDWICH COMBO`: **$6,297.93**
* Items and variations ranked by revenue:
    1. Full Rack of Glazed Baby Back Pork Ribs: **$12,391.97**
    2. Regular order of Beef Back Ribs (Full Rack): **$11,299.39**
    3. Regular order of Steak Sandwich Combo: **$6297.93**
* Items ranked lowest by revenue:
    1. `Pickled Jalapenos (8oz)`: **$1.64**
    2. `BLUE CHEESE SAUCE`: **$2.05**
    3. `BBQ dipping sauce`: **$2.55**

### Revenue & Payments

- [x] Total Sales, average order value, median order value.
- [] Check for chargebacks/refunds and their impact.

#### Totals

In [None]:
df.head()

In [None]:
# Create dataframe with order-level information

orders_df = df.groupby(by='pseudo_order_id').agg({
    'Order Total': 'first',
    'Order Refunded Amount': 'first'
}).reset_index()

orders_df.head()

In [None]:
# Calculate total sales

print(f"Total Sales: {orders_df['Order Total'].sum():.2f}")

In [None]:
# Summary statistics for Order Total

orders_df['Order Total'].describe()

* There were a total of **1,432** orders since opening.
* Total Sales = **$90,341.63**
* Average Sale Amount = **$63.08**
* Median Sale Amount = **$50.44**
* Maximum Sale Amount = **$1,158.75**
* Minimum Sale Amount = **$1.00**

#### Refunds

In [None]:
# Summary statistics for order refunded amount

orders_df['Order Refunded Amount'].describe()

* All values in `Order Refunded Amount` are **NaN**.
* This either means that there were no refunded orders, or that this information is not tracked in this dataset.

### Conclusions and Next Steps

***Data Quality Checks***

- Columns `Currency`, `Order Shipping Price`, `Order Refunded Amount`, `Fulfillment Location`, `Recipient Region`, and `Item SKU` carry no valuabe information and can be dropped from the dataset.
- Geographic details such as `Address`, `Postal Code`, `City`, `Region` are missing greater than **87%** of entries.

**Business Logic Validation**
* Outliers in `Item Price` are catering packages, platters, bundles, and combos.
* There are **no** refunded orders.
    * There were **6** cancelled orders.
* **1,073** rows are missing Fulfillment data.

**Customer-Level Insights**
* **822** unique customers.
* **11%** of customers are repeat customers (have ordered more than once).
    * **60%** of repeat customers order at most **2** times.
    * The *average number of days* between repeat orers is **54.2 days**.
    * The *median number of days* between repeat orders if **35 days**.

**Time-Based Patterns**
* **Fridays** have the highest number of orders.
    * **Tuesdays** are the second most popular days.
    * Other days of the week have similar *mean* and *median* order quantities.
* Peak order time is around **7pm**.

**Menu and Product-Level Insights**
* **135** unique combinations of items and their variations have been sold.
* Menu item names need to be standardized.
* `Item Variation` applies to both main and side dishes.
    * Can't be used as a *category* feature.
* Baby Back Pork Ribs is the **most ordered** item and provides the **highest revenue**.
* Some items have very low sales and need further exploration.

**Revenue & Payments**
* Total Sales: **$90,341.63**
* Average Order Amount: **$63.08**
* Median Order Amount: **$50.44**