# Initial Exploration

In [None]:
import os
import pandas as pd

# Data visualization
import matplotlib.pyplot as plt
import seaborn as sns

pd.set_option('display.max_columns', None)

## Orders

In [None]:
# Create a list of DataFrames from the orders CSV files
orders_dir = '../data/raw/orders'
csv_files = [f for f in os.listdir(orders_dir) if f.endswith('.csv')]

dataframes = []
for file in csv_files:
    file_path = os.path.join(orders_dir, file)
    df = pd.read_csv(file_path)
    dataframes.append(df)

# Combine dataframes into one
df = pd.concat(dataframes, ignore_index=True)
df.head()

In [None]:
print(f"Rows: {df.shape[0]}\nColumns: {df.shape[1]}")

In [None]:
df.columns

In [None]:
df.info()

In [None]:
# Convert date columns to datetime

df['Order Date'] = pd.to_datetime(df['Order Date'])
df['Fulfillment Date'] = pd.to_datetime(df['Fulfillment Date'], format='%m/%d/%Y, %I:%M %p')

### Data Quality Checks
- [x] Investigate missing values
- [x] Identify static columns
- [x] Check for duplicates
- [x] Validate data types
- [ ] Spot outliers

In [None]:
# Identify empty or static columns

for col in df.columns:
    unique_values = set(df[col].dropna().unique())
    if len(unique_values) == 0:
        print(f"Column '{col}' is empty.")
    elif len(unique_values) == 1:
        print(f"Column '{col}' has a single unique value: {unique_values.pop()}")

* `Currency`, `Fulfillment Location`, and `Recipient Region` have a single unique value.
* `Order Shipping Price`, `Order Refunded Amount`, and `Item SKU` are empty.

In [None]:
# View rows with all null values

df[df.isna().all(axis=1)]

#### Duplicates

In [None]:
# Check for duplicates

duplicates = df.duplicated()
df[duplicates]

In [None]:
# Check duplicate 'Armen 59-07' rows

df[(df['Order'] == 'Armen 59-07') & (df['Order Date'] == '2024/08/31')]

In [None]:
# Check duplicate Troy Issac rows

df[(df['Order'] == 'Troy Issac') & (df['Order Date'] == '2024/12/19')]

Each order is split into multiple rows, one for each menu item. Menu items are not grouped together however. For example, one order can have multiple rows with `CRINKLE FRIES` as the `Item name`. This implies that if items are part of separate combos or groupings, they are listed separately.

#### Outliers

In [None]:
def calculate_iqr(df: pd.DataFrame, col: str) -> tuple:
    """
    Calculate the Interquartile Range (IQR) for a given column.
    
    Args:
        col (str): The name of the column to calculate IQR for.
        
    Returns:
        tuple: A tuple containing the lower and upper bounds for outliers.
    """
    q1 = df[col].quantile(0.25)
    q3 = df[col].quantile(0.75)
    iqr = q3 - q1

    lower_bound = q1 - 1.5 * iqr
    upper_bound = q3 + 1.5 * iqr

    return lower_bound, upper_bound


In [None]:
# Identify outliers in Order Total

plt.title('Distribution of Order Total', fontsize=16, fontweight='bold')
sns.histplot(data=df, x='Order Total', color='skyblue')
plt.xlabel('Order Total ($)', fontsize=14)
plt.ylabel('Frequency', fontsize=14)
lower_bound, upper_bound = calculate_iqr(df=df, col='Order Total')

plt.axvline(x=df['Order Total'].mean(), color='red', linestyle='--', label='Mean')
plt.axvline(x=df['Order Total'].median(), color='green', linestyle='--', label='Median')
plt.axvline(x=lower_bound, color='orange', linestyle='-.', label='Lower Bound') if lower_bound > 0 else None
plt.axvline(x=upper_bound, color='orange', linestyle='-.', label='Upper Bound')
plt.legend()

plt.tight_layout()
plt.show()

In [None]:
num_order_total_outliers = len(df[df['Order Total'] > upper_bound])
print(f"Number of outliers in 'Order Total': {num_order_total_outliers}")
print(f"Percentage of total dataset: {num_order_total_outliers / len(df) * 100:.2f}%")

In [None]:
df[df['Order Total'] > upper_bound].sort_values(by='Order Total', ascending=False)

In [None]:
# Identify outliers in Item Price

plt.title('Distribution of Item Price', fontsize=16, fontweight='bold')
sns.histplot(data=df, x='Item Price', color='skyblue')
plt.xlabel('Item Price ($)', fontsize=14)
plt.ylabel('Frequency', fontsize=14)

lower_bound, upper_bound = calculate_iqr(df=df, col='Item Price')

plt.axvline(x=df['Item Price'].mean(), color='red', linestyle='--', label='Mean')
plt.axvline(x=df['Item Price'].median(), color='green', linestyle='--', label='Median')
plt.axvline(x=lower_bound, color='orange', linestyle='-.', label='Lower Bound') if lower_bound > 0 else None
plt.axvline(x=upper_bound, color='orange', linestyle='-.', label='Upper Bound')
plt.legend()

plt.tight_layout()
plt.show()

In [None]:
num_item_price_outliers = len(df[df['Item Price'] > upper_bound])
print(f"Number of outliers in 'Item Price': {num_item_price_outliers}")
print(f"Percentage of total dataset: {num_item_price_outliers / len(df) * 100:.2f}%")