# Muesli Data Analysis

## Pre-Setup

### Environment

In [4]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

sns.set_theme(style="darkgrid")
sns.set_palette("magma")

### Read in data

In [5]:
#df_orders = pd.read_excel("./data/Muesli Project raw data.xlsx",sheet_name="Orders",header=1)
#df_campaign = pd.read_excel("./data/Muesli Project raw data.xlsx", sheet_name="Campaign Data")
#df_order_process = pd.read_excel("./data/Muesli Project raw data.xlsx", sheet_name="Order Process Data")
#df_interndata = pd.read_excel("./data/Muesli Project raw data.xlsx", sheet_name="InternData Study")

### Raw dataframes

In [None]:
df_orders.head(2)

In [None]:
df_campaign.head(2)

In [None]:
df_order_process.head(2)

In [None]:
df_interndata.head(2)

### Data cleaning function

In [10]:
def data_cleaning(df, df_type):
    
    # make col names lower
    df.columns = df.columns.str.lower()

    
    # orders data
    if df_type == "orders":
        # dropping cols
        df = df.drop(["index", "customer name", "origin channel", "category", "sub-category", "product id", "sales", "quantity", "discount", "profit"],axis=1)
        # dropping duplicates
        df = df.drop_duplicates()
        # renaming values in ship mode
        df["ship mode"] = df["ship mode"].str.replace(" Class","")
        df["ship mode"] = df["ship mode"].str.replace("Second","Standard")
        df["ship mode"] = df["ship mode"].str.replace("First","Express")
    
    # campaign data
    elif df_type == "campaign":
        # dropping cols
        df = df.drop("customer name", axis=1)
        # dropping duplicates
        pass

    # order process data    
    elif df_type == "order_process":
        # dropping cols
        df = df.drop("row id", axis=1)
        # dropping duplicates
        df = df.drop_duplicates()
        # dropping 1 duplicate row for id (scanned on truck twice)
        df = df.drop_duplicates(subset=["order id"], keep = "first")
        # dropping column order date, because 100% match with order date in orders data
        df = df.drop("order date", axis=1)
        # dropping column ship mode, because 100% match with ship mode in orders data (assumption: second class shipping = standard)
        df = df.drop("ship mode", axis=1)


    #intern data
    else:
        # dropping cols
        pass
        # dropping duplicates
        df = df.drop_duplicates()
        # dropping column pickup date, because 100% match with on truck scan date
        df = df.drop("pickup date", axis=1)

    

    return df

### Checks for cleaning function

#### Orders data

In [11]:
df_orders_1 = data_cleaning(df_orders,"orders")
df_orders_1

NameError: name 'df_orders' is not defined

In [None]:
df_orders_1.duplicated().value_counts()

In [None]:
df_orders_1["order id"].nunique()

In [None]:
df_orders_1.info()

#### Campaign data

In [None]:
df_campaign_1 = data_cleaning(df_campaign,"campaign")
df_campaign_1

In [None]:
df_campaign_1.duplicated().value_counts()

In [None]:
df_campaign_1["order id"].nunique()

In [None]:
df_campaign_1.info()

#### Order process data

In [None]:
df_order_process_1 = data_cleaning(df_order_process,"order_process")
df_order_process_1

In [None]:
df_order_process_1.duplicated().value_counts()

In [None]:
df_order_process_1["order id"].duplicated().value_counts()

In [None]:
df_order_process_1.drop_duplicates("order id")


In [None]:
duplicates = df_order_process_1[df_order_process_1["order id"].duplicated(keep=False)]
duplicates

In [None]:
df_order_process_1.info()

#### Intern data

In [None]:
df_interndata_1 = data_cleaning(df_interndata,"intern")
df_interndata_1

In [None]:
df_interndata_1.duplicated().value_counts()

In [None]:
df_interndata_1["order id"].duplicated().value_counts()

In [None]:
df_interndata_1.info()

#### Truck scan vs intern scan

In [None]:
#merged_truck = df_order_process_1.merge(df_interndata_1, on="order id", how="outer")
#merged_truck = merged_truck[["order id","on truck scan date","pickup date"]].dropna()
#merged_truck

In [None]:
#merged_truck["diff"] = merged_truck["on truck scan date"] - merged_truck["pickup date"]
#merged_truck

#### Order dates & shipping methods

In [None]:
#merged_op = df_orders_1.merge(df_order_process_1, on="order id", how="outer").dropna()

In [None]:
#merged_op[["order id","order date_x","ship mode_x","order date_y","ship mode_y"]]
#merged_op["date_diff"] = merged_op["order date_x"]-merged_op["order date_y"]
#merged_op["date_diff"].value_counts()

In [None]:
#merged_op

In [None]:
#merged_op[["order id","ship mode_x","ship mode_y"]]

In [None]:
#merged_op["ship mode_x"].value_counts()

In [None]:
#merged_op["ship mode_y"].value_counts()

In [None]:
#merged_op["ship mode_x"] = merged_op["ship mode_x"].str.replace(" Class","")
#merged_op["ship mode_y"] = merged_op["ship mode_y"].str.replace(" Processing","")
#merged_op["ship mode_x"] = merged_op["ship mode_x"].str.replace("Second","Standard")
#merged_op["ship mode_x"] = merged_op["ship mode_x"].str.replace("First","Express")

In [None]:
#merged_op[["order id","ship mode_x","ship mode_y"]]

In [None]:
#merged_op['match'] = merged_op['ship mode_x'] == merged_op['ship mode_y']
#merged_op['match'].value_counts()

## Cleaned dataframes

In [None]:
df_orders_cleaned = data_cleaning(df_orders,"orders")
df_campaign_cleaned = data_cleaning(df_campaign,"campaign")
df_order_process_cleaned = data_cleaning(df_order_process,"order_process")
df_interndata_cleaned = data_cleaning(df_interndata,"intern")

In [None]:
df_orders_cleaned

In [None]:
df_campaign_cleaned

In [None]:
df_order_process_cleaned

In [None]:
df_interndata_cleaned

## Metrics

### o_date_2_processed

In [None]:
merged_metric_1 = df_orders_cleaned.merge(df_interndata_cleaned, on="order id", how="outer").dropna()
merged_metric_1

In [None]:
merged_metric_1["date_diff"] = merged_metric_1["ready to ship date"]-merged_metric_1["order date"]
merged_metric_1

In [None]:
merged_metric_1.info()

In [None]:
merged_metric_1["date_diff_days"] = merged_metric_1["date_diff"].dt.days

In [None]:
merged_metric_1.info()

In [None]:
# 1. Distribution of date_diff
plt.figure(figsize=(8,5))
plt.hist(merged_metric_1["date_diff_days"], edgecolor="black")
plt.title("Distribution of date difference")
plt.xlabel("Days between order and ready to ship")
plt.ylabel("Frequency")
plt.show()

#### WEEKDAYS

In [None]:
# Order weekday (0=Monday, 6=Sunday)
merged_metric_1["order_weekday"] = merged_metric_1["order date"].dt.day_name()

# Ready-to-ship weekday
merged_metric_1["ship_weekday"] = merged_metric_1["ready to ship date"].dt.day_name()

In [None]:
merged_metric_1

In [None]:
orders_by_day = merged_metric_1["order_weekday"].value_counts()
ship_by_day = merged_metric_1["ship_weekday"].value_counts()
avg_delay_by_day = merged_metric_1.groupby("order_weekday")["date_diff_days"].mean()

In [None]:
# 1 Order date by weekdays

orders_by_day = merged_metric_1["order_weekday"].value_counts().reindex(
    ["Monday","Tuesday","Wednesday","Thursday","Friday","Saturday","Sunday"]
)

orders_by_day.plot(kind="bar", figsize=(8,5), color="skyblue", edgecolor="black")
plt.title("Orders by Weekday")
plt.ylabel("Number of Orders")
plt.show()

In [None]:
# 2 ready to Ship date by Weekdays

ship_by_day = merged_metric_1["ship_weekday"].value_counts().reindex(
    ["Monday","Tuesday","Wednesday","Thursday","Friday","Saturday","Sunday"]
)

ship_by_day.plot(kind="bar", figsize=(8,5), color="lightgreen", edgecolor="black")
plt.title("Ready-to-Ship by Weekday")
plt.ylabel("Number of Orders")
plt.show()


In [None]:
# 3 Average delay by Weekdays

avg_delay_by_day = merged_metric_1.groupby("order_weekday")["date_diff_days"].mean().reindex(
    ["Monday","Tuesday","Wednesday","Thursday","Friday","Saturday","Sunday"]
)

avg_delay_by_day.plot(kind="bar", figsize=(8,5), color="orange", edgecolor="black")
plt.title("Average Time by Order Weekday")
plt.ylabel("Average Time (days)")
plt.show()


In [None]:
merged_metric_1["ship mode"] = merged_metric_1["ship mode"].str.replace(" Class","")
merged_metric_1["ship mode"] = merged_metric_1["ship mode"].str.replace("Second","Standard")
merged_metric_1["ship mode"] = merged_metric_1["ship mode"].str.replace("First","Express")

In [None]:
avg_diff_by_mode = merged_metric_1.groupby("ship mode")["date_diff_days"].mean().reset_index().round(2)
avg_diff_by_mode

In [None]:
# actual average delay for Standard
actual_std = (
    merged_metric_1[merged_metric_1["ship mode"]=="Standard"]
    .groupby("order_weekday")["date_diff_days"]
    .mean()
)

# expected values
expected_data = {
    "order_weekday": ["Monday","Tuesday","Wednesday","Thursday","Friday","Saturday","Sunday"],
    "expected_delay": [2,2,2,2,4,4,3]
}
expected_df = pd.DataFrame(expected_data).set_index("order_weekday")

# align both (reindex to weekday order)
weekday_order = ["Monday","Tuesday","Wednesday","Thursday","Friday","Saturday","Sunday"]
actual_std = actual_std.reindex(weekday_order)
expected_df = expected_df.reindex(weekday_order)

# combine
comparison = pd.DataFrame({
    "Actual (Standard)": actual_std,
    "Expected (Standard)": expected_df["expected_delay"]
})

# plot
comparison.plot(kind="bar", figsize=(12,6))
plt.title("Standard: Ordered to (ready to be ship) Duration Time: Actual vs Expected")
plt.ylabel("Average Duration Time (days)")
plt.xlabel("Order Weekday")
plt.xticks(rotation=45)
plt.legend(title="Series")
plt.show()


In [None]:
# actual average delay for Express
actual_exp = (
    merged_metric_1[merged_metric_1["ship mode"]=="Express"]
    .groupby("order_weekday")["date_diff_days"]
    .mean()
)

# expected values
expected_data = {
    "order_weekday": ["Monday","Tuesday","Wednesday","Thursday","Friday","Saturday","Sunday"],
    "expected_delay": [2,2,2,2,4,4,3]
}
expected_df = pd.DataFrame(expected_data).set_index("order_weekday")

# align both (reindex to weekday order)
weekday_order = ["Monday","Tuesday","Wednesday","Thursday","Friday","Saturday","Sunday"]
actual_exp = actual_exp.reindex(weekday_order)
expected_df = expected_df.reindex(weekday_order)

# combine into comparison table
comparison = pd.DataFrame({
    "Actual (Express)": actual_exp,
    "Expected (Express)": expected_df["expected_delay"]
})

# plot
comparison.plot(kind="bar", figsize=(12,6))
plt.title("Express: ordered to (ready to be ship) Duration Time: Actual vs Expected")
plt.ylabel("Average Time (days)")
plt.xlabel("Order Weekday")
plt.xticks(rotation=45)
plt.legend(title="Series")
plt.show()


In [None]:
# expected values (same for express & standard)
expected_data = {
    "order_weekday": ["Monday","Tuesday","Wednesday","Thursday","Friday","Saturday","Sunday"],
    "Expected": [2,2,2,2,4,4,3]
}
expected_df = pd.DataFrame(expected_data).set_index("order_weekday")

# weekday order
weekday_order = ["Monday","Tuesday","Wednesday","Thursday","Friday","Saturday","Sunday"]

# actual average delay for Standard
actual_std = (
    merged_metric_1[merged_metric_1["ship mode"]=="Standard"]
    .groupby("order_weekday")["date_diff_days"]
    .mean()
    .reindex(weekday_order)
)

# actual average delay for Express
actual_exp = (
    merged_metric_1[merged_metric_1["ship mode"]=="Express"]
    .groupby("order_weekday")["date_diff_days"]
    .mean()
    .reindex(weekday_order)
)

# combine into one DataFrame
comparison = pd.DataFrame({
    "Expected": expected_df["Expected"],
    "Actual (Standard)": actual_std,
    "Actual (Express)": actual_exp
})

# plot
comparison.plot(kind="bar", figsize=(12,6))
plt.title("Ordered to Ready-to-Ship Duration: Expected vs Actual (Standard & Express)")
plt.ylabel("Average Duration Time (days)")
plt.xlabel("Order Weekday")
plt.xticks(rotation=0)
plt.legend(title="Series")
plt.show()


In [None]:
# expected values (same for express & standard)
expected_data = {
    "order_weekday": ["Monday","Tuesday","Wednesday","Thursday","Friday","Saturday","Sunday"],
    "Expected": [2,2,2,2,4,4,3]
}
expected_df = pd.DataFrame(expected_data).set_index("order_weekday")

# weekday order
weekday_order = ["Monday","Tuesday","Wednesday","Thursday","Friday","Saturday","Sunday"]

# actual average delay for Standard
actual_std = (
    merged_metric_1[merged_metric_1["ship mode"]=="Standard"]
    .groupby("order_weekday")["date_diff_days"]
    .mean()
    .reindex(weekday_order)
)

# actual average delay for Express
actual_exp = (
    merged_metric_1[merged_metric_1["ship mode"]=="Express"]
    .groupby("order_weekday")["date_diff_days"]
    .mean()
    .reindex(weekday_order)
)

# combine into one DataFrame
comparison = pd.DataFrame({
    "Expected": expected_df["Expected"],
    "Actual (Standard)": actual_std,
    "Actual (Express)": actual_exp
})

# plot and add value labels
ax = comparison.plot(kind="bar", figsize=(12,6))
plt.title("Ordered to Ready-to-Ship Duration: Expected vs Actual (Standard & Express)")
plt.ylabel("Average Duration Time (days)")
plt.xlabel("Order Weekday")
plt.xticks(rotation=0)
plt.legend(title="Series")

# add labels to each bar
for container in ax.containers:
    ax.bar_label(container, fmt="%.1f", label_type="edge", padding=2)

plt.show()


### o_processes_2_truck

#### Data

In [None]:
df_processed = df_interndata_cleaned.copy()
df_processed["ready to ship date_weekday"] = df_processed["ready to ship date"].dt.dayofweek
df_processed

In [None]:
df_processed["ready to ship date_weekday"].value_counts()

In [None]:
# ready to ship date only Mo - Fr checked!

In [None]:
df_processed_x_truck = df_order_process_cleaned.merge(df_processed, on="order id",how="inner")
df_processed_x_truck

In [None]:
df_processed_x_truck_smode = df_processed_x_truck.merge(df_orders_cleaned, on="order id", how="inner")
df_processed_x_truck_smode = df_processed_x_truck_smode[["order id", "on truck scan date", "ready to ship date", "ready to ship date_weekday","ship mode","order date"]]
df_processed_x_truck_smode["o_processed_2_truck_actual"] = df_processed_x_truck_smethod["on truck scan date"] - df_processed_x_truck_smethod["ready to ship date"]
df_processed_x_truck_smode["o_processed_2_truck_actual"] = df_processed_x_truck_smode["o_processed_2_truck_actual"].dt.days
df_processed_x_truck_smode

In [None]:
df_processed_x_truck_smode["order date_weekday"] = df_processed_x_truck_smode["order date"].dt.day_name()
df_processed_x_truck_smode

In [None]:
df_processed_x_truck_smode.groupby("ship mode")[["o_processed_2_truck_actual"]].agg(["mean","max","min"])

In [None]:
graph_1 = df_processed_x_truck_smode.groupby("ship mode")[["o_processed_2_truck_actual"]].mean().reset_index()
graph_1

In [None]:
graph_1["o_processed_2_truck_plan"] = {0:0,1:1}
graph_1

In [None]:
# format for plot
graph_1 = pd.melt(graph_1,
    id_vars=["ship mode"],
    value_vars=["o_processed_2_truck_actual", "o_processed_2_truck_plan"],
    var_name="data type",
    value_name="days")

# renaming data type
graph_1["data type"] = graph_1["data type"].map({
    "o_processed_2_truck_actual": "Actual",
    "o_processed_2_truck_plan": "Plan"})

graph_1

In [None]:
graph_2 = df_processed_x_truck_smode.groupby(["ship mode","order date_weekday"])[["o_processed_2_truck_actual"]].mean().reset_index()
graph_2

In [None]:
# no sunday for express?

In [None]:
graph_2["identifyer"]=graph_2["ship mode"]+graph_2["order date_weekday"]
graph_2

In [None]:
plan_values_dict = {"ExpressMonday":1,
                    "ExpressTuesday":0,
                    "ExpressWednesday":1,
                    "ExpressThursday":0,
                    "ExpressFriday":0,
                    "ExpressSaturday":1,
                    "ExpressSunday":1,
                    "StandardMonday":1,
                    "StandardTuesday":2,
                    "StandardWednesday":1,
                    "StandardThursday":3,
                    "StandardFriday":2,
                    "StandardSaturday":1,
                    "StandardSunday":1
                    }

In [None]:
graph_2["o_processed_2_truck_plan"] = graph_2["identifyer"].map(plan_values_dict)
graph_2 = graph_2.drop("identifyer",axis=1)
graph_2

In [None]:
# format for plot
graph_2 = pd.melt(graph_2,
    id_vars=["ship mode","order date_weekday"],
    value_vars=["o_processed_2_truck_actual", "o_processed_2_truck_plan"],
    var_name="data type",
    value_name="days")

# renaming data type
graph_2["data type"] = graph_2["data type"].map({
    "o_processed_2_truck_actual": "Actual",
    "o_processed_2_truck_plan": "Plan"})

graph_2

In [None]:
weekday_order = ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"]

graph_2["order date_weekday"] = pd.Categorical(graph_2["order date_weekday"],categories=weekday_order, ordered=True)
graph_2

In [None]:
graph_2_express = graph_2[graph_2["ship mode"]=="Express"].sort_values("order date_weekday")
graph_2_standard = graph_2[graph_2["ship mode"]=="Standard"].sort_values("order date_weekday")

graph_2_express

#### Plots

In [None]:
graph = graph_1

plt.figure(figsize=(10, 7))

ax = sns.barplot(data=graph, 
             x= "ship mode",
             y = "days",
             hue="data type",
             hue_order=["Actual","Plan"],
             width=bar_width
            )
ax.set_ylim(-0.1, 3.2)
plt.title('Order ready for shipment 2 leaving warehouse', fontsize=16,pad=20)
plt.xlabel("Shipping type")
plt.ylabel("Days")
plt.legend()

bar_width = 0.7
offset = (bar_width / 4) # This calculates the offset needed to center on the 'Actual' bar

# Calculate and display the uplift (difference) as text above the bars
# Iterate through each ship mode ('Express' and 'Standard')
for i, ship_mode in enumerate(graph["ship mode"].unique()):
    # Get the 'Actual' and 'Plan' values for the current ship mode
    actual_days = graph[(graph["ship mode"] == ship_mode) & (graph["data type"] == "Actual")]["days"].iloc[0]
    plan_days = graph[(graph["ship mode"] == ship_mode) & (graph["data type"] == "Plan")]["days"].iloc[0]

    uplift = actual_days - plan_days
    
    # X-POSITION: Centered over the 'Actual' bar.
    # The first bar group (Express, i=0) is at x=0. The second (Standard, i=1) is at x=1.
    # The 'Actual' bar is typically slightly to the left of the center point (i - offset).
    x_pos = i - offset
    # y-position: slightly above the higher bar for the annotation
    y_pos = actual_days +0.05 # Adjust vertical position as needed

    # Determine text color (Green for improvement (lower days), Red for regression (higher days))
    color = "black"
    
    # Format the uplift text (e.g., "+0.50 days" or "-0.25 days")
    uplift_text = f"{uplift:+.1f} d." # Using + to explicitly show positive or negative

    # Add the uplift text
    ax.text(x_pos, y_pos, uplift_text,
            color=color, ha='center', va='bottom', fontsize=11, fontweight='bold')

plt.xticks()
plt.xticks()
plt.tight_layout()
plt.show(),

In [None]:
graph = graph_2_standard

plt.figure(figsize=(10, 7))

ax = sns.barplot(data=graph, 
             x= "order date_weekday",
             y = "days",
             hue="data type",
             hue_order=["Actual","Plan"],
             width=bar_width,
             errorbar=None
            )
ax.set_ylim(-0.1, 3.2)
plt.title('*Standard* order ready for shipment 2 leaving warehouse', fontsize=16,pad=20)
plt.xlabel("Order weekday")
plt.ylabel("Days")
plt.legend()

bar_width = 0.7
offset = (bar_width / 4) # This calculates the offset needed to center on the 'Actual' bar


# Calculate and display the uplift (difference) as text above the bars
# Iterate through each unique WEEKDAY in the data
for i, weekday in enumerate(graph["order date_weekday"].unique()):
    # Get the 'Actual' and 'Plan' values for the current WEEKDAY
    
    # Filter the graph DataFrame by the current weekday
    weekday_data = graph[graph["order date_weekday"] == weekday]
    
    # Use the filtered data to extract the specific Actual and Plan values
    actual_days = weekday_data[weekday_data["data type"] == "Actual"]["days"].iloc[0]
    plan_days = weekday_data[weekday_data["data type"] == "Plan"]["days"].iloc[0]

    uplift = actual_days - plan_days
    
    # X-POSITION: Centered over the 'Actual' bar.
    # The first bar group (Express, i=0) is at x=0. The second (Standard, i=1) is at x=1.
    # The 'Actual' bar is typically slightly to the left of the center point (i - offset).
    x_pos = i - offset
    # y-position: slightly above the higher bar for the annotation
    y_pos = actual_days +0.05 # Adjust vertical position as needed

    # Determine text color (Green for improvement (lower days), Red for regression (higher days))
    color = "black"
    
    # Format the uplift text (e.g., "+0.50 days" or "-0.25 days")
    uplift_text = f"{uplift:+.1f} d." # Using + to explicitly show positive or negative

    # Add the uplift text
    ax.text(x_pos, y_pos, uplift_text,
            color=color, ha='center', va='bottom', fontsize=11, fontweight='bold')

plt.xticks()
plt.xticks()
plt.tight_layout()
plt.show(),

In [None]:
graph = graph_2_express

plt.figure(figsize=(10, 7))

ax = sns.barplot(data=graph, 
             x= "order date_weekday",
             y = "days",
             hue="data type",
             hue_order=["Actual","Plan"],
             width=bar_width,
             errorbar=None
            )
ax.set_ylim(-0.1, 3.2)
plt.title('*Express* order ready for shipment 2 leaving warehouse', fontsize=16,pad=20)
plt.xlabel("Order weekday")
plt.ylabel("Days")
plt.legend()

bar_width = 0.7
offset = (bar_width / 4) # This calculates the offset needed to center on the 'Actual' bar


# Calculate and display the uplift (difference) as text above the bars
# Iterate through each unique WEEKDAY in the data
for i, weekday in enumerate(graph["order date_weekday"].unique()):
    # Get the 'Actual' and 'Plan' values for the current WEEKDAY
    
    # Filter the graph DataFrame by the current weekday
    weekday_data = graph[graph["order date_weekday"] == weekday]
    
    # Use the filtered data to extract the specific Actual and Plan values
    actual_days = weekday_data[weekday_data["data type"] == "Actual"]["days"].iloc[0]
    plan_days = weekday_data[weekday_data["data type"] == "Plan"]["days"].iloc[0]

    uplift = actual_days - plan_days
    
    # X-POSITION: Centered over the 'Actual' bar.
    # The first bar group (Express, i=0) is at x=0. The second (Standard, i=1) is at x=1.
    # The 'Actual' bar is typically slightly to the left of the center point (i - offset).
    x_pos = i - offset
    # y-position: slightly above the higher bar for the annotation
    y_pos = actual_days +0.05 # Adjust vertical position as needed

    # Determine text color (Green for improvement (lower days), Red for regression (higher days))
    color = "black"
    
    # Format the uplift text (e.g., "+0.50 days" or "-0.25 days")
    uplift_text = f"{uplift:+.1f} d." # Using + to explicitly show positive or negative

    # Add the uplift text
    ax.text(x_pos, y_pos, uplift_text,
            color=color, ha='center', va='bottom', fontsize=11, fontweight='bold')

plt.xticks()
plt.xticks()
plt.tight_layout()
plt.show(),

In [None]:
graph_2_express

### o_truck_2_delivered

#### Merging data

In [None]:
merged_o_truck_2_delivered=pd.merge(df_campaign_cleaned,df_order_process_cleaned, on='order id', how='inner' )

In [None]:
merged_o_truck_2_delivered

In [None]:
merged_o_truck_2_delivered = pd.merge(
    merged_o_truck_2_delivered,
    df_orders_cleaned[['order id', 'ship mode']],  # include key + column you want
    on='order id',
    how='inner'
)

In [None]:
merged_o_truck_2_delivered

#### checking na data

In [None]:
merged_o_truck_2_delivered.isna().sum()

#### calculating time interval between on truck scan date and arrival scan date

In [None]:
merged_o_truck_2_delivered['time interval']=merged_o_truck_2_delivered['arrival scan date']-merged_o_truck_2_delivered['on truck scan date']

In [None]:
merged_o_truck_2_delivered

#### extracting the days for arrival date

In [None]:
merged_o_truck_2_delivered["arrival year"]= merged_o_truck_2_delivered["arrival scan date"].dt.year
merged_o_truck_2_delivered["arrival month"]= merged_o_truck_2_delivered["arrival scan date"].dt.month
merged_o_truck_2_delivered["arrival day"]= merged_o_truck_2_delivered["arrival scan date"].dt.dayofweek

In [None]:
merged_o_truck_2_delivered

In [None]:
merged_o_truck_2_delivered['arrival day'].value_counts()

#### transforming arrival day into weekdays and weekend dates

In [None]:
arrival_day_categories={0: 'monday', 1: 'tuesday', 2: 'wednesday', 3: 'thursday', 4:'friday', 5:'friday', 6:'saturday', 7:'sunday'}
merged_o_truck_2_delivered['arrival day']=merged_o_truck_2_delivered['arrival day'].map(arrival_day_categories)
merged_o_truck_2_delivered

#### extracting the days for on truck scan date

In [None]:
merged_o_truck_2_delivered["on truck year"]= merged_o_truck_2_delivered["on truck scan date"].dt.year
merged_o_truck_2_delivered["on truck month"]= merged_o_truck_2_delivered["on truck scan date"].dt.month
merged_o_truck_2_delivered["on truck day"]= merged_o_truck_2_delivered["on truck scan date"].dt.dayofweek

In [None]:
merged_o_truck_2_delivered

In [None]:
merged_o_truck_2_delivered.info()

In [None]:
merged_o_truck_2_delivered['actual duration']=merged_o_truck_2_delivered['time interval'].dt.days

In [None]:
merged_o_truck_2_delivered.info()

In [None]:
merged_o_truck_2_delivered['on truck day'].value_counts()

#### transforming on truck scan date into weekdays and weekends date

In [None]:
on_truck_day_categories={0: 'monday', 1: 'tuesday', 2: 'wednesday', 3: 'thursday', 4:'friday', 5:'friday', 6:'saturday', 7:'sunday'}
merged_o_truck_2_delivered['on truck day']=merged_o_truck_2_delivered['on truck day'].map(on_truck_day_categories)
merged_o_truck_2_delivered

In [None]:
merged_o_truck_2_delivered['on truck day'].value_counts()

In [None]:
merged_o_truck_2_delivered['arrival day'].value_counts()

#### calculating the average actual duration of on truck to delivery 

In [None]:
result_on_truck=merged_o_truck_2_delivered.groupby(['ship mode','on truck day'])['actual duration'].mean().round(2).reset_index()

In [None]:
result_on_truck


In [None]:
result_on_truck["expected duration"] = [3, 3, 2, 4, 3, 3, 4, 4]

In [None]:
result_on_truck

In [None]:
result_arrival=merged_o_truck_2_delivered.groupby(['ship mode','arrival day'])['actual duration'].mean().round(2).reset_index()

In [None]:
result_arrival

In [None]:
result_arrival["expected duration"] = [3, 3, 2, 4, 4, 3, 3, 4, 4, 3]

In [None]:
result_arrival

In [None]:
result_on_truck.query("`ship mode` == 'Standard'")

In [None]:
merged_o_truck_2_delivered_weekdays_on_truck = merged_o_truck_2_delivered.query(
    "`on truck day` in ['monday', 'tuesday', 'wednesday', 'thursday', 'friday']"
)

In [None]:
merged_o_truck_2_delivered_weekdays_on_truck = merged_o_truck_2_delivered.query(
    "`on truck day` in ['monday', 'tuesday', 'wednesday', 'thursday', 'friday']"
)

In [None]:
average_actual_duration_weekdays_on_truck= merged_o_truck_2_delivered_weekdays_on_truck.groupby('ship mode')['actual duration'].mean().round(2)

In [None]:
average_actual_duration_weekdays_on_truck= merged_o_truck_2_delivered_weekdays_on_truck.groupby('ship mode')['actual duration'].mean().round(2)

In [None]:
merged_o_truck_2_delivered_weekdays_arrival = merged_o_truck_2_delivered.query(
    "`arrival day` in ['monday', 'tuesday', 'wednesday', 'thursday', 'friday']"
)

In [None]:
merged_o_truck_2_delivered_weekdays_arrival

In [None]:
merged_o_truck_2_delivered_weekends_arrival= merged_o_truck_2_delivered.query(
    "`arrival day` in ['saturday', 'sunday']")

In [None]:
merged_o_truck_2_delivered_weekends_arrival

#### visualization

In [None]:
# Data including missing days
standard_data = pd.DataFrame({
    "on_truck_day": ["monday", "tuesday", "wednesday", "thursday", "friday", "saturday", "sunday"],
    "actual duration": [4.33, 5.00, 4.83, None, 4.12, None, None],  # actual durations
    "expected duration normal": [3, 4, 4, 3, 3, 3, 3]  # expected durations
})

day_order = ["monday", "tuesday", "wednesday", "thursday", "friday", "saturday", "sunday"]

plt.figure(figsize=(12,6))
sns.set_style("whitegrid")
sns.set_context("talk")

# Barplot for actual duration
bars = sns.barplot(
    x="on_truck_day",
    y="actual duration",
    data=standard_data,
    palette="Blues_d",
    order=day_order,
    alpha=0.8
)

# Overlay expected duration as dashed line with points
plt.plot(
    day_order,
    standard_data["expected duration normal"],
    color="#FF6F61",  # soft red
    marker="o",
    linestyle="--",
    linewidth=2,
    markersize=8,
    label="Expected Duration"
)

# Annotate bars with actual duration
for i, val in enumerate(standard_data["actual duration"]):
    if val is not None:
        bars.text(i, val + 0.1, f"{val:.1f}", ha='center', va='bottom', fontsize=12, color='navy')

# Annotate expected durations on points
for i, val in enumerate(standard_data["expected duration normal"]):
    plt.text(i, val + 0.1, f"{val:.1f}", ha='center', va='bottom', fontsize=12, color='#FF6F61')

# Labels and title
plt.ylabel("Duration (days)", fontsize=14)
plt.xlabel("On Truck Day", fontsize=14)
plt.title("Actual vs Expected Duration for Standard Ship Mode", fontsize=16, weight='bold')
plt.legend(frameon=True, facecolor='white', edgecolor='black')
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
plt.ylim(0, max(standard_data["actual duration"].max(), max(standard_data["expected duration normal"])) + 2)

# Save figure
plt.savefig('Actual vs Expected Duration for Standard Ship Mode for On Truck', dpi=300, bbox_inches='tight')

plt.show()


In [None]:
result_arrival.query("`ship mode` == 'Standard'")

In [None]:
# Data including missing days
standard_data = pd.DataFrame({
    "arrival_day": ["monday", "tuesday", "wednesday", "thursday", "friday", "saturday", "sunday"],
    "actual duration": [4.90, 4.86, 4.54, 3.79, 3.34, None, None],  # actual durations
    "expected duration normal": [3, 4, 4, 3, 3, 3, 3]  # expected durations
})

day_order = ["monday", "tuesday", "wednesday", "thursday", "friday", "saturday", "sunday"]

plt.figure(figsize=(12,6))
sns.set_style("whitegrid")
sns.set_context("talk")

# Barplot for actual duration
bars = sns.barplot(
    x="arrival_day",
    y="actual duration",
    data=standard_data,
    palette="Blues_d",
    order=day_order,
    alpha=0.8
)

# Overlay expected duration as dashed line with points
plt.plot(
    day_order,
    standard_data["expected duration normal"],
    color="#FF6F61",  # soft red
    marker="o",
    linestyle="--",
    linewidth=2,
    markersize=8,
    label="Expected Duration"
)

# Annotate bars with actual duration
for i, val in enumerate(standard_data["actual duration"]):
    if val is not None:
        bars.text(i, val + 0.1, f"{val:.1f}", ha='center', va='bottom', fontsize=12, color='navy')

# Annotate expected durations on points
for i, val in enumerate(standard_data["expected duration normal"]):
    plt.text(i, val + 0.1, f"{val:.1f}", ha='center', va='bottom', fontsize=12, color='#FF6F61')

# Labels and title
plt.ylabel("Duration (days)", fontsize=14)
plt.xlabel("Arrival Day", fontsize=14)
plt.title("Actual vs Expected Duration for Standard Ship Mode", fontsize=16, weight='bold')
plt.legend(frameon=True, facecolor='white', edgecolor='black')
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
plt.ylim(0, max(standard_data["actual duration"].max(), max(standard_data["expected duration normal"])) + 2)

# Save figure
plt.savefig('Actual vs Expected Duration for Standard Ship Mode for Arrival Day', dpi=300, bbox_inches='tight')

plt.show()

In [None]:
result_on_truck.query("`ship mode` == 'Express'")

In [None]:
# Data including all days
data = pd.DataFrame({
    "ship_mode": ["Express"]*7,
    "on_truck_day": ["monday", "tuesday", "wednesday", "thursday", "friday", "saturday", "sunday"],
    "actual duration": [4.88, 5.33, 4.86, None, 3.79, None, None],  # actual durations
    "expected_duration": [3, 2, 4, 4, 3, 3, 3]  # expected durations
})

# Define the full day order
day_order = ["monday", "tuesday", "wednesday", "thursday", "friday", "saturday", "sunday"]

plt.figure(figsize=(12,6))
sns.set_style("whitegrid")
sns.set_context("talk")

# Barplot for actual duration
bars = sns.barplot(
    x="on_truck_day",
    y="actual duration",
    data=data,
    palette="Oranges",
    order=day_order,
    alpha=0.8
)

# Overlay expected duration as line with points
plt.plot(
    day_order,
    data["expected_duration"],
    color="#FF6F61",  # red for expected
    marker="o",
    linestyle="--",
    linewidth=2,
    markersize=8,
    label="Expected Duration"
)

# Annotate actual durations
for i, val in enumerate(data["actual duration"]):
    if val is not None:
        bars.text(i, val + 0.1, f"{val:.1f}", ha='center', va='bottom', fontsize=12, color='brown')

# Annotate expected durations
for i, val in enumerate(data["expected_duration"]):
    bars.text(i, val + 0.1, f"{val:.1f}", ha='center', va='bottom', fontsize=12, color='#FF6F61')

plt.ylabel("Duration (days)", fontsize=14)
plt.xlabel("On Truck Day", fontsize=14)
plt.title("Actual vs Expected Duration for Express Ship Mode (All Days)", fontsize=16, weight='bold')
plt.legend()
plt.ylim(0, max(data["actual duration"].max(skipna=True), max(data["expected_duration"])) + 2)

# Save figure
plt.savefig('Actual vs Expected Duration for Express Ship Mode for On Truck', dpi=300, bbox_inches='tight')

plt.show()



In [None]:
result_arrival.query("`ship mode` == 'Express'")

In [None]:
# Data including all days
data = pd.DataFrame({
    "ship_mode": ["Express"]*7,
    "arrival_day": ["monday", "tuesday", "wednesday", "thursday", "friday", "saturday", "sunday"],
    "actual duration": [4.64, 4.82, 5.29, 3.60, 3.30, None, None],  # actual durations
    "expected_duration": [3, 2, 4, 4, 3, 3, 3]  # expected durations
})

# Define the full day order
day_order = ["monday", "tuesday", "wednesday", "thursday", "friday", "saturday", "sunday"]

plt.figure(figsize=(12,6))
sns.set_style("whitegrid")
sns.set_context("talk")

# Barplot for actual duration
bars = sns.barplot(
    x="arrival_day",
    y="actual duration",
    data=data,
    palette="Oranges",
    order=day_order,
    alpha=0.8
)

# Overlay expected duration as line with points
plt.plot(
    day_order,
    data["expected_duration"],
    color="#FF6F61",  # red for expected
    marker="o",
    linestyle="--",
    linewidth=2,
    markersize=8,
    label="Expected Duration"
)

# Annotate actual durations
for i, val in enumerate(data["actual duration"]):
    if val is not None:
        bars.text(i, val + 0.1, f"{val:.1f}", ha='center', va='bottom', fontsize=12, color='brown')

# Annotate expected durations
for i, val in enumerate(data["expected_duration"]):
    bars.text(i, val + 0.1, f"{val:.1f}", ha='center', va='bottom', fontsize=12, color='#FF6F61')

plt.ylabel("Duration (days)", fontsize=14)
plt.xlabel("Arrival Day", fontsize=14)
plt.title("Actual vs Expected Duration for Express Ship Mode (All Days)", fontsize=16, weight='bold')
plt.legend()
plt.ylim(0, max(data["actual duration"].max(skipna=True), max(data["expected_duration"])) + 2)

# Save figure
plt.savefig('Actual vs Expected Duration for Express Ship Mode for Arrival Day', dpi=300, bbox_inches='tight')

plt.show()

In [None]:
average_actual_duration_weekdays_on_truck

In [None]:
average_actual_duration_weekdays_on_truck

In [None]:
# Dataset
data = {
    'Day Type': ['Weekdays', 'Weekend'],
    'Express': [4.48, None],
    'Standard': [4.64, None]
}
df = pd.DataFrame(data)

# Melt for seaborn
df_melted = df.melt(
    id_vars='Day Type',
    value_vars=['Express', 'Standard'],
    var_name='Ship Mode',
    value_name='Average Duration'
)

# Color palette
palette = {'Express': '#1f77b4', 'Standard': '#ff7f0e'}

# Plot
sns.set_style("whitegrid")
plt.figure(figsize=(8,6))
ax = sns.barplot(
    x='Day Type',
    y='Average Duration',
    hue='Ship Mode',
    data=df_melted,
    palette=palette,
    dodge=True,
    width=0.5
)

# Horizontal line for expected duration
plt.axhline(3, color='red', linestyle='--', linewidth=2, label='Expected Duration (3 days)')

# Add labels above bars with stagger and increased horizontal shift for Standard
stagger_amount = 0.1  # vertical offset to separate labels
horizontal_shift = 0.12  # increased horizontal offset for Standard

for p, (_, row) in zip(ax.patches, df_melted.iterrows()):
    val = row['Average Duration']
    if pd.notna(val):
        x = p.get_x() + p.get_width() / 2.
        y = val + stagger_amount if row['Ship Mode'] == 'Standard' else val
        # Shift Standard further to the right
        if row['Ship Mode'] == 'Standard':
            x += horizontal_shift
        ax.annotate(f"{val:.2f}",
                    (x, y),
                    ha='center', va='bottom',
                    fontsize=12, fontweight='bold')

# Style tweaks
plt.ylabel("Average Delivery Duration (days)", fontsize=14)
plt.xlabel("Day Type", fontsize=14)
plt.title("Average Truck-to-Delivery Duration by Ship Mode", fontsize=16, fontweight='bold')
plt.ylim(0, max(df_melted['Average Duration'].dropna()) + 0.5)

# Legend outside
plt.legend(title='Ship Mode', loc='upper left', bbox_to_anchor=(1,1), fontsize=12, title_fontsize=13)
sns.despine(left=True, bottom=True)

plt.tight_layout()

plt.savefig('Actual_Truck-to-Delivery_Duration_by_Ship_Mode.png', dpi=300, bbox_inches='tight')

plt.show()




### o_date_2_delivered

In [None]:
merged_metric_2 = df_orders_cleaned.merge(df_campaign_cleaned, on="order id", how="outer").dropna()
merged_metric_2

In [None]:
merged_metric_2["date_diff"] = merged_metric_2["arrival scan date"]-merged_metric_2["order date"]
merged_metric_2

In [None]:
merged_metric_2.info()

In [None]:
merged_metric_2["date_diff_days"] = merged_metric_2["date_diff"].dt.days

In [None]:
# 1. Distribution of date_diff
plt.figure(figsize=(8,5))
plt.hist(merged_metric_2["date_diff_days"], edgecolor="black")
plt.title("Distribution of date difference")
plt.xlabel("Days between order and arrival scan date")
plt.ylabel("Frequency")
plt.show()

In [None]:
# Order weekday (0=Monday, 6=Sunday)
merged_metric_2["order_weekday"] = merged_metric_2["order date"].dt.day_name()

# Ready-to-ship weekday
merged_metric_2["arrival_weekday"] = merged_metric_2["arrival scan date"].dt.day_name()


In [None]:
orders_by_day = merged_metric_2["order_weekday"].value_counts()
arrival_by_day = merged_metric_2["arrival_weekday"].value_counts()
avg_delay_by_day = merged_metric_2.groupby("order_weekday")["date_diff_days"].mean()


In [None]:
# 1 Order date by weekdays

orders_by_day = merged_metric_2["order_weekday"].value_counts().reindex(
    ["Monday","Tuesday","Wednesday","Thursday","Friday","Saturday","Sunday"]
)

orders_by_day.plot(kind="bar", figsize=(8,5), color="skyblue", edgecolor="black")
plt.title("Orders by Weekday")
plt.ylabel("Number of Orders")
plt.show()


In [None]:
# 2 Arrival date by Weekdays

arrival_by_day= merged_metric_2["arrival_weekday"].value_counts().reindex(
    ["Monday","Tuesday","Wednesday","Thursday","Friday","Saturday","Sunday"]
)

arrival_by_day.plot(kind="bar", figsize=(8,5), color="lightgreen", edgecolor="black")
plt.title("Arrival by Weekday")
plt.ylabel("Number of Orders")
plt.show()

In [None]:
# 3 Average Arrival delay by Weekdays

avg_delay_by_day = merged_metric_2.groupby("order_weekday")["date_diff_days"].mean().reindex(
    ["Monday","Tuesday","Wednesday","Thursday","Friday","Saturday","Sunday"]
)

avg_delay_by_day.plot(kind="bar", figsize=(8,5), color="orange", edgecolor="black")
plt.title("Average Arrival Delay by Order Weekday")
plt.ylabel("Average Delay (days)")
plt.show()


In [None]:
# 

merged_metric_2["ship mode"] = merged_metric_2["ship mode"].str.replace(" Class","")
merged_metric_2["ship mode"] = merged_metric_2["ship mode"].str.replace("Second","Standard")
merged_metric_2["ship mode"] = merged_metric_2["ship mode"].str.replace("First","Express")

In [None]:
# 3 Average Arrival delay by Ship mode

avg_delay_by_day = merged_metric_2.groupby("ship mode")["date_diff_days"].mean()#.reindex(
#    ["Monday","Tuesday","Wednesday","Thursday","Friday","Saturday","Sunday"]
#)

avg_delay_by_day.plot(kind="bar", figsize=(8,5), color="orange", edgecolor="black")
plt.title("Average Arrival Delay by Order Weekday")
plt.ylabel("Average Delay (days)")
plt.show()


In [None]:
avg_delay = merged_metric_2.groupby(["order_weekday","ship mode"])["date_diff_days"].mean().round(2)
avg_delay

In [None]:
avg_diff_by_mode_2 = merged_metric_2.groupby("ship mode")["date_diff_days"].mean().reset_index().round(2)
avg_diff_by_mode_2

In [None]:
# 4 Average Arrival delay by Ship mode

avg_delay = merged_metric_2.groupby(["order_weekday","ship mode"])["date_diff_days"].mean().unstack()

# ensure weekday order
weekday_order = ["Monday","Tuesday","Wednesday","Thursday","Friday","Saturday","Sunday"]
avg_delay = avg_delay.reindex(weekday_order)

# plot
avg_delay.plot(kind="bar", figsize=(12,6))
plt.title("Average Arrival Time by Weekday and Ship Mode")
plt.ylabel("Average Arrival Time (days)")
plt.xlabel("Order Weekday")
plt.xticks(rotation=45)
plt.legend(title="Ship Mode")
plt.show()


In [None]:
# actual average delay for Standard
actual_std = (
    merged_metric_2[merged_metric_2["ship mode"]=="Standard"]
    .groupby("order_weekday")["date_diff_days"]
    .mean()
)

# expected values
expected_data = {
    "order_weekday": ["Monday","Tuesday","Wednesday","Thursday","Friday","Saturday","Sunday"],
    "expected_delay": [5,7,6,7,8,7,6]
}
expected_df = pd.DataFrame(expected_data).set_index("order_weekday")

# align both (reindex to weekday order)
weekday_order = ["Monday","Tuesday","Wednesday","Thursday","Friday","Saturday","Sunday"]
actual_std = actual_std.reindex(weekday_order)
expected_df = expected_df.reindex(weekday_order)

# combine
comparison = pd.DataFrame({
    "Actual (Standard)": actual_std,
    "Expected (Standard)": expected_df["expected_delay"]
})

# plot
comparison.plot(kind="bar", figsize=(12,6))
plt.title("Standard Shipping Duration Time: Actual vs Expected")
plt.ylabel("Average Duration Time (days)")
plt.xlabel("Order Weekday")
plt.xticks(rotation=45)
plt.legend(title="Series")
plt.show()


In [None]:
# actual average delay for Express
actual_exp = (
    merged_metric_2[merged_metric_2["ship mode"]=="Express"]
    .groupby("order_weekday")["date_diff_days"]
    .mean()
)

# expected values
expected_data = {
    "order_weekday": ["Monday","Tuesday","Wednesday","Thursday","Friday","Saturday","Sunday"],
    "expected_delay": [5,4,6,5,6,7,6]
}
expected_df = pd.DataFrame(expected_data).set_index("order_weekday")

# align both (reindex to weekday order)
weekday_order = ["Monday","Tuesday","Wednesday","Thursday","Friday","Saturday","Sunday"]
actual_exp = actual_exp.reindex(weekday_order)
expected_df = expected_df.reindex(weekday_order)

# combine into comparison table
comparison = pd.DataFrame({
    "Actual (Express)": actual_exp,
    "Expected (Express)": expected_df["expected_delay"]
})

# plot
comparison.plot(kind="bar", figsize=(12,6))
plt.title("Express Shipping Duration Time: Actual vs Expected")
plt.ylabel("Average Time (days)")
plt.xlabel("Order Weekday")
plt.xticks(rotation=45)
plt.legend(title="Series")
plt.show()


In [None]:

# --- Express actual ---
actual_exp = (
    merged_metric_2[merged_metric_2["ship mode"]=="Express"]
    .groupby("order_weekday")["date_diff_days"]
    .mean()
)

# --- Express expected ---
expected_exp = {
    "order_weekday": ["Monday","Tuesday","Wednesday","Thursday","Friday","Saturday","Sunday"],
    "Expected (Express)": [5,4,6,5,6,7,6]
}
expected_exp_df = pd.DataFrame(expected_exp).set_index("order_weekday")

# --- Standard actual ---
actual_std = (
    merged_metric_2[merged_metric_2["ship mode"]=="Standard"]
    .groupby("order_weekday")["date_diff_days"]
    .mean()
)

# --- Standard expected ---
expected_std = {
    "order_weekday": ["Monday","Tuesday","Wednesday","Thursday","Friday","Saturday","Sunday"],
    "Expected (Standard)": [5,7,6,7,8,7,6]
}
expected_std_df = pd.DataFrame(expected_std).set_index("order_weekday")

# --- Reindex all to same weekday order ---
weekday_order = ["Monday","Tuesday","Wednesday","Thursday","Friday","Saturday","Sunday"]
actual_exp = actual_exp.reindex(weekday_order)
actual_std = actual_std.reindex(weekday_order)
expected_exp_df = expected_exp_df.reindex(weekday_order)
expected_std_df = expected_std_df.reindex(weekday_order)

# --- Combine everything into one DataFrame ---
comparison = pd.DataFrame({
    "Actual (Express)": actual_exp,
    "Expected (Express)": expected_exp_df["Expected (Express)"],
    "Actual (Standard)": actual_std,
    "Expected (Standard)": expected_std_df["Expected (Standard)"]
})

# --- Plot ---
ax = comparison.plot(kind="bar", figsize=(16,10))
plt.title("Shipping Duration Time (Ordered - Arrival): Actual vs Expected (Express & Standard)")
plt.ylabel("Average Duration Time (days)")
plt.xlabel("Order Weekday")
plt.xticks(rotation=0)
plt.legend(title="Series")

# Add values on top of bars
for container in ax.containers:
    ax.bar_label(container, fmt="%.1f", label_type="edge", padding=2)

plt.show()


In [None]:
# weekday order
weekday_order = ["Monday","Tuesday","Wednesday","Thursday","Friday","Saturday","Sunday"]
business_days = ["Monday","Tuesday","Wednesday","Thursday","Friday"]
weekend_days = ["Saturday","Sunday"]

# --- Express ---
# actual
actual_exp = (
    merged_metric_2[merged_metric_2["ship mode"]=="Express"]
    .groupby("order_weekday")["date_diff_days"]
    .mean()
    .reindex(weekday_order)
)

# expected
expected_exp_data = {
    "order_weekday": weekday_order,
    "expected_delay": [5,4,6,5,6,7,6]
}
expected_exp = pd.DataFrame(expected_exp_data).set_index("order_weekday").reindex(weekday_order)

# combine Express
comparison_exp = pd.DataFrame({
    "Actual (Express)": actual_exp,
    "Expected (Express)": expected_exp["expected_delay"]
})

# --- Standard ---
# actual
actual_std = (
    merged_metric_2[merged_metric_2["ship mode"]=="Standard"]
    .groupby("order_weekday")["date_diff_days"]
    .mean()
    .reindex(weekday_order)
)

# expected
expected_std_data = {
    "order_weekday": weekday_order,
    "expected_delay": [5,7,6,7,8,7,6]
}
expected_std = pd.DataFrame(expected_std_data).set_index("order_weekday").reindex(weekday_order)

# combine Standard
comparison_std = pd.DataFrame({
    "Actual (Standard)": actual_std,
    "Expected (Standard)": expected_std["expected_delay"]
})

# --- Split by business days & weekend ---
comparison_exp_business = comparison_exp.loc[business_days]
comparison_exp_weekend = comparison_exp.loc[weekend_days]

comparison_std_business = comparison_std.loc[business_days]
comparison_std_weekend = comparison_std.loc[weekend_days]

# --- Plot business days ---
fig, axes = plt.subplots(1, 2, figsize=(14,6), sharey=True)

comparison_exp_business.plot(kind="bar", ax=axes[0])
axes[0].set_title("Express - Business Days")
axes[0].set_ylabel("Average Duration Time (days)")
axes[0].set_xlabel("Order Weekday")
axes[0].set_xticklabels(comparison_exp_business.index, rotation=45)

comparison_std_business.plot(kind="bar", ax=axes[1])
axes[1].set_title("Standard - Business Days")
axes[1].set_xlabel("Order Weekday")
axes[1].set_xticklabels(comparison_std_business.index, rotation=45)

plt.suptitle("Shipping Duration: Business Days")
plt.tight_layout()
plt.show()

# --- Plot weekends ---
fig, axes = plt.subplots(1, 2, figsize=(10,6), sharey=True)

comparison_exp_weekend.plot(kind="bar", ax=axes[0])
axes[0].set_title("Express - Weekend")
axes[0].set_ylabel("Average Duration Time (days)")
axes[0].set_xlabel("Order Weekday")
axes[0].set_xticklabels(comparison_exp_weekend.index, rotation=0)

comparison_std_weekend.plot(kind="bar", ax=axes[1])
axes[1].set_title("Standard - Weekend")
axes[1].set_xlabel("Order Weekday")
axes[1].set_xticklabels(comparison_std_weekend.index, rotation=0)

plt.suptitle("Shipping Duration: Weekend")
plt.tight_layout()
plt.show()


In [None]:
# weekday order + categories
weekday_order = ["Monday","Tuesday","Wednesday","Thursday","Friday","Saturday","Sunday"]
business_days = ["Monday","Tuesday","Wednesday","Thursday","Friday"]
weekend_days = ["Saturday","Sunday"]

# --- Express ---
actual_exp = (
    merged_metric_2[merged_metric_2["ship mode"]=="Express"]
    .groupby("order_weekday")["date_diff_days"]
    .mean()
    .reindex(weekday_order)
)

expected_exp = pd.Series([5,4,6,5,6,7,6], index=weekday_order, name="Expected (Express)")

# --- Standard ---
actual_std = (
    merged_metric_2[merged_metric_2["ship mode"]=="Standard"]
    .groupby("order_weekday")["date_diff_days"]
    .mean()
    .reindex(weekday_order)
)

expected_std = pd.Series([5,7,6,7,8,7,6], index=weekday_order, name="Expected (Standard)")

# --- Combine into one long-format DataFrame ---
df_long = pd.DataFrame({
    "Actual (Express)": actual_exp,
    "Expected (Express)": expected_exp,
    "Actual (Standard)": actual_std,
    "Expected (Standard)": expected_std
}).reset_index().melt(id_vars="index", var_name="Type", value_name="Delay")

df_long.rename(columns={"index": "OrderWeekday"}, inplace=True)

# Add business/weekend label
df_long["DayType"] = df_long["OrderWeekday"].apply(lambda x: "Business" if x in business_days else "Weekend")

# --- Seaborn plot ---
plt.figure(figsize=(14,6))
sns.barplot(
    data=df_long,
    x="OrderWeekday", y="Delay", hue="Type",
    order=weekday_order
)

plt.title("Shipping Duration: Actual vs Expected (Express & Standard)\nBusiness Days vs Weekend Highlight")
plt.ylabel("Average Duration Time (days)")
plt.xlabel("Order Weekday")

# Highlight weekends with background shading
for i, day in enumerate(weekday_order):
    if day in weekend_days:
        plt.axvspan(i-0.5, i+0.5, color="lightgrey", alpha=0.3)

plt.legend(title="Series")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()


In [None]:
# --- Seaborn plot ---
plt.figure(figsize=(14,6))
ax = sns.barplot(
    data=df_long,
    x="OrderWeekday", y="Delay", hue="Type",
    order=weekday_order
)

plt.title("Shipping Duration: Actual vs Expected (Express & Standard)\nBusiness Days vs Weekend Highlight")
plt.ylabel("Average Duration Time (days)")
plt.xlabel("Order Weekday")

# Highlight weekends with background shading
for i, day in enumerate(weekday_order):
    if day in weekend_days:
        plt.axvspan(i-0.5, i+0.5, color="lightgrey", alpha=0.3)

# --- Add value labels ---
for p in ax.patches:
    height = p.get_height()
    ax.annotate(
        f"{height:.1f}",                       # 1 decimal place
        (p.get_x() + p.get_width() / 2., height),
        ha='center', va='bottom',
        fontsize=9, color="black", xytext=(0, 3), textcoords='offset points'
    )

plt.legend(title="Series")
plt.xticks(rotation=0)
plt.tight_layout()
plt.show()


### o_processed_2_delivered

#### Data

In [None]:
df_processed_x_delivered = df_campaign_cleaned.merge(df_interndata_cleaned, on="order id",how="inner")
df_processed_x_delivered

#### Plots

In [None]:
#not enough data