In [1]:
# Load required libraries
library(tidyverse)
library(janitor)
library(dplyr)
library(ggplot2)
library(skimr)
library(purrr)
library(lubridate)

# Source helper scripts
source("../../R/apply_factors.R")
source("../../R/analysis_helpers.R")
source("../../R/temporal_helpers.R")

# Load data
tables <- list(
  Orders  = readr::read_csv("../../data/processed/Orders.csv"),
  Returns = readr::read_csv("../../data/processed/Returns.csv"),
  People  = readr::read_csv("../../data/processed/People.csv")
)

# Apply factor transformations
tables <- apply_factors(tables)

# Extract tables
orders  <- tables$Orders
returns <- tables$Returns
people  <- tables$People

── [1mAttaching core tidyverse packages[22m ──────────────────────── tidyverse 2.0.0 ──
[32m✔[39m [34mdplyr    [39m 1.1.4     [32m✔[39m [34mreadr    [39m 2.1.6
[32m✔[39m [34mforcats  [39m 1.0.1     [32m✔[39m [34mstringr  [39m 1.6.0
[32m✔[39m [34mggplot2  [39m 4.0.1     [32m✔[39m [34mtibble   [39m 3.3.0
[32m✔[39m [34mlubridate[39m 1.9.4     [32m✔[39m [34mtidyr    [39m 1.3.1
[32m✔[39m [34mpurrr    [39m 1.2.0     
── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()
[36mℹ[39m Use the conflicted package ([3m[34m<http://conflicted.r-lib.org/>[39m[23m) to force all conflicts to become errors

Attaching package: ‘janitor’


The following objects are masked from ‘package:stats’:

    chisq.test, fisher.test


[1mRows: [22m[34m51290[39m [1mColumns: [22m[

# Shipping & Logistics

### Is there a clear correlation between shipping cost and time?

In [2]:
orders_with_shipping_time <- orders |>  
    mutate(    
        shipping_time_days = as.numeric(ship_date - order_date)
    )

shipping_cost_time_corr <- orders_with_shipping_time |>  
    summarise(    
        correlation = cor(shipping_cost, shipping_time_days, use = "complete.obs"),
        avg_shipping_cost = mean(shipping_cost),   
        avg_shipping_time = mean(shipping_time_days)  
)

shipping_cost_time_corr

correlation,avg_shipping_cost,avg_shipping_time
<dbl>,<dbl>,<dbl>
-0.1428227,26.37582,3.96937


### How do shipping costs vary across regions and shipping modes?

In [3]:
shipping_cost_by_region_mode <- orders |>  
    group_by(region, ship_mode) |>  
    summarise(
        avg_shipping_cost = mean(shipping_cost),    
        median_shipping_cost = median(shipping_cost),    
        n_orders = n(),    
        .groups = "drop"  
    )
    
shipping_cost_by_region_mode

region,ship_mode,avg_shipping_cost,median_shipping_cost,n_orders
<fct>,<fct>,<dbl>,<dbl>,<int>
Africa,First Class,29.15493,7.82,679
Africa,Standard Class,13.73457,4.03,2659
Africa,Second Class,23.21031,6.12,992
Africa,Same Day,34.23514,7.94,257
Canada,First Class,27.63486,8.25,70
Canada,Standard Class,11.08954,5.195,194
Canada,Second Class,29.54165,8.33,91
Canada,Same Day,21.7769,7.78,29
Caribbean,First Class,32.35589,14.215,231
Caribbean,Standard Class,15.4369,5.802,1002


### Do shipping costs significantly reduce profit for certain categories or regions?

In [4]:
shipping_profit_impact <- orders |>  
    group_by(category, region) |>  
    summarise(
        total_sales = sum(sales),
        total_profit = sum(profit),
        total_shipping_cost = sum(shipping_cost),
        shipping_cost_share = total_shipping_cost / total_sales,
        profit_margin = total_profit / total_sales,
        .groups = "drop"
    ) |>  
    arrange(desc(shipping_cost_share))

shipping_profit_impact

category,region,total_sales,total_profit,total_shipping_cost,shipping_cost_share,profit_margin
<fct>,<fct>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
Office Supplies,Africa,266755.53,28480.053,31095.94,0.11657093,0.10676462
Technology,Oceania,408002.98,54734.022,47207.84,0.11570464,0.13415103
Office Supplies,Caribbean,89575.42,14818.276,10323.46,0.11524876,0.16542793
Furniture,West,252612.74,11504.95,28716.27,0.11367704,0.04554382
Technology,Africa,322367.04,44129.493,36634.49,0.11364217,0.13689207
Technology,North Asia,314038.55,72471.015,35509.12,0.11307249,0.23077108
Office Supplies,East,205516.05,41014.579,23230.42,0.11303458,0.19956873
Office Supplies,North Asia,198554.78,40926.105,22420.98,0.11292088,0.20611997
Office Supplies,Canada,30034.08,7957.53,3388.53,0.11282283,0.26495002
Furniture,Canada,10595.28,2613.24,1194.75,0.11276248,0.2466419


# Fulfillment Timing

### What is the distribution of shipping delays?

In [5]:
shipping_delay_summary <- orders_with_shipping_time |>  
    summarise(
        mean_delay = mean(shipping_time_days),
        median_delay = median(shipping_time_days),
        sd_delay = sd(shipping_time_days)
    )
    
shipping_delay_summary

mean_delay,median_delay,sd_delay
<dbl>,<dbl>,<dbl>
3.96937,4,1.729437


### Are longer shipping delays associated with lower profit or higher return rates?

In [6]:
delay_profit_relationship <- orders_with_shipping_time |>  
    summarise(
        correlation_delay_profit = cor(shipping_time_days, profit,
        use = "complete.obs")
    )
    
delay_profit_relationship

correlation_delay_profit
<dbl>
0.001543151


In [7]:
orders_with_returns <- orders |>  
    left_join(returns, by = "order_id", relationship = "many-to-many") |> 
    mutate(
        returned_flag = ifelse(is.na(returned), 0, 1),
        shipping_time_days = as.numeric(ship_date - order_date)  
    )

delay_buckets <- orders_with_shipping_time |>  mutate(
    delay_bucket = cut(
        shipping_time_days,
        breaks = c(0, 2, 5, 10, Inf),
        labels = c("≤2 days", "2–5 days", "5–10 days", "10+ days")    
        )  
    ) |>  group_by(delay_bucket) |>  
    summarise(
        avg_profit = mean(profit),
        profit_margin = sum(profit) / sum(sales),
        n_orders = n(),
        .groups = "drop"  )

delay_buckets

delay_bucket,avg_profit,profit_margin,n_orders
<fct>,<dbl>,<dbl>,<int>
≤2 days,28.81055,0.113105,8688
2–5 days,28.52172,0.1168212,30690
5–10 days,28.98901,0.1179355,9312
,28.25062,0.113418,2600


In [8]:
delay_return_buckets <- orders_with_returns |>  
    mutate(    
        delay_bucket = cut(      
            shipping_time_days,      
            breaks = c(0, 2, 5, 10, Inf),      
            labels = c("≤2 days", "2–5 days", "5–10 days", "10+ days")    
        )  
    ) |>  
    group_by(delay_bucket) |>  
    summarise(    
        return_rate = mean(returned_flag),  
        n_orders = n(),    
        .groups = "drop"  
    )
    
delay_return_buckets

delay_bucket,return_rate,n_orders
<fct>,<dbl>,<int>
≤2 days,0.06169429,8688
2–5 days,0.05942336,30695
5–10 days,0.05627148,9312
,0.06576923,2600
