In [1]:
# load required libraries
library(tidyverse)
library(janitor)
library(dplyr)
library(ggplot2)
library(skimr)
library(purrr)
library(lubridate)

source("../../R/apply_factors.R")
source("../../R/analysis_helpers.R")
source("../../R/temporal_helpers.R")
tables <- list(
  Orders   = readr::read_csv("../../data/processed/Orders.csv"),
  Returns  = readr::read_csv("../../data/processed/Returns.csv"),
  People   = readr::read_csv("../../data/processed/People.csv")
)
tables <- apply_factors(tables)
orders <- tables$Orders 
returns <- tables$Returns
people <- tables$People

── [1mAttaching core tidyverse packages[22m ──────────────────────── tidyverse 2.0.0 ──
[32m✔[39m [34mdplyr    [39m 1.1.4     [32m✔[39m [34mreadr    [39m 2.1.6
[32m✔[39m [34mforcats  [39m 1.0.1     [32m✔[39m [34mstringr  [39m 1.6.0
[32m✔[39m [34mggplot2  [39m 4.0.1     [32m✔[39m [34mtibble   [39m 3.3.0
[32m✔[39m [34mlubridate[39m 1.9.4     [32m✔[39m [34mtidyr    [39m 1.3.1
[32m✔[39m [34mpurrr    [39m 1.2.0     
── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()
[36mℹ[39m Use the conflicted package ([3m[34m<http://conflicted.r-lib.org/>[39m[23m) to force all conflicts to become errors

Attaching package: ‘janitor’


The following objects are masked from ‘package:stats’:

    chisq.test, fisher.test


[1mRows: [22m[34m51290[39m [1mColumns: [22m[

# Overall Sales Evolution

### Q1. How do total sales and order volume evolve over time?

In [2]:
sales_over_time <- orders %>%
    mutate(order_month = floor_date(order_date, "month")) %>%
    group_by(order_month) %>%
    summarise(
        total_sales = sum(sales),
        n_orders = n(),
        .groups = "drop"
    ) %>%
    arrange(order_month)

sales_over_time

order_month,total_sales,n_orders
<date>,<dbl>,<int>
2011-01-01,98898.49,433
2011-02-01,91152.16,378
2011-03-01,145729.37,539
2011-04-01,116915.76,564
2011-05-01,146747.84,566
2011-06-01,215207.38,917
2011-07-01,115510.42,495
2011-08-01,207581.49,878
2011-09-01,290214.46,1052
2011-10-01,199071.26,774


### Q2. Are there clear seasonal or cyclical patterns in sales activity?

In [3]:
seasonality <- orders %>%
    mutate(
      year = year(order_date),
      month = month(order_date, label = TRUE)
    ) %>%
    group_by(month) %>%
    summarise(
      avg_monthly_sales = mean(sales),
      avg_order_count = n() / n_distinct(year),
      .groups = "drop"
    ) %>%
    arrange(month)

seasonality

month,avg_monthly_sales,avg_order_count
<ord>,<dbl>,<dbl>
Jan,259.7667,649.75
Feb,250.8023,542.0
Mar,253.8718,758.75
Apr,228.512,764.25
May,241.263,936.75
Jun,238.1761,1332.75
Jul,236.6967,791.5
Aug,254.7919,1269.5
Sep,240.5657,1493.75
Oct,260.1747,1122.5


# Trend Decomposition

### Q3. Do sales trends differ across regions or markets over time?

In [4]:
regional_trends <- orders %>%
    mutate(order_month = floor_date(order_date, "month")) %>%
    group_by(region, order_month) %>%
    summarise(
        total_sales = sum(sales),
        .groups = "drop"
    )

regional_trends

region,order_month,total_sales
<fct>,<date>,<dbl>
Africa,2011-01-01,11740.899
Africa,2011-02-01,8916.597
Africa,2011-03-01,3894.444
Africa,2011-04-01,10482.429
Africa,2011-05-01,6709.398
Africa,2011-06-01,7760.352
Africa,2011-07-01,8115.213
Africa,2011-08-01,16522.782
Africa,2011-09-01,14783.853
Africa,2011-10-01,7642.803


In [5]:
regional_trends_norm <- regional_trends %>%
    group_by(region) %>%
    mutate(
        sales_index = total_sales / mean(total_sales)
    ) %>%
    ungroup()

regional_trends_norm


region,order_month,total_sales,sales_index
<fct>,<date>,<dbl>,<dbl>
Africa,2011-01-01,11740.899,0.7190385
Africa,2011-02-01,8916.597,0.5460721
Africa,2011-03-01,3894.444,0.2385043
Africa,2011-04-01,10482.429,0.6419671
Africa,2011-05-01,6709.398,0.4108983
Africa,2011-06-01,7760.352,0.4752611
Africa,2011-07-01,8115.213,0.4969935
Africa,2011-08-01,16522.782,1.0118916
Africa,2011-09-01,14783.853,0.9053958
Africa,2011-10-01,7642.803,0.4680621


### Q4. Do customer segments exhibit different temporal purchasing patterns?

In [6]:
segment_trends <- orders %>%
    mutate(order_month = floor_date(order_date, "month")) %>%
    group_by(segment, order_month) %>%
    summarise(
        total_sales = sum(sales),
        n_orders = n(),
        .groups = "drop"
    )

segment_trends_norm <- segment_trends %>%
    group_by(segment) %>%
    mutate(
        sales_index = total_sales / mean(total_sales)
    ) %>%
    ungroup()

segment_trends_norm

segment,order_month,total_sales,n_orders,sales_index
<fct>,<date>,<dbl>,<int>,<dbl>
Consumer,2011-01-01,56687.37,252,0.4181031
Consumer,2011-02-01,35260.31,174,0.2600658
Consumer,2011-03-01,70156.34,313,0.5174447
Consumer,2011-04-01,52974.71,255,0.3907200
Consumer,2011-05-01,78396.64,318,0.5782219
Consumer,2011-06-01,111023.49,489,0.8188643
Consumer,2011-07-01,63109.99,263,0.4654738
Consumer,2011-08-01,112959.50,475,0.8331436
Consumer,2011-09-01,169430.00,540,1.2496471
Consumer,2011-10-01,101361.89,383,0.7476042


# Product & Category Dynamics

### Q5. Are certain product categories gaining or losing importance over time?

In [7]:
category_trends <- orders %>%
    mutate(order_month = floor_date(order_date, "month")) %>%
    group_by(order_month, category) %>%
    summarise(
        total_sales = sum(sales),
        .groups = "drop"
    ) %>%
    group_by(order_month) %>%
    mutate(
        category_share = total_sales / sum(total_sales)
    ) %>%
    ungroup()

category_trends

order_month,category,total_sales,category_share
<date>,<fct>,<dbl>,<dbl>
2011-01-01,Furniture,34463.75,0.3484760
2011-01-01,Office Supplies,33526.73,0.3390014
2011-01-01,Technology,30908.01,0.3125226
2011-02-01,Furniture,30641.20,0.3361545
2011-02-01,Office Supplies,22277.00,0.2443935
2011-02-01,Technology,38233.95,0.4194520
2011-03-01,Furniture,44780.14,0.3072829
2011-03-01,Office Supplies,34395.13,0.2360206
2011-03-01,Technology,66554.09,0.4566965
2011-04-01,Furniture,31344.87,0.2680979


# Stability & Volatility

### Q6. Which regions or segments show the highest volatility in sales over time?

In [8]:
sales_volatility <- orders %>%
  mutate(order_month = floor_date(order_date, "month")) %>%
    group_by(region, order_month) %>%
    summarise(
        total_sales = sum(sales),
        .groups = "drop"
    ) %>%
    group_by(region) %>%
    summarise(
        mean_sales = mean(total_sales),
        sd_sales = sd(total_sales),
        cv_sales = sd_sales / mean_sales,
        .groups = "drop"
    ) %>%
    arrange(desc(cv_sales))

sales_volatility

region,mean_sales,sd_sales,cv_sales
<fct>,<dbl>,<dbl>,<dbl>
Canada,1424.004,1480.318,1.0395464
East,14141.276,10525.704,0.7443249
Caribbean,6755.851,4462.477,0.6605351
Central Asia,15683.887,9196.429,0.5863616
EMEA,16795.027,9316.224,0.5547013
West,15113.705,8318.965,0.5504252
Oceania,22920.513,12094.64,0.5276775
Southeast Asia,18425.483,9436.463,0.512142
Africa,16328.609,8311.769,0.5090311
South,33352.23,16165.186,0.4846808


# Return-Aware Trends

### Q7. Do return rates change over time or show seasonal spikes?

In [9]:
returns_over_time <- orders %>%
    left_join(returns, by = "order_id", relationship = "many-to-many") %>%
    mutate(
        order_month = floor_date(order_date, "month"),
        returned = if_else(is.na(returned), 0, 1)
    ) %>%
    group_by(order_month) %>%
    summarise(
        n_orders = n(),
        n_returns = sum(returned),
        return_rate = n_returns / n_orders,
        .groups = "drop"
    ) %>%
    arrange(order_month)

returns_over_time

order_month,n_orders,n_returns,return_rate
<date>,<int>,<dbl>,<dbl>
2011-01-01,433,16,0.0369515
2011-02-01,378,21,0.05555556
2011-03-01,539,13,0.02411874
2011-04-01,564,43,0.07624113
2011-05-01,566,27,0.04770318
2011-06-01,917,77,0.08396947
2011-07-01,495,51,0.1030303
2011-08-01,878,42,0.04783599
2011-09-01,1052,59,0.05608365
2011-10-01,774,56,0.07235142


# Shipping & Logistics

### Q8. Is there a clear correlation between shipping cost and time?

In [10]:
orders_with_shipping_time <- orders %>%
    mutate(
        shipping_time_days = as.numeric(ship_date - order_date)
    )

shipping_cost_time_corr <- orders_with_shipping_time %>%
    summarise(
        correlation = cor(shipping_cost, shipping_time_days, use = "complete.obs"),
        avg_shipping_cost = mean(shipping_cost),
        avg_shipping_time = mean(shipping_time_days)
    )

shipping_cost_time_corr

correlation,avg_shipping_cost,avg_shipping_time
<dbl>,<dbl>,<dbl>
-0.1428227,26.37582,3.96937


### Q9. How do shipping costs vary across regions and shipping modes?

In [11]:
shipping_cost_by_region_mode <- orders %>%
    group_by(region, ship_mode) %>%
    summarise(
        avg_shipping_cost = mean(shipping_cost),
        median_shipping_cost = median(shipping_cost),
        n_orders = n(),
        .groups = "drop"
    )

shipping_cost_by_region_mode

region,ship_mode,avg_shipping_cost,median_shipping_cost,n_orders
<fct>,<fct>,<dbl>,<dbl>,<int>
Africa,First Class,29.15493,7.82,679
Africa,Standard Class,13.73457,4.03,2659
Africa,Second Class,23.21031,6.12,992
Africa,Same Day,34.23514,7.94,257
Canada,First Class,27.63486,8.25,70
Canada,Standard Class,11.08954,5.195,194
Canada,Second Class,29.54165,8.33,91
Canada,Same Day,21.7769,7.78,29
Caribbean,First Class,32.35589,14.215,231
Caribbean,Standard Class,15.4369,5.802,1002


### Q10. Do shipping costs significantly reduce profit for certain categories or regions?

In [12]:
shipping_profit_impact <- orders %>%
    group_by(category, region) %>%
    summarise(
        total_sales = sum(sales),
        total_profit = sum(profit),
        total_shipping_cost = sum(shipping_cost),
        shipping_cost_share = total_shipping_cost / total_sales,
        profit_margin = total_profit / total_sales,
        .groups = "drop"
    ) %>%
    arrange(desc(shipping_cost_share))

shipping_profit_impact

category,region,total_sales,total_profit,total_shipping_cost,shipping_cost_share,profit_margin
<fct>,<fct>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
Office Supplies,Africa,266755.53,28480.053,31095.94,0.11657093,0.10676462
Technology,Oceania,408002.98,54734.022,47207.84,0.11570464,0.13415103
Office Supplies,Caribbean,89575.42,14818.276,10323.46,0.11524876,0.16542793
Furniture,West,252612.74,11504.95,28716.27,0.11367704,0.04554382
Technology,Africa,322367.04,44129.493,36634.49,0.11364217,0.13689207
Technology,North Asia,314038.55,72471.015,35509.12,0.11307249,0.23077108
Office Supplies,East,205516.05,41014.579,23230.42,0.11303458,0.19956873
Office Supplies,North Asia,198554.78,40926.105,22420.98,0.11292088,0.20611997
Office Supplies,Canada,30034.08,7957.53,3388.53,0.11282283,0.26495002
Furniture,Canada,10595.28,2613.24,1194.75,0.11276248,0.2466419


# Fulfillment Timing

### Q11. What is the distribution of shipping delays?

In [13]:
shipping_delay_summary <- orders_with_shipping_time %>%
    summarise(
        mean_delay = mean(shipping_time_days),
        median_delay = median(shipping_time_days),
        sd_delay = sd(shipping_time_days)
    )

shipping_delay_summary

mean_delay,median_delay,sd_delay
<dbl>,<dbl>,<dbl>
3.96937,4,1.729437


### Q12. Are longer shipping delays associated with lower profit or higher return rates?

In [14]:
delay_profit_relationship <- orders_with_shipping_time %>%
  summarise(
    correlation_delay_profit = cor(shipping_time_days, profit, use = "complete.obs")
  )

delay_profit_relationship

correlation_delay_profit
<dbl>
0.001543151


In [15]:
orders_with_returns <- orders %>%
    left_join(returns, by = "order_id", relationship = "many-to-many") %>%
    mutate(
        returned_flag = ifelse(is.na(returned), 0, 1),
        shipping_time_days = as.numeric(ship_date - order_date)
    )

delay_buckets <- orders_with_shipping_time %>%
    mutate(
        delay_bucket = cut(
        shipping_time_days,
        breaks = c(0, 2, 5, 10, Inf),
        labels = c("≤2 days", "2–5 days", "5–10 days", "10+ days")
        )
    ) %>%
    group_by(delay_bucket) %>%
    summarise(
        avg_profit = mean(profit),
        profit_margin = sum(profit) / sum(sales),
        n_orders = n(),
        .groups = "drop"
    )

delay_buckets

delay_bucket,avg_profit,profit_margin,n_orders
<fct>,<dbl>,<dbl>,<int>
≤2 days,28.81055,0.113105,8688
2–5 days,28.52172,0.1168212,30690
5–10 days,28.98901,0.1179355,9312
,28.25062,0.113418,2600


In [16]:
delay_return_buckets <- orders_with_returns %>%
    mutate(
        delay_bucket = cut(
        shipping_time_days,
        breaks = c(0, 2, 5, 10, Inf),
        labels = c("≤2 days", "2–5 days", "5–10 days", "10+ days")
        )
    ) %>%
    group_by(delay_bucket) %>%
    summarise(
        return_rate = mean(returned_flag),
        n_orders = n(),
        .groups = "drop"
    )

delay_return_buckets

delay_bucket,return_rate,n_orders
<fct>,<dbl>,<int>
≤2 days,0.06169429,8688
2–5 days,0.05942336,30695
5–10 days,0.05627148,9312
,0.06576923,2600
