In [None]:
# Load required libraries
library(tidyverse)
library(janitor)
library(dplyr)
library(ggplot2)
library(skimr)
library(purrr)
library(lubridate)

# Source helper scripts
source("../../R/apply_factors.R")
source("../../R/analysis_helpers.R")
source("../../R/temporal_helpers.R")

# Load data
tables <- list(
  Orders  = readr::read_csv("../../data/processed/Orders.csv"),
  Returns = readr::read_csv("../../data/processed/Returns.csv"),
  People  = readr::read_csv("../../data/processed/People.csv")
)

# Apply factor transformations
tables <- apply_factors(tables)

# Extract tables
orders  <- tables$Orders
returns <- tables$Returns
people  <- tables$People

── [1mAttaching core tidyverse packages[22m ──────────────────────── tidyverse 2.0.0 ──
[32m✔[39m [34mdplyr    [39m 1.1.4     [32m✔[39m [34mreadr    [39m 2.1.6
[32m✔[39m [34mforcats  [39m 1.0.1     [32m✔[39m [34mstringr  [39m 1.6.0
[32m✔[39m [34mggplot2  [39m 4.0.1     [32m✔[39m [34mtibble   [39m 3.3.0
[32m✔[39m [34mlubridate[39m 1.9.4     [32m✔[39m [34mtidyr    [39m 1.3.1
[32m✔[39m [34mpurrr    [39m 1.2.0     
── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()
[36mℹ[39m Use the conflicted package ([3m[34m<http://conflicted.r-lib.org/>[39m[23m) to force all conflicts to become errors

Attaching package: ‘janitor’


The following objects are masked from ‘package:stats’:

    chisq.test, fisher.test


[1mRows: [22m[34m51290[39m [1mColumns: [22m[

# General Return Overview

### Q1. What proportion of orders are returned overall?

In [3]:
total_orders <- orders |>  distinct(order_id) |>  nrow()returned_orders <- returns |>  distinct(order_id) |>  nrow()overall_return_rate <- returned_orders / total_ordersoverall_return_rate


### Q2. How do return rates differ across markets and regions?

In [4]:
orders_returns <- orders |>  select(order_id, market, region) |>  distinct() |>  left_join(    returns |> distinct(order_id) |> mutate(returned = 1),    by = "order_id"  ) |>  mutate(returned = ifelse(is.na(returned), 0, returned))return_rate_by_region <- orders_returns |>  group_by(market, region) |>  summarise(    orders = n(),    return_rate = mean(returned),    .groups = "drop"  ) |>  arrange(desc(return_rate))return_rate_by_region


market,region,orders,return_rate
<fct>,<fct>,<int>,<dbl>
APAC,North Asia,1150,0.13826087
LATAM,North,1329,0.13167795
US,West,1611,0.11731844
EU,Central,2576,0.0632764
EU,South,995,0.06130653
EU,North,1027,0.0593963
APAC,Southeast Asia,1517,0.03625577
LATAM,South,1456,0.03365385
APAC,Oceania,1744,0.03325688
LATAM,Central,1504,0.03324468


# Product & Category Signals

### Q3. Which product categories and sub-categories have the highest/lowest return rates?

In [5]:
return_rate_by_category <- orders |>  select(order_id, category, sub_category) |>  distinct() |>  left_join(    returns |> distinct(order_id) |> mutate(returned = 1),    by = "order_id"  ) |>  mutate(returned = ifelse(is.na(returned), 0, returned)) |>  group_by(category, sub_category) |>  summarise(    orders = n(),    return_rate = mean(returned),    .groups = "drop"  ) |>  arrange(desc(return_rate))return_rate_by_category


category,sub_category,orders,return_rate
<fct>,<fct>,<int>,<dbl>
Furniture,Tables,836,0.07535885
Office Supplies,Fasteners,2304,0.07204861
Office Supplies,Appliances,1686,0.0717675
Technology,Accessories,2889,0.06957425
Office Supplies,Paper,3234,0.06555349
Furniture,Furnishings,2965,0.06441821
Furniture,Chairs,3187,0.06087229
Office Supplies,Envelopes,2310,0.06017316
Technology,Copiers,2120,0.05943396
Office Supplies,Supplies,2281,0.05918457


### Q4. Are high-return products also high-sales products?

In [6]:
product_returns_sales <- orders |>  group_by(product_id, product_name) |>  summarise(    total_sales = sum(sales),    orders = n_distinct(order_id),    .groups = "drop"  ) |>  left_join(    orders |>      select(order_id, product_id) |>      distinct() |>      left_join(        returns |> distinct(order_id) |> mutate(returned = 1),        by = "order_id"      ) |>      mutate(returned = ifelse(is.na(returned), 0, returned)) |>      group_by(product_id) |>      summarise(return_rate = mean(returned), .groups = "drop"),    by = "product_id"  )product_returns_sales |> arrange(desc(return_rate))


product_id,product_name,total_sales,orders,return_rate
<chr>,<chr>,<dbl>,<int>,<dbl>
FUR-BO-10000214,"Ikea Library with Doors, Pine",291.7920,1,1
FUR-BO-10002206,"Bush Saratoga Collection 5-Shelf Bookcase, Hanover Cherry, *Special Order",119.8330,1,1
FUR-CH-10000042,"Harbour Creations Swivel Stool, Black",355.3800,1,1
FUR-FU-10002614,"Tenex Door Stop, Black",131.6700,1,1
FUR-TA-10000022,"Hon Conference Table, Adjustable Height",3694.6800,1,1
FUR-TA-10000519,"Bevis Computer Table, Fully Assembled",692.3280,1,1
FUR-TA-10000591,"Chromcraft Training Table, with Bottom Storage",993.6150,1,1
FUR-TA-10000670,"Chromcraft Coffee Table, Adjustable Height",538.5600,1,1
FUR-TA-10000945,"Bevis Coffee Table, Adjustable Height",640.7400,1,1
FUR-TA-10001327,"Barricks Training Table, Rectangular",334.8222,1,1


# Segment & Behavioral Patterns

### Q5. Do return rates differ across customer segments?

In [7]:
return_rate_by_segment <- orders_returns |>  left_join(    orders |> select(order_id, segment) |> distinct(),    by = "order_id",    relationship = "many-to-many"  ) |>  group_by(segment) |>  summarise(    orders = n(),    return_rate = mean(returned),    .groups = "drop"  ) |>  arrange(desc(return_rate))return_rate_by_segment


segment,orders,return_rate
<fct>,<int>,<dbl>
Corporate,7697,0.04936988
Consumer,13141,0.0464196
Home Office,4706,0.04377391


# Diagnostic Questions

### Q6. Are returns concentrated in a small subset of products or regions?

In [8]:
return_concentration <- orders_returns |>  group_by(region) |>  summarise(    total_returns = sum(returned),    .groups = "drop"  ) |>  arrange(desc(total_returns)) |>  mutate(    cumulative_returns = cumsum(total_returns),    cumulative_share = cumulative_returns / sum(total_returns)  )return_concentration


region,total_returns,cumulative_returns,cumulative_share
<fct>,<dbl>,<dbl>,<dbl>
Central,252,252,0.2141037
North,236,488,0.4146134
West,189,677,0.5751912
North Asia,159,836,0.7102804
South,134,970,0.8241291
Oceania,58,1028,0.873407
Southeast Asia,55,1083,0.9201359
East,45,1128,0.9583687
Caribbean,25,1153,0.9796092
Central Asia,24,1177,1.0


### Q7. Are high return rates associated with high order volume or low order value?

In [None]:
returns_volume_value <- orders_returns |>  left_join(    orders |>      group_by(order_id) |>      summarise(        order_sales = sum(sales),        .groups = "drop"      ),    by = "order_id"  )cor(returns_volume_value$returned, returns_volume_value$order_sales, use = "complete.obs")


# Discount Behavior

### Q8. How do discounts affect profit and order value?

In [9]:
discount_profit_summary <- orders |>  summarise(    avg_discount = mean(discount),    avg_sales = mean(sales),    avg_profit = mean(profit),    correlation_discount_profit = cor(discount, profit, use = "complete.obs"),    correlation_discount_sales = cor(discount, sales, use = "complete.obs")  )discount_profit_summary


avg_discount,avg_sales,avg_profit,correlation_discount_profit,correlation_discount_sales
<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
0.1429075,246.4906,28.64174,-0.316375,-0.08672187


In [10]:
discount_buckets <- orders |>  mutate(    discount_bucket = cut(      discount,      breaks = c(0, 0.1, 0.25, 0.5, 1),      labels = c("0–10%", "10–25%", "25–50%", "50%+"),      include.lowest = TRUE    )  ) |>  group_by(discount_bucket) |>  summarise(    avg_sales = mean(sales),    avg_profit = mean(profit),    profit_margin = sum(profit) / sum(sales),    n_orders = n(),    .groups = "drop"  )discount_buckets


discount_bucket,avg_sales,avg_profit,profit_margin,n_orders
<fct>,<dbl>,<dbl>,<dbl>,<int>
0–10%,265.8225,62.64729,0.23567337,33688
10–25%,285.818,26.63291,0.09318138,6513
25–50%,210.2523,-58.16188,-0.276629,6917
50%+,89.0759,-98.89301,-1.11021065,4172


### Q9. Are discounts applied differently across segments or regions?

In [11]:
discount_by_segment <- orders |>  group_by(segment) |>  summarise(    avg_discount = mean(discount),    share_discounted_orders = mean(discount > 0),    .groups = "drop"  )discount_by_segment


segment,avg_discount,share_discounted_orders
<fct>,<dbl>,<dbl>
Consumer,0.1436022,0.437401
Corporate,0.1429311,0.4359971
Home Office,0.1408971,0.4233116


In [12]:
discount_by_region <- orders |>  group_by(region) |>  summarise(    avg_discount = mean(discount),    share_discounted_orders = mean(discount > 0),    .groups = "drop"  )discount_by_region


region,avg_discount,share_discounted_orders
<fct>,<dbl>,<dbl>
Africa,0.15670373,0.2302158
Canada,0.0,0.0
Caribbean,0.13575148,0.5426036
Central,0.13885131,0.4643339
Central Asia,0.06748047,0.1484375
East,0.14536517,0.4912219
EMEA,0.19608272,0.3199443
North,0.09605643,0.3366771
North Asia,0.04871685,0.1107784
Oceania,0.15316891,0.8557499


### Q10. Which times/products have the most discounts?

In [13]:
discount_by_product <- orders |>  group_by(product_name) |>  summarise(    avg_discount = mean(discount, na.rm = TRUE),    max_discount = max(discount, na.rm = TRUE),    share_discounted_orders = mean(discount > 0),    n_orders = n(),    .groups = "drop"  ) |>  arrange(desc(avg_discount))discount_by_product |> slice_head(n = 20)


product_name,avg_discount,max_discount,share_discounted_orders,n_orders
<chr>,<dbl>,<dbl>,<dbl>,<int>
"Chromcraft Training Table, Adjustable Height",0.8,0.8,1.0,1
Eureka Disposable Bags for Sanitaire Vibra Groomer I Upright Vac,0.8,0.8,1.0,1
GBC Plasticlear Binding Covers,0.7285714,0.8,1.0,7
GBC VeloBinder Electric Binding Machine,0.725,0.8,1.0,4
"Brother MFC-9340CDW LED All-In-One Printer, Copier Scanner",0.7,0.7,1.0,1
"Bush Westfield Collection Bookcases, Dark Cherry Finish, Fully Assembled",0.7,0.7,1.0,1
Cisco 8961 IP Phone Charcoal,0.7,0.7,1.0,1
Epson Perfection V600 Photo Scanner,0.7,0.7,1.0,1
Hewlett-Packard Deskjet F4180 All-in-One Color Ink-jet - Printer / copier / scanner,0.7,0.7,1.0,1
Lexmark MarkNet N8150 Wireless Print Server,0.7,0.7,1.0,2


In [14]:
discount_over_time <- orders |>  add_time_period("order_date", period = "month") |>  group_by(period) |>  summarise(    avg_discount = mean(discount),    share_discounted_orders = mean(discount > 0),    .groups = "drop"  ) |>  arrange(desc(avg_discount))discount_over_time


period,avg_discount,share_discounted_orders
<date>,<dbl>,<dbl>
2011-06-01,0.1739738,0.4961832
2011-07-01,0.1617899,0.5010101
2011-01-01,0.1587945,0.408776
2012-07-01,0.1584279,0.4962064
2011-11-01,0.1570114,0.451273
2014-04-01,0.1560419,0.4643197
2014-12-01,0.1556461,0.4509986
2012-03-01,0.1535982,0.4501511
2014-07-01,0.150425,0.424103
2013-03-01,0.1493264,0.4451697
