In [1]:
# load required libraries
library(tidyverse)
library(janitor)
library(dplyr)
library(ggplot2)
library(skimr)
library(purrr)
library(lubridate)

source("../../R/apply_factors.R")
source("../../R/analysis_helpers.R")
source("../../R/temporal_helpers.R")
tables <- list(
  Orders   = readr::read_csv("../../data/processed/Orders.csv"),
  Returns  = readr::read_csv("../../data/processed/Returns.csv"),
  People   = readr::read_csv("../../data/processed/People.csv")
)
tables <- apply_factors(tables)
orders <- tables$Orders 
returns <- tables$Returns
people <- tables$People

── [1mAttaching core tidyverse packages[22m ──────────────────────── tidyverse 2.0.0 ──
[32m✔[39m [34mdplyr    [39m 1.1.4     [32m✔[39m [34mreadr    [39m 2.1.6
[32m✔[39m [34mforcats  [39m 1.0.1     [32m✔[39m [34mstringr  [39m 1.6.0
[32m✔[39m [34mggplot2  [39m 4.0.1     [32m✔[39m [34mtibble   [39m 3.3.0
[32m✔[39m [34mlubridate[39m 1.9.4     [32m✔[39m [34mtidyr    [39m 1.3.1
[32m✔[39m [34mpurrr    [39m 1.2.0     
── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()
[36mℹ[39m Use the conflicted package ([3m[34m<http://conflicted.r-lib.org/>[39m[23m) to force all conflicts to become errors

Attaching package: ‘janitor’


The following objects are masked from ‘package:stats’:

    chisq.test, fisher.test


[1mRows: [22m[34m51290[39m [1mColumns: [22m[

# Profitability Overview

### Q1. Which products generate the most/least profit?

In [None]:
product_profit <- orders %>%
    group_by(product_name) %>%
    summarise(
        total_profit = sum(profit),
        total_sales  = sum(sales),
        n_orders     = n(),
        .groups = "drop"
      ) %>%
    arrange(desc(total_profit))

product_profit %>% slice_head(n = 10)
product_profit %>% slice_tail(n = 10)


product_name,total_profit,total_sales,n_orders
<chr>,<dbl>,<dbl>,<int>
Canon imageCLASS 2200 Advanced Copier,25199.928,61599.82,5
"Cisco Smart Phone, Full Size",17238.521,76441.53,38
"Motorola Smart Phone, Full Size",17027.113,73156.3,38
"Hoover Stove, Red",11807.969,31663.78,15
"Sauder Classic Bookcase, Traditional",10672.073,39108.3,29
"Harbour Creations Executive Leather Armchair, Adjustable",10427.326,50121.52,39
"Nokia Smart Phone, Full Size",9938.195,71904.56,47
"Cisco Smart Phone, with Caller ID",9786.641,43127.5,27
"Nokia Smart Phone, with Caller ID",9465.326,47877.79,24
"Belkin Router, USB",8955.018,23470.41,32


product_name,total_profit,total_sales,n_orders
<chr>,<dbl>,<dbl>,<int>
"Lesro Training Table, Rectangular",-2581.283,2711.647,5
"Bevis Wood Table, with Bottom Storage",-2782.588,11134.662,10
Chromcraft Bull-Nose Wood Oval Conference Tables & Bases,-2876.116,9917.64,5
"Rogers Lockers, Blue",-2893.491,28214.589,42
"Bevis Computer Table, Fully Assembled",-3509.564,11177.896,13
"Bevis Round Table, Adjustable Height",-3649.894,5654.796,5
Cubify CubeX 3D Printer Triple Head Print,-3839.99,7999.98,1
"Motorola Smart Phone, Cordless",-4447.038,38931.042,23
Lexmark MX611dhe Monochrome Laser Printer,-4589.973,16829.901,4
Cubify CubeX 3D Printer Double Head Print,-8879.97,11099.963,3


### Q2. How does profitability compare across product categories and sub-categories?

In [None]:
category_profit <- orders %>%
    group_by(category) %>%
    summarise(
        total_profit = sum(profit),
        total_sales  = sum(sales),
        profit_margin = total_profit / total_sales,
        .groups = "drop"
    ) %>%
    arrange(desc(total_profit))

category_profit


category,total_profit,total_sales,profit_margin
<fct>,<dbl>,<dbl>,<dbl>
Technology,663778.7,4744557,0.13990319
Office Supplies,518473.8,3787070,0.13690632
Furniture,286782.3,4110874,0.06976187


### Q3. Are there products or categories with high sales but low or negative profit?

In [None]:
product_sales_profit <- orders %>%
    group_by(product_name) %>%
    summarise(
        total_sales  = sum(sales),
        total_profit = sum(profit),
        profit_margin = total_profit / total_sales,
        n_orders = n(),
        .groups = "drop"
    )

high_sales_low_profit_products <- product_sales_profit %>%
    filter(
        total_sales > quantile(total_sales, 0.75),
        profit_margin <= 0
    ) %>%
    arrange(total_profit)

high_sales_low_profit_products

product_name,total_sales,total_profit,profit_margin,n_orders
<chr>,<dbl>,<dbl>,<dbl>,<int>
Cubify CubeX 3D Printer Double Head Print,11099.963,-8879.970,-0.80000000,3
Lexmark MX611dhe Monochrome Laser Printer,16829.901,-4589.973,-0.27272727,4
"Motorola Smart Phone, Cordless",38931.042,-4447.038,-0.11422859,23
Cubify CubeX 3D Printer Triple Head Print,7999.980,-3839.990,-0.48000000,1
"Bevis Round Table, Adjustable Height",5654.796,-3649.894,-0.64545105,5
"Bevis Computer Table, Fully Assembled",11177.896,-3509.564,-0.31397355,13
"Rogers Lockers, Blue",28214.589,-2893.491,-0.10255300,42
Chromcraft Bull-Nose Wood Oval Conference Tables & Bases,9917.640,-2876.116,-0.29000000,5
"Bevis Wood Table, with Bottom Storage",11134.662,-2782.588,-0.24990323,10
"Bevis Conference Table, Fully Assembled",14886.951,-2443.089,-0.16410945,8


In [None]:
sub_category_sales_profit <- orders %>%
    group_by(sub_category) %>%
    summarise(
        total_sales  = sum(sales),
        total_profit = sum(profit),
        profit_margin = total_profit / total_sales,
        .groups = "drop"
    ) %>%
    arrange(profit_margin)

sub_category_sales_profit


sub_category,total_sales,total_profit,profit_margin
<fct>,<dbl>,<dbl>,<dbl>
Tables,757041.92,-64083.39,-0.08464972
Machines,779060.07,58867.87,0.07556269
Supplies,243074.22,22583.26,0.09290686
Chairs,1501681.76,141973.8,0.0945432
Storage,1127085.86,108461.49,0.09623179
Bookcases,1466572.24,161924.42,0.11041012
Furnishings,385578.26,46967.43,0.12181036
Phones,1706824.14,216717.01,0.12697091
Fasteners,83242.32,11525.42,0.13845631
Appliances,1011064.3,141680.59,0.14013015


# Discount Behavior

### Q4. How do discounts affect profit and order value?

In [None]:
discount_profit_summary <- orders %>%
    summarise(
        avg_discount = mean(discount),
        avg_sales = mean(sales),
        avg_profit = mean(profit),
        correlation_discount_profit = cor(discount, profit, use = "complete.obs"),
        correlation_discount_sales = cor(discount, sales, use = "complete.obs")
    )

discount_profit_summary

avg_discount,avg_sales,avg_profit,correlation_discount_profit,correlation_discount_sales
<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
0.1429075,246.4906,28.64174,-0.316375,-0.08672187


In [None]:
discount_buckets <- orders %>%
    mutate(
        discount_bucket = cut(
            discount,
            breaks = c(0, 0.1, 0.25, 0.5, 1),
            labels = c("0–10%", "10–25%", "25–50%", "50%+"),
            include.lowest = TRUE
        )
    ) %>%
    group_by(discount_bucket) %>%
    summarise(
        avg_sales = mean(sales),
        avg_profit = mean(profit),
        profit_margin = sum(profit) / sum(sales),
        n_orders = n(),
        .groups = "drop"
    )

discount_buckets

discount_bucket,avg_sales,avg_profit,profit_margin,n_orders
<fct>,<dbl>,<dbl>,<dbl>,<int>
0–10%,265.8225,62.64729,0.23567337,33688
10–25%,285.818,26.63291,0.09318138,6513
25–50%,210.2523,-58.16188,-0.276629,6917
50%+,89.0759,-98.89301,-1.11021065,4172


### Q5. Are discounts applied differently across segments or regions?

In [None]:
discount_by_segment <- orders %>%
    group_by(segment) %>%
    summarise(
        avg_discount = mean(discount),
        share_discounted_orders = mean(discount > 0),
        .groups = "drop"
    )

discount_by_segment

segment,avg_discount,median_discount,share_discounted_orders
<fct>,<dbl>,<dbl>,<dbl>
Consumer,0.1436022,0,0.437401
Corporate,0.1429311,0,0.4359971
Home Office,0.1408971,0,0.4233116


In [14]:
discount_by_region <- orders %>%
    group_by(region) %>%
    summarise(
        avg_discount = mean(discount),
        share_discounted_orders = mean(discount > 0),
        .groups = "drop"
    )

discount_by_region

region,avg_discount,share_discounted_orders
<fct>,<dbl>,<dbl>
Africa,0.15670373,0.2302158
Canada,0.0,0.0
Caribbean,0.13575148,0.5426036
Central,0.13885131,0.4643339
Central Asia,0.06748047,0.1484375
East,0.14536517,0.4912219
EMEA,0.19608272,0.3199443
North,0.09605643,0.3366771
North Asia,0.04871685,0.1107784
Oceania,0.15316891,0.8557499


### Q6. Which times/products have the most discounts?

In [20]:
discount_by_product <- orders %>%
    group_by(product_name) %>%
    summarise(
        avg_discount = mean(discount, na.rm = TRUE),
        max_discount = max(discount, na.rm = TRUE),
        share_discounted_orders = mean(discount > 0),
        n_orders = n(),
        .groups = "drop"
    ) %>%
    arrange(desc(avg_discount))

discount_by_product %>% slice_head(n = 20)

product_name,avg_discount,max_discount,share_discounted_orders,n_orders
<chr>,<dbl>,<dbl>,<dbl>,<int>
"Chromcraft Training Table, Adjustable Height",0.8,0.8,1.0,1
Eureka Disposable Bags for Sanitaire Vibra Groomer I Upright Vac,0.8,0.8,1.0,1
GBC Plasticlear Binding Covers,0.7285714,0.8,1.0,7
GBC VeloBinder Electric Binding Machine,0.725,0.8,1.0,4
"Brother MFC-9340CDW LED All-In-One Printer, Copier Scanner",0.7,0.7,1.0,1
"Bush Westfield Collection Bookcases, Dark Cherry Finish, Fully Assembled",0.7,0.7,1.0,1
Cisco 8961 IP Phone Charcoal,0.7,0.7,1.0,1
Epson Perfection V600 Photo Scanner,0.7,0.7,1.0,1
Hewlett-Packard Deskjet F4180 All-in-One Color Ink-jet - Printer / copier / scanner,0.7,0.7,1.0,1
Lexmark MarkNet N8150 Wireless Print Server,0.7,0.7,1.0,2


In [23]:
discount_over_time <- orders %>%
    add_time_period("order_date", period = "month") %>%
    group_by(period) %>%
    summarise(
        avg_discount = mean(discount),
        share_discounted_orders = mean(discount > 0),
        .groups = "drop"
    ) %>%
    arrange(desc(avg_discount))

discount_over_time


period,avg_discount,share_discounted_orders
<date>,<dbl>,<dbl>
2011-06-01,0.1739738,0.4961832
2011-07-01,0.1617899,0.5010101
2011-01-01,0.1587945,0.408776
2012-07-01,0.1584279,0.4962064
2011-11-01,0.1570114,0.451273
2014-04-01,0.1560419,0.4643197
2014-12-01,0.1556461,0.4509986
2012-03-01,0.1535982,0.4501511
2014-07-01,0.150425,0.424103
2013-03-01,0.1493264,0.4451697


# Shipping & Logistics

### Q7. Is there a clear correlation between shipping cost and time?

In [25]:
orders_with_shipping_time <- orders %>%
    mutate(
        shipping_time_days = as.numeric(ship_date - order_date)
    )

shipping_cost_time_corr <- orders_with_shipping_time %>%
    summarise(
        correlation = cor(shipping_cost, shipping_time_days, use = "complete.obs"),
        avg_shipping_cost = mean(shipping_cost),
        avg_shipping_time = mean(shipping_time_days)
    )

shipping_cost_time_corr

correlation,avg_shipping_cost,avg_shipping_time
<dbl>,<dbl>,<dbl>
-0.1428227,26.37582,3.96937


### Q8. How do shipping costs vary across regions and shipping modes?

In [28]:
shipping_cost_by_region_mode <- orders %>%
    group_by(region, ship_mode) %>%
    summarise(
        avg_shipping_cost = mean(shipping_cost),
        median_shipping_cost = median(shipping_cost),
        n_orders = n(),
        .groups = "drop"
    )

shipping_cost_by_region_mode

region,ship_mode,avg_shipping_cost,median_shipping_cost,n_orders
<fct>,<fct>,<dbl>,<dbl>,<int>
Africa,First Class,29.15493,7.82,679
Africa,Standard Class,13.73457,4.03,2659
Africa,Second Class,23.21031,6.12,992
Africa,Same Day,34.23514,7.94,257
Canada,First Class,27.63486,8.25,70
Canada,Standard Class,11.08954,5.195,194
Canada,Second Class,29.54165,8.33,91
Canada,Same Day,21.7769,7.78,29
Caribbean,First Class,32.35589,14.215,231
Caribbean,Standard Class,15.4369,5.802,1002


### Q9. Do shipping costs significantly reduce profit for certain categories or regions?

In [None]:
shipping_profit_impact <- orders %>%
    group_by(category, region) %>%
    summarise(
        total_sales = sum(sales),
        total_profit = sum(profit),
        total_shipping_cost = sum(shipping_cost),
        shipping_cost_share = total_shipping_cost / total_sales,
        profit_margin = total_profit / total_sales,
        .groups = "drop"
    ) %>%
    arrange(desc(shipping_cost_share))

shipping_profit_impact

category,region,total_sales,total_profit,total_shipping_cost,shipping_cost_share,profit_margin
<fct>,<fct>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
Office Supplies,Africa,266755.53,28480.053,31095.94,0.11657093,0.10676462
Technology,Oceania,408002.98,54734.022,47207.84,0.11570464,0.13415103
Office Supplies,Caribbean,89575.42,14818.276,10323.46,0.11524876,0.16542793
Furniture,West,252612.74,11504.95,28716.27,0.11367704,0.04554382
Technology,Africa,322367.04,44129.493,36634.49,0.11364217,0.13689207
Technology,North Asia,314038.55,72471.015,35509.12,0.11307249,0.23077108
Office Supplies,East,205516.05,41014.579,23230.42,0.11303458,0.19956873
Office Supplies,North Asia,198554.78,40926.105,22420.98,0.11292088,0.20611997
Office Supplies,Canada,30034.08,7957.53,3388.53,0.11282283,0.26495002
Furniture,Canada,10595.28,2613.24,1194.75,0.11276248,0.2466419


# Fulfillment Timing

### Q10. What is the distribution of shipping delays?

In [None]:
shipping_delay_summary <- orders_with_shipping_time %>%
    summarise(
        mean_delay = mean(shipping_time_days),
        median_delay = median(shipping_time_days),
        sd_delay = sd(shipping_time_days)
    )

shipping_delay_summary


mean_delay,median_delay,sd_delay
<dbl>,<dbl>,<dbl>
3.96937,4,1.729437


### Q11. Are longer shipping delays associated with lower profit or higher return rates?

In [31]:
delay_profit_relationship <- orders_with_shipping_time %>%
  summarise(
    correlation_delay_profit = cor(shipping_time_days, profit, use = "complete.obs")
  )

delay_profit_relationship

correlation_delay_profit
<dbl>
0.001543151


In [42]:
orders_with_returns <- orders %>%
    left_join(returns, by = "order_id", relationship = "many-to-many") %>%
    mutate(
        returned_flag = ifelse(is.na(returned), 0, 1),
        shipping_time_days = as.numeric(ship_date - order_date)
    )

delay_buckets <- orders_with_shipping_time %>%
    mutate(
        delay_bucket = cut(
        shipping_time_days,
        breaks = c(0, 2, 5, 10, Inf),
        labels = c("≤2 days", "2–5 days", "5–10 days", "10+ days")
        )
    ) %>%
    group_by(delay_bucket) %>%
    summarise(
        avg_profit = mean(profit),
        profit_margin = sum(profit) / sum(sales),
        n_orders = n(),
        .groups = "drop"
    )

delay_buckets

delay_bucket,avg_profit,profit_margin,n_orders
<fct>,<dbl>,<dbl>,<int>
≤2 days,28.81055,0.113105,8688
2–5 days,28.52172,0.1168212,30690
5–10 days,28.98901,0.1179355,9312
,28.25062,0.113418,2600


In [44]:
delay_return_buckets <- orders_with_returns %>%
    mutate(
        delay_bucket = cut(
        shipping_time_days,
        breaks = c(0, 2, 5, 10, Inf),
        labels = c("≤2 days", "2–5 days", "5–10 days", "10+ days")
        )
    ) %>%
    group_by(delay_bucket) %>%
    summarise(
        return_rate = mean(returned_flag),
        n_orders = n(),
        .groups = "drop"
    )

delay_return_buckets


delay_bucket,return_rate,n_orders
<fct>,<dbl>,<int>
≤2 days,0.06169429,8688
2–5 days,0.05942336,30695
5–10 days,0.05627148,9312
,0.06576923,2600


# Order Characteristics

### Q12. How do order quantity and order priority relate to sales and profit?

In [None]:
quantity_sales_profit <- orders %>%
    group_by(quantity) %>%
    summarise(
        avg_sales = mean(sales),
        avg_profit = mean(profit),
        n_orders = n(),
        .groups = "drop"
    )

quantity_sales_profit

quantity,avg_sales,avg_profit,n_orders
<dbl>,<dbl>,<dbl>,<int>
1,73.60785,7.254517,8963
2,143.37787,16.360682,12748
3,208.41175,24.323615,9682
4,293.73946,34.724614,6385
5,369.68976,46.477984,4882
6,433.23664,41.426103,3020
7,490.47346,66.821232,2385
8,549.64786,61.538104,1361
9,605.64236,76.417221,987
10,624.80212,62.280717,276


In [None]:
priority_sales_profit <- orders %>%
    group_by(order_priority) %>%
    summarise(
        avg_sales = mean(sales),
        avg_profit = mean(profit),
        profit_margin = sum(profit) / sum(sales),
        n_orders = n(),
        .groups = "drop"
    )

priority_sales_profit

order_priority,avg_sales,avg_profit,profit_margin,n_orders
<ord>,<dbl>,<dbl>,<dbl>,<int>
Low,234.2516,24.19796,0.103299,2424
Medium,247.3717,29.36173,0.1186948,29433
High,245.6324,27.11912,0.1104053,15501
Critical,250.8229,31.99433,0.1275575,3932
