In [3]:
import pandas as pd

df = pd.read_csv("online_retail.csv") 
print("Raw shape:", df.shape)
df.head()

Raw shape: (456372, 8)


Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6.0,2010-12-01 08:26:00,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6.0,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8.0,2010-12-01 08:26:00,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6.0,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6.0,2010-12-01 08:26:00,3.39,17850.0,United Kingdom


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 456372 entries, 0 to 456371
Data columns (total 8 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   InvoiceNo    456372 non-null  object 
 1   StockCode    456372 non-null  object 
 2   Description  454994 non-null  object 
 3   Quantity     456371 non-null  float64
 4   InvoiceDate  456371 non-null  object 
 5   UnitPrice    456371 non-null  float64
 6   CustomerID   343956 non-null  float64
 7   Country      456371 non-null  object 
dtypes: float64(3), object(5)
memory usage: 27.9+ MB


## Step 1.1 — Parse datetime, create revenue, and build daily KPI table

In this step, we:
1. Parse `InvoiceDate` into a proper datetime
2. Create `date` (day-level) and `line_revenue = Quantity × UnitPrice`
3. Build a **daily KPI table** (orders, active customers, revenue, returns)
4. Export outputs for later notebooks

In [6]:
import numpy as np

# 1) Parse datetime
df["InvoiceDate"] = pd.to_datetime(df["InvoiceDate"], errors="coerce")

# Drop rows with invalid datetime (should be few)
before = len(df)
df = df.dropna(subset=["InvoiceDate"])
print("Dropped invalid InvoiceDate rows:", before - len(df))

# 2) Ensure numeric types
df["Quantity"] = pd.to_numeric(df["Quantity"], errors="coerce")
df["UnitPrice"] = pd.to_numeric(df["UnitPrice"], errors="coerce")

before = len(df)
df = df.dropna(subset=["Quantity", "UnitPrice"])
print("Dropped invalid Quantity/UnitPrice rows:", before - len(df))

# 3) Day-level date
df["date"] = df["InvoiceDate"].dt.date

# 4) Revenue at line level
df["line_revenue"] = df["Quantity"] * df["UnitPrice"]

df[["InvoiceNo","InvoiceDate","date","Quantity","UnitPrice","line_revenue","CustomerID","Country"]].head()

Dropped invalid InvoiceDate rows: 1
Dropped invalid Quantity/UnitPrice rows: 0


Unnamed: 0,InvoiceNo,InvoiceDate,date,Quantity,UnitPrice,line_revenue,CustomerID,Country
0,536365,2010-12-01 08:26:00,2010-12-01,6.0,2.55,15.3,17850.0,United Kingdom
1,536365,2010-12-01 08:26:00,2010-12-01,6.0,3.39,20.34,17850.0,United Kingdom
2,536365,2010-12-01 08:26:00,2010-12-01,8.0,2.75,22.0,17850.0,United Kingdom
3,536365,2010-12-01 08:26:00,2010-12-01,6.0,3.39,20.34,17850.0,United Kingdom
4,536365,2010-12-01 08:26:00,2010-12-01,6.0,3.39,20.34,17850.0,United Kingdom


In [7]:
print("Cleaned shape:", df.shape)
print("Date range:", df["InvoiceDate"].min(), "to", df["InvoiceDate"].max())

# How common are returns?
return_rows = (df["Quantity"] < 0).sum()
print("Return rows (Quantity < 0):", return_rows, f"({return_rows/len(df):.2%})")

# Missing CustomerID rate
missing_cust = df["CustomerID"].isna().sum()
print("Missing CustomerID:", missing_cust, f"({missing_cust/len(df):.2%})")

Cleaned shape: (456371, 10)
Date range: 2010-12-01 08:26:00 to 2011-11-10 16:03:00
Return rows (Quantity < 0): 9482 (2.08%)
Missing CustomerID: 112415 (24.63%)


## Step 1.2 — Daily KPI Definitions (implementation only)

We compute daily KPIs **without making business judgments yet**:
- `orders`: unique invoices per day
- `active_customers`: unique customers per day (excluding missing CustomerID)
- `gross_revenue`: sum(line_revenue) (includes negative revenue from returns)
- `return_revenue`: sum(line_revenue) where Quantity < 0
- `items_sold`: sum(Quantity where Quantity > 0)

In [8]:
daily_kpi = (
    df.groupby("date")
      .agg(
          orders=("InvoiceNo", "nunique"),
          active_customers=("CustomerID", lambda s: s.dropna().nunique()),
          gross_revenue=("line_revenue", "sum"),
          return_revenue=("line_revenue", lambda s: s[df.loc[s.index, "Quantity"] < 0].sum()),
          items_sold=("Quantity", lambda s: s[s > 0].sum()),
          rows=("InvoiceNo", "size")
      )
      .reset_index()
      .sort_values("date")
)

# For now, keep net_revenue same as gross; we will refine KPI policy in Notebook 01
daily_kpi["net_revenue"] = daily_kpi["gross_revenue"]

print("Daily KPI rows:", len(daily_kpi))
daily_kpi.head()

Daily KPI rows: 280


Unnamed: 0,date,orders,active_customers,gross_revenue,return_revenue,items_sold,rows,net_revenue
0,2010-12-01,143,98,58635.56,-325.23,27007.0,3108,58635.56
1,2010-12-02,167,117,46207.28,-1541.1,31348.0,2109,46207.28
2,2010-12-03,108,55,45620.46,-1323.25,16471.0,2202,45620.46
3,2010-12-05,95,76,31383.95,-391.0,16451.0,2725,31383.95
4,2010-12-06,133,90,53860.18,-970.28,21951.0,3878,53860.18


In [11]:
daily_kpi.tail()

Unnamed: 0,date,orders,active_customers,gross_revenue,return_revenue,items_sold,rows,net_revenue
275,2011-11-06,104,88,42912.4,-33.94,23309.0,3437,42912.4
276,2011-11-07,115,90,70001.08,-15880.73,31395.0,2099,70001.08
277,2011-11-08,144,122,56647.66,-1052.58,26538.0,4070,56647.66
278,2011-11-09,127,102,62599.43,-3528.14,38269.0,2716,62599.43
279,2011-11-10,141,111,54891.21,-2052.02,32019.0,2528,54891.21


In [9]:
daily_kpi_by_country = (
    df.groupby(["date", "Country"])
      .agg(
          orders=("InvoiceNo", "nunique"),
          active_customers=("CustomerID", lambda s: s.dropna().nunique()),
          gross_revenue=("line_revenue", "sum"),
          return_revenue=("line_revenue", lambda s: s[df.loc[s.index, "Quantity"] < 0].sum()),
          items_sold=("Quantity", lambda s: s[s > 0].sum()),
          rows=("InvoiceNo", "size")
      )
      .reset_index()
      .sort_values(["date", "Country"])
)

daily_kpi_by_country["net_revenue"] = daily_kpi_by_country["gross_revenue"]

daily_kpi_by_country.head()

Unnamed: 0,date,Country,orders,active_customers,gross_revenue,return_revenue,items_sold,rows,net_revenue
0,2010-12-01,Australia,1,1,358.25,0.0,107.0,14,358.25
1,2010-12-01,EIRE,2,1,555.38,0.0,243.0,21,555.38
2,2010-12-01,France,1,1,855.86,0.0,449.0,20,855.86
3,2010-12-01,Germany,2,2,139.18,-122.3,157.0,29,139.18
4,2010-12-01,Netherlands,1,1,192.6,0.0,97.0,2,192.6


In [10]:
import os

os.makedirs("data", exist_ok=True)

daily_kpi.to_csv("data/daily_kpi.csv", index=False)
daily_kpi_by_country.to_csv("data/daily_kpi_by_country.csv", index=False)

print("Saved: data/daily_kpi.csv")
print("Saved: data/daily_kpi_by_country.csv")

Saved: data/daily_kpi.csv
Saved: data/daily_kpi_by_country.csv
