In [81]:
import pandas as pd
import numpy as np
pd.options.display.float_format = '{:.2f}'.format
data = pd.read_csv("../data/online_retail_clean.csv")
data.head()


Unnamed: 0,Invoice,StockCode,Description,Quantity,InvoiceDate,Price,Customer ID,Country,Total Price,InvoiceYear,InvoiceMonth
0,489434,85048,15CM CHRISTMAS GLASS BALL 20 LIGHTS,12,2009-12-01 07:45:00,6.95,13085,United Kingdom,83.4,2009,12
1,489434,79323P,PINK CHERRY LIGHTS,12,2009-12-01 07:45:00,6.75,13085,United Kingdom,81.0,2009,12
2,489434,79323W,WHITE CHERRY LIGHTS,12,2009-12-01 07:45:00,6.75,13085,United Kingdom,81.0,2009,12
3,489434,22041,"RECORD FRAME 7"" SINGLE SIZE",48,2009-12-01 07:45:00,2.1,13085,United Kingdom,100.8,2009,12
4,489434,21232,STRAWBERRY CERAMIC TRINKET BOX,24,2009-12-01 07:45:00,1.25,13085,United Kingdom,30.0,2009,12


In [84]:
# Analysis #1 — Customer Segmentation

customer_invoice = data.groupby(["Customer ID"]).agg({"Invoice": "nunique", "Total Price": "sum"}).reset_index()
customer_invoice["Segment"] = customer_invoice["Invoice"].apply(lambda x: "Single" if x == 1 else "Repeat")
segment_analysis = customer_invoice.groupby("Segment").agg({"Invoice": "count","Total Price": "sum" })
segment_analysis["Avg Revenue"] = (
    segment_analysis["Total Price"] / segment_analysis["Invoice"]
)
customer_segment = segment_analysis.rename(columns={"Invoice": "Customer Count", "Total Price": "Total Revenue"})
customer_segment

Unnamed: 0_level_0,Customer Count,Total Revenue,Avg Revenue
Segment,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Repeat,4255,17175255.35,4036.49
Single,1623,568173.83,350.08


In [141]:
# Analysis #2 — Revenue Concentration
customers_revenue = data.groupby(["Customer ID"]).agg({"Invoice": "nunique", "Total Price": "sum"}).sort_values(["Total Price"], ascending=False).reset_index(drop=True)
cutoff = int(len(customers_revenue) * 0.10)
top_mask = customers_revenue.index < cutoff
customers_revenue["Segment"] = np.where(
    top_mask,
    "Top 10%",
    "Bottom 90%"
)
total_revenue = customers_revenue["Total Price"].sum()
revenue_concentration = customers_revenue.groupby("Segment").agg({"Invoice": "count", "Total Price": "sum"})
revenue_concentration["Revenue Share"] = (revenue_concentration["Total Price"] / total_revenue) * 100
revenue_concentration = revenue_concentration.rename(columns={"Invoice": "Customer Count", "Total Price": "Revenue"})
revenue_concentration = revenue_concentration.sort_values(["Revenue Share"], ascending=False)
revenue_concentration

Unnamed: 0_level_0,Customer Count,Revenue,Revenue Share
Segment,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Top 10%,587,11337006.17,63.89
Bottom 90%,5291,6406423.01,36.11


In [207]:
# Analysis #3 — Purchase Frequency & Customer Lifetime Patterns
customers_invoices =  (
    data[["Customer ID", "Invoice", "InvoiceDate"]]
    .drop_duplicates()
    .sort_values(["Customer ID", "InvoiceDate"])
    .reset_index(drop=True)
)
customers_invoices["InvoiceDate"] = pd.to_datetime(customers_invoices['InvoiceDate'])
customers_invoices["Days Between Purchases"] = (
    customers_invoices
    .groupby("Customer ID")["InvoiceDate"]
    .diff()
    .dt.days
)
customers_invoices["Segment"] = customers_invoices["Invoice"].apply(lambda x: "Single" if x.count() == 1 else "Repeat")
customers_invoices

AttributeError: 'int' object has no attribute 'count'