### Detailed EDA and Data Preparation

In [55]:

import os
from pathlib import Path
from dotenv import load_dotenv

load_dotenv()

# Identify the project root (parent of current notebook folder)
project_root = Path.cwd().parent

# Define consistent base folders under the project root
data_root = project_root / "data"
reports_root = project_root / "reports"

# Define subdirectories
raw_dir        = data_root / "raw"
pre_proc_dir   = data_root / "pre-processed"
proc_dir       = data_root / "processed"
prepared_dir   = data_root / "prepared"       
eda_dir        = reports_root / "eda"          

# Create them if they don’t exist
for d in [raw_dir, pre_proc_dir, proc_dir, prepared_dir, eda_dir]:
    d.mkdir(parents=True, exist_ok=True)

# Print confirmation
print("Dirs ready:")
print("  raw_dir       :", raw_dir)
print("  pre_proc_dir  :", pre_proc_dir)
print("  proc_dir      :", proc_dir)
print("  eda_dir       :", eda_dir)
print("  prepared_dir  :", prepared_dir)

Dirs ready:
  raw_dir       : c:\Users\Localadmin\DSI_directory\ds08_online-retail\data\raw
  pre_proc_dir  : c:\Users\Localadmin\DSI_directory\ds08_online-retail\data\pre-processed
  proc_dir      : c:\Users\Localadmin\DSI_directory\ds08_online-retail\data\processed
  eda_dir       : c:\Users\Localadmin\DSI_directory\ds08_online-retail\reports\eda
  prepared_dir  : c:\Users\Localadmin\DSI_directory\ds08_online-retail\data\prepared


In [56]:
#import libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import datetime as dt

In [57]:
#read processed data file
online_retail_df = pd.read_csv("../data/processed/online_retail_processed.csv")

online_retail_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 392391 entries, 0 to 392390
Data columns (total 8 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   InvoiceNo    392391 non-null  int64  
 1   StockCode    392391 non-null  object 
 2   Description  392391 non-null  object 
 3   Quantity     392391 non-null  int64  
 4   InvoiceDate  392391 non-null  object 
 5   UnitPrice    392391 non-null  float64
 6   CustomerID   392391 non-null  float64
 7   Country      392391 non-null  object 
dtypes: float64(2), int64(2), object(4)
memory usage: 23.9+ MB


In [58]:
#check for missing values
online_retail_df.isna().sum()

InvoiceNo      0
StockCode      0
Description    0
Quantity       0
InvoiceDate    0
UnitPrice      0
CustomerID     0
Country        0
dtype: int64

In [59]:
# Convert data types
# change the data type for InvoiceDate
online_retail_df['InvoiceDate'] = pd.to_datetime(online_retail_df['InvoiceDate'])

# change the data type for other relevant columns to str
cols_str = ['InvoiceNo', 'StockCode', 'Description', 'CustomerID', 'Country']
online_retail_df[cols_str] = online_retail_df[cols_str].astype("string")

# verify the datatypes
online_retail_df.dtypes

InvoiceNo      string[python]
StockCode      string[python]
Description    string[python]
Quantity                int64
InvoiceDate    datetime64[ns]
UnitPrice             float64
CustomerID     string[python]
Country        string[python]
dtype: object

In [60]:
online_retail_df.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom


In [61]:
#to handle the '.0' in CustomerIDs (happened after converting it from float to str)

online_retail_df['CustomerID'] = (
    online_retail_df['CustomerID']
      .astype('string')                             # ensure string dtype
      .str.replace(r'\.0$', '', regex=True)         # remove ".0" suffix
      .replace('nan', pd.NA)                        # replace string 'nan' with actual NA
)

In [62]:
#verify the change
online_retail_df.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850,United Kingdom


In [63]:
# Compute Revenue/Spend for each transaction
online_retail_df['Revenue'] = online_retail_df['Quantity'] * online_retail_df['UnitPrice']
online_retail_df.head()

# Cancellation Status- A transaction is "Cancelled" if InvoiceNo starts with 'C'
online_retail_df['Cancellation_Status'] = online_retail_df['InvoiceNo'].astype(str).apply(
    lambda x: 'Cancelled' if x.startswith('C') else 'Not Cancelled'
)

# Domestic vs International Customer
online_retail_df['Customer_Region'] = online_retail_df['Country'].apply(
    lambda x: 'Domestic' if x.strip().upper() == 'UNITED KINGDOM' else 'International'
)

online_retail_df.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,Revenue,Cancellation_Status,Customer_Region
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850,United Kingdom,15.3,Not Cancelled,Domestic
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850,United Kingdom,20.34,Not Cancelled,Domestic
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850,United Kingdom,22.0,Not Cancelled,Domestic
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850,United Kingdom,20.34,Not Cancelled,Domestic
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850,United Kingdom,20.34,Not Cancelled,Domestic


In [64]:
# Count unique customers per region
cust_region = (
    online_retail_df.groupby('Customer_Region')['CustomerID']
      .nunique()
      .reset_index(name='Customer_Count')
)

# Calculate % of customers
cust_region['Customer_%'] = (
    cust_region['Customer_Count'] / cust_region['Customer_Count'].sum() * 100
)

## Percentage of revenue by region
rev_region = (
    online_retail_df.groupby('Customer_Region')['Revenue']
      .sum()
      .reset_index(name='Total_Revenue')
)

# Calculate % of revenue
rev_region['Revenue_%'] = (
    rev_region['Total_Revenue'] / rev_region['Total_Revenue'].sum() * 100
)

# Merge both summaries
region_summary = cust_region.merge(rev_region, on='Customer_Region')

# Round percentages for readability
region_summary['Customer_%'] = region_summary['Customer_%'].round(2)
region_summary['Revenue_%'] = region_summary['Revenue_%'].round(2)

# Display summary
print(region_summary)

  Customer_Region  Customer_Count  Customer_%  Total_Revenue  Revenue_%
0        Domestic            3920       90.47    7285024.644      82.01
1   International             413        9.53    1598223.230      17.99


In [65]:
#save region summary in reports/eda
region_summary_path = eda_dir / "region_summary.csv"
region_summary.to_csv(region_summary_path, index=False)
print(f"[saved] Region summary to: {region_summary_path}")


[saved] Region summary to: c:\Users\Localadmin\DSI_directory\ds08_online-retail\reports\eda\region_summary.csv


#### Recency, Frequency and Monetary (RFM) Analysis

In [66]:
# Recency of purchase
# Reference date: last date in the dataset
recent_pur_date = online_retail_df['InvoiceDate'].max()

# Compute recency (in days) for each customer
recency_df = online_retail_df.groupby('CustomerID', as_index=False)['InvoiceDate'].max()
recency_df['Recency'] = (recent_pur_date - recency_df['InvoiceDate']).dt.days


# Transaction frequency
frequency_df = online_retail_df.groupby('CustomerID', as_index=False)['InvoiceNo'].nunique()
frequency_df.rename(columns={'InvoiceNo': 'Frequency'}, inplace=True)


# Monetary (total revenue per customer)
monetary_df = online_retail_df.groupby('CustomerID', as_index=False)['Revenue'].sum()
monetary_df.rename(columns={'Revenue': 'Monetary'}, inplace=True)


# Merge RFM components
rfm = recency_df.merge(frequency_df, on='CustomerID').merge(monetary_df, on='CustomerID')


# Combine with customer info (region, country)
rfm = rfm.merge(online_retail_df[['CustomerID', 'Country', 'Customer_Region']].drop_duplicates(),
                on='CustomerID', how='left')


# Display head
print(rfm.head())

  CustomerID         InvoiceDate  Recency  Frequency  Monetary  \
0      12346 2011-01-18 10:01:00      325          1  77183.60   
1      12347 2011-12-07 15:52:00        1          7   4310.00   
2      12348 2011-09-25 13:13:00       74          4   1797.24   
3      12349 2011-11-21 09:51:00       18          1   1757.55   
4      12350 2011-02-02 16:01:00      309          1    334.40   

          Country Customer_Region  
0  United Kingdom        Domestic  
1         Iceland   International  
2         Finland   International  
3           Italy   International  
4          Norway   International  


In [67]:
# remove InvoiceDate from rfm data frame
rfm = rfm.drop(columns=['InvoiceDate'])

In [68]:
# verify the changes
rfm.head()

Unnamed: 0,CustomerID,Recency,Frequency,Monetary,Country,Customer_Region
0,12346,325,1,77183.6,United Kingdom,Domestic
1,12347,1,7,4310.0,Iceland,International
2,12348,74,4,1797.24,Finland,International
3,12349,18,1,1757.55,Italy,International
4,12350,309,1,334.4,Norway,International


In [69]:
rfm.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4341 entries, 0 to 4340
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   CustomerID       4341 non-null   string 
 1   Recency          4341 non-null   int64  
 2   Frequency        4341 non-null   int64  
 3   Monetary         4341 non-null   float64
 4   Country          4341 non-null   string 
 5   Customer_Region  4341 non-null   object 
dtypes: float64(1), int64(2), object(1), string(2)
memory usage: 203.6+ KB


In [71]:
# save this dataframe into csv file
rfm_path = prepared_dir / "rfm.csv"
rfm.to_csv(rfm_path, index=False)
print(f"[saved] RFM data to: {rfm_path}")


[saved] RFM data to: c:\Users\Localadmin\DSI_directory\ds08_online-retail\data\prepared\rfm.csv


In [77]:
#divide dataset into two markets, domestic and internatioal
rfm_domestic = rfm[rfm['Customer_Region'] == 'Domestic']
rfm_international = rfm[rfm['Customer_Region'] == 'International']

In [79]:
rfm_international.head()

Unnamed: 0,CustomerID,Recency,Frequency,Monetary,Country,Customer_Region
1,12347,1,7,4310.0,Iceland,International
2,12348,74,4,1797.24,Finland,International
3,12349,18,1,1757.55,Italy,International
4,12350,309,1,334.4,Norway,International
5,12352,35,8,2506.04,Norway,International


In [81]:
# save this domestic dataframe into csv file
rfm_domestic_path = prepared_dir / "rfm_domestic.csv"
rfm_domestic.to_csv(rfm_domestic_path, index=False)
print(f"[saved] RFM domestic data to: {rfm_domestic_path}")

# save this domestic dataframe into csv file
rfm_international_path = prepared_dir / "rfm_international.csv"
rfm_international.to_csv(rfm_international_path, index=False)
print(f"[saved] RFM international data to: {rfm_international_path}")



[saved] RFM domestic data to: c:\Users\Localadmin\DSI_directory\ds08_online-retail\data\prepared\rfm_domestic.csv
[saved] RFM international data to: c:\Users\Localadmin\DSI_directory\ds08_online-retail\data\prepared\rfm_international.csv


RFM Analysis for Domestic and International Market

| Metric            | Definition                   | Typical interpretation          |
| ----------------- | ---------------------------- | ------------------------------- |
| **Recency (R)**   | Days since the last purchase | Lower = better (more recent)    |
| **Frequency (F)** | Number of transactions       | Higher = better (more loyal)    |
| **Monetary (M)**  | Total spend                  | Higher = better (more valuable) |


for Domestic Market

In [106]:
#Calculate RFM Scores (1–5 per metric)

#assign scores using quantiles (quintiles) — higher scores mean better customers.

# For convenience, use positive-spending RFM table
rfm_domestic_scoring = rfm_domestic.copy()

# Recency: smaller is better → reverse the scoring (5 = most recent)
rfm_domestic_scoring['R_Score'] = pd.qcut(rfm_domestic_scoring['Recency'], 5, labels=[5,4,3,2,1]).astype(int)

# Frequency: larger is better
rfm_domestic_scoring['F_Score'] = pd.qcut(rfm_domestic_scoring['Frequency'].rank(method='first'), 5, labels=[1,2,3,4,5]).astype(int)

# Monetary: larger is better
rfm_domestic_scoring['M_Score'] = pd.qcut(rfm_domestic_scoring['Monetary'], 5, labels=[1,2,3,4,5]).astype(int)

In [None]:
#Combine (concat) into an Overall RFM Score
#customer with R=5, F=4, M=5 → “545” (high value, frequent, and recent buyer)
rfm_domestic_scoring['RFM_Score'] = (
    rfm_domestic_scoring['R_Score'].astype(str)
    + rfm_domestic_scoring['F_Score'].astype(str)
    + rfm_domestic_scoring['M_Score'].astype(str)
)



In [113]:
#Define RFM Segments

def rfm_domestic_seg(row):
    r, f, m = int(row['R_Score']), int(row['F_Score']), int(row['M_Score'])
    if r >= 4 and f >= 4 and m >= 4:
        return 'Champions'
    elif r >= 4 and f >= 3:
        return 'Loyal Customers'
    elif f <= 2 and m <= 2 and r >= 4:
        return 'Potential Loyalists'
    elif r >= 3 and f <= 2:
        return 'Recent Customers'
    elif r <= 2 and f >= 4:
        return 'At Risk'
    elif r == 1 and f == 1:
        return 'Lost'
    else:
        return 'Others'

rfm_domestic_scoring['Segment'] = rfm_domestic_scoring.apply(rfm_domestic_seg, axis=1)

In [114]:
#dispaly the data
rfm_domestic_scoring

Unnamed: 0,CustomerID,Recency,Frequency,Monetary,Country,Customer_Region,R_Score,F_Score,M_Score,Segment
0,12346,325,1,77183.60,United Kingdom,Domestic,1,1,5,Lost
331,12747,1,11,4196.01,United Kingdom,Domestic,5,5,5,Champions
332,12748,0,209,33053.19,United Kingdom,Domestic,5,5,5,Champions
333,12749,3,5,4090.88,United Kingdom,Domestic,5,4,5,Champions
378,12820,2,4,942.34,United Kingdom,Domestic,5,4,4,Champions
...,...,...,...,...,...,...,...,...,...,...
4336,18280,277,1,180.60,United Kingdom,Domestic,1,2,1,Others
4337,18281,180,1,80.82,United Kingdom,Domestic,1,2,1,Others
4338,18282,7,2,178.05,United Kingdom,Domestic,5,3,1,Loyal Customers
4339,18283,3,16,2045.53,United Kingdom,Domestic,5,5,5,Champions


In [99]:
# save this dataframe into csv file
rfm_domestic_scor_path = prepared_dir / "rfm_domestic_scoring.csv"
rfm_domestic_scoring.to_csv(rfm_domestic_scor_path, index=False)
print(f"[saved] RFM domestic scoring data to: {rfm_domestic_scor_path}")


[saved] RFM domestic scoring data to: c:\Users\Localadmin\DSI_directory\ds08_online-retail\data\prepared\rfm_domestic_scoring.csv


| Segment                 | Typical Characteristics         | Strategic Action              |
| ----------------------- | ------------------------------- | ----------------------------- |
| **Champions**           | Recent, frequent, high spenders | Reward & retain               |
| **Loyal Customers**     | Buy often, decent spend         | Encourage advocacy            |
| **Potential Loyalists** | New but active                  | Nurture relationship          |
| **At Risk**             | Used to buy, not lately         | Win-back campaigns            |
| **Lost**                | Inactive & low value            | Consider dropping from target |
| **Others**              | Middle-tier customers           | Monitor behavior              |


In [116]:
# Ensure R/F/M scores are numeric ints (not category/strings)
for col in ['R_Score', 'F_Score', 'M_Score']:
    if str(rfm_domestic_scoring[col].dtype) == 'category':
        # if categorical, use the underlying codes (+1 to make them 1..5 if ordered 0..4)
        rfm_domestic_scoring[col] = rfm_domestic_scoring[col].cat.codes + 1
    rfm_domestic_scoring[col] = pd.to_numeric(rfm_domestic_scoring[col], errors='coerce').astype('Int64')

# Total customers (or use nunique if you prefer unique IDs)
total_customers = rfm_domestic_scoring['CustomerID'].nunique() if 'CustomerID' in rfm_domestic_scoring.columns else len(rfm_domestic_scoring)

segment_summary = (
    rfm_domestic_scoring
      .groupby('Segment', dropna=False)
      .agg(
          R_Score=('R_Score','mean'),
          F_Score=('F_Score','mean'),
          M_Score=('M_Score','mean'),
          Customer_Count=('CustomerID','nunique') if 'CustomerID' in rfm_domestic_scoring.columns else ('Segment','size')
      )
      .reset_index()
)

segment_summary['Customer_%'] = (segment_summary['Customer_Count'] / total_customers * 100).round(2)
segment_summary[['R_Score','F_Score','M_Score']] = segment_summary[['R_Score','F_Score','M_Score']].astype(float).round(2)

rfm_domestic_seg_summary = segment_summary.sort_values('Customer_Count', ascending=False)
print(rfm_domestic_seg_summary)

               Segment  R_Score  F_Score  M_Score  Customer_Count  Customer_%
4               Others     2.02     2.69     2.65            1415       36.10
1            Champions     4.59     4.70     4.63             870       22.19
3      Loyal Customers     4.43     3.37     2.84             446       11.38
6     Recent Customers     3.25     1.58     2.28             388        9.90
2                 Lost     1.00     1.00     1.67             325        8.29
0              At Risk     1.79     4.23     3.72             264        6.73
5  Potential Loyalists     4.28     1.47     1.44             212        5.41


In [117]:
# save the segments summary in reports/eda
rfm_domestic_seg_summary_path = eda_dir / "rfm_domestic_seg_summary.csv"
rfm_domestic_seg_summary.to_csv(rfm_domestic_seg_summary_path)
print(f"[saved] Domestic Segment summary → {rfm_domestic_seg_summary_path}")


[saved] Domestic Segment summary → c:\Users\Localadmin\DSI_directory\ds08_online-retail\reports\eda\rfm_domestic_seg_summary.csv


for International Market

In [118]:
#Calculate RFM Scores for International Market (1–5 per metric)

#assign scores using quantiles (quintiles) — higher scores mean better customers.

# For convenience, use positive-spending RFM table
rfm_international_scoring = rfm_international.copy()

# Recency: smaller is better → reverse the scoring (5 = most recent)
rfm_international_scoring['R_Score'] = pd.qcut(rfm_international_scoring['Recency'], 5, labels=[5,4,3,2,1]).astype(int)

# Frequency: larger is better
rfm_international_scoring['F_Score'] = pd.qcut(rfm_international_scoring['Frequency'].rank(method='first'), 5, labels=[1,2,3,4,5]).astype(int)

# Monetary: larger is better
rfm_international_scoring['M_Score'] = pd.qcut(rfm_international_scoring['Monetary'], 5, labels=[1,2,3,4,5]).astype(int)
#Combine (concat) into an Overall RFM Score
#customer with R=5, F=4, M=5 → “545” (high value, frequent, and recent buyer)
rfm_international_scoring['RFM_Score'] = (
    rfm_international_scoring['R_Score'].astype(str)
    + rfm_international_scoring['F_Score'].astype(str)
    + rfm_international_scoring['M_Score'].astype(str)
)


In [119]:


#Define RFM Segments

def rfm_international_seg(row):
    r, f, m = int(row['R_Score']), int(row['F_Score']), int(row['M_Score'])
    if r >= 4 and f >= 4 and m >= 4:
        return 'Champions'
    elif r >= 4 and f >= 3:
        return 'Loyal Customers'
    elif f <= 2 and m <= 2 and r >= 4:
        return 'Potential Loyalists'
    elif r >= 3 and f <= 2:
        return 'Recent Customers'
    elif r <= 2 and f >= 4:
        return 'At Risk'
    elif r == 1 and f == 1:
        return 'Lost'
    else:
        return 'Others'

rfm_international_scoring['Segment'] = rfm_international_scoring.apply(rfm_domestic_seg, axis=1)

#dispaly the data
rfm_international_scoring

Unnamed: 0,CustomerID,Recency,Frequency,Monetary,Country,Customer_Region,R_Score,F_Score,M_Score,RFM_Score,Segment
1,12347,1,7,4310.00,Iceland,International,5,5,5,555,Champions
2,12348,74,4,1797.24,Finland,International,2,4,4,244,At Risk
3,12349,18,1,1757.55,Italy,International,4,1,4,414,Recent Customers
4,12350,309,1,334.40,Norway,International,1,1,1,111,Lost
5,12352,35,8,2506.04,Norway,International,3,5,4,354,Others
...,...,...,...,...,...,...,...,...,...,...,...
3727,17444,147,3,2940.04,Canada,International,2,4,4,244,At Risk
3771,17508,280,1,387.31,Greece,International,1,2,1,121,Others
4003,17828,22,4,1820.09,Malta,International,4,4,4,444,Champions
4004,17829,298,1,889.24,United Arab Emirates,International,1,2,3,123,Others


In [120]:
# save this dataframe into csv file
rfm_international_scor_path = prepared_dir / "rfm_international_scoring.csv"
rfm_international_scoring.to_csv(rfm_international_scor_path, index=False)
print(f"[saved] RFM international scoring data to: {rfm_international_scor_path}")


[saved] RFM international scoring data to: c:\Users\Localadmin\DSI_directory\ds08_online-retail\data\prepared\rfm_international_scoring.csv


| Segment                 | Typical Characteristics         | Strategic Action              |
| ----------------------- | ------------------------------- | ----------------------------- |
| **Champions**           | Recent, frequent, high spenders | Reward & retain               |
| **Loyal Customers**     | Buy often, decent spend         | Encourage advocacy            |
| **Potential Loyalists** | New but active                  | Nurture relationship          |
| **At Risk**             | Used to buy, not lately         | Win-back campaigns            |
| **Lost**                | Inactive & low value            | Consider dropping from target |
| **Others**              | Middle-tier customers           | Monitor behavior              |


In [121]:
# Ensure R/F/M scores are numeric ints (not category/strings)
for col in ['R_Score', 'F_Score', 'M_Score']:
    if str(rfm_international_scoring[col].dtype) == 'category':
        # if categorical, use the underlying codes (+1 to make them 1..5 if ordered 0..4)
        rfm_international_scoring[col] = rfm_international_scoring[col].cat.codes + 1
    rfm_international_scoring[col] = pd.to_numeric(rfm_international_scoring[col], errors='coerce').astype('Int64')

# Total customers (or use nunique if you prefer unique IDs)
total_customers = rfm_international_scoring['CustomerID'].nunique() if 'CustomerID' in rfm_international_scoring.columns else len(rfm_international_scoring)

segment_summary = (
    rfm_international_scoring
      .groupby('Segment', dropna=False)
      .agg(
          R_Score=('R_Score','mean'),
          F_Score=('F_Score','mean'),
          M_Score=('M_Score','mean'),
          Customer_Count=('CustomerID','nunique') if 'CustomerID' in rfm_international_scoring.columns else ('Segment','size')
      )
      .reset_index()
)

segment_summary['Customer_%'] = (segment_summary['Customer_Count'] / total_customers * 100).round(2)
segment_summary[['R_Score','F_Score','M_Score']] = segment_summary[['R_Score','F_Score','M_Score']].astype(float).round(2)

rfm_international_seg_summary = segment_summary.sort_values('Customer_Count', ascending=False)
print(rfm_international_seg_summary)

               Segment  R_Score  F_Score  M_Score  Customer_Count  Customer_%
4               Others     2.02     2.72     2.71             164       39.71
1            Champions     4.56     4.69     4.70              87       21.07
3      Loyal Customers     4.48     3.58     2.75              48       11.62
6     Recent Customers     3.24     1.47     2.45              37        8.96
2                 Lost     1.00     1.00     1.48              29        7.02
5  Potential Loyalists     4.38     1.35     1.42              26        6.30
0              At Risk     1.78     4.13     3.61              22        5.33


In [122]:
# save the segments summary in reports/eda
rfm_international_seg_summary_path = eda_dir / "rfm_international_seg_summary.csv"
rfm_international_seg_summary.to_csv(rfm_international_seg_summary_path)
print(f"[saved] International Segment summary → {rfm_international_seg_summary_path}")

[saved] International Segment summary → c:\Users\Localadmin\DSI_directory\ds08_online-retail\reports\eda\rfm_international_seg_summary.csv
