In [11]:
import pandas as pd
import numpy as np

# Load data
df = pd.read_excel(r'E:\c drive\amazon\data\online_retail\online_retail_II.xlsx', parse_dates=['InvoiceDate'])

# Clean and prepare
df.dropna(inplace=True)
df = df[df['Quantity'] > 0]
df = df[df['Price'] > 0]
df = df.dropna(subset=['Customer ID'])

# Optional: standardize column names
df.columns = df.columns.str.strip().str.replace(' ', '_')

# Create TotalPrice
df['TotalPrice'] = df['Quantity'] * df['Price']


In [12]:
df.columns = df.columns.str.strip().str.replace(' ', '_')


In [13]:
import pandas as pd
import numpy as np
from datetime import datetime


In [14]:
df = df[df['Quantity'] > 0]
df = df[df['Price'] > 0]
df['TotalPrice'] = df['Quantity'] * df['Price']
df = df.dropna(subset=['Customer_ID'])
df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'])


In [15]:
# Group by customer
cltv = df.groupby('Customer_ID').agg({
    'Invoice': 'nunique',                      # Frequency
    'InvoiceDate': lambda x: (x.max() - x.min()).days,  # Recency (period)
    'TotalPrice': 'sum'                        # Monetary value
}).reset_index()

cltv.columns = ['Customer_ID', 'Frequency', 'Recency', 'Monetary']


In [16]:
# Average Order Value
cltv['AOV'] = cltv['Monetary'] / cltv['Frequency']

# Purchase Frequency
cltv['PF'] = cltv['Frequency'] / cltv['Recency']

# Profit Margin Assumption
cltv['ProfitMargin'] = 0.10  # assume 10% profit margin

# CLTV = AOV * PF * Customer Lifetime * Profit Margin
cltv['CLTV'] = cltv['AOV'] * cltv['PF'] * 365 * cltv['ProfitMargin']


In [17]:
# Drop NaNs from CLTV column
cltv = cltv.replace([np.inf, -np.inf], np.nan)
cltv = cltv.dropna(subset=['CLTV'])

# Use qcut with retbins=True to see how many bins are actually created
cltv_bins, bin_edges = pd.qcut(cltv['CLTV'], q=4, labels=False, retbins=True, duplicates='drop')

# Number of actual bins
n_bins = len(bin_edges) - 1

# Generate labels accordingly (e.g., ['C', 'B', 'A'] for 3 bins)
labels = list('DCBA')[-n_bins:]

# Apply qcut again with correct labels
cltv['CLTV_Segment'] = pd.qcut(cltv['CLTV'], q=n_bins, labels=labels, duplicates='drop')


In [18]:
print(cltv.groupby('CLTV_Segment')['CLTV'].agg(['count', 'min', 'max', 'mean']))


              count         min           max         mean
CLTV_Segment                                              
D               704    5.011508    156.734788    97.949600
C               704  156.980812    290.943134   217.151310
B               703  291.748354    568.985464   409.970081
A               704  569.007572  48284.755000  2035.967778


  print(cltv.groupby('CLTV_Segment')['CLTV'].agg(['count', 'min', 'max', 'mean']))


In [19]:
print(cltv.head())

   Customer_ID  Frequency  Recency  Monetary          AOV        PF  \
0      12346.0         11      196    372.86    33.896364  0.056122   
1      12347.0          2       37   1323.32   661.660000  0.054054   
3      12349.0          3      181   2671.14   890.380000  0.016575   
5      12352.0          2       16    343.80   171.900000  0.125000   
8      12356.0          3       44   3562.25  1187.416667  0.068182   

   ProfitMargin         CLTV CLTV_Segment  
0           0.1    69.435663            D  
1           0.1  1305.437297            A  
3           0.1   538.655304            B  
5           0.1   784.293750            A  
8           0.1  2955.048295            A  


In [20]:
cltv.to_csv('cltv_dataset.csv', index=False)
