# Customer Lifetime  Value (CLV) Prediction

In [1]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import lifetimes

from datetime import datetime
from dateutil.relativedelta import relativedelta

import os
from google.cloud import bigquery

### Import Data

In [7]:
# Import order data
gcr_project_id = os.getenv('GCR_CLV_PROJECT_ID')
QUERY  = f"""
WITH order_values AS (
    SELECT 
      order_id,
      SUM(sale_price) as order_value
    FROM `{gcr_project_id}.thelook_ecommerce.order_items`
    GROUP BY order_id
    ORDER BY order_id
)
SELECT 
  orders.order_id,
  orders.user_id,
  users.first_name,
  users.last_name,
  users.email,
  orders.created_at,
  orders.status,
  order_values.order_value
FROM `{gcr_project_id}.thelook_ecommerce.orders` AS orders
    LEFT JOIN `{gcr_project_id}.thelook_ecommerce.users` AS users ON orders.user_id = users.id
    LEFT JOIN order_values on orders.order_id = order_values.order_id
ORDER BY orders.order_id;
"""

client = bigquery.Client()

df = client.query_and_wait(QUERY).to_dataframe()



### Formatting Data - Calculate Frequency, Recency, Customer Age, and Customer Monetary Value.

In [8]:
# Reformat 'created_at' column the exclude time of day
from datetime import datetime
df2 = df.copy()
df2['created_at'] = df2.created_at.apply(lambda x : x.date())
df2.head()

Unnamed: 0,order_id,user_id,first_name,last_name,email,created_at,status,order_value
0,1,1,Jenna,Galvan,jennagalvan@example.net,2022-06-11,Processing,17.379999
1,2,3,Mark,Calhoun,markcalhoun@example.org,2024-02-25,Cancelled,44.119999
2,3,3,Mark,Calhoun,markcalhoun@example.org,2023-10-07,Complete,79.989998
3,4,3,Mark,Calhoun,markcalhoun@example.org,2022-11-23,Processing,83.0
4,5,3,Mark,Calhoun,markcalhoun@example.org,2024-06-12,Complete,48.0


In [9]:
# Get Customer Summary Data : Frequency, Recency, Monetary Value
df_rfm  = lifetimes.utils.summary_data_from_transaction_data(df2, 'user_id', 'created_at',
                                                                 freq='D', include_first_transaction = False)
df_rfm['monetary_value'] = df2.groupby('user_id')[['order_value']].mean()
df_rfm.head()

Unnamed: 0_level_0,frequency,recency,T,monetary_value
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,0.0,0.0,953.0,17.379999
3,3.0,567.0,788.0,63.777499
4,0.0,0.0,427.0,34.98
5,0.0,0.0,84.0,70.99
6,0.0,0.0,73.0,230.0


### Defining Utility Functions for Evaluating CLV Prediction Model Performance

This section contains the definition of the following functions:

* `get_train_test_rfm` - get training and testing datasets for evaluating a predictive model. The training dataset contains RFM data for shoppers whose first purchase was made during the training period. The testing dataset contains the true RFM and equity data of these shoppers during the testing period.
* `get_pred_equity` - predict the future equity of users whose data was used to fit the model during the prediction period.

In [29]:
def get_train_test_rfm(df : pd.DataFrame, 
                        train_period_start : datetime.date,
                        train_period_end : datetime.date,
                        prediction_period_duration : int=12):
    """Get RFM summary dataframes for the training & testing periods.
    
    Args:
        df - 
        train_period_start - 
        train_period_end - 
        prediction_period_duration - duration of the prediction period in months
        
    Returns:
    
    """
            
    # Create copy of dataset & reformat 'created_at' column
    df1 = df.copy()
    df1['created_at'] = df1.created_at.apply(lambda x : x.date())

    # Calculate end of prediction period and assert ensure that it ends before the maximum order date
    prediction_period_end = train_period_end + relativedelta(months=prediction_period_duration)
    assert df1.created_at.max() > prediction_period_end, f"Prediction period ends in the future - we do not have the data to evaluate these predictions : try reducing the prediction period duration or choosing an earlier training period end date"

    # Filter out users who were 'alive' before training period started
    first_order_dates = df1.groupby('user_id')['created_at'].min()
    valid_users = first_order_dates[(first_order_dates >= train_period_start) & (first_order_dates <= train_period_end)].index
    df1 = df1[df1['user_id'].isin(valid_users)]

    # Select data from training & testing periods
    df_train = df1[(df1.created_at <= train_period_end)]
    df_test = df1[(df1.created_at > train_period_end) & (df1.created_at <= prediction_period_end)]

    # Get RFM summary data from training period
    df_train_rfm  = lifetimes.utils.summary_data_from_transaction_data(df_train, 'user_id', 'created_at',
                                                                           freq='D', include_first_transaction = False)
    df_train_rfm = pd.merge(df_train_rfm, df_train.groupby('user_id')['order_value'].agg(['mean', 'sum']), 
                            how='left', on='user_id').rename(columns={'mean' : 'monetary_value', 'sum' : 'revenue'})
    
    # Get RFM summary data from testing period
    df_test_rfm = lifetimes.utils.summary_data_from_transaction_data(df_test, 'user_id', 'created_at',
                                                                     freq='D', include_first_transaction = True)
    df_test_rfm = pd.merge(df_test_rfm, df_test.groupby('user_id')['order_value'].agg('sum'), 
                           how='left', on='user_id').rename(columns={'frequency' : 'true_purchases', 'order_value':'true_equity'})
    
    # Combine training & testing RFM data
    df_all_rfm = pd.merge(df_train_rfm.rename(columns={'frequency':'train_frequency', 'recency':'train_recency', 'T':'train_T'}), 
                          df_test_rfm.rename(columns={'recency':'test_recency', 'T':'test_T'}), how='left', left_index=True, right_index=True)

    
    return df_train_rfm, df_test_rfm, df_all_rfm

In [6]:
def get_pred_equity(model,
                    prediction_period_duration : int=12,
                    discount_rate : float=0.1, freq : str="D"):
    """Predict the equity of training dataset shoppers during the prediction period.
    
    Args:
        model - prediction model that has already been fitted with the training dataset
        prediction_period_duration - duration of the prediction period in months
        
    Returns:
    
    """
    
    pred_equity = model.predict_clv(prediction_period_duration, discount_rate, freq).rename(columns={'clv':'pred_equity'})
    
    return pred_equity

### Gamma-Gamma CLV Prediction Model

In [10]:
# Create class for Gamma-Gamma prediction model
class PredictorGGF:
    def __init__(self, df_summary):
        self.df_summary = df_summary
        self.correlation = self.df_summary[self.df_summary.frequency != 0][['monetary_value', 'frequency']].corr().values[0,1]

        return print(f"Correlation between shopper frequency & monetary value is : {float(self.correlation):.5f}.")

    def fit_bgf(self, penalty_coef : float=0.01):

        self.bgf = lifetimes.BetaGeoFitter(penalty_coef)
        self.bgf.fit(self.df_summary['frequency'],
                    self.df_summary['recency'],
                    self.df_summary['T'])

        print(f"Beta-Gamma model successfully fitted")
        return self.bgf.summary

    def fit_ggf(self, penalty_coef : float=0.01):
        assert self.correlation < 0.1, f"Correlation between frequency and monetary value for returning customers is {self.correlation} - this is quite high and may cause poor predictions"

        self.ggf = lifetimes.GammaGammaFitter(penalty_coef)
        self.ggf.fit(self.df_summary[self.df_summary.frequency != 0]['frequency'],
                     self.df_summary[self.df_summary.frequency != 0]['monetary_value'])

        print(f"Gamma-Gamma model successfully fitted")
        if float(self.ggf.params_['q']) < 1:
            print("Outliers in the data are causing the 'q' parameter for the Gamma-Gamma model to be < 1 therefore model predictions will fail.\nFix this by either removing outliers until you get 'q' > 1, or use raw monetary values to model CLV.")

        return self.ggf.summary
    
    def predict_clv(self, time : int=12, discount_rate : float=0.1, freq : str="D"):
        """Predict Customer Lifetime Value
        Args:
            time (float, optional) – the lifetime expected for the user in months. Default: 12
            discount_rate (float, optional) – the monthly adjusted discount rate. Default: 0.01
            freq (string, optional) – {“D”, “H”, “M”, “W”} for day, hour, month, week. This represents what unit of time your T is measure in.

        Returns:
            Series – Series object with customer ids as index and the estimated customer lifetime values as values
        """

        # Predict customer lifetime value
        clv_preds_df = self.ggf.customer_lifetime_value(
                            self.bgf,
                            self.df_summary['frequency'],
                            self.df_summary['recency'],
                            self.df_summary['T'],
                            self.df_summary['monetary_value'],
                            time=time,
                            discount_rate=discount_rate,
                            freq=freq
                        ).to_frame()
        
        return clv_preds_df

In [11]:
ggf_model = PredictorGGF(df_rfm)
penalty_val = 0.01
bgf_summary = ggf_model.fit_bgf(penalty_coef=penalty_val)
ggf_summary = ggf_model.fit_ggf(penalty_coef=penalty_val)
clv_preds = ggf_model.predict_clv()
clv_preds

Correlation between shopper frequency & monetary value is : 0.00726.


In [30]:
# Experiment 1 : Training Period = 1/1/2022 - 30/4/2024, Testing Period = 1/5/2024 - 31/12/2024
df_train, df_test, df_all = get_train_test_rfm(df, train_period_start=datetime(2022,1,1).date(), train_period_end=datetime(2024,4,30).date(), prediction_period_duration=8)

# Fit GGF model to training data
ggf_model_exp1 = PredictorGGF(df_train)
penalty_val = 0.01
bgf_summary = ggf_model_exp1.fit_bgf(penalty_coef=penalty_val)
ggf_summary = ggf_model_exp1.fit_ggf(penalty_coef=penalty_val)

# Predict shopper equity during prediction period
pred_equity = get_pred_equity(ggf_model_exp1, prediction_period_duration=8)

# Add predicted equity to testing period data
df_all = pd.merge(df_all, pred_equity, how='left', left_on='user_id', right_index=True)
df_all

Correlation between shopper frequency & monetary value is : 0.01471.
Beta-Gamma model successfully fitted
Gamma-Gamma model successfully fitted
Outliers in the data are causing the 'q' parameter for the Gamma-Gamma model to be < 1 therefore model predictions will fail.
Fix this by either removing outliers until you get 'q' > 1, or use raw monetary values to model CLV.


Unnamed: 0_level_0,train_frequency,train_recency,train_T,monetary_value,revenue,true_purchases,test_recency,test_T,true_equity,pred_equity
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,0.0,0.0,689.0,17.379999,17.379999,,,,,-2.985951
3,2.0,459.0,524.0,69.036666,207.109997,1.0,0.0,201.0,48.000000,25.806836
4,0.0,0.0,163.0,34.980000,34.980000,,,,,-5.271491
8,1.0,109.0,200.0,74.294999,148.589998,1.0,0.0,7.0,121.950001,17.393892
10,2.0,375.0,492.0,36.663333,109.990000,1.0,0.0,177.0,140.779999,14.203999
...,...,...,...,...,...,...,...,...,...,...
99989,0.0,0.0,254.0,44.950001,44.950001,,,,,-4.653530
99990,1.0,25.0,662.0,41.500000,83.000000,,,,,3.474670
99993,0.0,0.0,490.0,33.000000,33.000000,,,,,-3.570680
99995,1.0,77.0,482.0,229.995001,459.990002,,,,,27.335000


### Raw Monetary Value CLV Prediction Model

In [32]:
# Create class for Raw Monetary Value prediction model
import lifetimes.utils
class PredictorRawMonetary:
    def __init__(self, df_summary):
        self.df_summary = df_summary
        self.correlation = self.df_summary[self.df_summary.frequency != 0][['monetary_value', 'frequency']].corr().values[0,1]

        return print(f"Correlation between shopper frequency & monetary value is : {float(self.correlation):.5f}.")

    def fit_bgf(self, penalty_coef : float=0.01):

        self.bgf = lifetimes.BetaGeoFitter(penalty_coef)
        self.bgf.fit(self.df_summary['frequency'],
                    self.df_summary['recency'],
                    self.df_summary['T'])

        print(f"Beta-Gamma model successfully fitted")
        return self.bgf.summary
    
    def predict_clv(self, time : int=12, discount_rate : float=0.1, freq : str="D"):
        """Predict Customer Lifetime Value
        Args:
            time (float, optional) – the lifetime expected for the user in months. Default: 12
            discount_rate (float, optional) – the monthly adjusted discount rate. Default: 0.01
            freq (string, optional) – {“D”, “H”, “M”, “W”} for day, hour, month, week. This represents what unit of time your T is measure in.

        Returns:
            Series – Series object with customer ids as index and the estimated customer lifetime values as values
        """

        # Predict customer lifetime value
        clv_preds_df = lifetimes.utils._customer_lifetime_value(
                            self.bgf,
                            self.df_summary['frequency'],
                            self.df_summary['recency'],
                            self.df_summary['T'],
                            self.df_summary['monetary_value'],
                            time=time,
                            discount_rate=discount_rate,
                            freq=freq
                        ).to_frame()
        
        return clv_preds_df

In [33]:
rmv_model = PredictorRawMonetary(df_rfm)
penalty_val = 0.01
bgf_summary = rmv_model.fit_bgf(penalty_coef=penalty_val)
clv_preds = rmv_model.predict_clv()
clv_preds

Correlation between shopper frequency & monetary value is : 0.00726.


In [36]:
# Experiment 1 : Training Period = 1/1/2022 - 30/4/2024, Testing Period = 1/5/2024 - 31/12/2024
df_train, df_test, df_all = get_train_test_rfm(df, train_period_start=datetime(2022,1,1).date(), train_period_end=datetime(2024,4,30).date(), prediction_period_duration=8)

# Fit GGF model to training data
rmv_model_exp1 = PredictorRawMonetary(df_train)
penalty_val = 0.01
bgf_summary = rmv_model_exp1.fit_bgf(penalty_coef=penalty_val)

# Predict shopper equity during prediction period
pred_equity = get_pred_equity(rmv_model_exp1, prediction_period_duration=8)

# Add predicted equity to testing period data
df_all = pd.merge(df_all, pred_equity, how='left', left_on='user_id', right_index=True)
df_all

Correlation between shopper frequency & monetary value is : 0.01471.
Beta-Gamma model successfully fitted


Unnamed: 0_level_0,train_frequency,train_recency,train_T,monetary_value,revenue,true_purchases,test_recency,test_T,true_equity,pred_equity
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,0.0,0.0,689.0,17.379999,17.379999,,,,,1.192063
3,2.0,459.0,524.0,69.036666,207.109997,1.0,0.0,201.0,48.000000,23.960483
4,0.0,0.0,163.0,34.980000,34.980000,,,,,4.235652
8,1.0,109.0,200.0,74.294999,148.589998,1.0,0.0,7.0,121.950001,15.029285
10,2.0,375.0,492.0,36.663333,109.990000,1.0,0.0,177.0,140.779999,12.873266
...,...,...,...,...,...,...,...,...,...,...
99989,0.0,0.0,254.0,44.950001,44.950001,,,,,4.804843
99990,1.0,25.0,662.0,41.500000,83.000000,,,,,2.887690
99993,0.0,0.0,490.0,33.000000,33.000000,,,,,2.706649
99995,1.0,77.0,482.0,229.995001,459.990002,,,,,24.450325
