# **Customer Lifetime Value Prediction (Olist Marketplace)**

## Environment Setup & Dependencies

In [4]:
import pandas as pd
import numpy as np
import dill

from lifetimes import BetaGeoFitter, GammaGammaFitter
from lifetimes.utils import summary_data_from_transaction_data
from lifetimes.plotting import plot_probability_alive_matrix

## Data Acquisition

In [None]:
# Definition of archives to charge
datasets = {
    'orders': 'olist_orders_dataset.csv',
    'items': 'olist_order_items_dataset.csv',
    'customers': 'olist_customers_dataset.csv',
    'payments': 'olist_order_payments_dataset.csv',
}

# Bulk data loading
orders = pd.read_csv(datasets['orders'])
items = pd.read_csv(datasets['items'])
customers = pd.read_csv(datasets['customers'])
payments = pd.read_csv(datasets['payments'])

## Data Integration & Preprocessing

In [6]:
# Consolidation of sources and schemas normalization
df = (orders
      .merge(items, on='order_id', how='inner')
      .merge(customers, on='customer_id', how='inner')
      [['customer_unique_id', 'order_purchase_timestamp', 'price', 'customer_state', 'customer_city']]
      .copy()
)

# Standardization of types and nomenclature
df['order_purchase_timestamp'] = pd.to_datetime(df['order_purchase_timestamp'])
df.columns = ['customer_id', 'order_date', 'sales', 'state', 'city']

## RFM Feature Engineering

In [7]:
# Definition of the cut-off point for the analysis
analysis_date = df['order_date'].max()

# Generation metrics of recent, frequency and monetary value (RFM)
rfm_summary = summary_data_from_transaction_data(
    df,
    customer_id_col='customer_id',
    datetime_col='order_date',
    monetary_value_col='sales',
    observation_period_end=analysis_date
)

# Inclusion of state and city information
geo_metadata = df.groupby('customer_id')[['state', 'city']].first()
rfm_summary = rfm_summary.join(geo_metadata, how='left')

rfm_summary.head()

Unnamed: 0_level_0,frequency,recency,T,monetary_value,state,city
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0000366f3b9a7992bf8c76cfdf3221e2,0.0,0.0,116.0,0.0,SP,cajamar
0000b849f77a49e4a4ce2b2a4ca5be3f,0.0,0.0,119.0,0.0,SP,osasco
0000f46a3911fa3c0805444483337064,0.0,0.0,542.0,0.0,SC,sao jose
0000f6ccb0745a6a4b88665a16c9f078,0.0,0.0,326.0,0.0,PA,belem
0004aac84e0df4da2b147fca70cf8255,0.0,0.0,293.0,0.0,SP,sorocaba


## BG/NBD Model: Training & Inference

In [None]:
# Parameters of configuration
PENALIZER = 0.01
FORECAST_DAYS = 30

# Initialization and training of the BG/NBD model
bgf = BetaGeoFitter(penalizer_coef=PENALIZER)
bgf.fit(
    rfm_summary['frequency'], 
    rfm_summary['recency'], 
    rfm_summary['T']
)

# Calculation of the expected number of purchases in the next 30 days
rfm_summary['expected_purchases_30d'] = (
    bgf.conditional_expected_number_of_purchases_up_to_time(
    FORECAST_DAYS, 
    rfm_summary['frequency'], 
    rfm_summary['recency'], 
    rfm_summary['T']
    )
)

## Monetary Modeling & CLV Calculation

In [9]:
# Only recurrent customers for training the monetary model
returning_customers = rfm_summary[rfm_summary['frequency'] > 0]

# Training of the monetary model
ggf = GammaGammaFitter(penalizer_coef=PENALIZER)
ggf.fit(
    returning_customers['frequency'], 
    returning_customers['monetary_value']
)

# Inference of CLV and probability of being active
rfm_summary['clv_12m'] = ggf.customer_lifetime_value(
    bgf,
    rfm_summary['frequency'],
    rfm_summary['recency'],
    rfm_summary['T'],
    rfm_summary['monetary_value'],
    time=12,  # months
    discount_rate=0.01
)

rfm_summary['prob_alive'] = bgf.conditional_probability_alive(
    rfm_summary['frequency'], 
    rfm_summary['recency'], 
    rfm_summary['T']
)

## Model Persistence & Data Export

In [10]:
artifacts = {
    'bgf_model.pkl': bgf,
    'ggf_model.pkl': ggf,
}

for filename, model in artifacts.items():
    with open(filename, 'wb') as f:
        dill.dump(model, f)

OUTPUT_FILE = 'olist_rfm_summary.csv'
rfm_summary.to_csv(OUTPUT_FILE, index=True)

print(f"INFO: Successfully exported {len(artifacts)} models and '{OUTPUT_FILE}'.")

INFO: Successfully exported 2 models and 'olist_rfm_summary.csv'.
