<a href="https://colab.research.google.com/github/cbonnin88/E-commerce-Retention-Engine/blob/main/Customer_Lifetime_Value.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [38]:
from google.colab import auth
from google.cloud import bigquery
import polars as pl
import plotly.express as px
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score
import numpy as np
import scipy.stats as stats
from statsmodels.stats.proportion import proportions_ztest
import pandas as pd

In [2]:
auth.authenticate_user()
project_id = 'poetic-maxim-486609-e5'
client = bigquery.Client(project=project_id)

In [12]:
query = """
SELECT
  user_id,
  COUNT(order_id) AS frequency,
  AVG(sale_price) AS monetary,
  DATE_DIFF(MAX(EXTRACT(DATE FROM created_at)), MIN(EXTRACT(DATE FROM created_at)), DAY) AS tenure,
  SUM(sale_price) AS target_clv
FROM `bigquery-public-data.thelook_ecommerce.order_items`
WHERE status NOT IN ('Cancelled','Returned')
GROUP BY 1
"""

In [13]:
df_ml = pl.from_pandas(client.query(query).to_dataframe())

In [14]:
display(df_ml.head())

user_id,frequency,monetary,tenure,target_clv
i64,i64,f64,i64,f64
20415,10,38.962,1776,389.620002
11973,8,48.91375,642,391.310001
87346,8,70.51375,432,564.110002
93461,8,55.398749,1838,443.189994
9650,8,47.695001,40,381.560005


# **Filter out users with only 1 purchase (LTV prediction is most useful for repeat behavoir)**

In [15]:
# 2.
df_ml = df_ml.filter(pl.col('frequency') >1)

In [16]:
print(f'Dataset Size: {df_ml.shape}')

Dataset Size: (34124, 5)


# **Training The Model**

- I will use a **Random Forest Regressor**

In [17]:
# Defining Features (X) and Target (y)
X = df_ml.select(['frequency','monetary','tenure']).to_numpy()
y = df_ml.select(['target_clv']).to_numpy().ravel()

In [18]:
# Split: 80% to train the 'brain', 20% to test it
X_train,X_test,y_train,y_test = train_test_split(X,y, test_size=0.2,random_state=42)

In [20]:
# Initialize and Train
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train,y_train)

Evaluatioin & Visualization

- I compare its **Predictions** against the **Actual** values in our test set.

In [21]:
# Make predictions
y_pred = rf_model.predict(X_test)

In [22]:
# Calculate Metrics
mae = mean_absolute_error(y_test,y_pred)
r2 = r2_score(y_test,y_pred)

In [23]:
print(f'Mean Absolute Error: €{mae:.2f}')
print(f'R-Squared Score: {r2:.2f}')

Mean Absolute Error: €0.33
R-Squared Score: 1.00


# **Visualize Actual vs Predicted**

In [40]:
fig = px.scatter(
    x=y_test,
    y=y_pred,
    labels={'X':'Actual LVT (€)','y':'Predicted LTV (€)'},
    title='Actual vs. Predicted Customer Lifetime Value',
    opacity=0.5
)

fig.add_shape(
    type='line',
    x0=min(y_test),
    y0=min(y_test),
    x1=max(y_test),
    y1=max(y_test),
    line=dict(color='Red',dash='dash')
)

# **A/B Testing**

In [27]:
# I created a dummy experiment dataset based on 'TheLook' user counts
# I am assuming that we have 10000 in each group

n_control = 10000
n_variant = 10000

In [28]:
# Defining simulated Conversion Rates (CVR)
control_conversions = np.random.binomial(1,0.50,n_control)
variant_conversions = np.random.binomial(1,0.058,n_variant)

In [29]:
# Creating a Polars DataFrame for the results
ab_results = pl.DataFrame({
    'group':['Control'] * n_control + ['Variant'] * n_variant,
    'converted': np.concatenate([control_conversions,variant_conversions])
})

**The Statistical Test (Z-Test for Proportions)**

- I determine if **0.8% lift** is statistically Significant or just random noise

In [31]:
conversions = [control_conversions.sum(),variant_conversions.sum()]
totals = [n_control, n_variant]

z_stat, p_value = proportions_ztest(conversions,totals)

In [33]:
print(f'Control Conversion Rate: {control_conversions.mean():.2%}')
print(f'Variant Conversion Rate: {variant_conversions.mean():.2%}')
print(f'P-Value: {p_value:.4f}')

if p_value < 0.05:
  print('Result: Statistically Significant!, We should rool out the feature.')
else:
  print('Result: Not Significant. Keep the Control.')

Control Conversion Rate: 49.97%
Variant Conversion Rate: 5.81%
P-Value: 0.0000
Result: Statistically Significant!, We should rool out the feature.


# **Visualizing the "Confidence"**

In [34]:
# Calculate standard error for the error bars

ctrl_se = np.sqrt(0.05 * (1-0.05) / n_control)
var_se = np.sqrt(0.058 * (1-0.058) / n_variant)

In [39]:
plot_data = pd.DataFrame({
    'Group':['Control','Variant'],
    'Conversion Rate':[control_conversions.mean(),variant_conversions.mean()],
    'Standard Error':[ctrl_se* 1.96, var_se*1.96] # 95% Confidence Interval
})

In [41]:
fig2 = px.bar(
    plot_data,
    x='Group',
    y='Conversion Rate',
    color='Conversion Rate',
    color_continuous_scale= 'viridis',
    error_y='Standard Error',
    title='A/B Test Results: Control vs. Variant (Viridis Scale)',
    text_auto='.2%'
)

fig2.update_layout(
    template='plotly_white',
    coloraxis_showscale=False,
    yaxis_tickformat='.1%',
    yaxis_title='Conversion Rate (%)',
    hovermode='x unified'
)

fig2.show()