In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import mean_squared_error, classification_report
import shap 
import matplotlib.pyplot as plt

customer_metrics = pd.read_csv('../data/processed/customer_segments.csv')

In [4]:
def prepare_features_for_clv(data):
  features = data.copy()

  features['last_purchase'] = pd.to_datetime(features['last_purchase'])

  features['recency'] = (features['last_purchase'].max() - features['last_purchase']).dt.days

  features['avg_purchase_interval'] = features['customer_lifespan'] / features['order_count']

  features['purchase_regularity'] = features['order_count'] / features['customer_lifespan']

  model_features = ['recency', 'purchase_frequency', 'avg_order_value', 'avg_purchase_interval',              'purchase_regularity']

  return features[model_features]

X = prepare_features_for_clv(customer_metrics)
Y = customer_metrics['total_spend']

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.fit_transform(X_test)


In [7]:
def train_clv_model(X_train, Y_train):
  model = RandomForestRegressor(n_estimators=100, random_state=42)
  model.fit(X_train, Y_train)
  return model

clv_model = train_clv_model(X_train_scaled, Y_train)

train_predictions = clv_model.predict(X_train_scaled)
test_predictions = clv_model.predict(X_test_scaled)

print('CLV Model Performance:')
print(f'Training RMSE {np.sqrt(mean_squared_error(Y_train, train_predictions)):.2f}')
print(f'Testing RMSE {np.sqrt(mean_squared_error(Y_test, test_predictions)):.2f}')

CLV Model Performance:
Training RMSE 2734.78
Testing RMSE 10667.76
