# Predicting Customer Lifetime Value¶
Using past purchasing history of customers to build a model that can predict the CLV for new customers

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
import seaborn as sn

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
import sklearn.metrics

In [2]:
raw_data = pd.read_csv("customer_spend.csv")
raw_data.head(2)

Unnamed: 0,CUST_ID,New_Customer,Sep-21,Oct-21,Nov-21,Dec-21,Jan-22,Feb-22,Mar-22,Apr-22,Lifetime Value
0,1001,Yes,2649.6,1152.0,921.6,3801.6,1382.4,1728.0,691.2,1267.2,11520
1,1002,No,709.55,308.5,246.8,1018.05,370.2,462.75,185.1,339.35,3085


In [3]:
raw_data.dtypes

CUST_ID             int64
New_Customer       object
Sep-21            float64
Oct-21            float64
Nov-21            float64
Dec-21            float64
Jan-22            float64
Feb-22            float64
Mar-22            float64
Apr-22            float64
Lifetime Value      int64
dtype: object

# Correlation Analysis

In [4]:
cleaned_data = raw_data.drop("CUST_ID", axis=1)
cleaned_data.head(2)

Unnamed: 0,New_Customer,Sep-21,Oct-21,Nov-21,Dec-21,Jan-22,Feb-22,Mar-22,Apr-22,Lifetime Value
0,Yes,2649.6,1152.0,921.6,3801.6,1382.4,1728.0,691.2,1267.2,11520
1,No,709.55,308.5,246.8,1018.05,370.2,462.75,185.1,339.35,3085


In [5]:
#correlation of all columns to CLV, Show 5 Highest correlations.
cleaned_data.corr()['CLV'].nlargest(5,)

KeyError: 'CLV'

In [None]:
#Months are strongest correlation to the target variable (CLV).  Month 1 (73.4%), Month 5, Month 3, etc.
#Perfect correlation = 1, so strongest relationship is the highest value in month data. 
cleaned_data.corr()['CLV'].nlargest(5,).plot()

In [None]:
#corr = cleaned_data.corr()['CLV']
f,ax = plt.subplots(figsize=(10,8))
corr = cleaned_data.corr()
sn.heatmap(corr,mask=np.zeros_like(corr, dtype=bool), cmap=sn.diverging_palette(200,110,as_cmap=True), square=True, ax=ax)

In [None]:
#corr = cleaned_data.corr()['CLV']
sn.pairplot(cleaned_data[['CLV', 'MONTH_2', 'MONTH_1','MONTH_5']])
sn.lmplot(x="CLV", y="MONTH_1", data=cleaned_data);

In [None]:
sn.scatterplot(data = cleaned_data, x = "CLV", y = "Sep-21")
#plot the line of best fit for highest correlation month.
sn.lmplot(x="CLV", y="MONTH_1", data=cleaned_data);
#plot the line of Best fit for lowest correlation month.
sn.lmplot(x="CLV", y="MONTH_6", data=cleaned_data);

# Do Training & Testing Split
To perform supervised machine learning - need to split the data into training (80%) & testing (20%) datasets.

In [9]:
#Need to .Drop the Target column ('CLV') as a predictor.  
#Set Target as 'CLV' column. Training (.8) & testing (.2)

predictors = cleaned_data.drop(["Lifetime Value"], axis=1)
targets = cleaned_data.["LifetimeValue"]
pred_train, pred_test, tar_train, tar_test = train_test_split(predictors, targets, test_size=.2)
print( "Predictor - Training: ", pred_train.shape, "Predictor - testing: ", pred_test.shape)

AttributeError: 'DataFrame' object has no attribute 'LifetimeValue'

# Build a Linear Regression equation for predicting CLV and 
then check its accuracy against the test dataset.

In [None]:
#Build model on training data
model = LinearRegression()
model.fit(pred_train, tar_train)
print("Coefficients: \n", model.coef_)
print("Intercept: ", model.intercept_)


#test on testing data
predictions = model.predict(pred_test)
predictions

print("Accuracy of model:", sklearn.metrics.r2_score(tar_test, predictions).round(2))
#output shows the accuracy of the predicting CLV model is ?%.

# Predicting the CLV for a new customer

In [10]:
#let's say new customer spends = first 6 months €100, 0, €50, €0, €0, €0.
#Use this model to predict this customer's CLV.

new_data = np.array([100,0,0,0,0,0]).reshape(1,-1)
new_pred=model.predict(new_data)
print("The CLV for the new customer who bought only in first 3-6 months: €", new_pred[0].round(2))

NameError: name 'model' is not defined

In [None]:
#customer spends 0 in months 1 to 6

second_data = np.array([300,0, 0,0,0,0]).reshape(1,-1)
second_pred=model.predict(second_data)
print("The CLV for the new customer who bought only €100 in first month: €", second_pred[0].round(2))

In [None]:
#Customer spends 300 in months 3 to 6 only. 
new_data = np.array([0,0,300,300,300,300]).reshape(1,-1)
new_pred=model.predict(new_data)
print("The CLV for the new customer who bought only in last 3-6 months: €", new_pred[0].round(2))

In [None]:
#Customer spends 0 in months 1 to 6. 
new_data = np.array([0,0,0,0,0,0]).reshape(1,-1)
new_pred=model.predict(new_data)
print("The CLV for the new customer who did not buy in first 6 months: €", new_pred[0].round(2))