 ##  Predicting Customer Lifetime Value
 
 - Data downloaded from [Kaggle](https://www.kaggle.com/baetulo/lifetime-value)

In [54]:
from pandas import Series, DataFrame
import pandas as pd
import numpy as np
import os
import matplotlib.pylab as plt
from sklearn.model_selection  import train_test_split
from sklearn.linear_model import LinearRegression
import sklearn.metrics

raw_data = pd.read_csv("raw_data.csv")

In [55]:
raw_data.dtypes

product_type          object
user_id              float64
join_date             object
hidden                 int64
product               object
STV                  float64
target               float64
credit_card_level     object
is_lp                  int64
aff_type              object
is_cancelled         float64
country_segment       object
dtype: object

In [56]:
raw_data.count()

product_type         881127
user_id              881127
join_date            881127
hidden               881127
product              881127
STV                  881127
target               881127
credit_card_level    881127
is_lp                881127
aff_type             881125
is_cancelled         807454
country_segment      881127
dtype: int64

In [57]:
raw_data.head()

Unnamed: 0,product_type,user_id,join_date,hidden,product,STV,target,credit_card_level,is_lp,aff_type,is_cancelled,country_segment
0,type_ex,7.0,2018-12-01 00:01:45,0,product_1,8.25,8.25,standard,0,PPL,,US
1,type_ex,20.0,2018-12-01 00:06:05,0,product_2,8.25,8.25,standard,0,PPL,,US
2,type_ex,22.0,2018-12-01 00:06:23,0,product_3,8.25,8.25,prepaid,0,PPL,,US
3,type_ex,26.0,2018-12-01 00:07:12,0,product_2,8.25,8.25,standard,0,PPL,,US
4,type_ex,59.0,2018-12-01 00:15:21,0,product_2,8.25,8.25,standard,0,PPL,,Other Countries


In [58]:
df1 = pd.get_dummies(raw_data["product_type"])

In [59]:
df2 = pd.concat((raw_data, df1), axis=1)

### Correlation

In [60]:
df2.head()

Unnamed: 0,product_type,user_id,join_date,hidden,product,STV,target,credit_card_level,is_lp,aff_type,is_cancelled,country_segment,type_ex,type_p,type_u,type_x
0,type_ex,7.0,2018-12-01 00:01:45,0,product_1,8.25,8.25,standard,0,PPL,,US,1,0,0,0
1,type_ex,20.0,2018-12-01 00:06:05,0,product_2,8.25,8.25,standard,0,PPL,,US,1,0,0,0
2,type_ex,22.0,2018-12-01 00:06:23,0,product_3,8.25,8.25,prepaid,0,PPL,,US,1,0,0,0
3,type_ex,26.0,2018-12-01 00:07:12,0,product_2,8.25,8.25,standard,0,PPL,,US,1,0,0,0
4,type_ex,59.0,2018-12-01 00:15:21,0,product_2,8.25,8.25,standard,0,PPL,,Other Countries,1,0,0,0


In [61]:
cleaned_data = df2[['target','hidden','STV','is_lp','type_ex','type_p','type_u','type_x']]
cleaned_data.corr()['target']

target     1.000000
hidden    -0.185045
STV        0.501814
is_lp      0.012936
type_ex    0.043607
type_p    -0.083589
type_u    -0.009093
type_x     0.068205
Name: target, dtype: float64

### Training and Testing Split

In [62]:
predictors = cleaned_data[['STV','type_ex','type_p','type_u','type_x']]
targets = cleaned_data.target

pred_train, pred_test, tar_train, tar_test  =   train_test_split(predictors, targets, test_size=.1)
print( "Predictor - Training : ", pred_train.shape, "Predictor - Testing : ", pred_test.shape )


Predictor - Training :  (793014, 5) Predictor - Testing :  (88113, 5)


## Build and Test Model
We build a Linear Regression equation for predicting CLV and then check its accuracy by predicting against the test dataset

In [63]:
#Build model on training data
model = LinearRegression()
model.fit(pred_train,tar_train)
print("Coefficients: \n", model.coef_)
print("Intercept:", model.intercept_)

#Test on testing data
predictions = model.predict(pred_test)
predictions

sklearn.metrics.r2_score(tar_test, predictions)

Coefficients: 
 [9.73378241e-01 2.03386842e+10 2.03386842e+10 2.03386842e+10
 2.03386842e+10]
Intercept: -20338684209.342915


0.2550377054943561

It shows a 26% accuracy. This is not a good model for predicting CLV