In [None]:
import datetime as dt
import pandas as pd
import matplotlib.pyplot as plt
import squarify
from lifetimes import BetaGeoFitter
from lifetimes import GammaGammaFitter
from lifetimes.plotting import plot_period_transactions
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option("display.width", 1000)
pd.set_option('display.float_format', lambda x: '%.8f' % x)
import os


In [None]:
df_ = pd.read_csv(r"/Users/abdullahcetin/Documents/python /git_folder/Data_set/data.csv", encoding= 'unicode_escape')
df = df_.copy()

def check_df(dataframe):
    print("##################### First 10 Observations #####################")
    print(dataframe.head(10))
    print("##################### Column names #####################")
    print(df.info())
    print("##################### Shape #####################")
    print(dataframe.shape)
    print("##################### Quantiles #####################")
    print(dataframe.describe([0, 0.05, 0.50, 0.95, 0.99, 1]).T)
    print("##################### NA #####################")
    print(dataframe.isnull().sum())
    print("##################### Types #####################")
    print(dataframe.dtypes)
check_df(df)

In [None]:
# Data cleaning
df.dropna(inplace=True)
df = df[~df["InvoiceNo"].str.contains("C", na=False)] 
df = df[df["Quantity"] > 0]
df = df[df["UnitPrice"] > 0]
df.describe().T

In [None]:
print(df.shape )
df["InvoiceNo"].nunique()

In [None]:
df.head()

In [None]:
# correction of outliers.

def outlier_thresholds(dataframe, variable):
    quartile1 = dataframe[variable].quantile(0.01)
    quartile3 = dataframe[variable].quantile(0.99)
    interquantile_range = quartile3 - quartile1
    up_limit = quartile3 + 1.5 * interquantile_range
    low_limit = quartile1 - 1.5 * interquantile_range
    return low_limit, up_limit


def replace_with_thresholds(dataframe, variable):
    low_limit, up_limit = outlier_thresholds(dataframe, variable)
    # dataframe.loc[(dataframe[variable] < low_limit), variable] = low_limit
    dataframe.loc[(dataframe[variable] > up_limit), variable] = up_limit

In [None]:
replace_with_thresholds(df, "Quantity")
replace_with_thresholds(df, "UnitPrice")
df.describe().T

In [None]:
#calculation of total price.
df["TotalPrice"] = df["Quantity"] * df["UnitPrice"]
df.head()

In [None]:

df.groupby("InvoiceNo").agg({"TotalPrice":"sum"}).head()

In [None]:
df["InvoiceDate"] = df["InvoiceDate"].apply(pd.to_datetime)

In [None]:
df.InvoiceDate.max() #'9/9/2011 9:52'


In [None]:
today_date = dt.datetime(2011, 12, 11)

In [None]:
# calculation of rfm metrics

cltv_df = df.groupby('CustomerID').agg(
    {'InvoiceDate': [lambda InvoiceDate: (InvoiceDate.max() - InvoiceDate.min()).days,
                     lambda InvoiceDate: (today_date - InvoiceDate.min()).days],
     'InvoiceNo': lambda InvoiceNo: InvoiceNo.nunique(),
     'TotalPrice': lambda TotalPrice: TotalPrice.sum()})
cltv_df.columns = cltv_df.columns.droplevel(0)
cltv_df.columns = ['recency', 'T', 'frequency', 'monetary']
cltv_df = cltv_df[cltv_df["monetary"] > 0]
cltv_df.head()

In [None]:
# Calculation fo RFM score.
cltv_df['Recency_Score'] = pd.qcut(cltv_df['recency'].rank(method="first") , 5,labels=[5, 4, 3, 2, 1])
cltv_df['Frequency_Score'] = pd.qcut(cltv_df['recency'].rank(method="first") , 5,labels=[1, 2, 3, 4, 5])
cltv_df['Monetary_Score'] = pd.qcut(cltv_df['recency'].rank(method="first") , 5,labels=[1, 2, 3, 4, 5])
cltv_df['RFM_Score'] = (cltv_df['Recency_Score'].astype(str) +cltv_df['Frequency_Score'].astype(str))
cltv_df.head()

In [None]:


seg_map={
    r'[1-2][1-2]':'hibernating',
    r'[1-2][3-4]': 'at_Risk',
    r'[1-2]5': 'cant_loose',
    r'3[1-2]': 'about_to_sleep',
    r'33' : ' need_attention',
    r'[3-4][4-5]': 'loyal_customers',
    r'41': 'promising',
    r'51': 'new_customers',
    r'[4-5][2-3]': 'potential_loyalsts',
    r'5[4-5]':'champions'
}
cltv_df['Segment_RFM'] = cltv_df['RFM_Score'].replace(seg_map, regex=True)
cltv_df.head()

In [None]:
cltv_df[["Segment_RFM","recency", "frequency", "monetary"]].groupby("Segment_RFM").agg(['mean', 'count'])

In [None]:
# calculation of cltv

cltv_df["monetary"] = cltv_df["monetary"] / cltv_df["frequency"] # To calculate average monetary

cltv_df = cltv_df[(cltv_df['frequency'] > 1)] # To get rid of values that are equal to 0
# cltv_df = cltv_df[(cltv_df['recency'] > 1)]
cltv_df["recency"] = cltv_df["recency"] / 7  # converting days to weeks

cltv_df["T"] = cltv_df["T"] / 7 # converting days to weeks

cltv_df.head()
check_df(cltv_df)

In [None]:
bgf = BetaGeoFitter(penalizer_coef=0.001)
bgf.fit(cltv_df['frequency'],
        cltv_df['recency'],
        cltv_df['T'])
bgf.summary

In [None]:
plot_period_transactions(bgf)
plt.show(block=True)

In [None]:
cltv_df["expected_purc_a_month"] = bgf.conditional_expected_number_of_purchases_up_to_time(4, cltv_df['frequency'], cltv_df['recency'], cltv_df['T'])

cltv_df["expected_purc_3_month"] = bgf.predict(12, cltv_df['frequency'], cltv_df['recency'], cltv_df['T'])

cltv_df.head()

In [None]:
ggf = GammaGammaFitter(penalizer_coef=0.01)

ggf.fit(cltv_df['frequency'], cltv_df['monetary'])

In [None]:
cltv_df["expected_average_profit"] = ggf.conditional_expected_average_profit(cltv_df['frequency'],
                                                                             cltv_df['monetary']) #per puchase
cltv_df.head()

In [None]:
# calculation of cltv 
cltv_df["cltv"] = ggf.customer_lifetime_value(bgf,
                                   cltv_df['frequency'],
                                   cltv_df['recency'],
                                   cltv_df['T'],
                                   cltv_df['monetary'],
                                   time=3,
                                   freq="W",  # Frequency period ("Weeks")
                                   discount_rate=0.01)
cltv_df.head()

In [None]:
cltv_df["Segment_Cltv"] = pd.qcut(cltv_df["cltv"], 4, labels=["D", "C", "B", "A"])
cltv_df.sort_values("cltv", ascending = False).head(20)

In [None]:
cltv_df.describe().T

In [None]:

cltv_df.groupby("Segment_Cltv")[["expected_purc_a_month", "expected_purc_3_month","expected_average_profit","cltv" ]].agg({"count", "mean", "sum"}).sort_values("Segment_Cltv",ascending = False)

In [None]:
# cltv_df[(cltv_df["Segment_Cltv"] == "A") & (cltv_df["Segment_RFM"] == "cant_loose")].head()

cltv_df[cltv_df["Segment_Cltv"] == "A"][["RFM_Score","Segment_RFM","cltv"]].sort_values("cltv", ascending=False).head()