In [53]:
import pandas as pd
import numpy as np

In [54]:
import plotly.express as px

In [55]:
import plotly.io as pio
pio.renderers.default = "notebook"

In [56]:
df = pd.read_csv("online_retail.csv")

In [57]:
# remove missing customer
df = df.dropna(subset=['Customer ID'])
#remove returns and invalid sales
df = df[df['Quantity']> 0 ]
df = df[df['Price']> 0]

In [58]:
#fix the data columns
df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'])
df['InvoiceMonth'] = df['InvoiceDate'].dt.to_period('M').astype(str)

In [59]:
df['CohortMonth'] = ( df.groupby ('Customer ID')['InvoiceMonth'].transform('min'))

In [60]:
df['InvoiceMonth_dt'] = pd.to_datetime(df['InvoiceMonth'])


In [61]:
df.drop('Total', axis=1, inplace=True)

In [62]:
print(df.columns)

Index(['Invoice', 'StockCode', 'Description', 'Quantity', 'InvoiceDate',
       'Price', 'Customer ID', 'Country', 'Sales', 'InvoiceMonth',
       'CohortMonth', 'InvoiceMonth_dt'],
      dtype='object')


In [63]:
df['InvoiceMonth_dt'] = pd.to_datetime(df['InvoiceMonth'])
df['CohortMonth_dt'] = pd.to_datetime(df['CohortMonth'])


In [64]:
df['CohortIndex'] = ((df['InvoiceMonth_dt'].dt.year - df['CohortMonth_dt'].dt.year)*12
                     + (df['InvoiceMonth_dt'].dt.month - df['CohortMonth_dt'].dt.month)
                     )

In [65]:
#Aggregate Active Customer per cohort
cohort_data = df.groupby(['CohortMonth', 'CohortIndex'])['Customer ID'].nunique().reset_index()
cohort_data.rename(columns={'Customer ID': 'ActiveCustomers'},inplace=True)

In [66]:
#Caluclate total customer per cohort
total_customer = cohort_data[cohort_data['CohortIndex']==0][['CohortMonth', 'ActiveCustomers']]
total_customer.rename(columns={'ActiveCustomers': 'TotalCustomers'}, inplace=True)

In [67]:
# merge total customer back
cohort_data = cohort_data.merge(total_customer, on='CohortMonth')

In [68]:
#calculate retention
cohort_data['RetentionRate'] = cohort_data['ActiveCustomers'] / cohort_data['TotalCustomers']
cohort_matrix = cohort_data.pivot(index='CohortMonth', columns='CohortIndex', values='RetentionRate')

In [69]:
cohort_matrix = cohort_data.pivot(index='CohortMonth', columns='CohortIndex', values='RetentionRate')


In [70]:
#Calculate Churn 
churn_matrix= 1-cohort_matrix
churn_matrix_pct = churn_matrix*100

In [71]:
#plot heat map
fig_retention= px.imshow(
    cohort_matrix,
    text_auto=".2f",
    color_continuous_scale='Blues',
    labels=dict(x="Months Since First Purchase", y="CohortMonth", color="RetentionRate")
)

In [72]:
fig_retention.update_yaxes(autorange="reversed")
fig_retention.update_layout(title="Cohort Analysis Retention HeatMap")
fig_retention.show(renderer='browser')

In [73]:
#plot Churn Heatmap
fig_churn = px.imshow(
    churn_matrix_pct,
    text_auto=".1f",
    color_continuous_scale='Reds',
    labels=dict(x="Months Since First Purchase", y="Cohort Month", color="Churn%")
    )

In [74]:
fig_churn.update_yaxes(autorange="reversed")
fig_churn.update_layout(title="Cohort Analysis Churn Heatmap")
fig_churn.show(renderer='browser')


In [75]:
df.to_csv("Online_retail_Final.csv", index=False)

In [76]:
cohort_matrix.to_csv("CohortMatrix_Retention.csv")
cohort_matrix.to_csv("CohortMatrix_Churn.csv")