In [1]:
import pandas as pd
import plotly.express as px

In [2]:
def plot(fig):
    fig.update_layout(
        font=dict(family="Roboto", size=16),
        template='plotly_white'
    )
    colors = ['#250541', '#F93D55']
    fig.show()

# Data Load and Prep

In [3]:
plans_raw = pd.read_csv('./data/prep/premium_plans_info.csv')

In [24]:
# Consider only not cancelled plans
plans = plans_raw.loc[plans_raw['cancelled_at'].isnull()].copy().sort_values(by='created_at')

plans.loc[:, 'cohort'] = plans['created_at'].str[:7]

plans.head()

Unnamed: 0,plan_id,student_id,created_at,plan_type,cancelled_at,cost,cohort
5405,78584527_Mensal_2015-08-08,78584527,2015-08-08 00:00:00.000000,Mensal,,29.9,2015-08
3577,55313801_Mensal_2015-08-19,55313801,2015-08-19 00:00:00.000000,Mensal,,29.9,2015-08
6917,96396664_Mensal_2015-08-23,96396664,2015-08-23 00:00:00.000000,Mensal,,29.9,2015-08
1416,27891872_Mensal_2015-09-06,27891872,2015-09-06 20:05:51.157000,Mensal,,29.9,2015-09
3570,5523173_Mensal_2015-09-12,5523173,2015-09-12 16:00:31.940000,Mensal,,29.9,2015-09


In [5]:
plans['created_at'].max()

'2018-06-08 03:00:37.000000'

# Analysis

In [16]:
# Group users all time revenue

user_ltv = plans\
            .groupby(['student_id'], as_index=False)\
            .agg({'plan_id':'nunique', 'created_at':'min', 'plan_type':'nunique', 'cost':'sum'})

user_ltv.loc[:, 'cohort'] = user_ltv['created_at'].str[:7]

cohort_ltv = user_ltv.groupby(['cohort'], as_index=False).agg({'cost':['sum', 'mean'], 'student_id':'nunique'})

cohort_ltv.columns = ['cohort', 'total_cohort_revenue', 'user_ltv', 'total_users']
cohort_ltv.loc[:, 'total_users'] = cohort_ltv['total_users'].astype(float)
cohort_ltv.head()

df = cohort_ltv.loc[cohort_ltv['cohort'] >= '2017-10']
fig = px.line(df, x='cohort', y='user_ltv')
fig.update_yaxes(matches=None, showticklabels=True, title='')
fig.update_layout(showlegend=False, 
                  xaxis_title='Cohort do usuário',
                  yaxis_title='LTV (R$)',
                  title='LTV por cohort'
                 )
fig.for_each_annotation(lambda a: a.update(text=a.text.split('=')[1]))

plot(fig)

In [26]:
cohort_ltv.loc[cohort_ltv['cohort'].between('2017-11', '2018-02'), 'user_ltv'].median()

147.6698628762544