In [1]:
import pandas as pd
import plotly.express as px
from sklearn.preprocessing import KBinsDiscretizer

In [2]:
# Load data
premium = pd.read_json('./data/BASE A/premium_students.json')
premium.head()

Unnamed: 0,StudentId,RegisteredDate,SubscriptionDate
0,98723802,2017-11-01 00:06:09.988381,2017-11-01 01:20:11.000000
1,86905029,2017-11-01 00:19:31.215160,2017-11-17 21:58:57.163663
2,40935842,2017-11-01 00:23:43.138459,2017-11-01 01:26:17.114303
3,83184096,2017-11-01 00:27:33.721328,2018-05-18 14:08:42.000000
4,12771137,2017-11-01 00:32:57.191732,2017-11-01 00:59:44.849137


In [3]:
# Check nulls
premium.isnull().mean()

StudentId           0.0
RegisteredDate      0.0
SubscriptionDate    0.0
dtype: float64

In [48]:
# Data prep
premium.loc[:, 'RegisteredDate'] = pd.to_datetime(premium['RegisteredDate'], infer_datetime_format=True)
premium.loc[:, 'SubscriptionDate'] = pd.to_datetime(premium['SubscriptionDate'], infer_datetime_format=True)

premium.loc[:, 'days_until_subscription'] = (premium['SubscriptionDate'] - premium['RegisteredDate']).dt.days

premium.loc[:, 'subscription_date'] = (premium['SubscriptionDate']).dt.date
premium.loc[:, 'registration_date'] = (premium['RegisteredDate']).dt.date

premium.head()

Unnamed: 0,StudentId,RegisteredDate,SubscriptionDate,days_until_subscription,subs_date,reg_date,class,subscription_date,registration_date
0,98723802,2017-11-01 00:06:09.988381,2017-11-01 01:20:11.000000,0,2017-11-01,2017-11-01,0,2017-11-01,2017-11-01
1,86905029,2017-11-01 00:19:31.215160,2017-11-17 21:58:57.163663,16,2017-11-17,2017-11-01,1-30,2017-11-17,2017-11-01
2,40935842,2017-11-01 00:23:43.138459,2017-11-01 01:26:17.114303,0,2017-11-01,2017-11-01,0,2017-11-01,2017-11-01
3,83184096,2017-11-01 00:27:33.721328,2018-05-18 14:08:42.000000,198,2018-05-18,2017-11-01,+30,2018-05-18,2017-11-01
4,12771137,2017-11-01 00:32:57.191732,2017-11-01 00:59:44.849137,0,2017-11-01,2017-11-01,0,2017-11-01,2017-11-01


In [56]:
fig = px.histogram(premium, x=['registration_date', 'subscription_date'], facet_col='variable')
# fig.for_each_annotation(lambda a: a.update(text=a.text.split('=')[1]))
fig.update_layout(showlegend=False,
                  yaxis_title='Quantidade de usuários',
                  title='Cadastro e conversão, no período'
                 )
fig.update_xaxes(title='Período')
plot(fig)

In [103]:
def plot(fig):
    fig.update_layout(
        font=dict(family="Roboto", size=16),
        template='plotly_white'
    )
    colors = ['#250541', '#F93D55']
    fig.show()

In [41]:
df = premium\
        .groupby(['days_until_subscription'], as_index=False)\
        .agg({'StudentId':'nunique'})

fig = px.bar(df, x='days_until_subscription', y='StudentId')
fig.update_layout(
    xaxis_title='Dias entre cadastro e primeira compra',
    yaxis_title='Quantidade de usuários',
    title='Distribuição do tempo até conversão'
)
fig.update_xaxes(tickmode = 'linear',
        dtick = 30
    )
plot(fig)

In [96]:
premium.loc[premium['days_until_subscription'] == 0, 'class'] = '0 dias'
premium.loc[premium['days_until_subscription'].between(1, 30), 'class'] = '1 a 30 dias'
premium.loc[premium['days_until_subscription'] > 30, 'class'] = '+30 dias'

premium['class'].value_counts()

0 dias         2998
+30 dias       2042
1 a 30 dias    1220
Name: class, dtype: int64

In [102]:
df = premium\
        .groupby(['class'], as_index=False)\
        .agg({'StudentId':'count', 'days_until_subscription':'mean'})\
        .sort_values(by='days_until_subscription')

df.loc[:,'total'] = df['StudentId'].sum()
df.loc[:,'percentage'] = 100*df['StudentId']/df['total']

fig = px.bar(df, x='class', y='percentage')
fig.update_xaxes(type='category', title='Categoria')
fig.update_yaxes(tickvals=[], title='Percentual (%)')
fig.update_traces(texttemplate='<b>%{y:.1f}%<b>', textposition='outside', )
fig.update_layout(
    title='Distribuição dos usuários, por categoria'
)
plot(fig)