In [265]:
import pandas as pd
import plotly.express as px
from sklearn.cluster import MiniBatchKMeans, KMeans
from sklearn.preprocessing import StandardScaler
from IPython.display import display
from scipy import stats
from sklearn.decomposition import PCA
import numpy as np
pd.set_option('max_columns', None)
pd.set_option('use_inf_as_na', True)

# Common functions

In [385]:
# dissimilarity would not be defined for a single cluster, thus, minimum number of clusters should be 2

def find_cluster_size(data, kmax, pca_var=0.95, minibatch=True, batch_size=100):
    
    scaler = StandardScaler()
    data_scaled = scaler.fit_transform(data)
    
    pca = PCA(n_components=pca_var)
    points = pca.fit_transform(data_scaled)
    print('Original Data: {}, PCA: {}'.format(data_scaled.shape[1], points.shape[1]))
    
    wss = []
    cluster_sizes = []
    silhouettes = []
    
    for k in range(1, kmax):
        if k % 5 == 0:
            print(k)
        if minibatch:
            kmeans = MiniBatchKMeans(n_clusters = k, batch_size=batch_size).fit(points)
        else:
            kmeans = KMeans(n_clusters = k).fit(points)

        centroids = kmeans.cluster_centers_
        pred_clusters = kmeans.predict(points)

        wss.append(kmeans.inertia_)
#         if k > 1:
#             silhouettes.append(silhouette_score(points, pred_clusters, metric = 'euclidean'))
    
    cluster_sizes = [f for f in range(1, kmax)]
    
    metrics = pd.DataFrame({'cluster_size':cluster_sizes, 'wss':wss})
    
    return metrics

# User Profile

## Data Load

In [53]:
students_df = pd.read_csv('./data/prep/user_infos.csv').set_index('student_id')

In [54]:
students_df.isnull().mean()

signup_at               0.000000
university_name         0.000000
course_name             0.000000
state                   0.596017
signup_source           0.000000
city                    0.669500
user_client             0.140317
user_origin             0.140317
origin                  0.140317
course_area             0.000000
on_top_20_university    0.000000
region                  0.596017
total_plans             0.970950
ltv                     0.970950
first_purchase          0.970950
last_purchase           0.970950
has_purchased           0.000000
dtype: float64

In [55]:
students_df.loc[:, 'on_top_20_university'] = students_df['on_top_20_university'].astype(int)

In [56]:
student_profile = students_df.loc[:, ['signup_source', 'origin', 'course_area', 'on_top_20_university', 'region']]

In [57]:
student_profile_dummies = pd.get_dummies(student_profile)
student_profile_dummies.head()

Unnamed: 0_level_0,on_top_20_university,signup_source_Email,signup_source_Facebook,signup_source_Google,origin_android,origin_ios,origin_other,origin_website,course_area_Administração,course_area_Biológicas,course_area_Direito,course_area_Engenharia,course_area_Exatas,course_area_Humanas,course_area_Outros,region_Centro-Oeste,region_Nordeste,region_Norte,region_Sudeste,region_Sul
student_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
2774,0,0,1,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0
3287,1,0,0,1,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0
3546,1,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0
7251,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
7338,1,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0


## Cluster

### Find cluster size

In [417]:
metrics = find_cluster_size(student_profile_dummies.values, 30, 0.8, False)    

Original Data: 20, PCA: 13
5
10
15
20
25


In [418]:
fig = px.line(metrics, x='cluster_size', y='wss')
fig.show()

Apesar do método nos indicar uma melhor separação em clusters por volta de 8, para fins de simplificar nossa análise, vamos escolher 5 como o número de clusters a analisar, dado uma lógica de atribuição mais genérica.

In [419]:
# import hdbscan

# data = student_profile_dummies
# clusterer = hdbscan.HDBSCAN(min_cluster_size=int(data.shape[0]/15), gen_min_span_tree=True, min_samples=1)

# scaler = StandardScaler()
# data_scaled = scaler.fit_transform(student_profile_dummies.values)
# pca = PCA(n_components=0.95)
# points = pca.fit_transform(data_scaled)

# clusterer.fit(points)
# clusterer.labels_.max()

## Find clusters

In [447]:
scaler = StandardScaler()
data_scaled = scaler.fit_transform(student_profile_dummies.values)
pca = PCA(n_components=0.95)
points = pca.fit_transform(data_scaled)
kmeans = KMeans(n_clusters = 5, random_state=42).fit(points)

labels = kmeans.labels_

In [449]:
students_df.head()

Unnamed: 0_level_0,signup_at,university_name,course_name,state,signup_source,city,user_client,user_origin,origin,course_area,on_top_20_university,region,total_plans,ltv,first_purchase,last_purchase,has_purchased
student_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
2774,2017-11-02 22:33:13.199000,UNINORTE,Licenciatura em Biologia,Amazonas,Facebook,Manaus,Website,website,website,Biológicas,0,Norte,,,,,False
3287,2017-11-24 13:19:30.684799,UNIP,Administração,,Google,,Website,website,website,Administração,1,,,,,,False
3546,2014-01-09 07:56:11.830000,ESTÁCIO,Direito,Piauí,Facebook,,,,,Direito,1,Nordeste,,,,,False
7251,2017-04-20 14:54:38.956966,UNIFAVIP,Engenharia Mecânica,,Google,,,,,Engenharia,0,,,,,,False
7338,2016-06-05 12:25:46.003000,ESTÁCIO,Engenharia Civil,Rio de Janeiro,Google,Rio de Janeiro,Website,website,website,Engenharia,1,Sudeste,,,,,False


In [450]:
clusters_df = student_profile_dummies.copy()
clusters_df.loc[:, 'converted'] = students_df['has_purchased'].astype(float)
clusters_df.loc[:, 'ltv'] = students_df['ltv']
clusters_df.loc[:, 'cohort'] = students_df['signup_at'].str[:7]
clusters_df.loc[:, 'cluster'] = labels
clusters_df.loc[:, 'student_id'] = students_df.index

In [451]:
metrics_to_calc = {c:'mean' for c in student_profile_dummies.columns.tolist()}
metrics_to_calc['converted'] = 'mean'
metrics_to_calc['ltv'] = 'mean'
metrics_to_calc['student_id'] = 'count'

clusters_mean = clusters_df\
                    .groupby(['cluster'])\
                    .agg(metrics_to_calc)

clusters_melt = clusters_mean\
                    .stack().to_frame().reset_index()

clusters_melt.columns = ['cluster', 'variable_dummy', 'value']

In [452]:
# clusters_melt.loc[:, 'variable'] = clusters_melt['variable_dummy'].apply(lambda x: '_'.join(x.split('_')[:-1]))
# clusters_melt.loc[:, 'category'] = clusters_melt['variable_dummy'].apply(lambda x: x.split('_')[-1])

In [453]:
clusters_melt.loc[:, 'var_max'] = clusters_melt.groupby(['variable_dummy'])['value'].rank(ascending=False, method='first')
clusters_melt.loc[:, 'max_val'] = clusters_melt.groupby(['variable_dummy'])['value'].transform('max')
clusters_melt.loc[:, 'min_val'] = clusters_melt.groupby(['variable_dummy'])['value'].transform('min')

clusters_melt.loc[:, 'cluster_highlight'] = clusters_melt.apply(lambda x: x['var_max'] <= 1 or x['value'] == x['min_val'], axis=1)

In [454]:
clusters_mean

Unnamed: 0_level_0,on_top_20_university,signup_source_Email,signup_source_Facebook,signup_source_Google,origin_android,origin_ios,origin_other,origin_website,course_area_Administração,course_area_Biológicas,course_area_Direito,course_area_Engenharia,course_area_Exatas,course_area_Humanas,course_area_Outros,region_Centro-Oeste,region_Nordeste,region_Norte,region_Sudeste,region_Sul,converted,ltv,student_id
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
0,0.487617,0.0,0.0,1.0,0,0.027135,0,0.880769,0.130261,0.195561,0.155501,0.154825,0.144066,0.14393,0.075856,0.018541,0.066856,0.017932,0.157396,0.036067,0.031669,158.859829,14778
1,0.391276,0.0,1.0,0.0,0,0.054584,0,0.731569,0.093429,0.209598,0.166626,0.191754,0.120256,0.141418,0.07692,0.04192,0.118597,0.029133,0.254916,0.081169,0.029052,151.554735,24714
2,0.466559,0.097014,0.471963,0.431023,1,0.0,0,0.0,0.125929,0.239562,0.143224,0.131469,0.11566,0.157276,0.08688,0.013782,0.048642,0.014863,0.108769,0.021078,0.01716,142.111024,7401
3,0.375839,0.052081,0.521879,0.42604,0,0.0,1,0.0,0.114094,0.241342,0.154362,0.150336,0.112483,0.14443,0.082953,0.020134,0.061208,0.018523,0.132081,0.027114,0.015839,124.947458,3725
4,0.360904,1.0,0.0,0.0,0,0.041462,0,0.769559,0.079621,0.193882,0.212002,0.248454,0.113089,0.097527,0.055425,0.034214,0.120337,0.029418,0.221062,0.059902,0.039544,171.175472,9382


In [446]:
for c in clusters_melt['cluster'].unique().tolist():
    df = clusters_melt.loc[(clusters_melt['cluster'] == c) & (clusters_melt['cluster_highlight'])]
    display(df)

Unnamed: 0,cluster,variable_dummy,value,var_max,max_val,min_val,cluster_highlight
0,0,on_top_20_university,0.487617,1.0,0.487617,0.360904,True
1,0,signup_source_Email,0.0,4.0,1.0,0.0,True
2,0,signup_source_Facebook,0.0,4.0,1.0,0.0,True
3,0,signup_source_Google,1.0,1.0,1.0,0.0,True
4,0,origin_android,0.0,2.0,1.0,0.0,True
6,0,origin_other,0.0,2.0,1.0,0.0,True
7,0,origin_website,0.880769,1.0,0.880769,0.0,True
8,0,course_area_Administração,0.130261,1.0,0.130261,0.079621,True
12,0,course_area_Exatas,0.144066,1.0,0.144066,0.112483,True


Unnamed: 0,cluster,variable_dummy,value,var_max,max_val,min_val,cluster_highlight
24,1,signup_source_Email,0.0,5.0,1.0,0.0,True
25,1,signup_source_Facebook,1.0,1.0,1.0,0.0,True
26,1,signup_source_Google,0.0,4.0,1.0,0.0,True
27,1,origin_android,0.0,3.0,1.0,0.0,True
28,1,origin_ios,0.054584,1.0,0.054584,0.0,True
29,1,origin_other,0.0,3.0,1.0,0.0,True
38,1,region_Centro-Oeste,0.04192,1.0,0.04192,0.013782,True
41,1,region_Sudeste,0.254916,1.0,0.254916,0.108769,True
42,1,region_Sul,0.081169,1.0,0.081169,0.021078,True
45,1,student_id,24714.0,1.0,24714.0,3725.0,True


Unnamed: 0,cluster,variable_dummy,value,var_max,max_val,min_val,cluster_highlight
50,2,origin_android,1.0,1.0,1.0,0.0,True
51,2,origin_ios,0.0,4.0,0.054584,0.0,True
52,2,origin_other,0.0,4.0,1.0,0.0,True
53,2,origin_website,0.0,4.0,0.880769,0.0,True
56,2,course_area_Direito,0.143224,5.0,0.212002,0.143224,True
57,2,course_area_Engenharia,0.131469,5.0,0.248454,0.131469,True
59,2,course_area_Humanas,0.157276,1.0,0.157276,0.097527,True
60,2,course_area_Outros,0.08688,1.0,0.08688,0.055425,True
61,2,region_Centro-Oeste,0.013782,5.0,0.04192,0.013782,True
62,2,region_Nordeste,0.048642,5.0,0.120337,0.048642,True


Unnamed: 0,cluster,variable_dummy,value,var_max,max_val,min_val,cluster_highlight
73,3,origin_android,0.0,4.0,1.0,0.0,True
74,3,origin_ios,0.0,5.0,0.054584,0.0,True
75,3,origin_other,1.0,1.0,1.0,0.0,True
76,3,origin_website,0.0,5.0,0.880769,0.0,True
78,3,course_area_Biológicas,0.241342,1.0,0.241342,0.193882,True
81,3,course_area_Exatas,0.112483,5.0,0.144066,0.112483,True
89,3,converted,0.015839,5.0,0.039544,0.015839,True
90,3,ltv,124.947458,5.0,171.175472,124.947458,True
91,3,student_id,3725.0,5.0,24714.0,3725.0,True


Unnamed: 0,cluster,variable_dummy,value,var_max,max_val,min_val,cluster_highlight
92,4,on_top_20_university,0.360904,5.0,0.487617,0.360904,True
93,4,signup_source_Email,1.0,1.0,1.0,0.0,True
94,4,signup_source_Facebook,0.0,5.0,1.0,0.0,True
95,4,signup_source_Google,0.0,5.0,1.0,0.0,True
96,4,origin_android,0.0,5.0,1.0,0.0,True
98,4,origin_other,0.0,5.0,1.0,0.0,True
100,4,course_area_Administração,0.079621,5.0,0.130261,0.079621,True
101,4,course_area_Biológicas,0.193882,5.0,0.241342,0.193882,True
102,4,course_area_Direito,0.212002,1.0,0.212002,0.143224,True
103,4,course_area_Engenharia,0.248454,1.0,0.248454,0.131469,True


In [430]:
clusters_df.head()

Unnamed: 0_level_0,on_top_20_university,signup_source_Email,signup_source_Facebook,signup_source_Google,origin_android,origin_ios,origin_other,origin_website,course_area_Administração,course_area_Biológicas,course_area_Direito,course_area_Engenharia,course_area_Exatas,course_area_Humanas,course_area_Outros,region_Centro-Oeste,region_Nordeste,region_Norte,region_Sudeste,region_Sul,converted,ltv,cohort,cluster,student_id
student_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1
2774,0,0,1,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0,0.0,,2017-11,1,2774
3287,1,0,0,1,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0.0,,2017-11,4,3287
3546,1,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0.0,,2014-01,1,3546
7251,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0.0,,2017-04,4,7251
7338,1,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0.0,,2016-06,4,7338


In [431]:
clusters_cohorts = clusters_df\
                    .groupby(['cluster', 'cohort'], as_index=True)\
                    .agg({'ltv':'mean', 'converted':['mean', 'sum', 'count']})
clusters_cohorts.columns = ['ltv', 'conversion_rate', 'converted', 'students']

clusters_cohorts.loc[:, 'students'] = clusters_cohorts['students'].astype(float)
clusters_cohorts = clusters_cohorts.reset_index()

In [432]:
fig = px.line(clusters_cohorts, x='cohort', y=['students', 'converted', 'ltv', 'conversion_rate',], 
              color='cluster', facet_col='variable', facet_row='cluster')
fig.update_yaxes(matches=None, showticklabels=True)

# User Behaviour

## Data prep

In [467]:
user_behaviour = pd.read_csv('./data/prep/user_activity_summary.csv')#.set_index(['student_id', 'month'])

# Create relative metrics
activity_cols = ['question_events', 'subject_events', 'subject_events']
for c in activity_cols:
    user_behaviour.loc[:, c + '_percent'] = user_behaviour[c]/user_behaviour['total_activities']

events_cols = [c for c in user_behaviour.columns.tolist() if 'events_' in c]
for c in events_cols:
    user_behaviour.loc[:, c + '_percent'] = user_behaviour[c]/user_behaviour['total_events']

days_cols = [c for c in user_behaviour.columns.tolist() if 'days' in c]
days_cols.remove('total_days')
for c in days_cols:
    user_behaviour.loc[:, c + '_percent'] = user_behaviour[c]/user_behaviour['total_days']

user_behaviour.loc[:, 'weekend_use_percent'] = user_behaviour['days_on_weekend']/user_behaviour['total_days']
user_behaviour.loc[:, 'week_use_percent'] = 1 - user_behaviour['weekend_use_percent']

user_behaviour.head()

Unnamed: 0,student_id,month,total_events,first_event,last_event,total_days,unique_origins,days_on_weekend,total_activities,file_events,question_events,session_events,subject_events,file_days_used,question_days_used,session_days_used,subject_days_used,events_on_mobile,events_on_web,used_days_on_mobile,used_days_on_web,events_usage__0-5,events_usage__12-17,events_usage__18-23,events_usage__6-11,question_events_percent,subject_events_percent,events_on_mobile_percent,events_on_web_percent,events_usage__0-5_percent,events_usage__12-17_percent,events_usage__18-23_percent,events_usage__6-11_percent,question_events_percent_percent,subject_events_percent_percent,days_on_weekend_percent,file_days_used_percent,question_days_used_percent,session_days_used_percent,subject_days_used_percent,used_days_on_mobile_percent,used_days_on_web_percent,weekend_use_percent,week_use_percent
0,2774,2017-11,9,2017-11-02 20:36:23.000000,2017-11-15 03:01:47.000000,2,1,0,7,1.0,0.0,2.0,6.0,1.0,0.0,2.0,1.0,0.0,3.0,0.0,2.0,1.0,0.0,8.0,0.0,0.0,0.857143,0.0,0.333333,0.111111,0.0,0.888889,0.0,0.0,0.095238,0.0,0.5,0.0,1.0,0.5,0.0,1.0,0.0,1.0
1,3287,2017-11,10,2017-11-24 13:20:41.000000,2017-11-24 16:05:35.000000,1,1,0,9,3.0,0.0,1.0,6.0,1.0,0.0,1.0,1.0,0.0,4.0,0.0,1.0,0.0,10.0,0.0,0.0,0.0,0.666667,0.0,0.4,0.0,1.0,0.0,0.0,0.0,0.066667,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0
2,3287,2017-12,3,2017-12-12 02:03:37.000000,2017-12-12 02:10:39.000000,1,1,0,2,2.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,3.0,0.0,1.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0
3,3546,2015-10,8,2015-10-05 08:56:05.000000,2015-10-14 10:51:29.000000,2,0,1,8,0.0,0.0,0.0,8.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.125,0.5,0.0,0.0,0.0,1.0,0.0,0.0,0.5,0.5
4,3546,2015-11,9,2015-11-03 09:58:54.820000,2015-11-17 12:40:02.157000,2,1,0,9,0.0,7.0,0.0,2.0,0.0,2.0,0.0,1.0,0.0,7.0,0.0,2.0,0.0,5.0,0.0,4.0,0.777778,0.222222,0.0,0.777778,0.0,0.555556,0.0,0.444444,0.08642,0.024691,0.0,0.0,1.0,0.0,0.5,0.0,1.0,0.0,1.0


In [504]:
# Remove outliers

activity_df = user_behaviour.drop(columns=['first_event', 'last_event', 'student_id', 'month']).fillna(0)

activity_df.loc[:, 'outlier'] = False

outlier_dict = activity_df.quantile(0.999).to_dict()

for k, v in outlier_dict.items():
    activity_df.loc[activity_df[k] > v, 'outlier'] = True

cluster_activity = activity_df[~activity_df['outlier']].iloc[:, :-1]

cluster_activity.head()

Unnamed: 0,total_events,total_days,unique_origins,days_on_weekend,total_activities,file_events,question_events,session_events,subject_events,file_days_used,question_days_used,session_days_used,subject_days_used,events_on_mobile,events_on_web,used_days_on_mobile,used_days_on_web,events_usage__0-5,events_usage__12-17,events_usage__18-23,events_usage__6-11,question_events_percent,subject_events_percent,events_on_mobile_percent,events_on_web_percent,events_usage__0-5_percent,events_usage__12-17_percent,events_usage__18-23_percent,events_usage__6-11_percent,question_events_percent_percent,subject_events_percent_percent,days_on_weekend_percent,file_days_used_percent,question_days_used_percent,session_days_used_percent,subject_days_used_percent,used_days_on_mobile_percent,used_days_on_web_percent,weekend_use_percent,week_use_percent
0,9,2,1,0,7,1.0,0.0,2.0,6.0,1.0,0.0,2.0,1.0,0.0,3.0,0.0,2.0,1.0,0.0,8.0,0.0,0.0,0.857143,0.0,0.333333,0.111111,0.0,0.888889,0.0,0.0,0.095238,0.0,0.5,0.0,1.0,0.5,0.0,1.0,0.0,1.0
1,10,1,1,0,9,3.0,0.0,1.0,6.0,1.0,0.0,1.0,1.0,0.0,4.0,0.0,1.0,0.0,10.0,0.0,0.0,0.0,0.666667,0.0,0.4,0.0,1.0,0.0,0.0,0.0,0.066667,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0
2,3,1,1,0,2,2.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,3.0,0.0,1.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0
3,8,2,0,1,8,0.0,0.0,0.0,8.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.125,0.5,0.0,0.0,0.0,1.0,0.0,0.0,0.5,0.5
5,2,2,1,0,2,2.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,2.0,0.0,2.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.5,0.5,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0


## Find cluster size

In [514]:
metrics = find_cluster_size(cluster_activity.values, 20, 0.8, False)

Original Data: 40, PCA: 9
5
10
15
20


In [515]:
fig = px.line(metrics, x='cluster_size', y='wss')
fig.show()

Vamos focar em 5 clusters.

## Find Clusters

In [516]:
scaler = StandardScaler()
data_scaled = scaler.fit_transform(cluster_activity.values)
pca = PCA(n_components=0.8)
points = pca.fit_transform(data_scaled)
kmeans = KMeans(n_clusters = 5, random_state=42).fit(points)

data_scaled = scaler.transform(activity_df.iloc[:, :-1].values)
points = pca.transform(data_scaled)

labels_behav = kmeans.predict(points)

In [517]:
behav_clusters = activity_df.copy()
behav_clusters.loc[:, 'cluster'] = labels_behav
behav_clusters.loc[:, 'student_id'] = user_behaviour['student_id']
behav_clusters.loc[:, 'month'] = user_behaviour['month']

In [736]:
metrics_to_calc = {c:'mean' for c in cluster_activity.columns.tolist()}
# metrics_to_calc['converted'] = 'mean'
# metrics_to_calc['ltv'] = 'mean'
metrics_to_calc['student_id'] = 'count'

clusters_mean = behav_clusters\
                    .groupby(['cluster'])\
                    .agg(metrics_to_calc)

clusters_melt = behav_clusters\
                    .stack().to_frame().reset_index()

clusters_melt.columns = ['cluster', 'variable_dummy', 'value']

In [737]:
clusters_mean.sort_values(by=['total_events', 'total_days'])

Unnamed: 0_level_0,total_events,total_days,unique_origins,days_on_weekend,total_activities,file_events,question_events,session_events,subject_events,file_days_used,question_days_used,session_days_used,subject_days_used,events_on_mobile,events_on_web,used_days_on_mobile,used_days_on_web,events_usage__0-5,events_usage__12-17,events_usage__18-23,events_usage__6-11,question_events_percent,subject_events_percent,events_on_mobile_percent,events_on_web_percent,events_usage__0-5_percent,events_usage__12-17_percent,events_usage__18-23_percent,events_usage__6-11_percent,question_events_percent_percent,subject_events_percent_percent,days_on_weekend_percent,file_days_used_percent,question_days_used_percent,session_days_used_percent,subject_days_used_percent,used_days_on_mobile_percent,used_days_on_web_percent,weekend_use_percent,week_use_percent,student_id
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1
1,6.6576,1.243789,0.487558,0.349574,5.900816,0.971591,6.1e-05,0.756784,4.929163,0.391736,6.1e-05,0.578113,1.025334,0.376834,1.351603,0.15469,0.487763,0.612815,2.506129,2.321001,1.217656,2e-06,0.888383,0.040848,0.144275,0.086036,0.376446,0.35076,0.186758,1.743644e-07,0.213132,0.282277,0.291043,6e-06,0.408913,0.908098,0.104412,0.344961,0.282277,0.717723,48788
2,7.345575,2.194023,1.084011,0.635572,4.571084,4.40355,0.0,2.774491,0.167534,1.563292,0.0,2.049115,0.057522,0.214735,6.963306,0.105962,2.123199,1.372094,2.564314,2.884941,0.524225,0.0,0.017352,0.01904,0.969274,0.189851,0.348334,0.387873,0.073942,0.0,0.001582,0.292084,0.739188,0.0,0.92929,0.023453,0.035107,0.979277,0.292084,0.707916,198533
0,9.678521,2.635741,1.230744,0.707604,5.709519,5.420813,3.8e-05,3.969002,0.288668,1.390069,3.8e-05,2.518663,0.081223,8.294109,1.095745,2.402714,0.366897,1.881413,3.114161,3.886207,0.79674,1e-06,0.02746,0.902409,0.080591,0.191964,0.330048,0.388782,0.089205,2.983908e-08,0.002118,0.265929,0.496758,5e-06,0.955549,0.028101,0.939979,0.107614,0.265929,0.734071,78845
4,16.790216,3.246683,1.157546,0.907131,11.834992,9.162935,1.442371,4.955224,1.229685,1.619818,1.131012,2.619818,0.373134,4.31592,11.146766,0.976783,2.388889,2.893864,5.951907,6.534826,1.409619,0.393554,0.130743,0.203233,0.660279,0.128804,0.380259,0.364229,0.126707,0.2342145,0.018804,0.286774,0.421775,0.578586,0.621331,0.173902,0.251272,0.736422,0.286774,0.713226,2412
3,88.98357,8.849537,1.513692,2.491167,68.953757,68.265431,0.017336,20.029813,0.67099,7.056249,0.016718,8.512374,0.252213,35.227136,53.084579,4.364505,5.394276,17.693638,29.081408,35.971423,6.237101,0.000354,0.014858,0.396812,0.59373,0.19948,0.32769,0.400456,0.072374,6.138343e-06,0.000279,0.283361,0.827952,0.001825,0.959523,0.036323,0.465035,0.636539,0.283361,0.716639,24285


5 tipos de uso:
* Mobile fileview
* Web fileview
* Subject following
* Casual studying
* Super fileview

In [536]:
cluster_labels = {0:'mobile_fileview', 2:'web_fileview', 1:'subject_following', 4:'casual_studying', 3:'super_fileview'}

behav_clusters.loc[:, 'cluster_name'] = behav_clusters['cluster'].apply(lambda x: cluster_labels[x])

behav_clusters.loc[:, 'first_month'] = behav_clusters.groupby(['student_id'])['month'].transform('min')

In [557]:
first_behav = behav_clusters\
                .loc[behav_clusters['month'] == behav_clusters['first_month'], ['cluster_name', 'student_id']]\
                .set_index('student_id')
    
first_behav.columns = ['first_cluster']

first_behav.head()

Unnamed: 0_level_0,first_cluster
student_id,Unnamed: 1_level_1
2774,subject_following
3287,subject_following
3546,subject_following
7251,mobile_fileview
7338,web_fileview


In [552]:
user_behav_cluster = behav_clusters\
                        .groupby(['student_id', 'cluster_name'], as_index=False)\
                        .agg({'month':['count', 'min']})\

user_behav_cluster.columns = ['student_id', 'main_cluster', 'months_as_main', 'main_first_month']

user_behav_cluster = user_behav_cluster.sort_values(by=['student_id', 'months_as_main', 'main_first_month'])

user_behav_cluster.loc[:, 'max_behav'] = user_behav_cluster.groupby(['student_id'])['months_as_main'].rank(method='first', ascending=False)

main_behaviour = user_behav_cluster.loc[user_behav_cluster['max_behav'] == 1].iloc[:, :-1]
main_behaviour.head()

Unnamed: 0,student_id,main_cluster,months_as_main,main_first_month
0,2774,subject_following,1,2017-11
1,3287,subject_following,1,2017-11
7,3546,web_fileview,12,2016-11
8,7251,mobile_fileview,4,2017-04
13,7338,web_fileview,9,2016-11


In [559]:
user_behav = behav_clusters\
                .groupby(['student_id'], as_index=True)\
                .agg({'month':['count', 'min', 'max'], 'cluster_name':'nunique'})\

user_behav.columns = ['total_time', 'first_month', 'last_month', 'unique_clusters']

user_behav = pd.merge(left=user_behav.reset_index(), right=main_behaviour, on=['student_id']).set_index('student_id')

user_behav.loc[:, 'first_cluster'] = first_behav['first_cluster']

user_behav.head()

Unnamed: 0_level_0,total_time,first_month,last_month,unique_clusters,main_cluster,months_as_main,main_first_month,first_cluster
student_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2774,1,2017-11,2017-11,1,subject_following,1,2017-11,subject_following
3287,2,2017-11,2017-12,2,subject_following,1,2017-11,subject_following
3546,17,2015-10,2018-05,5,web_fileview,12,2016-11,subject_following
7251,5,2017-04,2017-12,2,mobile_fileview,4,2017-04,mobile_fileview
7338,12,2016-11,2018-06,4,web_fileview,9,2016-11,web_fileview


In [560]:
user_behav.to_csv('./data/prep/user_activity_cluster_summary.csv', index=True)

## Data Analysis

In [730]:
def plot(fig):
    fig.update_layout(
        font=dict(family="Roboto", size=16),
        template='plotly_white'
    )
    colors = ['#250541', '#F93D55']
    fig.show()

In [677]:
students_df = pd.read_csv('./data/prep/user_infos.csv').set_index('student_id')
students_df.head()

Unnamed: 0_level_0,signup_at,university_name,course_name,state,signup_source,city,user_client,user_origin,origin,course_area,on_top_20_university,region,total_plans,ltv,first_purchase,last_purchase,revenue_first_purchase,has_purchased
student_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2774,2017-11-02 22:33:13.199000,UNINORTE,Licenciatura em Biologia,Amazonas,Facebook,Manaus,Website,website,website,Biológicas,False,Norte,,,,,,False
3287,2017-11-24 13:19:30.684799,UNIP,Administração,,Google,,Website,website,website,Administração,True,,,,,,,False
3546,2014-01-09 07:56:11.830000,ESTÁCIO,Direito,Piauí,Facebook,,,,,Direito,True,Nordeste,,,,,,False
7251,2017-04-20 14:54:38.956966,UNIFAVIP,Engenharia Mecânica,,Google,,,,,Engenharia,False,,,,,,,False
7338,2016-06-05 12:25:46.003000,ESTÁCIO,Engenharia Civil,Rio de Janeiro,Google,Rio de Janeiro,Website,website,website,Engenharia,True,Sudeste,,,,,,False


In [678]:
user_behav = pd.read_csv('./data/prep/user_activity_cluster_summary.csv').set_index('student_id')
user_behav.head()

Unnamed: 0_level_0,total_time,first_month,last_month,unique_clusters,main_cluster,months_as_main,main_first_month,first_cluster
student_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2774,1,2017-11,2017-11,1,subject_following,1,2017-11,subject_following
3287,2,2017-11,2017-12,2,subject_following,1,2017-11,subject_following
3546,17,2015-10,2018-05,5,web_fileview,12,2016-11,subject_following
7251,5,2017-04,2017-12,2,mobile_fileview,4,2017-04,mobile_fileview
7338,12,2016-11,2018-06,4,web_fileview,9,2016-11,web_fileview


In [741]:
summ_df = students_df.loc[:, ['signup_at', 'origin', 'course_area', 'on_top_20_university', 'total_plans', 'signup_source',
                              'region', 'ltv', 'first_purchase', 'has_purchased', 'revenue_first_purchase']]

summ_df = pd.concat([summ_df, user_behav], axis=1).reset_index()

summ_df.loc[:, 'signup_month'] = summ_df['signup_at'].str[:7]
summ_df.loc[:, 'converted'] = summ_df['has_purchased'].astype(int)
summ_df.loc[:, 'students'] = 1
summ_df.head()

Unnamed: 0,student_id,signup_at,origin,course_area,on_top_20_university,total_plans,signup_source,region,ltv,first_purchase,has_purchased,revenue_first_purchase,total_time,first_month,last_month,unique_clusters,main_cluster,months_as_main,main_first_month,first_cluster,signup_month,converted,students
0,2774,2017-11-02 22:33:13.199000,website,Biológicas,False,,Facebook,Norte,,,False,,1,2017-11,2017-11,1,subject_following,1,2017-11,subject_following,2017-11,0,1
1,3287,2017-11-24 13:19:30.684799,website,Administração,True,,Google,,,,False,,2,2017-11,2017-12,2,subject_following,1,2017-11,subject_following,2017-11,0,1
2,3546,2014-01-09 07:56:11.830000,,Direito,True,,Facebook,Nordeste,,,False,,17,2015-10,2018-05,5,web_fileview,12,2016-11,subject_following,2014-01,0,1
3,7251,2017-04-20 14:54:38.956966,,Engenharia,False,,Google,,,,False,,5,2017-04,2017-12,2,mobile_fileview,4,2017-04,mobile_fileview,2017-04,0,1
4,7338,2016-06-05 12:25:46.003000,website,Engenharia,True,,Google,Sudeste,,,False,,12,2016-11,2018-06,4,web_fileview,9,2016-11,web_fileview,2016-06,0,1


### User Features 

In [731]:
df = summ_df\
        .groupby(['course_area', 'has_purchased'], as_index=False)\
        .agg({'student_id':'count'})\
        .sort_values(by='student_id', ascending=False)

fig = px.bar(df, y='course_area', x='student_id', facet_col='has_purchased', color='course_area')
fig.update_traces(texttemplate='%{x}', textposition='inside')
fig.update_xaxes(matches=None, title='Total de estudantes')

fig.update_layout(title='Conversão dos usuários, por área', 
                  yaxis_title='Área de interesse',
                  showlegend=False
                 )
plot(fig)

In [761]:
df = summ_df\
        .groupby(['origin', 'has_purchased'], as_index=False)\
        .agg({'student_id':'count'})\
        .sort_values(by='student_id', ascending=False)

df.loc[:, 'total'] = df.groupby(['has_purchased'])['student_id'].transform('sum')
df.loc[:, 'percent'] = 100*df['student_id']/df['total']

fig = px.bar(df, y='origin', x='percent', facet_col='has_purchased', color='origin')
fig.update_traces(texttemplate='%{x:.1f}%', textposition='outside')
fig.update_xaxes(matches=None, title='Percentual de estudantes (%)', range=[0,100])

fig.update_layout(title='Conversão de usuários, por dispositivo', 
                  yaxis_title='Dispositivo do cadastro',
                  showlegend=False
                 )
plot(fig)

In [762]:
df = summ_df\
        .groupby(['signup_source', 'has_purchased'], as_index=False)\
        .agg({'student_id':'count'})\
        .sort_values(by='student_id', ascending=False)

df.loc[:, 'total'] = df.groupby(['has_purchased'])['student_id'].transform('sum')
df.loc[:, 'percent'] = 100*df['student_id']/df['total']

fig = px.bar(df, y='signup_source', x='percent', facet_col='has_purchased', color='signup_source')
fig.update_traces(texttemplate='%{x:.1f}%', textposition='auto')
fig.update_xaxes(matches=None, title='Percentual dos estudantes (%)')

fig.update_layout(title='Conversão de usuários, por origem do cadastro', 
                  yaxis_title='Origem do cadastro',
                  showlegend=False
                 )
plot(fig)

In [767]:
df = summ_df\
        .groupby(['region', 'has_purchased'], as_index=False)\
        .agg({'student_id':'count'})\
        .sort_values(by='student_id', ascending=False)

df.loc[:, 'total'] = df.groupby(['has_purchased'])['student_id'].transform('sum')
df.loc[:, 'percent'] = 100*df['student_id']/df['total']

fig = px.bar(df, y='region', x='percent', facet_col='has_purchased', color='region')
fig.update_traces(texttemplate='%{x:.1f}%', textposition='outside')
fig.update_xaxes(matches=None, title='Percentual dos estudantes (%)', range=[0,80])

fig.update_layout(title='Conversão dos usuários, por região', 
                  yaxis_title='Região do usuário',
                  showlegend=False
                 )
plot(fig)

### User behaviour

In [773]:
df = summ_df\
        .groupby(['first_cluster'], as_index=False)\
        .agg({'student_id':'count'})\
        .sort_values(by='student_id', ascending=True)

fig = px.bar(df, y='first_cluster', x='student_id')
fig.update_traces(texttemplate='%{x}', textposition='auto')
fig.update_layout(title='Padrão de comportamento no 1º mês', 
                  xaxis_title='Quantidade de usuários',
                  yaxis_title=' Comportamento no 1º mês'
                 )
plot(fig)

In [781]:
df = summ_df\
        .groupby(['first_cluster', 'has_purchased'], as_index=False)\
        .agg({'student_id':'count'})\
        .sort_values(by='student_id', ascending=False)

df.loc[:, 'total'] = df.groupby(['has_purchased'])['student_id'].transform('sum')
df.loc[:, 'percent'] = 100*df['student_id']/df['total']

fig = px.bar(df, y='first_cluster', x='percent', facet_col='has_purchased', color='first_cluster')
fig.update_traces(texttemplate='%{x:.1f}%', textposition='outside')
fig.update_xaxes(matches=None, showticklabels=True, title='Porcentagem de usuários (%)',  range=[0,80])
fig.update_layout(title='Padrão de comportamento por usuário, no 1o mês', 
                  yaxis_title='Perfil de uso',
                  showlegend=False
                 )
plot(fig)

In [786]:
df = summ_df\
        .groupby(['first_month', 'first_cluster'], as_index=False)\
        .agg({'student_id':'count'})\
        .sort_values(by='first_month', ascending=False)

fig = px.line(df, x='first_month', y='student_id', color='first_cluster')
fig.update_layout(title='Evolução do comportamento do 1º mês de atividade, por safra', 
                  yaxis_title='Quantidade de usuários',
                  xaxis_title='Mês do usuário',
                  legend_orientation='h',
                  height=700
                 )
plot(fig)

In [787]:
df = summ_df\
        .groupby(['first_cluster', 'course_area'], as_index=False)\
        .agg({'student_id':'count'})\
        .sort_values(by='student_id', ascending=False)

df.loc[:, 'total'] = df.groupby(['first_cluster'])['student_id'].transform('sum')
df.loc[:, 'percent'] = 100*df['student_id']/df['total']

fig = px.bar(df, facet_col='first_cluster', x='student_id', y='course_area', 
                   color='course_area', text='percent')

fig.update_traces(texttemplate='%{text:.1f}%', textposition='inside')
fig.update_xaxes(matches=None, title='')

fig.update_layout(title='Distribuição do comportamento do 1o mês, por curso de interesse', 
                  yaxis_title='Curso de interesse',
                  showlegend=False
                 )

fig.for_each_annotation(lambda a: a.update(text=a.text.split('=')[1]))
plot(fig)

### LTV

In [789]:
df = summ_df\
        .loc[summ_df['has_purchased']]\
        .groupby(['main_cluster'], as_index=False)\
        .agg({'student_id':'count', 'revenue_first_purchase':'mean', 
              'ltv':'mean', 'total_plans':'mean'})\
        .sort_values(by='ltv', ascending=True)


fig = px.bar(df, y='main_cluster', x=['revenue_first_purchase', 'ltv'], barmode='group')

fig.update_traces(texttemplate='R$%{x:.1f}', textposition='inside')

fig.update_layout(title='LTV e Receita de primeira compra, por tipo de comportamento mais comum do usuário', 
                  yaxis_title='Comportamento mais comum',
                  xaxis_title='Valor em receita (R$)',
                  showlegend=True
                 )
plot(fig)

In [790]:
df = summ_df\
        .loc[summ_df['has_purchased']]\
        .groupby(['first_cluster'], as_index=False)\
        .agg({'student_id':'count', 'revenue_first_purchase':'mean', 
              'ltv':'mean', 'total_plans':'mean'})\
        .sort_values(by='ltv', ascending=True)


fig = px.bar(df, y='first_cluster', x=['revenue_first_purchase', 'ltv'], barmode='group')

fig.update_traces(texttemplate='R$%{x:.1f}', textposition='inside')

fig.update_layout(title='LTV e Receita de primeira compra, por primeiro comportamento do usuário', 
                  yaxis_title='Comportamento mais comum',
                  xaxis_title='Valor em receita (R$)',
                  showlegend=True
                 )

plot(fig)

In [800]:
df = summ_df\
        .groupby(['has_purchased', 'total_time'], as_index=False)\
        .agg({'student_id':'count'})\
        .sort_values(by=['has_purchased', 'total_time'], ascending=True)

df.loc[:, 'cumsum'] = df.groupby(['has_purchased'])['student_id'].cumsum()
df.loc[:, 'total'] = df.groupby(['has_purchased'])['student_id'].transform('sum')

df.loc[:, 'students_over'] = df['total'] - df['cumsum']

fig = px.histogram(df, x='total_time', y='student_id', histfunc='sum', nbins=50,
                   facet_col='has_purchased')

fig.update_yaxes(matches=None, showticklabels=True)
fig.update_xaxes(title='Meses com atividade')
# fig.update_traces(texttemplate='R$%{x:.1f}', textposition='inside')

fig.update_layout(title='Conversão x Tempo de atividade dos usuários', 
                  yaxis_title='Total de usuários',
                  showlegend=True
                 )

plot(fig)
df.head()

Unnamed: 0,has_purchased,total_time,student_id,cumsum,total,students_over
0,False,1,7539,7539,58257,50718
1,False,2,7155,14694,58257,43563
2,False,3,6443,21137,58257,37120
3,False,4,5868,27005,58257,31252
4,False,5,5222,32227,58257,26030
