User Engagement Analysis

In [6]:
import pickle
import numpy as np
import pandas as pd
import math
from sklearn import metrics
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler, normalize
from scipy.spatial.distance import cdist
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

In [18]:
import os , sys
sys.path.append(os.path.abspath(os.path.join('../scripts')))
from data_selector import *
from data_visualizer import *
from data_outlier_handler import OutlierHandler

Data Reading

In [19]:
df = pd.read_csv("C:/Users/USER/Desktop/KAIM/TellCo-Data-Analysis/Data/cleaned_Tellco_data.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 60586 entries, 0 to 60585
Data columns (total 55 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   Unnamed: 0                        60586 non-null  int64  
 1   Bearer Id                         60586 non-null  int64  
 2   Start                             60586 non-null  object 
 3   Start ms                          60586 non-null  float64
 4   End                               60586 non-null  object 
 5   End ms                            60586 non-null  float64
 6   IMSI                              60586 non-null  int64  
 7   MSISDN/Number                     60586 non-null  int64  
 8   IMEI                              60586 non-null  int64  
 9   Last Location Name                60586 non-null  object 
 10  Avg RTT DL (ms)                   60586 non-null  float64
 11  Avg RTT UL (ms)                   60586 non-null  float64
 12  Avg 

In [20]:
df.columns

Index(['Unnamed: 0', 'Bearer Id', 'Start', 'Start ms', 'End', 'End ms', 'IMSI',
       'MSISDN/Number', 'IMEI', 'Last Location Name', 'Avg RTT DL (ms)',
       'Avg RTT UL (ms)', 'Avg Bearer TP DL (kbps)', 'Avg Bearer TP UL (kbps)',
       'TCP DL Retrans. Vol (Bytes)', 'TCP UL Retrans. Vol (Bytes)',
       'DL TP < 50 Kbps (%)', '50 Kbps < DL TP < 250 Kbps (%)',
       '250 Kbps < DL TP < 1 Mbps (%)', 'DL TP > 1 Mbps (%)',
       'UL TP < 10 Kbps (%)', '10 Kbps < UL TP < 50 Kbps (%)',
       '50 Kbps < UL TP < 300 Kbps (%)', 'UL TP > 300 Kbps (%)',
       'Activity Duration DL (ms)', 'Activity Duration UL (ms)', 'Dur. (ms).1',
       'Handset Manufacturer', 'Handset Type', 'Nb of sec with Vol DL < 6250B',
       'Nb of sec with Vol UL < 1250B', 'Social Media DL (Bytes)',
       'Social Media UL (Bytes)', 'Google DL (Bytes)', 'Google UL (Bytes)',
       'Email DL (Bytes)', 'Email UL (Bytes)', 'Youtube DL (Bytes)',
       'Youtube UL (Bytes)', 'Netflix DL (Bytes)', 'Netflix UL (Bytes)',

User Engagement Metrics

In [21]:
user_engagement_df = df[['MSISDN/Number', 'Bearer Id', 'Dur. (ms).1', 'Total Data Volume (Bytes)']]

In [22]:
# aggregating user engagement metrics per user
user_engagement_df = user_engagement_df.groupby(
    'MSISDN/Number').agg({'Bearer Id': 'count', 'Dur. (ms).1': 'sum', 'Total Data Volume (Bytes)': 'sum'})
user_engagement_df = user_engagement_df.rename(
    columns={'Bearer Id': 'xDR Sessions'})
user_engagement_df.head()

Unnamed: 0_level_0,xDR Sessions,Dur. (ms).1,Total Data Volume (Bytes)
MSISDN/Number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
33601007832,1,49878024.0,422320698.0
33601008617,1,18555323.0,871832580.0
33601010682,1,128088011.0,194367933.0
33601011634,1,64180392.0,199050991.0
33601011959,1,86399977.0,332660357.0


In [23]:
# top 10 customers per xDR sessions
user_engagement_df.nlargest(10, 'xDR Sessions')

Unnamed: 0_level_0,xDR Sessions,Dur. (ms).1,Total Data Volume (Bytes)
MSISDN/Number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
33659725664,12,3204990000.0,5590099000.0
33667163239,12,8485604000.0,5499476000.0
33614892860,11,6049279000.0,6348299000.0
33659359429,10,2332679000.0,4224487000.0
33659822913,10,4889931000.0,3299376000.0
33603127838,9,4887847000.0,3737505000.0
33658263267,9,2484771000.0,4927907000.0
33675877202,9,2792791000.0,4602815000.0
33681557919,9,4093421000.0,3791379000.0
33604515716,8,3425932000.0,4139835000.0


In [24]:
# top 10 customers per duration(ms)
user_engagement_df.nlargest(10, 'Dur. (ms).1')

Unnamed: 0_level_0,xDR Sessions,Dur. (ms).1,Total Data Volume (Bytes)
MSISDN/Number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
33667163239,12,8485604000.0,5499476000.0
33614892860,11,6049279000.0,6348299000.0
33625779332,5,5233185000.0,2949550000.0
33659822913,10,4889931000.0,3299376000.0
33603127838,9,4887847000.0,3737505000.0
33662840755,6,4120956000.0,2731476000.0
33681557919,9,4093421000.0,3791379000.0
33668929914,8,4091725000.0,4849671000.0
33626320676,7,3578263000.0,3396601000.0
33604515716,8,3425932000.0,4139835000.0


In [25]:
# top 10 customers per total data traffic
user_engagement_df.nlargest(10, 'Total Data Volume (Bytes)')

Unnamed: 0_level_0,xDR Sessions,Dur. (ms).1,Total Data Volume (Bytes)
MSISDN/Number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
33614892860,11,6049279000.0,6348299000.0
33659725664,12,3204990000.0,5590099000.0
33667163239,12,8485604000.0,5499476000.0
33665140229,7,1366500000.0,4928688000.0
33658263267,9,2484771000.0,4927907000.0
33659546392,8,1983092000.0,4906749000.0
33668929914,8,4091725000.0,4849671000.0
33762333464,8,2732116000.0,4816230000.0
33760413819,7,3331283000.0,4753227000.0
33668047871,8,2933086000.0,4606712000.0


K-means Clustering

In [26]:
# scale data
scaler = StandardScaler()
scaled_data = scaler.fit_transform(user_engagement_df)
scaled_data

array([[-0.3465085 , -0.31556066, -0.43684189],
       [-0.3465085 , -0.5292634 ,  0.53142027],
       [-0.3465085 ,  0.21803597, -0.92785903],
       ...,
       [-0.3465085 , -0.40469995, -0.92882985],
       [-0.3465085 , -0.4384615 , -0.54597594],
       [-0.3465085 , -0.59574729,  0.19282624]], shape=(48011, 3))

In [27]:
normalized_data = normalize(scaled_data)
normalized_data

array([[-0.5408405 , -0.49253622, -0.68183548],
       [-0.41940366, -0.64060479,  0.64321541],
       [-0.34166884,  0.21499067, -0.91489968],
       ...,
       [-0.32360308, -0.37794787, -0.86743095],
       [-0.44351098, -0.56120553, -0.69881784],
       [-0.48418237, -0.83244808,  0.26943947]], shape=(48011, 3))

In [28]:
kmeans = KMeans(n_clusters=3, random_state=1).fit(normalized_data)
kmeans.labels_

array([2, 0, 2, ..., 2, 2, 0], shape=(48011,), dtype=int32)

In [29]:
user_engagement_df.insert(0, 'cluster', kmeans.labels_)
user_engagement_df

Unnamed: 0_level_0,cluster,xDR Sessions,Dur. (ms).1,Total Data Volume (Bytes)
MSISDN/Number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
33601007832,2,1,49878024.0,422320698.0
33601008617,0,1,18555323.0,871832580.0
33601010682,2,1,128088011.0,194367933.0
33601011634,2,1,64180392.0,199050991.0
33601011959,2,1,86399977.0,332660357.0
...,...,...,...,...
33789914536,2,1,86399921.0,366089210.0
33789922012,2,1,54339643.0,576824365.0
33789942399,2,1,36812757.0,193917233.0
33789980299,2,1,31864281.0,371655648.0


In [30]:
user_engagement_df['cluster'].value_counts()

cluster
2    24608
0    16013
1     7390
Name: count, dtype: int64

In [32]:
fig = px.scatter(user_engagement_df, x='Total Data Volume (Bytes)', y='Dur. (ms).1',
                 color='cluster', size='xDR Sessions')
fig.show()

In [34]:
cluster0 = user_engagement_df[user_engagement_df["cluster"]==0]
cluster0[['xDR Sessions', 'Dur. (ms).1', 'Total Data Volume (Bytes)']].describe()

Unnamed: 0,xDR Sessions,Dur. (ms).1,Total Data Volume (Bytes)
count,16013.0,16013.0,16013.0
mean,1.0,59050480.0,750514600.0
std,0.0,31830970.0,97866300.0
min,1.0,7146574.0,564164000.0
25%,1.0,30207340.0,665464600.0
50%,1.0,57074720.0,750123800.0
75%,1.0,86399930.0,834517600.0
max,1.0,178948900.0,949598300.0


In [35]:
cluster1 = user_engagement_df[user_engagement_df["cluster"]==1]
cluster1[['xDR Sessions', 'Dur. (ms).1', 'Total Data Volume (Bytes)']].describe()

Unnamed: 0,xDR Sessions,Dur. (ms).1,Total Data Volume (Bytes)
count,7390.0,7390.0,7390.0
mean,2.701488,288899400.0,1339395000.0
std,1.123663,299055800.0,685473200.0
min,1.0,28248480.0,101047000.0
25%,2.0,172799800.0,870142900.0
50%,2.0,187470700.0,1191361000.0
75%,3.0,345599800.0,1653580000.0
max,12.0,8485604000.0,6348299000.0


In [36]:
cluster2 = user_engagement_df[user_engagement_df["cluster"]==2]
cluster2[['xDR Sessions', 'Dur. (ms).1', 'Total Data Volume (Bytes)']].describe()

Unnamed: 0,xDR Sessions,Dur. (ms).1,Total Data Volume (Bytes)
count,24608.0,24608.0,24608.0
mean,1.000041,62368510.0,329025100.0
std,0.006375,34739390.0,148517200.0
min,1.0,7189000.0,38022360.0
25%,1.0,31977750.0,200078400.0
50%,1.0,62705210.0,329889300.0
75%,1.0,86399940.0,458491500.0
max,2.0,374127000.0,617775500.0


Traffic of applications per user

In [37]:
user_app_engagement_df = df[['MSISDN/Number', 'Social Media Data Volume (Bytes)', 'Google Data Volume (Bytes)',
    'Email Data Volume (Bytes)', 'Youtube Data Volume (Bytes)', 'Netflix Data Volume (Bytes)',
    'Gaming Data Volume (Bytes)', 'Other Data Volume (Bytes)']]

In [38]:
# top 10 most engaged users per app
social_media = user_app_engagement_df.nlargest(10, "Social Media Data Volume (Bytes)")['Social Media Data Volume (Bytes)']
google = user_app_engagement_df.nlargest(10, "Google Data Volume (Bytes)")['Google Data Volume (Bytes)']
email = user_app_engagement_df.nlargest(10, "Email Data Volume (Bytes)")['Email Data Volume (Bytes)']
youtube = user_app_engagement_df.nlargest(10, "Youtube Data Volume (Bytes)")['Youtube Data Volume (Bytes)']
netflix = user_app_engagement_df.nlargest(10, "Netflix Data Volume (Bytes)")['Netflix Data Volume (Bytes)']
gaming = user_app_engagement_df.nlargest(10, "Gaming Data Volume (Bytes)")['Gaming Data Volume (Bytes)']
other = user_app_engagement_df.nlargest(10, "Other Data Volume (Bytes)")['Other Data Volume (Bytes)']

In [39]:
apps_sum = user_app_engagement_df.sum().sort_values(ascending=False)
apps_sum

MSISDN/Number                       2.040306e+15
Other Data Volume (Bytes)           2.606939e+13
Gaming Data Volume (Bytes)          2.604257e+13
Netflix Data Volume (Bytes)         1.375769e+12
Youtube Data Volume (Bytes)         1.373273e+12
Google Data Volume (Bytes)          4.731190e+11
Email Data Volume (Bytes)           1.367259e+11
Social Media Data Volume (Bytes)    1.106325e+11
dtype: float64

Optimized value of k for user engagement k-means clustering

In [40]:
def choose_kmeans(df: pd.DataFrame, num: int):
  distortions = []
  inertias = []
  K = range(1, num)
  for k in K:
    kmeans = KMeans(n_clusters=k, random_state=0).fit(df)
    distortions.append(sum(
        np.min(cdist(df, kmeans.cluster_centers_, 'euclidean'), axis=1)) / df.shape[0])
    inertias.append(kmeans.inertia_)

  return (distortions, inertias)

In [41]:
distortions, inertias = choose_kmeans(normalized_data, 20)

In [42]:
fig = make_subplots(
    rows=1, cols=2, subplot_titles=("Distortion", "Inertia")
)
fig.add_trace(go.Scatter(x=np.array(range(1, 20)), y=distortions), row=1, col=1)
fig.add_trace(go.Scatter(x=np.array(range(1, 20)), y=inertias), row=1, col=2)
fig.update_layout(title_text="The Elbow Method", height=500)
fig.show()

In [43]:
kmeans = KMeans(n_clusters=6, random_state=6).fit(normalized_data)
kmeans.labels_

array([5, 1, 0, ..., 5, 5, 4], shape=(48011,), dtype=int32)

In [44]:
user_engagement_df["cluster"]= kmeans.labels_
user_engagement_df

Unnamed: 0_level_0,cluster,xDR Sessions,Dur. (ms).1,Total Data Volume (Bytes)
MSISDN/Number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
33601007832,5,1,49878024.0,422320698.0
33601008617,1,1,18555323.0,871832580.0
33601010682,0,1,128088011.0,194367933.0
33601011634,0,1,64180392.0,199050991.0
33601011959,0,1,86399977.0,332660357.0
...,...,...,...,...
33789914536,0,1,86399921.0,366089210.0
33789922012,4,1,54339643.0,576824365.0
33789942399,5,1,36812757.0,193917233.0
33789980299,5,1,31864281.0,371655648.0


In [45]:
fig = px.scatter(user_engagement_df, x='Total Data Volume (Bytes)', y='Dur. (ms).1',
                 color='cluster', size='xDR Sessions')
fig.show()

Saving Data

In [46]:
# save the dataframe
user_engagement_df.to_csv('../data/user_engagement_data.csv')

In [48]:
# save the clustering model
with open("../models/user_engagement.pkl", "wb") as f:
    pickle.dump(kmeans, f)