User Engagement Analysis

In [2]:
import pickle
import numpy as np
import pandas as pd
import math
from sklearn import metrics
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler, normalize
from scipy.spatial.distance import cdist
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

In [5]:
import os , sys
sys.path.append(os.path.abspath(os.path.join('../scripts')))
from data_selector import *
from data_visualizer import *
from data_outlier_handler import OutlierHandler

Data Reading

In [7]:
df = pd.read_csv("C:/Users/USER/Desktop/KAIM/TellCo-Data-Analysis/Data/cleaned_Tellco_data.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 60586 entries, 0 to 60585
Data columns (total 55 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   Unnamed: 0                        60586 non-null  int64  
 1   Bearer Id                         60586 non-null  int64  
 2   Start                             60586 non-null  object 
 3   Start ms                          60586 non-null  float64
 4   End                               60586 non-null  object 
 5   End ms                            60586 non-null  float64
 6   IMSI                              60586 non-null  int64  
 7   MSISDN/Number                     60586 non-null  int64  
 8   IMEI                              60586 non-null  int64  
 9   Last Location Name                60586 non-null  object 
 10  Avg RTT DL (ms)                   60586 non-null  float64
 11  Avg RTT UL (ms)                   60586 non-null  float64
 12  Avg 

In [9]:
df.columns

Index(['Unnamed: 0', 'Bearer Id', 'Start', 'Start ms', 'End', 'End ms', 'IMSI',
       'MSISDN/Number', 'IMEI', 'Last Location Name', 'Avg RTT DL (ms)',
       'Avg RTT UL (ms)', 'Avg Bearer TP DL (kbps)', 'Avg Bearer TP UL (kbps)',
       'TCP DL Retrans. Vol (Bytes)', 'TCP UL Retrans. Vol (Bytes)',
       'DL TP < 50 Kbps (%)', '50 Kbps < DL TP < 250 Kbps (%)',
       '250 Kbps < DL TP < 1 Mbps (%)', 'DL TP > 1 Mbps (%)',
       'UL TP < 10 Kbps (%)', '10 Kbps < UL TP < 50 Kbps (%)',
       '50 Kbps < UL TP < 300 Kbps (%)', 'UL TP > 300 Kbps (%)',
       'Activity Duration DL (ms)', 'Activity Duration UL (ms)', 'Dur. (ms).1',
       'Handset Manufacturer', 'Handset Type', 'Nb of sec with Vol DL < 6250B',
       'Nb of sec with Vol UL < 1250B', 'Social Media DL (Bytes)',
       'Social Media UL (Bytes)', 'Google DL (Bytes)', 'Google UL (Bytes)',
       'Email DL (Bytes)', 'Email UL (Bytes)', 'Youtube DL (Bytes)',
       'Youtube UL (Bytes)', 'Netflix DL (Bytes)', 'Netflix UL (Bytes)',

User Engagement Metrics

In [10]:
user_engagement_df = df[['MSISDN/Number', 'Bearer Id', 'Dur. (ms).1', 'Total Data Volume (Bytes)']]

In [11]:
# aggregating user engagement metrics per user
user_engagement_df = user_engagement_df.groupby(
    'MSISDN/Number').agg({'Bearer Id': 'count', 'Dur. (ms).1': 'sum', 'Total Data Volume (Bytes)': 'sum'})
user_engagement_df = user_engagement_df.rename(
    columns={'Bearer Id': 'xDR Sessions'})
user_engagement_df.head()

Unnamed: 0_level_0,xDR Sessions,Dur. (ms).1,Total Data Volume (Bytes)
MSISDN/Number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
33601007832,1,49878024.0,422320698.0
33601008617,1,18555323.0,871832580.0
33601010682,1,128088011.0,194367933.0
33601011634,1,64180392.0,199050991.0
33601011959,1,86399977.0,332660357.0


In [12]:
# top 10 customers per xDR sessions
user_engagement_df.nlargest(10, 'xDR Sessions')

Unnamed: 0_level_0,xDR Sessions,Dur. (ms).1,Total Data Volume (Bytes)
MSISDN/Number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
33659725664,12,3204990000.0,5590099000.0
33667163239,12,8485604000.0,5499476000.0
33614892860,11,6049279000.0,6348299000.0
33659359429,10,2332679000.0,4224487000.0
33659822913,10,4889931000.0,3299376000.0
33603127838,9,4887847000.0,3737505000.0
33658263267,9,2484771000.0,4927907000.0
33675877202,9,2792791000.0,4602815000.0
33681557919,9,4093421000.0,3791379000.0
33604515716,8,3425932000.0,4139835000.0


In [13]:
# top 10 customers per duration(ms)
user_engagement_df.nlargest(10, 'Dur. (ms).1')

Unnamed: 0_level_0,xDR Sessions,Dur. (ms).1,Total Data Volume (Bytes)
MSISDN/Number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
33667163239,12,8485604000.0,5499476000.0
33614892860,11,6049279000.0,6348299000.0
33625779332,5,5233185000.0,2949550000.0
33659822913,10,4889931000.0,3299376000.0
33603127838,9,4887847000.0,3737505000.0
33662840755,6,4120956000.0,2731476000.0
33681557919,9,4093421000.0,3791379000.0
33668929914,8,4091725000.0,4849671000.0
33626320676,7,3578263000.0,3396601000.0
33604515716,8,3425932000.0,4139835000.0


In [14]:
# top 10 customers per total data traffic
user_engagement_df.nlargest(10, 'Total Data Volume (Bytes)')

Unnamed: 0_level_0,xDR Sessions,Dur. (ms).1,Total Data Volume (Bytes)
MSISDN/Number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
33614892860,11,6049279000.0,6348299000.0
33659725664,12,3204990000.0,5590099000.0
33667163239,12,8485604000.0,5499476000.0
33665140229,7,1366500000.0,4928688000.0
33658263267,9,2484771000.0,4927907000.0
33659546392,8,1983092000.0,4906749000.0
33668929914,8,4091725000.0,4849671000.0
33762333464,8,2732116000.0,4816230000.0
33760413819,7,3331283000.0,4753227000.0
33668047871,8,2933086000.0,4606712000.0
