###  **User Engagement Analysis**
In this task we will track the user’s engagement using the following engagement metrics: 
* Sessions frequency 
* Duration of the session 
* Sessions total traffic (download and upload (MB))


In [4]:
# Import libraries

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

In [5]:
import os
import sys
import warnings
warnings.filterwarnings('ignore')
sys.path.append(os.path.abspath(os.path.join('../scripts')))

In [6]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)
pd.set_option("expand_frame_repr", False)
pd.set_option('display.float_format', '{:.2f}'.format)

In [12]:
# Read cleaned Telecom dataset

cleaned_df = pd.read_csv('../data/cleaned_Telecom_data.csv')

In [13]:
# Compute the Total Data usage for each applications

cleaned_df["Social Media (Bytes)"] = cleaned_df["Social Media DL (Bytes)"] + cleaned_df['Social Media UL (Bytes)']
cleaned_df["Google (Bytes)"]       = cleaned_df["Google DL (Bytes)"] + cleaned_df["Google UL (Bytes)"]
cleaned_df['Youtube (Bytes)']      = cleaned_df["Youtube DL (Bytes)"] + cleaned_df["Youtube UL (Bytes)"]
cleaned_df['Netflix (Bytes)']      = cleaned_df["Netflix DL (Bytes)"] + cleaned_df["Netflix UL (Bytes)"]
cleaned_df["Gaming (Bytes)"]       = cleaned_df["Gaming DL (Bytes)"] + cleaned_df["Gaming UL (Bytes)"]
cleaned_df['Email (Bytes)']        = cleaned_df["Email DL (Bytes)"] + cleaned_df["Email UL (Bytes)"]
cleaned_df['Other (Bytes)']        = cleaned_df["Other DL (Bytes)"]+ cleaned_df["Other UL (Bytes)"]
cleaned_df['Total Data (Bytes)']   = cleaned_df['Total UL (Bytes)'] + cleaned_df['Total DL (Bytes)']

In [14]:
#  Replace the space with underscore

cleaned_df.columns = cleaned_df.columns.str.replace(" ", "_")
cleaned_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 149001 entries, 0 to 149000
Data columns (total 53 columns):
 #   Column                          Non-Null Count   Dtype  
---  ------                          --------------   -----  
 0   Unnamed:_0                      149001 non-null  int64  
 1   Bearer_Id                       149001 non-null  float64
 2   Start                           149001 non-null  object 
 3   Start_ms                        149001 non-null  float64
 4   End                             149001 non-null  object 
 5   End_ms                          149001 non-null  float64
 6   Dur._(ms)                       149001 non-null  float64
 7   IMSI                            149001 non-null  float64
 8   MSISDN/Number                   149001 non-null  float64
 9   IMEI                            149001 non-null  float64
 10  Last_Location_Name              149001 non-null  object 
 11  Avg_RTT_DL_(ms)                 149001 non-null  float64
 12  Avg_RTT_UL_(ms) 

### Task 2
Aggregate the above metrics per customer id (MSISDN) and report the top 10 customers per engagement metric

In [25]:
user_engagement_df = cleaned_df[['MSISDN/Number', 'Bearer_Id', 'Dur._(ms)', 'Total_Data_(Bytes)']].copy().rename(columns={
    'Bearer_Id': 'Session_Id', 'MSISDN/Number':'Customer_Id', 'Dur._(ms)': 'Duration', 'Total_Data_(Bytes)': 'Total_Data_Volume'})

In [27]:

user_engagement = user_engagement_df.groupby('Customer_Id').agg({'Session_Id': 'count', 'Duration': 'sum', 'Total_Data_Volume': 'sum'})
user_engagement.head(10)

Unnamed: 0_level_0,Session_Id,Duration,Total_Data_Volume
Customer_Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
33601001722.0,1,116720.0,878690574.0
33601001754.0,1,181230.0,156859643.0
33601002511.0,1,134969.0,595966483.0
33601007832.0,1,49878.0,422320698.0
33601008617.0,2,37104.0,1457410944.0
33601010682.0,2,253983.0,615217221.0
33601011634.0,2,128360.0,654723066.0
33601011959.0,1,86399.0,332660357.0
33601014694.0,2,495702.0,990132189.0
33601020306.0,1,124854.0,732463761.0
