In [None]:


import numpy as np
import pandas as pd
import seaborn as sns
import plotly.io as pio
import plotly.express as px
import plotly. graph_objects as go
import matplotlib.pyplot as plt
from sklearn import preprocessing 
from sklearn.cluster import KMeans
from plotly.subplots import make_subplots
from sklearn.preprocessing import StandardScaler, normalize

import sys
import os
sys.path.insert(0,os.path.abspath('../scripts'))

import functions as fn
import importlib
importlib.reload(fn)



In [None]:
cleaned_data = pd.read_csv('../data/clean_telecom_data.csv')
cleaned_data.info()

In [None]:
user_engagement_df = cleaned_data[['MSISDN_Number', 'Bearer_Id', 'Dur._(ms)', 'Total_Data_(Bytes)']].copy().rename(columns={
    'Bearer_Id': 'Session_Frequency', 'MSISDN_Number':'Customer_Id', 'Dur._(ms)': 'Duration', 'Total_Data_(Bytes)': 'Total_Data_Volume'})

In [None]:
#Aggregating Session_Frequency,Duration and Total_Data_Volume per Customer_Id and Displaying top 10 
user_engagement = user_engagement_df.groupby('Customer_Id').agg({'Session_Frequency': 'count', 'Duration': 'sum', 'Total_Data_Volume': 'sum'})
user_engagement.head(10)

In [None]:
#Displaying top 10  customer engagement
sessions = user_engagement.nlargest(10, "Session_Frequency")['Session_Frequency']
duration = user_engagement.nlargest(10, "Duration")['Duration']
total_data_volume = user_engagement.nlargest(10, "Total_Data_Volume")['Total_Data_Volume']

functions.mult_hist([sessions, duration, total_data_volume], 1,
          3, "User Metrix", ['Session Frequency', 'Duration', 'Total Data Volume'])

In [None]:
# Check for outliers 

user_engagement.boxplot()

In [None]:
#Normalize the Engagement Metrics
scaler = StandardScaler()
scaled_array = scaler.fit_transform(user_engagement)
pd.DataFrame(scaled_array).head(5)

In [None]:
data_normalized = normalize(scaled_array)
pd.DataFrame(data_normalized).head(5)

In [None]:
# K=3 to classify customers 
kmeans = KMeans(n_clusters=3, random_state=0).fit(data_normalized)
kmeans.labels_

In [None]:
user_engagement.insert(0, 'Cluster', kmeans.labels_)
user_engagement.head(5)

In [None]:
user_engagement['Cluster'].value_counts()

In [None]:
from IPython.display import Image
fig = px.scatter(user_engagement, x='Total_Data_Volume', y="Duration", color='Cluster', size='Session_Frequency')
Image(pio.to_image(fig, format='png', width=1200))

In [None]:
#Compute the minimum, maximum, average & total non-normalized metrics for each cluster. 

cluster1 = user_engagement[user_engagement["Cluster"]==0]
cluster1.describe()

In [None]:
cluster1 = user_engagement[user_engagement["Cluster"]==1]
cluster1.describe()

In [None]:
cluster1 = user_engagement[user_engagement["Cluster"]==2]
cluster1.describe()

In [None]:
#Aggregate user total traffic per application and derive the top 10 most engaged users per application
app_df = cleaned_data.groupby('MSISDN_Number').agg({'Gaming_(Bytes)': 'sum', 'Youtube_(Bytes)': 'sum', 'Netflix_(Bytes)': 'sum',\
                     'Google_(Bytes)': 'sum', 'Email_(Bytes)': 'sum', 'Social_Media_(Bytes)': 'sum', 'Other_(Bytes)': 'sum'})
app_df.head()

In [None]:
gaming = app_df.nlargest(10, "Gaming_(Bytes)")['Gaming_(Bytes)']
youtube = app_df.nlargest(10, "Youtube_(Bytes)")['Youtube_(Bytes)']
netflix = app_df.nlargest(10, "Netflix_(Bytes)")['Netflix_(Bytes)']
google = app_df.nlargest(10, "Google_(Bytes)")['Google_(Bytes)']
email = app_df.nlargest(10, "Email_(Bytes)")['Email_(Bytes)']
social_media = app_df.nlargest(10, "Social_Media_(Bytes)")['Social_Media_(Bytes)']
other = app_df.nlargest(10, "Other_(Bytes)")['Other_(Bytes)']


functions.mult_hist([gaming, youtube, netflix], 1,
          3, "User metrix", ["Gaming_(Bytes)", "youtube", "netflix"])

#plots.mult_hist([google, email, social_media, other], 1,
        #  4, "User Metrics", [ "Google", "Email", "Social Media", "Other"])

In [None]:
functions.mult_hist([google, email, social_media, other], 1,
         4, "User Metrics", [ "Google", "Email", "Social Media", "Other"])

In [None]:
top_3 = app_df.sum()
top_3

In [None]:
plots.plot_bar(top_3.nlargest(3), "Top 3 most used Applications", "Applications", "Sum of Top 3 Users")

In [None]:

from scipy.spatial.distance import cdist
def choose_kmeans(df: pd.DataFrame, num: int):
  distortions = []
  inertias = []
  K = range(1, num)
  for k in K:
    kmeans = KMeans(n_clusters=k, random_state=0).fit(df)
    distortions.append(sum(
        np.min(cdist(df, kmeans.cluster_centers_, 'euclidean'), axis=1)) / df.shape[0])
    inertias.append(kmeans.inertia_)


  return (distortions, inertias)

In [None]:
distortions, inertias = choose_kmeans(data_normalized, 15)

In [None]:
fig = make_subplots(
    rows=1, cols=2, subplot_titles=("Distortion", "Inertia")
)
fig.add_trace(go.Scatter(x=np.array(range(1, 15)), y=distortions), row=1, col=1)
fig.add_trace(go.Scatter(x=np.array(range(1, 15)), y=inertias), row=1, col=2)
fig.update_layout(title_text="The Elbow Method")
# fig.show(config=config)
Image(pio.to_image(fig, format='png', width=1200))

Inferring from our plots, we can see that cluster value of 4 would be optimal since there is no decrease in inertia or distortion.

In [None]:
kmeans = KMeans(n_clusters=4, random_state=0).fit(data_normalized)
user_engagement["Cluster"]= kmeans.labels_
user_engagement

In [None]:
# save the model
import pickle
with open("../models/User_engagement.pkl", "wb") as f:
    pickle.dump(kmeans, f)

In [None]:

user_engagement.to_csv('../data/user_engagement.csv')