<h1> User Engagement Analysis </h2>

<h2> Imporing data and packages </h2>

In [13]:
import pandas as pd
import numpy as np
import os
import warnings
import matplotlib.pyplot as plt
import seaborn as sns
import sys
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
import joblib # for dumping regression model to a file.

In [None]:
# Suppressing warning messages

warnings.filterwarnings('ignore')

In [None]:
# setting path to file and folders

parent_dir = os.path.abspath(os.path.join(os.getcwd(), os.pardir)) # setting path to parent directory
scripts_dir = os.path.join(parent_dir,"scripts") # setting path to scripts directory
models_dir = os.path.join(parent_dir, "models") # setiing path to models directory
data_path = os.path.join(parent_dir,"data","teleco_user_exp_data.csv") # setting path to data directory

sys.path.insert(1, scripts_dir)

In [4]:
# Importing scripts

from data_cleaning_functions import DataCleaner as Cleaner 
from data_summarizing_functions import DataSummarizer as Sumar

cleaner = Cleaner()
sumar = Sumar()

In [5]:
# Importing the users data frame with experience and engagement score

user_df= pd.read_csv(data_path)

In [6]:
user_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 106856 entries, 0 to 106855
Data columns (total 22 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   MSISDN/Number      106856 non-null  float64
 1   xDr_session_count  106856 non-null  int64  
 2   session_dur        106856 non-null  float64
 3   Handset            106856 non-null  object 
 4   Total_DL           106856 non-null  float64
 5   Total_UL           106856 non-null  float64
 6   total_vol          106856 non-null  float64
 7   RTT total          106856 non-null  float64
 8   TP total           106856 non-null  float64
 9   TCP total          106856 non-null  float64
 10  youtube            106856 non-null  float64
 11  netflix            106856 non-null  float64
 12  google             106856 non-null  float64
 13  gaming             106856 non-null  float64
 14  email              106856 non-null  float64
 15  social             106856 non-null  float64
 16  ot

<h2> User Satisfaction Exploration

In [7]:
# creating a new satisfaction score column based on exp_score and eng_score.

user_df["sat_score"] = user_df[["eng_score", "exp_score"]].mean(axis=1)
sumar.summ_columns(user_df)

Unnamed: 0,variables,missing_count,missing_percent_(%),data_type,unique_values
0,MSISDN/Number,0,0.0,float64,106856
1,xDr_session_count,0,0.0,int64,18
2,session_dur,0,0.0,float64,74492
3,Handset,0,0.0,object,1394
4,Total_DL,0,0.0,float64,106851
5,Total_UL,0,0.0,float64,106715
6,total_vol,0,0.0,float64,106853
7,RTT total,0,0.0,float64,787
8,TP total,0,0.0,float64,28856
9,TCP total,0,0.0,float64,29986


In [8]:
# showing the top 10 users based on user experience metrics

sumar.show_N_per_col(user_df, "MSISDN/Number", ["sat_score"], 10)


Top 10 customers based onsat_score

       MSISDN/Number  sat_score
6437    3.361489e+10  26.364660
76363   3.367588e+10  22.021100
13526   3.362632e+10  19.839741
37052   3.365973e+10  19.702678
92923   3.376054e+10  19.524782
13180   3.362578e+10  19.068834
35436   3.365936e+10  17.802818
30225   3.365826e+10  17.018547
92577   3.376041e+10  16.987683
57241   3.366471e+10  16.488120


<h2> Creating a regression model

In [18]:
# Creating features and target.
#x = feature/independant variables
#y = target / dependant variable
features = ["xDr_session_count", "session_dur", "total_vol", "RTT total", "TP total", "TCP total"]
x = user_df[features]
y = user_df["sat_score"]

# splitting the data
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 42)

# gnerate model
sumar.generateModel(x_train, y_train)

y_prediction =  model.predict(x_test)
y_prediction

# Saving trained model

#joblib.dump(models_dir, "user_satisfaction_model.pkl")
#print("MODEL SAVED SUCCESSFULLY.")

Model created Sucessfully.


array([3.68726512, 3.93127092, 4.16854178, ..., 5.20640853, 4.67365363,
       3.88223749])

In [19]:
# predicting the accuracy score

score=r2_score(y_test,y_prediction)
print("r2 socre is ",score)

r2 socre is  0.6439936648170462
mean_sqrd_error is== 0.10041563653542979
root_mean_squared error of is== 0.3168842636285838


In [21]:
x2 = x.values.reshape(-1, len(features))
y2 = y.values

ols = LinearRegression()
model2 = ols.fit(x2, y2)


In [22]:
model2.coef_

array([ 3.91659315e-01,  2.18706953e-03,  2.64819866e-04, -3.16630911e+00,
       -9.22341775e-06, -5.32443629e-01])

In [23]:
model2.intercept_

3.2601769698353547

In [25]:
model2.score(x2, y2)

0.640569770394771

<h2>clustering based on experience and engagement

In [None]:
# selecting features

selected_features = ["eng_score", "exp_score"]
normalized_df, 