In [107]:
# Import modules for our task

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import seaborn as sns

In [108]:
# Import system libraries and our Scripts

import os
import sys
import warnings
warnings.filterwarnings('ignore')
sys.path.append(os.path.abspath(os.path.join('../scripts')))
from user_overview_script import UserOverviewScript
from df_cleaning import DataFrameCleaning
from df_info import DataFrameInfo
from df_outlier import *
from plots import *

In [109]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)
pd.set_option("expand_frame_repr", False)
pd.set_option('display.float_format', '{:.2f}'.format)

In [110]:
# Read cleaned Telecom dataset

cleaned_df = pd.read_csv('../data/cleaned_Telecom_data.csv')
cleaned_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 148506 entries, 0 to 148505
Data columns (total 54 columns):
 #   Column                          Non-Null Count   Dtype  
---  ------                          --------------   -----  
 0   Unnamed: 0                      148506 non-null  int64  
 1   Unnamed:_0                      148506 non-null  int64  
 2   Bearer_Id                       148506 non-null  float64
 3   Start                           148506 non-null  object 
 4   Start_ms                        148506 non-null  float64
 5   End                             148506 non-null  object 
 6   End_ms                          148506 non-null  float64
 7   Dur._(ms)                       148506 non-null  float64
 8   IMSI                            148506 non-null  float64
 9   MSISDN_Number                   148506 non-null  float64
 10  IMEI                            148506 non-null  float64
 11  Last_Location_Name              148506 non-null  object 
 12  Avg_RTT_DL_(ms) 

### **Task 3** - User Experience Analytics
**Task 3.1.** - Aggregate, per customer, the following information(treat missing & outliers by replacing by the mean or the mode of the corresponding variable

In [111]:
user_experience_df = cleaned_df[[
    "MSISDN_Number",
    "Avg_RTT_DL_(ms)",
    "Avg_RTT_UL_(ms)",
    "Avg_Bearer_TP_DL_(kbps)",
    "Avg_Bearer_TP_UL_(kbps)",
    "Handset_Type"]].copy()

In [112]:
# Instantiate UserOverViewScript class in user_overview_script 

user_overview_script = UserOverviewScript(user_experience_df)

In [113]:
# Convert Average RTT millisecond values to sec

user_experience_df['Avg_RTT_DL_(ms)'] = user_overview_script.convert_ms_to_sec(\
    user_experience_df['Avg_RTT_DL_(ms)'])

user_experience_df['Avg_RTT_UL_(ms)'] = user_overview_script.convert_ms_to_sec(\
    user_experience_df['Avg_RTT_UL_(ms)'])

In [114]:
user_experience_df.rename(columns = {\
    'Avg_RTT_DL_(ms)':'Avg_RTT_DL_(sec)',\
    'Avg_RTT_UL_(ms)':'Avg_RTT_UL_(sec)'\
    }, inplace=True)

In [115]:
# Bring back the deleted TCP Retransmission volume column during data cleaning from the original data 

df = pd.read_csv('../data/Week1_challenge_data_source(CSV).csv')
user_experience_df['TCP_DL_Retrans_Vol_(Bytes)'] = df['TCP DL Retrans. Vol (Bytes)']
user_experience_df['TCP_UL_Retrans_Vol_(Bytes)'] = df['TCP UL Retrans. Vol (Bytes)']

In [116]:
user_experience_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 148506 entries, 0 to 148505
Data columns (total 8 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   MSISDN_Number               148506 non-null  float64
 1   Avg_RTT_DL_(sec)            148506 non-null  float64
 2   Avg_RTT_UL_(sec)            148506 non-null  float64
 3   Avg_Bearer_TP_DL_(kbps)     148506 non-null  float64
 4   Avg_Bearer_TP_UL_(kbps)     148506 non-null  float64
 5   Handset_Type                148506 non-null  object 
 6   TCP_DL_Retrans_Vol_(Bytes)  61271 non-null   float64
 7   TCP_UL_Retrans_Vol_(Bytes)  52879 non-null   float64
dtypes: float64(7), object(1)
memory usage: 9.1+ MB


In [117]:
# Handle null values of TCP Retransmission volume columns with mean and median based on Skewness value 

user_experience_df = DataFrameCleaning(user_experience_df)
columns = ['TCP_DL_Retrans_Vol_(Bytes)', 'TCP_UL_Retrans_Vol_(Bytes)']
user_experience_df.fill_numerical_column(columns)

Automation in Action...!!!


In [118]:
# Count null values if any

user_experience_df = DataFrameInfo(user_experience_df.df)
user_experience_df.get_null_counts()

MSISDN_Number                 0
Avg_RTT_DL_(sec)              0
Avg_RTT_UL_(sec)              0
Avg_Bearer_TP_DL_(kbps)       0
Avg_Bearer_TP_UL_(kbps)       0
Handset_Type                  0
TCP_DL_Retrans_Vol_(Bytes)    0
TCP_UL_Retrans_Vol_(Bytes)    0
dtype: int64
