## Exploratory Data Analysis

* Data Cleaning
* Data Transformation
* Data Extraction

In [36]:
# import required packages and modules
import sys
sys.path.insert(0,'../scripts/')

import pandas as pd
import numpy as np
from numpy import percentile

import seaborn as sns
import matplotlib.pyplot as plt

# ignore warnings
import warnings
warnings.filterwarnings('ignore')

### Data Cleaning
* Loading the Dataset
* Understanding the Dataset
* Looking for null values in the Dataset
* Handling missing values using different techniques
* Save cleaned Dataset

In [37]:
# data loading function

from data_loading import load_data_from_csv
from data_description import DataDescription

In [38]:
# load the dataset into a dataframe

pd.set_option('display.max_columns', None)
pd.options.display.float_format = "{:.3f}".format


# list of possible na_values
na_values = ["n/a", "na", 'none', "-", "--", None, '?']

df = load_data_from_csv("../data/telecom_data_source.csv",na_values)

In [39]:
# create dataframe object from DataDescription class
df = DataDescription(df)

# display top 5 rows of the dataframe
df.df_head()

                 Bearer Id            Start  Start ms              End  \
0 13114483460844900352.000   4/4/2019 12:01   770.000  4/25/2019 14:35   
1 13114483482878900224.000   4/9/2019 13:04   235.000   4/25/2019 8:15   
2 13114483484080500736.000   4/9/2019 17:42     1.000  4/25/2019 11:58   
3 13114483485442799616.000   4/10/2019 0:31   486.000   4/25/2019 7:36   
4 13114483499480700928.000  4/12/2019 20:10   565.000  4/25/2019 10:40   

   End ms   Dur. (ms)                IMSI   MSISDN/Number               IMEI  \
0 662.000 1823652.000 208201448079117.000 33664962239.000 35521209507511.000   
1 606.000 1365104.000 208201909211140.000 33681854413.000 35794009006359.000   
2 652.000 1361762.000 208200314458056.000 33760627129.000 35281510359387.000   
3 171.000 1321509.000 208201402342131.000 33750343200.000 35356610164913.000   
4 954.000 1089009.000 208201401415120.000 33699795932.000 35407009745539.000   

      Last Location Name  Avg RTT DL (ms)  Avg RTT UL (ms)  \
0  9.1645669

In [40]:
# display dataframe column list
df.df_columns_list()

['Bearer Id',
 'Start',
 'Start ms',
 'End',
 'End ms',
 'Dur. (ms)',
 'IMSI',
 'MSISDN/Number',
 'IMEI',
 'Last Location Name',
 'Avg RTT DL (ms)',
 'Avg RTT UL (ms)',
 'Avg Bearer TP DL (kbps)',
 'Avg Bearer TP UL (kbps)',
 'TCP DL Retrans. Vol (Bytes)',
 'TCP UL Retrans. Vol (Bytes)',
 'DL TP < 50 Kbps (%)',
 '50 Kbps < DL TP < 250 Kbps (%)',
 '250 Kbps < DL TP < 1 Mbps (%)',
 'DL TP > 1 Mbps (%)',
 'UL TP < 10 Kbps (%)',
 '10 Kbps < UL TP < 50 Kbps (%)',
 '50 Kbps < UL TP < 300 Kbps (%)',
 'UL TP > 300 Kbps (%)',
 'HTTP DL (Bytes)',
 'HTTP UL (Bytes)',
 'Activity Duration DL (ms)',
 'Activity Duration UL (ms)',
 'Dur. (ms).1',
 'Handset Manufacturer',
 'Handset Type',
 'Nb of sec with 125000B < Vol DL',
 'Nb of sec with 1250B < Vol UL < 6250B',
 'Nb of sec with 31250B < Vol DL < 125000B',
 'Nb of sec with 37500B < Vol UL',
 'Nb of sec with 6250B < Vol DL < 31250B',
 'Nb of sec with 6250B < Vol UL < 37500B',
 'Nb of sec with Vol DL < 6250B',
 'Nb of sec with Vol UL < 1250B',
 'Socia

In [41]:
# display dataframe's info
df.df_detail_info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150001 entries, 0 to 150000
Data columns (total 55 columns):
 #   Column                                    Non-Null Count   Dtype  
---  ------                                    --------------   -----  
 0   Bearer Id                                 149010 non-null  float64
 1   Start                                     150000 non-null  object 
 2   Start ms                                  150000 non-null  float64
 3   End                                       150000 non-null  object 
 4   End ms                                    150000 non-null  float64
 5   Dur. (ms)                                 150000 non-null  float64
 6   IMSI                                      149431 non-null  float64
 7   MSISDN/Number                             148935 non-null  float64
 8   IMEI                                      149429 non-null  float64
 9   Last Location Name                        148848 non-null  object 
 10  Avg RTT DL (ms)     

In [42]:
# display null counts for each column
df.df_null_counts()

Bearer Id                                      991
Start                                            1
Start ms                                         1
End                                              1
End ms                                           1
Dur. (ms)                                        1
IMSI                                           570
MSISDN/Number                                 1066
IMEI                                           572
Last Location Name                            1153
Avg RTT DL (ms)                              27829
Avg RTT UL (ms)                              27812
Avg Bearer TP DL (kbps)                          1
Avg Bearer TP UL (kbps)                          1
TCP DL Retrans. Vol (Bytes)                  88146
TCP UL Retrans. Vol (Bytes)                  96649
DL TP < 50 Kbps (%)                            754
50 Kbps < DL TP < 250 Kbps (%)                 754
250 Kbps < DL TP < 1 Mbps (%)                  754
DL TP > 1 Mbps (%)             

In [43]:
# display skewness of each columns
df.df_skewness()

Bearer Id                                    0.027
Start ms                                     0.001
End ms                                      -0.001
Dur. (ms)                                    3.953
IMSI                                        41.046
MSISDN/Number                              332.156
IMEI                                         1.071
Avg RTT DL (ms)                             62.908
Avg RTT UL (ms)                             28.457
Avg Bearer TP DL (kbps)                      2.589
Avg Bearer TP UL (kbps)                      4.503
TCP DL Retrans. Vol (Bytes)                 15.952
TCP UL Retrans. Vol (Bytes)                 84.113
DL TP < 50 Kbps (%)                         -2.298
50 Kbps < DL TP < 250 Kbps (%)               3.271
250 Kbps < DL TP < 1 Mbps (%)                4.566
DL TP > 1 Mbps (%)                           5.370
UL TP < 10 Kbps (%)                         -8.985
10 Kbps < UL TP < 50 Kbps (%)               10.944
50 Kbps < UL TP < 300 Kbps (%) 

In [None]:
# displays the size ( or shape) of the dataframe
df.df_size()

### Data Tranformation
* Loading the Dataset
* Understanding the Dataset
* Looking for null values in the Dataset
* Handling missing values using different techniques
* Save cleaned Dataset

AttributeError: 'DataDescription' object has no attribute 'df_size'

### Data Extraction
* Loading the Dataset
* Understanding the Dataset
* Looking for null values in the Dataset
* Handling missing values using different techniques
* Save cleaned Dataset