## Data Understanding

In [1]:
import pandas as pd
import numpy as np
import gc
import pickle
import sys
import warnings
sys.path.insert(1, "../scripts")
warnings.filterwarnings('ignore')

In [2]:
from data_cleaning import DataCleaner
from data_summary import DataSummarizer
cleaner = DataCleaner()
summary = DataSummarizer()

In [3]:
#read the raw data
raw_df = pd.read_pickle('../data/telecom_xdr.pkl')
#drop any duplicate rows
raw_df_droped_dup = raw_df.drop_duplicates()
#associeted column descriptions
colum_descript = pd.read_excel('../resources/Field Descriptions.xlsx')

In [4]:
# column names missmatch b/n data and description
[x for x in colum_descript['Fields'].tolist() if x not in raw_df.columns.tolist()]

['bearer id',
 'Dur. (s)',
 'YouTube DL (Bytes)',
 'YouTube UL (Bytes)',
 'Other DL',
 'Other UL']

In [5]:
[x for x in raw_df.columns.tolist() if x not in colum_descript['Fields'].tolist()]

['Bearer Id',
 'Youtube DL (Bytes)',
 'Youtube UL (Bytes)',
 'Other DL (Bytes)',
 'Other UL (Bytes)']

In [6]:
#Match columns in the description df to the data df
raw_df_renamed = raw_df.rename(columns={'Bearer Id':'bearer id' , 'Youtube DL (Bytes)':'YouTube DL (Bytes)', 'Youtube UL (Bytes)':'YouTube UL (Bytes)', 'Other DL (Bytes)':'Other DL', 'Other UL (Bytes)':'Other UL'})
del raw_df
gc.collect()

1303

## Data Pre-Processing
Data preprocessing is an integral step in Machine Learning as the quality of data and the useful information that can be derived from it directly affects the ability of our model to learn; therefore, it is extremely important that we preprocess our data before feeding it into our model.
#### Handling Null Values

In [7]:
#Original data summary
raw_df_renamed.info()
raw_df_renamed.head()
print(summary.percent_missing(raw_df_renamed,))
summary.summarise_columns(raw_df_renamed)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150001 entries, 0 to 150000
Data columns (total 55 columns):
 #   Column                                    Non-Null Count   Dtype  
---  ------                                    --------------   -----  
 0   bearer id                                 149010 non-null  float64
 1   Start                                     150000 non-null  object 
 2   Start ms                                  150000 non-null  float64
 3   End                                       150000 non-null  object 
 4   End ms                                    150000 non-null  float64
 5   Dur. (ms)                                 150000 non-null  float64
 6   IMSI                                      149431 non-null  float64
 7   MSISDN/Number                             148935 non-null  float64
 8   IMEI                                      149429 non-null  float64
 9   Last Location Name                        148848 non-null  object 
 10  Avg RTT DL (ms)     

Unnamed: 0,variables,missing_count,missing_percent_(%),data_type,unique_values
27,Activity Duration UL (ms),1,0.0,float64,106293
28,Dur. (ms).1,1,0.0,float64,122872
29,Handset Manufacturer,572,0.0,object,171
30,Handset Type,572,0.0,object,1397
39,Social Media DL (Bytes),0,0.0,float64,146856
40,Social Media UL (Bytes),0,0.0,float64,59078
41,Google DL (Bytes),0,0.0,float64,149024
42,Google UL (Bytes),0,0.0,float64,147267
43,Email DL (Bytes),0,0.0,float64,146916
44,Email UL (Bytes),0,0.0,float64,138699


#### Missing values & Data-Types

In [8]:
#I choose to fill the missing values than drop the columns
#But 'Dur. (ms)' & 'Dur. (ms).1' are the same in milli and micro seconds, let's drop the micro seconds column
#& IMEI & IMSI don't seem to be used in the future tasks,
raw_df_droped = raw_df_renamed.drop(['Dur. (ms).1', 'IMSI', 'IMEI'], axis=1)
del raw_df_renamed
gc.collect()


0

In [9]:
# inspect the missimng values(their description) and the complete ones, calling the method taking the data with the missing values(0% cuttof)
missing, good = summary.columns_missing_most_values(raw_df_droped, colum_descript,0)
# with 30% cutt-off 
# missing, good = summary.columns_missing_most_values(raw_df_droped, colum_descript,30)


 38 columns are missing atleast 1 value, they are:

                                      Fields                                        Description  % Missing
55                          Total UL (Bytes)  Data volume (in Bytes) sent by the MS during t...   0.000667
27                 Activity Duration DL (ms)  Activity Duration for downlink (ms) - excludin...   0.000667
14                   Avg Bearer TP UL (kbps)  Average Bearer Throughput for uplink (kbps) - ...   0.000667
13                   Avg Bearer TP DL (kbps)  Average Bearer Throughput for Downlink (kbps) ...   0.000667
54                          Total DL (Bytes)  Data volume (in Bytes) received by the MS duri...   0.000667
28                 Activity Duration UL (ms)  Activity Duration for uplink (ms) - excluding ...   0.000667
4                                        End         End time of the xDR (last frame timestamp)   0.000667
3                                   Start ms  Milliseconds offset of start time for the xDR 

##### !! The is no single unique column to be used as key
##### Hence, we will use the index as unique id for the rows 

In [10]:
raw_df_droped['index'] = raw_df_droped.index

##### The Columns can be grouped in to five different types:
* **Categorical string**:  categorical properties with string values
* **Categorical numeric**:  categorical properties with numeric values
* **Time Series**:  system time stamps of events
* **Numerica percentages**: Performance distributions over speed ranges in %
* **Numerical speed**:  communication speed measured in Kbps or roundtrip time in milli-sec
* **Numerical time-gap**:   Time-gap measures in milli-sec
* **Numerical data-volume**:    Data volume measures in Bytes

In [11]:
categorical_string_columns = ['Handset Type', 'Handset Manufacturer', 'Last Location Name']
categorical_numeric_columns = ['MSISDN/Number', 'bearer id']
time_series_columns = ['Start', 'End']
numeric_percentages = ['DL TP < 50 Kbps (%)', '50 Kbps < DL TP < 250 Kbps (%)', '250 Kbps < DL TP < 1 Mbps (%)', 'DL TP > 1 Mbps (%)', 
    'UL TP < 10 Kbps (%)', '10 Kbps < UL TP < 50 Kbps (%)', '50 Kbps < UL TP < 300 Kbps (%)', 'UL TP > 300 Kbps (%)']
numeric_averages = ['Avg RTT DL (ms)','Avg RTT UL (ms)', 'Avg Bearer TP DL (kbps)', 'Avg Bearer TP UL (kbps)']
numeric_time = ['Dur. (ms)', 'Start ms', 'End ms', 'Activity Duration DL (ms)','Activity Duration UL (ms)', 'Nb of sec with 125000B < Vol DL',
    'Nb of sec with 1250B < Vol UL < 6250B', 'Nb of sec with 31250B < Vol DL < 125000B', 'Nb of sec with 37500B < Vol UL', 
    'Nb of sec with 6250B < Vol DL < 31250B', 'Nb of sec with 6250B < Vol UL < 37500B', 'Nb of sec with Vol DL < 6250B',
    'Nb of sec with Vol UL < 1250B']
numeric_data = ['Social Media DL (Bytes)', 'Social Media UL (Bytes)', 'Google DL (Bytes)', 'Google UL (Bytes)', 'Email DL (Bytes)',
    'Email UL (Bytes)', 'YouTube DL (Bytes)', 'YouTube UL (Bytes)', 'Netflix DL (Bytes)', 'Netflix UL (Bytes)', 'Gaming DL (Bytes)',
    'Gaming UL (Bytes)', 'HTTP DL (Bytes)', 'HTTP UL (Bytes)', 'Other DL', 'Other UL', 'Total UL (Bytes)', 'Total DL (Bytes)', 
    'TCP DL Retrans. Vol (Bytes)', 'TCP UL Retrans. Vol (Bytes)']
col_types = {
    'cat_str':categorical_string_columns, 
    'cat_dig':categorical_numeric_columns,
    'time_series': time_series_columns, 
    'num_dist':numeric_percentages, 
    'num': {
        'time':numeric_time,
        'avg':numeric_averages,
        'data':numeric_data
        }
        }

#### Fill the missing values according to their type

In [12]:
print(summary.percent_missing(raw_df_droped))

The dataset contains 12.96 % ( 1030249 )missing values.
None


In [13]:
raw_df_filled = cleaner.fill_missing(raw_df_droped,col_types)
print(summary.percent_missing(raw_df_filled))

The dataset contains 0.0 % ( 0 )missing values.
None


#### Convert Bytes to MB & ms to sec

In [16]:
df_cleaned = cleaner.ms_to_s(cleaner.convert_bytes_to_megabytes(raw_df_filled, numeric_data), ['Dur. (ms)', 'Start ms', 'End ms', 
    'Activity Duration DL (ms)','Activity Duration UL (ms)'])

In [17]:
df_cleaned.rename(columns = {'Social Media DL (Bytes)':'Social Media DL (MB)', 'Social Media UL (Bytes)':'Social Media UL (MB)', 
    'Google DL (Bytes)':'Google DL (MB)','Google UL (Bytes)':'Google UL (MB)', 'Email DL (Bytes)':'Email DL (MB)','Email UL (Bytes)':'Email UL (MB)',
    'YouTube DL (Bytes)':'YouTube DL (MB)','YouTube UL (Bytes)':'YouTube UL (MB)', 'Netflix DL (Bytes)':'Netflix DL (MB)','Netflix UL (Bytes)':'Netflix UL (MB)',
    'Gaming DL (Bytes)':'Gaming DL (MB)','Gaming UL (Bytes)':'Gaming UL (MB)', 'HTTP DL (Bytes)':'HTTP DL (MB)','HTTP UL (Bytes)':'HTTP UL (MB)', 
    'Other DL':'Other DL (MB)','Other UL':'Other UL (MB)', 'Total DL (Bytes)':'Total DL (MB)','Total UL (Bytes)':'Total UL (MB)',
    'TCP UL Retrans. Vol (Bytes)':'TCP UL Retrans. Vol (MB)','TCP DL Retrans. Vol (Bytes)':'TCP DL Retrans. Vol (MB)', 'Dur. (ms)':'Duration (sec)', 
    'Start ms': 'Start (sec)', 'End ms':'End (sec)', 'Activity Duration DL (ms)':'Activity Duration DL (sec)','Activity Duration UL (ms)':'Activity Duration UL (sec)'}, inplace = True) 
numeric_time = ['Duration (sec)', 'Start (sec)', 'End (sec)', 'Activity Duration DL (sec)','Activity Duration UL (sec)', 'Nb of sec with 125000B < Vol DL',
    'Nb of sec with 1250B < Vol UL < 6250B', 'Nb of sec with 31250B < Vol DL < 125000B', 'Nb of sec with 37500B < Vol UL', 
    'Nb of sec with 6250B < Vol DL < 31250B', 'Nb of sec with 6250B < Vol UL < 37500B', 'Nb of sec with Vol DL < 6250B',
    'Nb of sec with Vol UL < 1250B']
numeric_data = ['Social Media DL (MB)', 'Social Media UL (MB)', 'Google DL (MB)', 'Google UL (MB)', 'Email DL (MB)',
    'Email UL (MB)', 'YouTube DL (MB)', 'YouTube UL (MB)', 'Netflix DL (MB)', 'Netflix UL (MB)', 'Gaming DL (MB)',
    'Gaming UL (MB)', 'HTTP DL (MB)', 'HTTP UL (MB)', 'Other DL (MB)', 'Other UL (MB)', 'Total UL (MB)', 'Total DL (MB)', 
    'TCP DL Retrans. Vol (MB)', 'TCP UL Retrans. Vol (MB)']
col_types['num']['time'] = numeric_time
col_types['num']['data'] = numeric_data

In [18]:
#save the final df as pkl dump
df_cleaned.to_pickle("../data/df_cleaned.pkl")
colum_descript.to_pickle("../data/colum_descript.pkl")
with open('../data/col_types.pickle', 'wb') as handle:
    pickle.dump(col_types, handle, protocol=pickle.HIGHEST_PROTOCOL)