<h1> TeleCo Customers Data Exploration</h1>

<h2> Imporing data and packages </h2>

Importing essential libraries

In [1]:
import pandas as pd
import numpy as np
import os
import warnings
from pandasql import sqldf
import matplotlib.pyplot as plt
import seaborn as sns
import sys

Suppressing warning messages

In [2]:
warnings.filterwarnings('ignore')

Setting file paths

In [3]:
parent_dir = os.path.abspath(os.path.join(os.getcwd(), os.pardir)) # setting path to parent directory
scripts_dir = os.path.join(parent_dir,"scripts") # setting path to scripts directory
data_path = os.path.join(parent_dir,"data","teleco_customers_data.csv") # setting path to data directory

sys.path.insert(1, scripts_dir)

Importing data cleaning and summarizing scripts

In [4]:
from data_cleaning_functions import DataCleaner as Cleaner 
from data_summarizing_functions import DataSummarizer as Sumar

cleaner = Cleaner()
sumar = Sumar()

Loading the data

In [5]:
df = pd.read_csv(data_path)

<h2>Data Cleaning</h2>

Taking a first look at the data

In [6]:
df.head() # looking the first few rows of the data.

Unnamed: 0,Bearer Id,Start,Start ms,End,End ms,Dur. (ms),IMSI,MSISDN/Number,IMEI,Last Location Name,...,Youtube DL (Bytes),Youtube UL (Bytes),Netflix DL (Bytes),Netflix UL (Bytes),Gaming DL (Bytes),Gaming UL (Bytes),Other DL (Bytes),Other UL (Bytes),Total UL (Bytes),Total DL (Bytes)
0,1.311448e+19,4/4/2019 12:01,770.0,4/25/2019 14:35,662.0,1823652.0,208201400000000.0,33664960000.0,35521210000000.0,9.16456699548519E+015,...,15854611.0,2501332.0,8198936.0,9656251.0,278082303.0,14344150.0,171744450.0,8814393.0,36749741.0,308879636.0
1,1.311448e+19,4/9/2019 13:04,235.0,4/25/2019 8:15,606.0,1365104.0,208201900000000.0,33681850000.0,35794010000000.0,L77566A,...,20247395.0,19111729.0,18338413.0,17227132.0,608750074.0,1170709.0,526904238.0,15055145.0,53800391.0,653384965.0
2,1.311448e+19,4/9/2019 17:42,1.0,4/25/2019 11:58,652.0,1361762.0,208200300000000.0,33760630000.0,35281510000000.0,D42335A,...,19725661.0,14699576.0,17587794.0,6163408.0,229584621.0,395630.0,410692588.0,4215763.0,27883638.0,279807335.0
3,1.311448e+19,4/10/2019 0:31,486.0,4/25/2019 7:36,171.0,1321509.0,208201400000000.0,33750340000.0,35356610000000.0,T21824A,...,21388122.0,15146643.0,13994646.0,1097942.0,799538153.0,10849722.0,749039933.0,12797283.0,43324218.0,846028530.0
4,1.311448e+19,4/12/2019 20:10,565.0,4/25/2019 10:40,954.0,1089009.0,208201400000000.0,33699800000.0,35407010000000.0,D88865A,...,15259380.0,18962873.0,17124581.0,415218.0,527707248.0,3529801.0,550709500.0,13910322.0,38542814.0,569138589.0


In [7]:
df.shape # checking number of columns and rows

(150001, 55)

In [8]:
sumar.summ_columns(df) # looking at all available columns

Unnamed: 0,variables,missing_count,missing_percent_(%),data_type,unique_values
0,Bearer Id,991,1.0,float64,134709
1,Start,1,0.0,object,9998
2,Start ms,1,0.0,float64,1001
3,End,1,0.0,object,6404
4,End ms,1,0.0,float64,1001
5,Dur. (ms),1,0.0,float64,89526
6,IMSI,570,0.0,float64,107266
7,MSISDN/Number,1066,1.0,float64,106857
8,IMEI,572,0.0,float64,107271
9,Last Location Name,1153,1.0,object,45548


**Observation**
<ul>
    <li> there are 150,0001 rows and 55 columns of data.</li>
    <li> some rows are missing upto 87% of their data.</li>
    <li> The columns are of two data types: string and float</li>
    
</ul>

Dimensionality reduction based on missing values. columns with greater than 30% data loss will be removed

In [9]:
df2 = cleaner.reduce_dim_missing(df, 30)
df2.shape

(150001, 45)

In [10]:
keep_list = ['Bearer Id','Dur. (ms)','IMSI','MSISDN/Number','IMEI','Activity Duration DL (ms)','Activity Duration UL (ms)','Handset Manufacturer','Handset Type','Social Media DL (Bytes)','Social Media UL (Bytes)','Google DL (Bytes)','Google UL (Bytes)','Email DL (Bytes)','Email UL (Bytes)','Youtube DL (Bytes)','Youtube UL (Bytes)','Netflix DL (Bytes)','Netflix UL (Bytes)','Gaming DL (Bytes)','Gaming UL (Bytes)','Other DL (Bytes)','Other UL (Bytes)','Total UL (Bytes)','Total DL (Bytes)']

df3 = cleaner.remove_cols(df2, keep_list, True)
sumar.summ_columns(df3)

Unnamed: 0,variables,missing_count,missing_percent_(%),data_type,unique_values
0,Bearer Id,991,1.0,float64,134709
1,Dur. (ms),1,0.0,float64,89526
2,IMSI,570,0.0,float64,107266
3,MSISDN/Number,1066,1.0,float64,106857
4,IMEI,572,0.0,float64,107271
5,Activity Duration DL (ms),1,0.0,float64,102561
6,Activity Duration UL (ms),1,0.0,float64,106293
7,Handset Manufacturer,572,0.0,object,171
8,Handset Type,572,0.0,object,1397
9,Social Media DL (Bytes),0,0.0,float64,146856


Adjusting the data type of id columns

In [11]:
df3 = cleaner.format_number(df3, ['Bearer Id','IMSI','MSISDN/Number','IMEI']) # this will convert the specified columns to object type
sumar.summ_columns(df3)

Unnamed: 0,variables,missing_count,missing_percent_(%),data_type,unique_values
0,Bearer Id,0,0.0,object,134709
1,Dur. (ms),1,0.0,float64,89526
2,IMSI,0,0.0,object,107266
3,MSISDN/Number,0,0.0,object,106857
4,IMEI,0,0.0,object,107271
5,Activity Duration DL (ms),1,0.0,float64,102561
6,Activity Duration UL (ms),1,0.0,float64,106293
7,Handset Manufacturer,572,0.0,object,171
8,Handset Type,572,0.0,object,1397
9,Social Media DL (Bytes),0,0.0,float64,146856


missing values of categorical variables will be filled by mode 
and missing values of  numerical variables will be filled by median values

In [12]:
df3 = cleaner.fill_missing_by_mode(df3) # fills all categorical variables with mode
df3 = cleaner.fill_missing_by_median(df3) # fills all numerical variables with median
sumar.summ_columns(df3)

Unnamed: 0,variables,missing_count,missing_percent_(%),data_type,unique_values
0,Bearer Id,0,0.0,object,134709
1,Dur. (ms),0,0.0,float64,89525
2,IMSI,0,0.0,object,107266
3,MSISDN/Number,0,0.0,object,106857
4,IMEI,0,0.0,object,107271
5,Activity Duration DL (ms),0,0.0,float64,102561
6,Activity Duration UL (ms),0,0.0,float64,106293
7,Handset Manufacturer,0,0.0,object,170
8,Handset Type,0,0.0,object,1396
9,Social Media DL (Bytes),0,0.0,float64,146856


Converting Bytes to Megabytes.

In [13]:
df3 = cleaner.byte_to_mb(df3, "(Bytes)")
sumar.summ_columns(df3)

Unnamed: 0,variables,missing_count,missing_percent_(%),data_type,unique_values
0,Bearer Id,0,0.0,object,134709
1,Dur. (ms),0,0.0,float64,89525
2,IMSI,0,0.0,object,107266
3,MSISDN/Number,0,0.0,object,106857
4,IMEI,0,0.0,object,107271
5,Activity Duration DL (ms),0,0.0,float64,102561
6,Activity Duration UL (ms),0,0.0,float64,106293
7,Handset Manufacturer,0,0.0,object,170
8,Handset Type,0,0.0,object,1396
9,Social Media DL (MB),0,0.0,float64,146856


<h2>Univariate Analysis</h2>

**SQL query based functions to explore handsets and their manufacturers**

What are the top 10 handsets used by customers?

In [14]:
sumar.get_top_n("df3", "Handset Type", 10, globals())

Unnamed: 0,Handset Type,user_count
0,Huawei B528S-23A,20324
1,Apple iPhone 6S (A1688),9419
2,Apple iPhone 6 (A1586),9023
3,Apple iPhone 7 (A1778),6326
4,Apple iPhone Se (A1723),5187
5,Apple iPhone 8 (A1905),4993
6,Apple iPhone Xr (A2105),4568
7,Samsung Galaxy S8 (Sm-G950F),4520
8,Apple iPhone X (A1901),3813
9,Samsung Galaxy A5 Sm-A520F,3724


What are the top 3 handset manufacturers?

In [15]:
top_man = sumar.get_top_n("df3", "Handset Manufacturer", 3, globals())
top_man

Unnamed: 0,Handset Manufacturer,user_count
0,Apple,60137
1,Samsung,40839
2,Huawei,34423


What are the top 5 Handsets per the top 3 handset manufacturers

In [16]:
sumar.manByHandset(top_man["Handset Manufacturer"].to_list(),'df3', globals())

  Handset Manufacturer             Handset Type  num_users
0                Apple  Apple iPhone 6S (A1688)       9419
1                Apple   Apple iPhone 6 (A1586)       9023
2                Apple   Apple iPhone 7 (A1778)       6326
3                Apple  Apple iPhone Se (A1723)       5187
4                Apple   Apple iPhone 8 (A1905)       4993 

  Handset Manufacturer                  Handset Type  num_users
0              Samsung  Samsung Galaxy S8 (Sm-G950F)       4520
1              Samsung    Samsung Galaxy A5 Sm-A520F       3724
2              Samsung   Samsung Galaxy J5 (Sm-J530)       3696
3              Samsung   Samsung Galaxy J3 (Sm-J330)       3484
4              Samsung  Samsung Galaxy S7 (Sm-G930X)       3199 

  Handset Manufacturer                    Handset Type  num_users
0               Huawei                Huawei B528S-23A      19752
1               Huawei                    Huawei E5180       2079
2               Huawei  Huawei P20 Lite Huawei Nova 3E      

**Observation**
<ul>
<li> The most used handset type is Huawei B528S-23A </li>
<li> But most of the handsets used are manufactured by Apple </li>
<li> From among the top three manufacturers, Apple iPhone 6S, Samsung Galaxy S8 (Sm-G950F), and Huawei B528S-23A are 
the most used handsets</li>

</ul>

<h3> Aggregating the data for further exploration </h3>

In [21]:
grouping_lis = ["MSISDN/Number"]
aggr_lis = ["Bearer Id", "Dur. (ms)", "Total DL (MB)", "Total UL (MB)", "Social Media DL (MB)", "Social Media UL (MB)",\
    "Youtube DL (MB)", "Youtube UL (MB)", "Netflix DL (MB)", "Netflix UL (MB)", "Google DL (MB)", "Google UL (MB)","Gaming DL (MB)","Gaming UL (MB)", "Email DL (MB)", "Email UL (MB)", "Other DL (MB)", "Other UL (MB)"] 
metric_lis = ["count", "sum", "sum", "sum", "sum", "sum", "sum", "sum","sum", "sum", "sum", "sum", "sum", "sum", "sum","sum", "sum","sum"]
col_names = ["xDr_session_count", "session_dur", "Total_DL", "Total_UL", "Social_DL", "Social_UL",\
    "Youtube_DL", "Youtube_UL", "Netflix_DL", "Netflix_UL", "Google_DL", "Google_UL", "Gaming_DL", "Gaming_UL", "Email_DL", "Email_UL", "Other_DL", "Other_UL"]


aggr_df = sumar.find_agg(df3, grouping_lis, aggr_lis, metric_lis, col_names)
sumar.combineColumns(aggr_df, "Youtube_DL", "Youtube_UL", "youtube", True)
sumar.combineColumns(aggr_df, "Netflix_DL", "Netflix_UL", "netflix", True)
sumar.combineColumns(aggr_df, "Google_DL", "Google_UL", "google", True)
sumar.combineColumns(aggr_df, "Gaming_DL", "Gaming_UL", "gaming", True)
sumar.combineColumns(aggr_df, "Email_DL", "Email_UL", "email", True)
sumar.combineColumns(aggr_df, "Social_DL", "Social_UL", "social", True)
sumar.combineColumns(aggr_df, "Other_DL", "Other_UL", "other", True)

In [22]:
sumar.summ_columns(aggr_df)

Unnamed: 0,variables,missing_count,missing_percent_(%),data_type,unique_values
0,MSISDN/Number,0,0.0,object,106857
1,xDr_session_count,0,0.0,int64,18
2,session_dur,0,0.0,float64,77539
3,Total_DL,0,0.0,float64,106853
4,Total_UL,0,0.0,float64,106782
5,Social_DL,0,0.0,float64,105780
6,Social_UL,0,0.0,float64,66861
7,Youtube_DL,0,0.0,float64,106713
8,Youtube_UL,0,0.0,float64,106672
9,Netflix_DL,0,0.0,float64,106681
