# User Analytics in the Telecommunication Industry 

## Import Libraries

In [5]:
## Import Libraries
import sys
import os
sys.path.append(os.path.abspath(os.path.join('..')))
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np 
from pandas.api.types import is_string_dtype, is_numeric_dtype

## Import Data

In [18]:
CSV_PATH = "../data/raw/rawData.csv"

In [19]:
# taking a csv file path and reading a dataframe

def read_proccessed_data(csv_path):
    try:    
        df = pd.read_csv(csv_path)
        print("file read as csv")
        return df
    except FileNotFoundError:
        print("file not found")

In [20]:
## getting number of columns, row and column information
def get_data_info(xDR_df: pd.DataFrame):
    
    row_count, col_count = xDR_df.shape
    
    print(f"Number of rows: {row_count}")
    print(f"Number of columns: {col_count}")

    return xDR_df.info()

In [21]:
## basic statistics of each column and see the data at glance
def get_statistics_info(xDR_df: pd.DataFrame):
    
    return xDR_df.describe(include='all')

In [22]:
# reading the extracted tweeter data and getting information

xDR_df = read_proccessed_data(CSV_PATH)
get_data_info(xDR_df)
get_statistics_info(xDR_df)

file read as csv
Number of rows: 150001
Number of columns: 55
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150001 entries, 0 to 150000
Data columns (total 55 columns):
 #   Column                                    Non-Null Count   Dtype  
---  ------                                    --------------   -----  
 0   Bearer Id                                 149010 non-null  float64
 1   Start                                     150000 non-null  object 
 2   Start ms                                  150000 non-null  float64
 3   End                                       150000 non-null  object 
 4   End ms                                    150000 non-null  float64
 5   Dur. (ms)                                 150000 non-null  float64
 6   IMSI                                      149431 non-null  float64
 7   MSISDN/Number                             148935 non-null  float64
 8   IMEI                                      149429 non-null  float64
 9   Last Location Name            

Unnamed: 0,Bearer Id,Start,Start ms,End,End ms,Dur. (ms),IMSI,MSISDN/Number,IMEI,Last Location Name,...,Youtube DL (Bytes),Youtube UL (Bytes),Netflix DL (Bytes),Netflix UL (Bytes),Gaming DL (Bytes),Gaming UL (Bytes),Other DL (Bytes),Other UL (Bytes),Total UL (Bytes),Total DL (Bytes)
count,149010.0,150000,150000.0,150000,150000.0,150000.0,149431.0,148935.0,149429.0,148848,...,150001.0,150001.0,150001.0,150001.0,150001.0,150001.0,150001.0,150001.0,150000.0,150000.0
unique,,9997,,6403,,,,,,45547,...,,,,,,,,,,
top,,4/26/2019 7:25,,4/25/2019 0:01,,,,,,D41377B,...,,,,,,,,,,
freq,,203,,1150,,,,,,80,...,,,,,,,,,,
mean,1.013887e+19,,499.1882,,498.80088,104608.6,208201600000000.0,41882820000.0,48474550000000.0,,...,11634070.0,11009410.0,11626850.0,11001750.0,422044700.0,8288398.0,421100500.0,8264799.0,41121210.0,454643400.0
std,2.893173e+18,,288.611834,,288.097653,81037.62,21488090000.0,2447443000000.0,22416370000000.0,,...,6710569.0,6345423.0,6725218.0,6359490.0,243967500.0,4782700.0,243205000.0,4769004.0,11276390.0,244142900.0
min,6.917538e+18,,0.0,,0.0,7142.0,204047100000000.0,33601000000.0,440015200000.0,,...,53.0,105.0,42.0,35.0,2516.0,59.0,3290.0,148.0,2866892.0,7114041.0
25%,7.349883e+18,,250.0,,251.0,57440.5,208201400000000.0,33651300000.0,35460710000000.0,,...,5833501.0,5517965.0,5777156.0,5475981.0,210473300.0,4128476.0,210186900.0,4145943.0,33222010.0,243106800.0
50%,7.349883e+18,,499.0,,500.0,86399.0,208201500000000.0,33663710000.0,35722010000000.0,,...,11616020.0,11013450.0,11642220.0,10996380.0,423408100.0,8291208.0,421803000.0,8267071.0,41143310.0,455841100.0
75%,1.304243e+19,,749.0,,750.0,132430.2,208201800000000.0,33683490000.0,86119700000000.0,,...,17448520.0,16515560.0,17470480.0,16507270.0,633174200.0,12431620.0,631691800.0,12384150.0,49034240.0,665705500.0


### Distinct value counts/frequencies of each column to determine if there are any columns with only a single value/all different values

In [27]:
pd.DataFrame(xDR_df.apply(lambda x: len(x.value_counts(dropna=False)), axis=0), 
columns=['Unique Value Count']).sort_values(by='Unique Value Count', ascending=True)

Unnamed: 0,Unique Value Count
UL TP > 300 Kbps (%),64
50 Kbps < UL TP < 300 Kbps (%),69
250 Kbps < DL TP < 1 Mbps (%),74
50 Kbps < DL TP < 250 Kbps (%),85
10 Kbps < UL TP < 50 Kbps (%),86
DL TP > 1 Mbps (%),86
UL TP < 10 Kbps (%),99
DL TP < 50 Kbps (%),101
Handset Manufacturer,171
Avg RTT UL (ms),723
