## Tellco Telecom Data PreProcessing
- Load the data from sql dumped file
- Understand the Data
- Handle Missing Values
- Deal with Duplicate Data
- Address Outliers
- Save Processed Data

### Import necessary packages

In [1]:
import pandas as pd
import os
import sys

In [2]:

sys.path.append(os.path.abspath(os.path.join('../src')))

# Print the updated sys.path


from data_loader import load_sql_to_dataframe, run_sql_query
from data_cleaning_utils import DataCleaningUtil

cleaning_utils = DataCleaningUtil()

### Load The sql file

In [3]:
df = load_sql_to_dataframe('../data/telecom.sql')
print(df.head())

  df = pd.read_csv(data_io, delimiter='\t', header=None)
2023-12-13 12:07:46,051:logger:Load sql file dumped data to pandas dataframe.


           ("Bearer Id"          "Start" "Start ms"            "End" "End ms"  \
0  1.31144834608449e+19   4/4/2019 12:01        770  4/25/2019 14:35      662   
1  1.31144834828789e+19   4/9/2019 13:04        235   4/25/2019 8:15      606   
2  1.31144834840805e+19   4/9/2019 17:42          1  4/25/2019 11:58      652   
3  1.31144834854428e+19   4/10/2019 0:31        486   4/25/2019 7:36      171   
4  1.31144834994807e+19  4/12/2019 20:10        565  4/25/2019 10:40      954   

  "Dur. (ms)"           "IMSI" "MSISDN/Number"          "IMEI"  \
0     1823652  208201448079117     33664962239  35521209507511   
1     1365104  208201909211140     33681854413  35794009006359   
2     1361762  208200314458056     33760627129  35281510359387   
3     1321509  208201402342131     33750343200  35356610164913   
4     1089009  208201401415120     33699795932  35407009745539   

    "Last Location Name"  ... "Youtube DL (Bytes)" "Youtube UL (Bytes)"  \
0  9.16456699548519E+015  ...           1

In [4]:
df.columns

Index(['("Bearer Id"', '"Start"', '"Start ms"', '"End"', '"End ms"',
       '"Dur. (ms)"', '"IMSI"', '"MSISDN/Number"', '"IMEI"',
       '"Last Location Name"', '"Avg RTT DL (ms)"', '"Avg RTT UL (ms)"',
       '"Avg Bearer TP DL (kbps)"', '"Avg Bearer TP UL (kbps)"',
       '"TCP DL Retrans. Vol (Bytes)"', '"TCP UL Retrans. Vol (Bytes)"',
       '"DL TP < 50 Kbps (%)"', '"50 Kbps < DL TP < 250 Kbps (%)"',
       '"250 Kbps < DL TP < 1 Mbps (%)"', '"DL TP > 1 Mbps (%)"',
       '"UL TP < 10 Kbps (%)"', '"10 Kbps < UL TP < 50 Kbps (%)"',
       '"50 Kbps < UL TP < 300 Kbps (%)"', '"UL TP > 300 Kbps (%)"',
       '"HTTP DL (Bytes)"', '"HTTP UL (Bytes)"', '"Activity Duration DL (ms)"',
       '"Activity Duration UL (ms)"', '"Dur. (ms).1"',
       '"Handset Manufacturer"', '"Handset Type"',
       '"Nb of sec with 125000B < Vol DL"',
       '"Nb of sec with 1250B < Vol UL < 6250B"',
       '"Nb of sec with 31250B < Vol DL < 125000B"',
       '"Nb of sec with 37500B < Vol UL"',
       '"Nb o

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150005 entries, 0 to 150004
Data columns (total 55 columns):
 #   Column                                      Non-Null Count   Dtype  
---  ------                                      --------------   -----  
 0   ("Bearer Id"                                150005 non-null  object 
 1   "Start"                                     150001 non-null  object 
 2   "Start ms"                                  150001 non-null  object 
 3   "End"                                       150001 non-null  object 
 4   "End ms"                                    150001 non-null  object 
 5   "Dur. (ms)"                                 150001 non-null  object 
 6   "IMSI"                                      150001 non-null  object 
 7   "MSISDN/Number"                             150001 non-null  object 
 8   "IMEI"                                      150001 non-null  object 
 9   "Last Location Name"                        150001 non-null  object 
 

In [6]:
df.shape

(150005, 55)

### Understand the Data
- Rename columns to lowercase, replace characters, and remove railing underscores
- Identify the data types (numeric, categorical, text, etc.) of each feature.

In [7]:
df.columns

Index(['("Bearer Id"', '"Start"', '"Start ms"', '"End"', '"End ms"',
       '"Dur. (ms)"', '"IMSI"', '"MSISDN/Number"', '"IMEI"',
       '"Last Location Name"', '"Avg RTT DL (ms)"', '"Avg RTT UL (ms)"',
       '"Avg Bearer TP DL (kbps)"', '"Avg Bearer TP UL (kbps)"',
       '"TCP DL Retrans. Vol (Bytes)"', '"TCP UL Retrans. Vol (Bytes)"',
       '"DL TP < 50 Kbps (%)"', '"50 Kbps < DL TP < 250 Kbps (%)"',
       '"250 Kbps < DL TP < 1 Mbps (%)"', '"DL TP > 1 Mbps (%)"',
       '"UL TP < 10 Kbps (%)"', '"10 Kbps < UL TP < 50 Kbps (%)"',
       '"50 Kbps < UL TP < 300 Kbps (%)"', '"UL TP > 300 Kbps (%)"',
       '"HTTP DL (Bytes)"', '"HTTP UL (Bytes)"', '"Activity Duration DL (ms)"',
       '"Activity Duration UL (ms)"', '"Dur. (ms).1"',
       '"Handset Manufacturer"', '"Handset Type"',
       '"Nb of sec with 125000B < Vol DL"',
       '"Nb of sec with 1250B < Vol UL < 6250B"',
       '"Nb of sec with 31250B < Vol DL < 125000B"',
       '"Nb of sec with 37500B < Vol UL"',
       '"Nb o

In [8]:
df = cleaning_utils.clean_columns_name(df)
print(df.columns)

  .str.replace('(', '_')
  .str.replace(')', '_')
  .str.replace('_+', '_')
  .str.replace('.', '')
2023-12-13 11:18:52,138:logger:Rename columns to lowercase, replace characters, and remove duplicates and trailing underscores


Index(['bearer_id', 'start', 'start_ms', 'end', 'end_ms', 'dur_ms', 'imsi',
       'msisdn_number', 'imei', 'last_location_name', 'avg_rtt_dl_ms',
       'avg_rtt_ul_ms', 'avg_bearer_tp_dl_kbps', 'avg_bearer_tp_ul_kbps',
       'tcp_dl_retrans_vol_bytes', 'tcp_ul_retrans_vol_bytes',
       'dl_tp_<_50_kbps_pct', '50_kbps_<_dl_tp_<_250_kbps_pct',
       '250_kbps_<_dl_tp_<_1_mbps_pct', 'dl_tp_>_1_mbps_pct',
       'ul_tp_<_10_kbps_pct', '10_kbps_<_ul_tp_<_50_kbps_pct',
       '50_kbps_<_ul_tp_<_300_kbps_pct', 'ul_tp_>_300_kbps_pct',
       'http_dl_bytes', 'http_ul_bytes', 'activity_duration_dl_ms',
       'activity_duration_ul_ms', 'dur_ms_1', 'handset_manufacturer',
       'handset_type', 'nb_of_sec_with_125000b_<_vol_dl',
       'nb_of_sec_with_1250b_<_vol_ul_<_6250b',
       'nb_of_sec_with_31250b_<_vol_dl_<_125000b',
       'nb_of_sec_with_37500b_<_vol_ul',
       'nb_of_sec_with_6250b_<_vol_dl_<_31250b',
       'nb_of_sec_with_6250b_<_vol_ul_<_37500b',
       'nb_of_sec_with_vol_d

In [9]:
df.dtypes

bearer_id                                    object
start                                        object
start_ms                                     object
end                                          object
end_ms                                       object
dur_ms                                       object
imsi                                         object
msisdn_number                                object
imei                                         object
last_location_name                           object
avg_rtt_dl_ms                                object
avg_rtt_ul_ms                                object
avg_bearer_tp_dl_kbps                        object
avg_bearer_tp_ul_kbps                        object
tcp_dl_retrans_vol_bytes                     object
tcp_ul_retrans_vol_bytes                     object
dl_tp_<_50_kbps_pct                          object
50_kbps_<_dl_tp_<_250_kbps_pct               object
250_kbps_<_dl_tp_<_1_mbps_pct                object
dl_tp_>_1_mb

In [23]:
# Convert relevant columns to appropriate data types
numeric_columns = [
                    'bearer_id',"start_ms","end_ms",'dur_ms', 'imsi', 'msisdn_number', 'imei', 
                    'avg_rtt_dl_ms', 'avg_rtt_ul_ms','avg_bearer_tp_dl_kbps', 'avg_bearer_tp_ul_kbps',
                    'tcp_dl_retrans_vol_bytes', 'tcp_ul_retrans_vol_bytes', 'dl_tp_<_50_kbps_pct', 
                    '50_kbps_<_dl_tp_<_250_kbps_pct','250_kbps_<_dl_tp_<_1_mbps_pct', 'dl_tp_>_1_mbps_pct',
                    'ul_tp_<_10_kbps_pct', '10_kbps_<_ul_tp_<_50_kbps_pct', '50_kbps_<_ul_tp_<_300_kbps_pct',
                    'ul_tp_>_300_kbps_pct', 'http_dl_bytes', 'http_ul_bytes', 'activity_duration_dl_ms', 
                    'activity_duration_ul_ms', 'dur_ms_1', 'nb_of_sec_with_125000b_<_vol_dl',
                    'nb_of_sec_with_1250b_<_vol_ul_<_6250b', 'nb_of_sec_with_31250b_<_vol_dl_<_125000b',
                    'nb_of_sec_with_37500b_<_vol_ul', 'nb_of_sec_with_6250b_<_vol_dl_<_31250b', 
                    'nb_of_sec_with_6250b_<_vol_ul_<_37500b', 'nb_of_sec_with_vol_dl_<_6250b',
                    'nb_of_sec_with_vol_ul_<_1250b', 'total_ul_bytes', 'total_dl_bytes', "social_media_dl_bytes",
                    "social_media_ul_bytes", "google_dl_bytes", "google_ul_bytes", "email_dl_bytes", 
                    "email_ul_bytes", "youtube_dl_bytes", "youtube_ul_bytes", "netflix_dl_bytes", 
                    "netflix_ul_bytes", "gaming_dl_bytes", "gaming_ul_bytes", "other_dl_bytes",  "other_ul_bytes"                                                        
                ]

string_columns = ['last_location_name', 'handset_manufacturer', 'handset_type']
datetime_columns = ["start", "end"]

# Convert numeric columns to numeric data types
# df = cleaning_utils.change_datatype_to_float(df, numeric_columns)
df = cleaning_utils.change_datatype_to_string(df, string_columns)
# df = cleaning_utils.change_datatype_to_datetime(df, datetime_columns) 
print(df.dtypes)

2023-12-13 11:26:23,389:logger:"None of [Index(['last_location_name', 'handset_manufacturer', 'handset_type'], dtype='object')] are in the [columns]"


("Bearer Id"                                   object
"Start"                                        object
"Start ms"                                     object
"End"                                          object
"End ms"                                       object
"Dur. (ms)"                                    object
"IMSI"                                         object
"MSISDN/Number"                                object
"IMEI"                                         object
"Last Location Name"                           object
"Avg RTT DL (ms)"                              object
"Avg RTT UL (ms)"                              object
"Avg Bearer TP DL (kbps)"                      object
"Avg Bearer TP UL (kbps)"                      object
"TCP DL Retrans. Vol (Bytes)"                  object
"TCP UL Retrans. Vol (Bytes)"                  object
"DL TP < 50 Kbps (%)"                          object
"50 Kbps < DL TP < 250 Kbps (%)"               object
"250 Kbps < DL TP < 1 Mbps (

### Handle Missing Values

In [11]:
# cleaning_utils.get_missing_values_percent(df)
cleaning_utils.get_missing_values_percent(df)

2023-12-13 11:19:16,898:logger:Get missing values percentage from dataframe


Unnamed: 0,missing_percent
bearer_id,0.0
handset_manufacturer,0.002667
handset_type,0.002667
nb_of_sec_with_125000b_<_vol_dl,0.002667
nb_of_sec_with_1250b_<_vol_ul_<_6250b,0.002667
nb_of_sec_with_31250b_<_vol_dl_<_125000b,0.002667
nb_of_sec_with_37500b_<_vol_ul,0.002667
nb_of_sec_with_6250b_<_vol_dl_<_31250b,0.002667
nb_of_sec_with_6250b_<_vol_ul_<_37500b,0.002667
nb_of_sec_with_vol_dl_<_6250b,0.002667


#### Drop columns with high missing percentage (more than 30%)

In [13]:
high_missing_cols = df.columns[df.isnull().mean() >= 0.3]
df.drop(high_missing_cols, axis=1, inplace=True)
print(high_missing_cols)

Index([], dtype='object')


#### Moderate Missing Percentage (between 5% and 30%)

In [149]:
moderate_missing_cols = df.columns[(0.05 <= df.isnull().mean()) & (df.isnull().mean() < 0.3)]
df[moderate_missing_cols] = df[moderate_missing_cols].fillna(method='ffill')
print(moderate_missing_cols)


Index([], dtype='object')


In [150]:
low_missing_cols = df.columns[df.isnull().mean() < 0.05]
print(low_missing_cols)

Index(['bearer_id', 'start', 'start_ms', 'end', 'end_ms', 'dur_ms', 'imsi',
       'msisdn_number', 'imei', 'last_location_name', 'avg_rtt_dl_ms',
       'avg_rtt_ul_ms', 'avg_bearer_tp_dl_kbps', 'avg_bearer_tp_ul_kbps',
       'tcp_dl_retrans_vol_bytes', 'tcp_ul_retrans_vol_bytes',
       'dl_tp_<_50_kbps_pct', '50_kbps_<_dl_tp_<_250_kbps_pct',
       '250_kbps_<_dl_tp_<_1_mbps_pct', 'dl_tp_>_1_mbps_pct',
       'ul_tp_<_10_kbps_pct', '10_kbps_<_ul_tp_<_50_kbps_pct',
       '50_kbps_<_ul_tp_<_300_kbps_pct', 'ul_tp_>_300_kbps_pct',
       'http_dl_bytes', 'http_ul_bytes', 'activity_duration_dl_ms',
       'activity_duration_ul_ms', 'dur_ms_1', 'handset_manufacturer',
       'handset_type', 'nb_of_sec_with_125000b_<_vol_dl',
       'nb_of_sec_with_1250b_<_vol_ul_<_6250b',
       'nb_of_sec_with_31250b_<_vol_dl_<_125000b',
       'nb_of_sec_with_37500b_<_vol_ul',
       'nb_of_sec_with_6250b_<_vol_dl_<_31250b',
       'nb_of_sec_with_6250b_<_vol_ul_<_37500b',
       'nb_of_sec_with_vol_d

In [151]:
numeric_cols = df[low_missing_cols].select_dtypes(include=['float64']).columns

# Handle missing values for numeric columns
for column in numeric_cols:
    df[column].fillna(df[column].mean(), inplace=True)

print(numeric_cols)

Index(['social_media_dl_bytes', 'social_media_ul_bytes', 'google_dl_bytes',
       'google_ul_bytes', 'email_dl_bytes', 'email_ul_bytes',
       'youtube_dl_bytes', 'youtube_ul_bytes', 'netflix_dl_bytes',
       'netflix_ul_bytes', 'gaming_dl_bytes', 'gaming_ul_bytes',
       'other_dl_bytes', 'other_ul_bytes'],
      dtype='object')


In [152]:
string_cols = df[low_missing_cols].select_dtypes(include=['string']).columns

# Handle missing values for string columns
for column in string_cols:
    df[column].fillna('unknown', inplace=True)
    
print(string_cols)

Index(['last_location_name', 'handset_manufacturer', 'handset_type'], dtype='object')


In [153]:
# Step 1: Identify missing values
missing_values = df.isnull().sum()

# Step 2: Display the columns with missing values
columns_with_missing_values = missing_values[missing_values > 0]
print("Columns with Missing Values:\n", columns_with_missing_values)

# Calculate missing percentage for each column
missing_percentage = (df.isnull().sum() / len(df)) * 100

# Sort missing percentage values in descending order
missing_percentage_sorted = missing_percentage.sort_values(ascending=False)

# Display missing percentage for each column (sorted)
print("\nMissing Percentage for Each Column (Sorted):")
print(missing_percentage_sorted)


Columns with Missing Values:
 start                                       5
start_ms                                    4
end                                         5
end_ms                                      4
dur_ms                                      4
imsi                                        4
msisdn_number                               4
imei                                        4
avg_rtt_dl_ms                               4
avg_rtt_ul_ms                               4
avg_bearer_tp_dl_kbps                       4
avg_bearer_tp_ul_kbps                       4
tcp_dl_retrans_vol_bytes                    4
tcp_ul_retrans_vol_bytes                    4
dl_tp_<_50_kbps_pct                         4
50_kbps_<_dl_tp_<_250_kbps_pct              4
250_kbps_<_dl_tp_<_1_mbps_pct               4
dl_tp_>_1_mbps_pct                          4
ul_tp_<_10_kbps_pct                         4
10_kbps_<_ul_tp_<_50_kbps_pct               4
50_kbps_<_ul_tp_<_300_kbps_pct              4
ul_t

In [154]:
# Extract rows with both 'start' and 'end' missing
both_start_end_missing_rows = df[df['start'].isnull() & df['end'].isnull()]

# Calculate and fill missing 'start' and 'end' values
average_start_time = df['start'].mean()

for index, row in both_start_end_missing_rows.iterrows():
    calculated_start = average_start_time
    calculated_end = calculated_start + pd.to_timedelta(row['dur_ms'], unit='ms')
    
    df.at[index, 'start'] = calculated_start
    df.at[index, 'end'] = calculated_end

# Display the updated DataFrame
print("DataFrame after filling missing 'start' and 'end' values:")
print(both_start_end_missing_rows)


ValueError: unit must not be specified if the input is/contains a str

In [None]:
# Step 1: Identify missing values
missing_values = df.isnull().sum()

# Step 2: Display the columns with missing values
columns_with_missing_values = missing_values[missing_values > 0]
print("Columns with Missing Values:\n", columns_with_missing_values)

# Calculate missing percentage for each column
missing_percentage = (df.isnull().sum() / len(df)) * 100

# Sort missing percentage values in descending order
missing_percentage_sorted = missing_percentage.sort_values(ascending=False)

# Display missing percentage for each column (sorted)
print("\nMissing Percentage for Each Column (Sorted):")
print(missing_percentage_sorted)


Columns with Missing Values:
 Series([], dtype: int64)

Missing Percentage for Each Column (Sorted):
bearer_id                         0.0
activity_duration_ul_ms           0.0
handset_manufacturer              0.0
handset_type                      0.0
nb_of_sec_with_vol_dl_<_6250b     0.0
nb_of_sec_with_vol_ul_<_1250b     0.0
social_media_dl_bytes             0.0
social_media_ul_bytes             0.0
google_dl_bytes                   0.0
google_ul_bytes                   0.0
email_dl_bytes                    0.0
email_ul_bytes                    0.0
youtube_dl_bytes                  0.0
youtube_ul_bytes                  0.0
netflix_dl_bytes                  0.0
netflix_ul_bytes                  0.0
gaming_dl_bytes                   0.0
gaming_ul_bytes                   0.0
other_dl_bytes                    0.0
other_ul_bytes                    0.0
total_ul_bytes                    0.0
dur_ms_1                          0.0
activity_duration_dl_ms           0.0
start                    

### Deal with Duplicate Data

In [None]:
# Check for duplicate rows
duplicate_rows = df[df.duplicated()]

# Display duplicate rows, if any
print("Duplicate Rows:", len(duplicate_rows))

Duplicate Rows: 3


In [None]:
# Drop duplicate rows
df.drop_duplicates(inplace=True)

In [None]:
# Check for duplicate rows
duplicate_rows = df[df.duplicated()]

# Display duplicate rows, if any
print("Duplicate Rows:", len(duplicate_rows))

Duplicate Rows: 0


### Address Outliers

In [None]:
def calculate_outlier_limits(data, column):
    # Calculate quartiles
    Q1 = data[column].quantile(0.25)
    Q3 = data[column].quantile(0.75)
    
    # Calculate IQR
    IQR = Q3 - Q1
    
    # Calculate lower and upper limits
    lower_limit = Q1 - 1.5 * IQR
    upper_limit = Q3 + 1.5 * IQR
    
    return lower_limit, upper_limit

def get_outlier_percentage(dataframe):
    # Filter only columns with float64 type
    float_columns = dataframe.select_dtypes(include='float64').columns

    # Dictionary to store outlier percentages
    outlier_percentages = {}

    # Iterate through float64 columns
    for column in float_columns:
        # Calculate limits using the provided function
        lower_limit, upper_limit = calculate_outlier_limits(dataframe, column)

        # Count the number of outliers for each column
        outliers_count = dataframe[(dataframe[column] < lower_limit) | (dataframe[column] > upper_limit)].shape[0]

        # Calculate outlier percentage
        total_values = len(dataframe[column])
        outlier_percentage = (outliers_count / total_values) * 100

        # Save the result in the dictionary
        outlier_percentages[column] = outlier_percentage

    return outlier_percentages



# Example: Calculate outlier percentages for each column in the DataFrame
outlier_percentages = get_outlier_percentage(df)

# Display columns with their outlier percentages
for column, percentage in outlier_percentages.items():
    print(f"Column: {column}, Outlier Percentage: {percentage:.2f}%")


Column: bearer_id, Outlier Percentage: 0.00%
Column: start_ms, Outlier Percentage: 0.00%
Column: end_ms, Outlier Percentage: 0.00%
Column: dur_ms, Outlier Percentage: 4.79%
Column: imsi, Outlier Percentage: 8.83%
Column: msisdn_number, Outlier Percentage: 16.50%
Column: imei, Outlier Percentage: 0.00%
Column: avg_rtt_dl_ms, Outlier Percentage: 11.31%
Column: avg_rtt_ul_ms, Outlier Percentage: 11.11%
Column: avg_bearer_tp_dl_kbps, Outlier Percentage: 8.82%
Column: avg_bearer_tp_ul_kbps, Outlier Percentage: 14.35%
Column: dl_tp_<_50_kbps_pct, Outlier Percentage: 12.23%
Column: 50_kbps_<_dl_tp_<_250_kbps_pct, Outlier Percentage: 10.15%
Column: 250_kbps_<_dl_tp_<_1_mbps_pct, Outlier Percentage: 19.83%
Column: dl_tp_>_1_mbps_pct, Outlier Percentage: 15.23%
Column: ul_tp_<_10_kbps_pct, Outlier Percentage: 14.75%
Column: 10_kbps_<_ul_tp_<_50_kbps_pct, Outlier Percentage: 21.86%
Column: 50_kbps_<_ul_tp_<_300_kbps_pct, Outlier Percentage: 3.56%
Column: ul_tp_>_300_kbps_pct, Outlier Percentage: 

In [None]:

def calculate_outlier_limits(data, column):
    # Calculate quartiles
    Q1 = data[column].quantile(0.25)
    Q3 = data[column].quantile(0.75)
    
    # Calculate IQR
    IQR = Q3 - Q1
    
    # Calculate lower and upper limits
    lower_limit = Q1 - 1.5 * IQR
    upper_limit = Q3 + 1.5 * IQR
    
    return lower_limit, upper_limit

def fix_outliers(dataframe):
    # Filter only columns with float64 type
    float_columns = dataframe.select_dtypes(include='float64').columns

    # Iterate through float64 columns
    for column in float_columns:
        # Calculate limits using the provided function
        lower_limit, upper_limit = calculate_outlier_limits(dataframe, column)

        # Fix outliers by capping values
        dataframe[column] = dataframe[column].clip(lower=lower_limit, upper=upper_limit)

    return dataframe

# Example: Fix outliers for each column in the DataFrame
df = fix_outliers(df)




In [None]:

# Example: Calculate outlier percentages for each column in the DataFrame
outlier_percentages = get_outlier_percentage(df)

# Display columns with their outlier percentages
for column, percentage in outlier_percentages.items():
    print(f"Column: {column}, Outlier Percentage: {percentage:.2f}%")


Column: bearer_id, Outlier Percentage: 0.00%
Column: start_ms, Outlier Percentage: 0.00%
Column: end_ms, Outlier Percentage: 0.00%
Column: dur_ms, Outlier Percentage: 0.00%
Column: imsi, Outlier Percentage: 0.00%
Column: msisdn_number, Outlier Percentage: 0.00%
Column: imei, Outlier Percentage: 0.00%
Column: avg_rtt_dl_ms, Outlier Percentage: 0.00%
Column: avg_rtt_ul_ms, Outlier Percentage: 0.00%
Column: avg_bearer_tp_dl_kbps, Outlier Percentage: 0.00%
Column: avg_bearer_tp_ul_kbps, Outlier Percentage: 0.00%
Column: dl_tp_<_50_kbps_pct, Outlier Percentage: 0.00%
Column: 50_kbps_<_dl_tp_<_250_kbps_pct, Outlier Percentage: 0.00%
Column: 250_kbps_<_dl_tp_<_1_mbps_pct, Outlier Percentage: 0.00%
Column: dl_tp_>_1_mbps_pct, Outlier Percentage: 0.00%
Column: ul_tp_<_10_kbps_pct, Outlier Percentage: 0.00%
Column: 10_kbps_<_ul_tp_<_50_kbps_pct, Outlier Percentage: 0.00%
Column: 50_kbps_<_ul_tp_<_300_kbps_pct, Outlier Percentage: 0.00%
Column: ul_tp_>_300_kbps_pct, Outlier Percentage: 0.00%
Colu

### Save Processed Data

In [None]:
df.to_csv('../data/tellco_data.csv')