## Data Quality Checking

In [None]:
import sys
sys.path.append('../')
import pandas as pd
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import SimpleImputer
from scripts.db_utils import connect, sql_to_dataframe
from src.data_quality_checks import check_missing_data, check_duplicates, check_data_types, check_numeric_anomalies, get_numeric_columns, get_total_missing_percentage
from src.utils import bytes_to_gigabytes, kilobytes_per_second_to_megabytes_per_second, milliseconds_to_hours, milliseconds_to_minutes, bytes_to_megabytes, milliseconds_to_seconds



In [None]:
#opening the connection
conn = connect()

query = """ SELECT * FROM public.xdr_data  """

#loading our dataframe
df = sql_to_dataframe(conn, query)

#closing the connection
conn.close()

# Let’s see if we loaded the df successfully
df.head()

In [None]:
# checking for missing value in each columns
missing_data_summary = check_missing_data(df)
print(missing_data_summary)

In [None]:
# Calculate total percentage of missing values
missing_data_percentage = get_total_missing_percentage(df)
print(f"Total Percentage of Missing Values: {missing_data_percentage:.2f}%")

In [None]:
# checking for duplicated rows in the datasets
duplicate_rows = check_duplicates(df)
print(duplicate_rows)

In [None]:
# checking for data type issues per each columns
dtypes_summary = check_data_types(df)
print(dtypes_summary)

In [None]:
# list all numberical columns 
numeric_columns = get_numeric_columns(df)
print(numeric_columns)

In [None]:
# checking for anomalies in all numeric columns
for numeric_column in numeric_columns:
    numeric_anomalies = check_numeric_anomalies(df, numeric_column, lower_bound=0, upper_bound=None)
    print(numeric_anomalies)

## Data Preprocessing

### Data Cleaning

In [None]:
# Calculate total percentage of missing values
missing_data_percentage = get_total_missing_percentage(df)
print(f"Total Percentage of Missing Values before cleaning: {missing_data_percentage}%")

In [None]:
# Separate numerical and categorical columns
numerical_columns = df.select_dtypes(include=[np.number]).columns.tolist()
categorical_columns = df.select_dtypes(exclude=[np.number]).columns.tolist()

In [None]:
# Drop columns with all missing values
df = df.dropna(axis=1, how='all')

In [None]:
# Convert numerical columns to the correct data type
for col in numerical_columns:
    df[col] = pd.to_numeric(df[col], errors='coerce')

In [None]:
# Impute missing values for numerical columns using mean strategy
num_imputer = SimpleImputer(strategy='mean')
df[numerical_columns] = num_imputer.fit_transform(df[numerical_columns])

In [None]:
# Impute missing values for categorical columns using most frequent strategy (mode)
cat_imputer = SimpleImputer(strategy='most_frequent')
df[categorical_columns] = cat_imputer.fit_transform(df[categorical_columns])

In [None]:
# Calculate total percentage of missing values
missing_data_percentage = get_total_missing_percentage(df)
print(f"Total Percentage of Missing Values after cleaning: {missing_data_percentage}%")

### Selecting Relevant columns

In [None]:
# Customizing some columns and do some unit conversions 
df['Social Media (GB)'] = df['Social Media DL (Bytes)'].apply(bytes_to_megabytes) + df['Social Media UL (Bytes)'].apply(bytes_to_megabytes)
df['Youtube (GB)'] = df['Youtube DL (Bytes)'].apply(bytes_to_megabytes) + df['Youtube UL (Bytes)'].apply(bytes_to_megabytes)
df['Google (GB)'] = (df['Google DL (Bytes)'] + df['Google UL (Bytes)']).apply(bytes_to_megabytes)
df['Email (GB)'] = (df['Email DL (Bytes)'] + df['Email UL (Bytes)']).apply(bytes_to_megabytes)
df['Netflix (GB)'] = (df['Netflix DL (Bytes)'] + df['Netflix UL (Bytes)']).apply(bytes_to_megabytes)
df['Gaming (GB)'] = (df['Gaming DL (Bytes)'] + df['Gaming UL (Bytes)']).apply(bytes_to_megabytes)
df['Other (GB)'] = (df['Other DL (Bytes)'] + df['Other UL (Bytes)']).apply(bytes_to_megabytes)
df['Total Data (GB)'] = df['Total DL (Bytes)'].apply(bytes_to_megabytes) + df['Total UL (Bytes)'].apply(bytes_to_megabytes)
df['Dur. (hr)'] = df['Dur. (ms).1'].apply(milliseconds_to_hours)
df['Avg RTT DL (sec)'] = df['Avg RTT DL (ms)'].apply(milliseconds_to_seconds)
df['Avg RTT UL (sec)'] = df['Avg RTT UL (ms)'].apply(milliseconds_to_seconds)
df['Avg Bearer TP DL (Mbps)'] = df['Avg Bearer TP DL (kbps)'].apply(kilobytes_per_second_to_megabytes_per_second)
df['Avg Bearer TP UL (Mbps)'] = df['Avg Bearer TP UL (kbps)'].apply(kilobytes_per_second_to_megabytes_per_second)

# Apply conversion functions to columns and store results in new columns
df['Total DL (Mb)'] = df['Total DL (Bytes)'].apply(bytes_to_megabytes)
df['Total UL (Mb)'] = df['Total UL (Bytes)'].apply(bytes_to_megabytes)
df['Social Media DL (Mb)'] = df['Social Media DL (Bytes)'].apply(bytes_to_megabytes)
df['Social Media UL (Mb)'] = df['Social Media UL (Bytes)'].apply(bytes_to_megabytes)
df['Google DL (Mb)'] = df['Google DL (Bytes)'].apply(bytes_to_megabytes)
df['Google UL (Mb)'] = df['Google UL (Bytes)'].apply(bytes_to_megabytes)
df['Email DL (Mb)'] = df['Email DL (Bytes)'].apply(bytes_to_megabytes)
df['Email UL (Mb)'] = df['Email UL (Bytes)'].apply(bytes_to_megabytes)
df['Youtube DL (Mb)'] = df['Youtube DL (Bytes)'].apply(bytes_to_megabytes)
df['Youtube UL (Mb)'] = df['Youtube UL (Bytes)'].apply(bytes_to_megabytes)
df['Netflix DL (Mb)'] = df['Netflix DL (Bytes)'].apply(bytes_to_megabytes)
df['Netflix UL (Mb)'] = df['Netflix UL (Bytes)'].apply(bytes_to_megabytes)
df['Gaming DL (Mb)'] = df['Gaming DL (Bytes)'].apply(bytes_to_megabytes)
df['Gaming UL (Mb)'] = df['Gaming UL (Bytes)'].apply(bytes_to_megabytes)
df['Other DL (Mb)'] = df['Other DL (Bytes)'].apply(bytes_to_megabytes)
df['Other UL (Mb)'] = df['Other UL (Bytes)'].apply(bytes_to_megabytes)
df['Dur. (hr)'] = df['Dur. (ms).1'].apply(milliseconds_to_hours)
df['Dur. (sec)'] = df['Dur. (ms).1'].apply(milliseconds_to_seconds)

# Calculate total data volume (DL+UL) for each application
df['Social Media Data (Mb)'] = df['Social Media DL (Mb)'] + df['Social Media UL (Mb)']
df['Youtube Data (Mb)'] = df['Youtube DL (Mb)'] + df['Youtube UL (Mb)']
df['Email Data (Mb)'] = df['Email DL (Mb)'] + df['Email UL (Mb)']
df['Gaming Data (Mb)'] = df['Gaming DL (Mb)'] + df['Gaming UL (Mb)']
df['Netflix Data (Mb)'] = df['Netflix DL (Mb)'] + df['Netflix UL (Mb)']
df['Google Data (Mb)'] = df['Google DL (Mb)'] + df['Google UL (Mb)']
df['Other Data (Mb)'] = df['Other DL (Mb)'] + df['Other UL (Mb)']
df['Total Data (Mb)'] = df['Total DL (Mb)'] + df['Total UL (Mb)']

In [None]:
# Create a new DataFrame with selected columns
selected_columns = ['Total Data (Mb)', 'IMSI', 'Start',
                    'Handset Manufacturer', 'Handset Type',
                    'Avg Bearer TP UL (Mbps)', 'Avg Bearer TP DL (Mbps)', 'Avg RTT DL (sec)', 'Avg RTT UL (sec)',
                    'Dur. (hr)',
                    'Last Location Name',
                    'Gaming Data (Mb)', 'Netflix Data (Mb)', 'Email Data (Mb)', 'Google Data (Mb)', 'Youtube Data (Mb)', 'Social Media Data (Mb)', 'Other Data (Mb)']

df_relevant = df[selected_columns]

### Data Summary

#### For Categorical Data

In [None]:
# Simple Tabulation for Handset Manufacturer
print("--- Simple Tabulation for Handset Manufacturer ---")
counts_manufacturer = df_relevant['Handset Manufacturer'].value_counts()
total_manufacturer = counts_manufacturer.sum()
percentages_manufacturer = counts_manufacturer.apply(lambda x: round((x / total_manufacturer) * 100, 2))

result_manufacturer = pd.DataFrame({'Handset Manufacturer': counts_manufacturer.index, 'Count': counts_manufacturer.values, 'Percentage': percentages_manufacturer.values})
result_manufacturer.head(10)

In [None]:
# Simple Tabulation for Handset Type
print("--- Simple Tabulation for Handset Type ---")
counts_type = df_relevant['Handset Type'].value_counts()
total_type = counts_type.sum()
percentages_type = counts_type.apply(lambda x: round((x / total_type) * 100, 2))

result_type = pd.DataFrame({'Handset Type': counts_type.index, 'Count': counts_type.values, 'Percentage': percentages_type.values})
top_10_result = result_type.head(10)

In [None]:
# Sort by count in ascending order
top_10_result = top_10_result.sort_values(by='Count', ascending=True)

# Create a horizontal bar chart
plt.figure(figsize=(10, 6))
plt.barh(top_10_result['Handset Type'], top_10_result['Count'], color='deepblue')
plt.title('Top 10 Handset Types (Count)')
plt.tight_layout()
plt.show()

In [None]:
# Simple Tabulation for Last Location Name
print("--- Simple Tabulation for Last Location Name ---")
counts_location = df_relevant['Last Location Name'].value_counts()
total_location = counts_location.sum()
percentages_location = counts_location.apply(lambda x: round((x / total_location) * 100, 2))

result_location = pd.DataFrame({'Last Location Name': counts_location.index, 'Count': counts_location.values, 'Percentage': percentages_location.values})

result_location.head(10)

In [None]:
# Convert the date column to datetime format for accurate comparison
date_column = pd.to_datetime(df['Start'])

# Find the minimum and maximum dates
min_date = date_column.min()
max_date = date_column.max()

print("Minimum Date:", min_date)
print("Maximum Date:", max_date)

In [None]:
# Function to list top 5 handset types for a given manufacturer
def top_handsets_for_manufacturer(manufacturer):
    df_manufacturer = df_relevant[df_relevant['Handset Manufacturer'] == manufacturer]
    top_handsets = df_manufacturer['Handset Type'].value_counts().head(5)
    top_handsets_percentage = (top_handsets / top_handsets.sum()) * 100
    result_top_handsets = pd.DataFrame({'Handset Type': top_handsets.index, 'Count': top_handsets.values, 'Percentage': top_handsets_percentage.values})
    return result_top_handsets

In [None]:
manufacturer = 'Samsung'  # Enter the top manufacturer here
top_handsets_df = top_handsets_for_manufacturer(manufacturer)
samsung_handsets = top_handsets_df.head(5)
samsung_handsets_count = samsung_handsets['Count']
samsung_handsets_type = samsung_handsets['Handset Type']

In [None]:
# Sort by count in ascending order
samsung_top_5_result = samsung_handsets.sort_values(by='Count', ascending=True)

# Create a horizontal bar chart
plt.figure(figsize=(6, 3))
plt.barh(samsung_top_5_result['Handset Type'], samsung_top_5_result['Count'], color='green')
plt.title('Top 5 Handset Types of Samsung')
plt.tight_layout()
plt.show()

In [None]:
manufacturer = 'Apple'  # Enter the top manufacturer here
top_handsets_df = top_handsets_for_manufacturer(manufacturer)
apple_handsets = top_handsets_df.head(5)
apple_handsets_count = apple_handsets['Count']
apple_handsets_type = apple_handsets['Handset Type']

In [None]:
# Sort by count in ascending order
apple_top_5_result = apple_handsets.sort_values(by='Count', ascending=True)

# Create a horizontal bar chart
plt.figure(figsize=(6, 3))
plt.barh(apple_top_5_result['Handset Type'], apple_top_5_result['Count'], color='brown')
plt.title('Top 5 Handset Types of Apple')
plt.tight_layout()
plt.show()

In [None]:
manufacturer = 'Huawei'  # Enter the top manufacturer here
top_handsets_df = top_handsets_for_manufacturer(manufacturer)
huawei_handsets = top_handsets_df.head(5)
huawei_handsets_count = huawei_handsets['Count']
huawei_handsets_type = huawei_handsets['Handset Type']

In [None]:
# Sort by count in ascending order
huawei_top_5_result = huawei_handsets.sort_values(by='Count', ascending=True)

# Create a horizontal bar chart
plt.figure(figsize=(6, 3))
plt.barh(huawei_top_5_result['Handset Type'], huawei_top_5_result['Count'], color='grey')
plt.title('Top 5 Handset Types of Huawei')
plt.tight_layout()
plt.show()

#### For Numerical data

In [None]:
# Numerical variables descriptive summary
numerical_columns = ['Total Data (Mb)', 'Avg Bearer TP UL (Mbps)',
                     'Avg Bearer TP DL (Mbps)', 'Avg RTT DL (sec)', 'Avg RTT UL (sec)', 'Dur. (hr)',
                     'Gaming Data (Mb)', 'Netflix Data (Mb)', 'Email Data (Mb)', 'Google Data (Mb)', 'Youtube Data (Mb)', 'Social Media Data (Mb)', 'Other Data (Mb)']

In [None]:
# Calculate descriptive statistics
df_descriptions = df_relevant[numerical_columns].describe()
df_descriptions

In [None]:
# Calculate mode
mode_values = df_relevant[numerical_columns].mode().iloc[0]
mode_values.head(20)

In [None]:
# Calculate range
range_values = df_relevant[numerical_columns].max() - df_relevant[numerical_columns].min()
range_values.head(20)

In [None]:
# Calculate skewness
skewness_values = df_relevant[numerical_columns].skew()
skewness_values.head(20)

In [None]:
# Calculate sum
sum_values = df_relevant[numerical_columns].sum()
sum_values.head(20)

#### Aggregating User Behavior Data

In [None]:
# Step 1: Group by user
grouped_data = df.groupby('IMSI')

# Step 2: Compute aggregates
user_behavior_summary = grouped_data.agg({
    'Bearer Id': 'count',
    'Dur. (hr)': 'sum',
    'Total Data (Mb)': 'sum',
    'Social Media Data (Mb)': 'sum',
    'Google Data (Mb)': 'sum',
    'Email Data (Mb)': 'sum',
    'Youtube Data (Mb)': 'sum',
    'Netflix Data (Mb)': 'sum',
    'Gaming Data (Mb)': 'sum',
    'Other Data (Mb)': 'sum',
})

user_behavior_summary.rename(columns={'Bearer Id': 'Number of Sessions'}, inplace=True)

# Display the aggregated user behavior data
user_behavior_summary.head(10)

In [None]:
# Number of users 
unique_rows = df.drop_duplicates(subset=['IMSI'])
users_count = unique_rows['IMSI'].count()
print(users_count)

In [None]:
# Convert 'Start' to datetime format using .loc to avoid SettingWithCopyWarning
df_relevant.loc[:, 'Start'] = pd.to_datetime(df_relevant['Start'])

# Extract date part only and save it to a new column 'Date' using .loc
df_relevant.loc[:, 'Date'] = df_relevant['Start'].dt.date

# Convert 'Date' to datetime format for plotting using .loc
df_relevant.loc[:, 'Date'] = pd.to_datetime(df_relevant['Date'])

In [None]:
# Columns to exclude
columns_to_exclude = ['Last Location Name', 'Handset Manufacturer', 'Handset Type', 'IMSI']

# Create a new DataFrame by excluding the specified columns
new_df = df_relevant.drop(columns=columns_to_exclude)

In [None]:
unique_dates = df_relevant['Date'].drop_duplicates().reset_index(drop=True)
unique_dates = pd.to_datetime(unique_dates.dropna())
print(unique_dates)

In [None]:
# Check if 'Date' column exists in df_relevant
if 'Date' in df_relevant.columns:
    unique_dates = df_relevant['Date'].drop_duplicates().reset_index(drop=True)
    unique_dates = pd.to_datetime(unique_dates.dropna())  # Exclude undefined values before conversion
    
    # Create a new DataFrame to store the results
    new_df = pd.DataFrame(columns=list(df_relevant.columns))  # Include all columns
    
    # Iterate over unique dates and filter rows in df_relevant for each date
    for date in unique_dates:
        filtered_rows = df_relevant[df_relevant['Date'] == date]
        filtered_rows = filtered_rows.drop(columns=['Date'])  # Drop 'Date' column
        
        # Select numeric columns only and calculate the sum
        sum_values = filtered_rows.select_dtypes(include=[int, float]).sum()
        
        # Append the sum to the new DataFrame
        new_row = [date] + sum_values.tolist()
        
        # Ensure new_row has the same length as the number of columns in new_df
        while len(new_row) < len(new_df.columns):
            new_row.append(None)  # Add None for missing columns
        
        new_df.loc[len(new_df)] = new_row
    
    new_df.head(30)
else:
    print("No 'Date' column found in df_relevant.")

In [None]:
def plot_bar_chart(x_values, y_values, x_label='X-axis', y_label='Y-axis', title='Bar Chart'):
    """
    Plot a bar chart based on provided x and y values.

    Args:
    x_values (list or array-like): Values for the x-axis.
    y_values (list or array-like): Values for the y-axis.
    x_label (str): Label for the x-axis (default is 'X-axis').
    y_label (str): Label for the y-axis (default is 'Y-axis').
    title (str): Title of the chart (default is 'Bar Chart').
    """
    plt.figure(figsize=(10, 6))  # Set the figure size
    plt.bar(x_values, y_values, width=0.5)  # Plotting the bar chart
    plt.xlabel(x_label)
    plt.ylabel(y_label)
    plt.title(title)
    plt.xticks(rotation=45)
    plt.grid(True)
    plt.tight_layout()
    plt.show()


In [None]:
# Convert 'Start' to datetime format
df_relevant['Start'] = pd.to_datetime(df_relevant['Start'])

# Extract date part only and save it to a new column 'Date'
df_relevant['Date'] = df_relevant['Start'].dt.date

# Create a new DataFrame to store the aggregated data
new_df = pd.DataFrame()

print(df_relevant.columns)

# Group by 'Date' and sum 'Total Data (Gb)' for each date
grouped_data = df.groupby('Date')['Total Data (Mb)'].sum().reset_index()

# Copy the 'Date' and aggregated 'Total DL (Bytes)' to the new DataFrame
new_df['Date'] = grouped_data['Date']
new_df['Total Data (Mb)'] = grouped_data['Total Data (Mb)']

# Convert 'Total Data DL (Bytes)' to gigabytes
# new_df['Total Data DL (Gb)'] = new_df['Total Data DL (Bytes)'].apply(bytes_to_gigabytes)

# Plotting the chart with all date values
plt.figure(figsize=(12, 6))
plt.plot(new_df['Date'], new_df['Total Data (Mb)'])
plt.xlabel('Date')
plt.ylabel('Total Data (Mb)')
plt.title('Total Data (Mb) Variation by Date')
plt.xticks(new_df['Date'], rotation=45)  # Set x-ticks to all date values and rotate labels
plt.tight_layout()  # Adjust layout to prevent clipping of labels
plt.grid(True)  # Add gridlines for better readability
plt.show()

#### Exploratory Data Analysis (EDA)

In [None]:
df_relevant.describe()