In [1]:
import sys
import os

# Add the 'scripts' directory to the system path
scripts_path = os.path.abspath(os.path.join('..', 'scripts'))
sys.path.append(scripts_path)


In [2]:
# Import your modules
from data_cleaning import clean_data
from data_extraction import extract_relevant_columns
from data_formatting import format_data
from data_transformation import transform_data



In [3]:
from db import load_data


In [4]:
from execute_query import execute_query

In [6]:
# Load data from the database into a DataFrame
data = load_data()

# Check if data is loaded
if data is not None:
    print("Data loaded successfully.")
    print(data.head())  # Display the first few rows of the data

Data loaded successfully.
      Bearer Id            Start  Start ms              End  End ms  \
0  1.311448e+19   4/4/2019 12:01     770.0  4/25/2019 14:35   662.0   
1  1.311448e+19   4/9/2019 13:04     235.0   4/25/2019 8:15   606.0   
2  1.311448e+19   4/9/2019 17:42       1.0  4/25/2019 11:58   652.0   
3  1.311448e+19   4/10/2019 0:31     486.0   4/25/2019 7:36   171.0   
4  1.311448e+19  4/12/2019 20:10     565.0  4/25/2019 10:40   954.0   

   Dur. (ms)          IMSI  MSISDN/Number          IMEI  \
0  1823652.0  2.082014e+14   3.366496e+10  3.552121e+13   
1  1365104.0  2.082019e+14   3.368185e+10  3.579401e+13   
2  1361762.0  2.082003e+14   3.376063e+10  3.528151e+13   
3  1321509.0  2.082014e+14   3.375034e+10  3.535661e+13   
4  1089009.0  2.082014e+14   3.369980e+10  3.540701e+13   

      Last Location Name  ...  Youtube DL (Bytes)  Youtube UL (Bytes)  \
0  9.16456699548519E+015  ...          15854611.0           2501332.0   
1                L77566A  ...          2024739

In [5]:
# Top 10 handsets used by customers
query = """
    SELECT "Handset Type", COUNT(*) as handset_count
FROM xdr_data
GROUP BY "Handset Type"
ORDER BY handset_count DESC
LIMIT 10;


"""
handset_data = execute_query(query)
print(handset_data)

                   Handset Type  handset_count
0              Huawei B528S-23A          19752
1       Apple iPhone 6S (A1688)           9419
2        Apple iPhone 6 (A1586)           9023
3                     undefined           8987
4        Apple iPhone 7 (A1778)           6326
5       Apple iPhone Se (A1723)           5187
6        Apple iPhone 8 (A1905)           4993
7       Apple iPhone Xr (A2105)           4568
8  Samsung Galaxy S8 (Sm-G950F)           4520
9        Apple iPhone X (A1901)           3813


In [7]:
# Top 5 handset manufacturers
query = """
    SELECT "Handset Manufacturer", COUNT(*) as manufacturer_count
    FROM xdr_data
    GROUP BY "Handset Manufacturer"
    ORDER BY manufacturer_count DESC
    LIMIT 5;

"""
top_3_manufacturers_list = execute_query(query)
print(top_3_manufacturers_list)

            Handset Manufacturer  manufacturer_count
0                          Apple               59565
1                        Samsung               40839
2                         Huawei               34423
3                      undefined                8987
4  Sony Mobile Communications Ab                 980


In [8]:
# Top 5 handsets per top 3 manufacturers
query = """

    SELECT "Handset Type", COUNT("Handset Type") as handset_count
    FROM xdr_data
    WHERE "Handset Manufacturer" IN (
        SELECT "Handset Manufacturer" 
        FROM xdr_data
        GROUP BY "Handset Manufacturer"
        ORDER BY COUNT(*) DESC
        LIMIT 3
    )
    GROUP BY "Handset Type"
    ORDER BY handset_count DESC
LIMIT 5; """

top_5_handsets_per_manufacturer = execute_query(query)
print(top_5_handsets_per_manufacturer)

              Handset Type  handset_count
0         Huawei B528S-23A          19752
1  Apple iPhone 6S (A1688)           9419
2   Apple iPhone 6 (A1586)           9023
3   Apple iPhone 7 (A1778)           6326
4  Apple iPhone Se (A1723)           5187


In [9]:
# User behavior on applications (number of sessions, total data volume, etc.)
query = """
    SELECT 
    "MSISDN/Number" AS user_id,                  -- Assuming MSISDN/Number is the user identifier
    COUNT("Bearer Id") AS session_count,        -- Assuming Bearer Id represents a unique session
    SUM("Dur. (ms)") AS total_duration,         -- Duration of sessions in milliseconds
    SUM("Total DL (Bytes)") AS total_download,  -- Total download data in bytes
    SUM("Total UL (Bytes)") AS total_upload,    -- Total upload data in bytes
    SUM("Total DL (Bytes)") + SUM("Total UL (Bytes)") AS total_data_volume,  -- Total data volume
    "Handset Type"                              -- Assuming Handset Type is equivalent to application
FROM xdr_data
GROUP BY "MSISDN/Number", "Handset Type";
 """

user_behaviour = execute_query(query)
print(user_behaviour)

             user_id  session_count  total_duration  total_download  \
0       3.360100e+10              1        116720.0    8.426375e+08   
1       3.360100e+10              1        181230.0    1.207552e+08   
2       3.360100e+10              1        134969.0    5.566597e+08   
3       3.360101e+10              1         49878.0    4.019932e+08   
4       3.360101e+10              2         37104.0    1.363130e+09   
...              ...            ...             ...             ...   
106960           NaN              1         15094.0    7.856484e+08   
106961           NaN              0         17477.0    7.266285e+08   
106962           NaN              1         11921.0    1.459396e+08   
106963           NaN              2        173817.0    1.523894e+09   
106964           NaN             11      37649798.0    2.587348e+11   

        total_upload  total_data_volume                    Handset Type  
0       3.605311e+07       8.786906e+08  Huawei P20 Lite Huawei Nova 3E  