In [1]:
import pandas as pd
from google.cloud import bigquery
from IPython.display import display
from IPython.display import Markdown

import matplotlib.pyplot as plt
import seaborn as sns

### Import function: Interactive SQL Query to Pandas DataFrame Converter

In [2]:
# Import the custom query_df function for executing BigQuery queries
from query_functions import query_df
from query_functions import run_query # just run the query, no df no print [INSERT, UPDATE, DELETE...}]

### Datasets and Tables

In [3]:
# Catalonian mobile coverage (2015-2017)
mobile_data_cleaned = "bq-analyst-230590.project_cat_mobile_coverage_2015_2017.mobile_data_2015_2017_cleaned"

# Per capita income by catalan province (2015-2017)
percapita_income = "bq-analyst-230590.project_cat_mobile_coverage_2015_2017.cat_percapita_income_by_province_2015_2017"

# Catalan population and density by province (2015-2017)
pop_density = "bq-analyst-230590.project_cat_mobile_coverage_2015_2017.cat_pop_by_province_2015_2017"

### Preview datasets

Mobile Data Cleaned

In [4]:
# SQL query:
query = f"""
    SELECT *
    FROM `{mobile_data_cleaned}`
    LIMIT 100
    """

# Execute the query
query_df(query)

Unnamed: 0,date,hour,lat,long,signal,network,operator,status,description,net,...,precission,provider,activity,postal_code,town_name,position_geom,province,year,month,hour_24h
0,2017-03-16,22:57:36,41.28913,2.07148,13,EE,EE,0,STATE_IN_SERVICE,Undefined net,...,26.0,Fused,TILTING,081691,el Prat de Llobregat,POINT(2.07148 41.28913),Barcelona,2017,3,22
1,2015-09-08,20:22:13,41.67372,2.79184,15,EE,EE,2,STATE_EMERGENCY_ONLY,2G,...,27.0,GPS,IN_VEHICLE,170237,Blanes,POINT(2.79184 41.67372),Girona,2015,9,20
2,2015-09-10,06:01:47,41.78536,2.75066,7,EE,EE,2,STATE_EMERGENCY_ONLY,2G,...,20.0,GPS,IN_VEHICLE,171030,Maçanet de la Selva,POINT(2.75066 41.78536),Girona,2015,9,6
3,2015-09-10,06:05:52,41.74997,2.70201,10,EE,EE,2,STATE_EMERGENCY_ONLY,2G,...,17.0,GPS,IN_VEHICLE,171030,Maçanet de la Selva,POINT(2.70201 41.74997),Girona,2015,9,6
4,2015-10-30,17:35:57,41.37722,2.17944,7,O2,O2,2,STATE_EMERGENCY_ONLY,4G,...,14.0,GPS,STILL,080193,Barcelona,POINT(2.17944 41.37722),Barcelona,2015,10,17
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,2015-05-03,17:48:51,41.43642,2.20425,14,Movistar,ONO,0,STATE_IN_SERVICE,3G,...,12.0,GPS,UNKNOWN,080193,Barcelona,POINT(2.20425 41.43642),Barcelona,2015,5,17
96,2015-05-06,17:07:38,41.40189,2.20570,18,Movistar,ONO,2,STATE_EMERGENCY_ONLY,2G,...,42.0,Fused,STILL,080193,Barcelona,POINT(2.2057 41.40189),Barcelona,2015,5,17
97,2015-03-20,10:21:49,41.40603,2.17706,7,Movistar,ONO,0,STATE_IN_SERVICE,2G,...,13.0,GPS,ON_FOOT,080193,Barcelona,POINT(2.17706 41.40603),Barcelona,2015,3,10
98,2015-04-10,16:41:02,41.37369,2.15205,15,Movistar,ONO,0,STATE_IN_SERVICE,3G,...,8.0,GPS,ON_FOOT,080193,Barcelona,POINT(2.15205 41.37369),Barcelona,2015,4,16


In [5]:
# List of columns
mobdata = query_df(query)
mobdata.columns

Index(['date', 'hour', 'lat', 'long', 'signal', 'network', 'operator',
       'status', 'description', 'net', 'speed', 'satellites', 'precission',
       'provider', 'activity', 'postal_code', 'town_name', 'position_geom',
       'province', 'year', 'month', 'hour_24h'],
      dtype='object')

In [6]:
# SQL query: count total rows
query = f"""
    SELECT COUNT(*) total_rows
    FROM `{mobile_data_cleaned}`
    """

# Execute the query
query_df(query)

Unnamed: 0,total_rows
0,10643906


Per Capita Income

In [7]:
# SQL query:
query = f"""
    SELECT *
    FROM `{percapita_income}`
    """

# Execute the query
query_df(query)

Unnamed: 0,year,province,per_capita_income
0,2015,Girona,25200
1,2015,Lleida,20136
2,2015,Tarragona,22486
3,2015,Barcelona,27214
4,2016,Barcelona,27913
5,2016,Girona,25598
6,2016,Tarragona,23130
7,2016,Lleida,20713
8,2017,Barcelona,28481
9,2017,Lleida,21091


Pop Density

In [8]:
# SQL query:
query = f"""
    SELECT *
    FROM `{pop_density}`
    LIMIT 100
    """
# Execute the query
query_df(query)

Unnamed: 0,province,year,population,sq_km,density_per_sq_km
0,Girona,2017,766705,5908,129.8
1,Girona,2016,766273,5908,129.7
2,Girona,2015,765783,5908,129.6
3,Lleida,2016,742099,12172,61.0
4,Lleida,2015,742138,12172,61.0
5,Lleida,2017,741884,12172,61.0
6,Barcelona,2017,5652301,7726,731.6
7,Barcelona,2016,5635085,7726,729.4
8,Barcelona,2015,5618162,7726,727.2
9,Tarragona,2016,810947,6303,128.7


### Execute queries:

**1. Filtering and Sorting**

**2. Aggregation**

**3. Join Operations**

**4. Subqueries**

**5. Grouping and Aggregation**

**6. Complex Filtering**

**7. Data Validation**

**8. Case Statements**

**9. Statistical Aggregate Functions**

**10. Window Functions**

#### 1. Filtering and Sorting

1. Retrieve records from the '{mobile_2015_2017}' table for the year 2016 and sort them in descending order based on the 'signal' column.

In [9]:
# Datasets: {mobile_data_cleaned}, {pop_density}, {percapita_income}

query = f"""
    SELECT
        date,
        signal,
        network,
        operator,
        net,
        speed,
        activity
    FROM `{mobile_data_cleaned}`
    WHERE year = 2016
    ORDER BY signal DESC
    LIMIT 1000
    """
# Execute the query
query_df(query)

Unnamed: 0,date,signal,network,operator,net,speed,activity
0,2016-07-31,99,Orange,Orange,Undefined net,4.6,ON_FOOT
1,2016-06-22,99,Orange,Jazztel,3G,0.1,ON_FOOT
2,2016-10-03,99,Movistar,Movistar,2G,1.0,TILTING
3,2016-03-10,99,Movistar,Tuenti,3G,8.1,IN_VEHICLE
4,2016-09-29,99,Movistar,Movistar,2G,2.9,ON_FOOT
...,...,...,...,...,...,...,...
995,2016-09-28,99,Movistar,Movistar,3G,61.4,IN_VEHICLE
996,2016-10-01,99,Movistar,Movistar,2G,32.1,STILL
997,2016-08-09,99,Movistar,Movistar,3G,0.0,TILTING
998,2016-10-05,99,Movistar,Movistar,2G,27.7,TILTING


2. Retrieve all records from the '{mobile_2015_2017}' table for the year 2016 where the signal strength is above a threshold of 100 and net is not undefined.

In [10]:
# Datasets: {mobile_data_cleaned}, {pop_density}, {percapita_income}

query = f"""
    SELECT 
        date,
        operator,
        net,
        signal
    FROM `{mobile_data_cleaned}`
    WHERE 
        year = 2016
        AND signal > 50
        AND NET != 'Undefined net'
    ORDER BY signal DESC
    LIMIT 1000
    """

# Execute the query
query_df(query)  

Unnamed: 0,date,operator,net,signal
0,2016-10-03,Movistar,2G,99
1,2016-06-22,Jazztel,3G,99
2,2016-06-23,Jazztel,3G,99
3,2016-10-05,Movistar,2G,99
4,2016-06-22,Jazztel,3G,99
...,...,...,...,...
995,2016-02-09,Orange,4G,65
996,2016-03-09,Orange,4G,65
997,2016-10-29,Vodafone,3G,65
998,2016-05-28,Jazztel,3G,65


3. Retrieve records from the '{mobile_2015_2017}' table for the year 2017 where 'net' is '4G,' 'description' is 'STATE_IN_SERVICE,' and 'speed' is greater than 100.

In [11]:
# Datasets: {mobile_data_cleaned}, {pop_density}, {percapita_income}

query = f"""
    SELECT
      date,
      operator,
      net,
      description,
      signal,
      speed
    FROM `{mobile_data_cleaned}`
    WHERE
      year = 2017
      AND net = '4G'
      AND description = 'STATE_IN_SERVICE'
      AND speed > 100.0
    ORDER BY 6 DESC
    LIMIT 1000;
    """
# Execute the query
query_df(query)    

Unnamed: 0,date,operator,net,description,signal,speed
0,2017-07-24,Movistar,4G,STATE_IN_SERVICE,26,253.1
1,2017-05-28,Vodafone,4G,STATE_IN_SERVICE,23,244.6
2,2017-02-22,Orange,4G,STATE_IN_SERVICE,14,242.6
3,2017-03-31,Orange,4G,STATE_IN_SERVICE,19,239.6
4,2017-05-10,Masmovil,4G,STATE_IN_SERVICE,13,229.5
...,...,...,...,...,...,...
570,2017-07-15,Vodafone,4G,STATE_IN_SERVICE,9,100.3
571,2017-05-14,Vodafone,4G,STATE_IN_SERVICE,7,100.3
572,2017-07-09,Orange,4G,STATE_IN_SERVICE,16,100.3
573,2017-07-25,Orange,4G,STATE_IN_SERVICE,12,100.2


#### 2. Aggregation

1. Calculate the average, maximum and standard deviation 'speed' of mobile data in the 'mobile 2015_2017' table for the year 2017.

In [12]:
# Datasets: {mobile_data_cleaned}, {pop_density}, {percapita_income}

query = f"""
    SELECT 
        ROUND(AVG(speed),2) avg_speed_2017,
        MAX(speed) max_speed_2017,
        ROUND(STDDEV_POP(speed),2) std_speed_2017
    FROM `{mobile_data_cleaned}`
    WHERE year = 2017
    """
# Execute the query
query_df(query)   

Unnamed: 0,avg_speed_2017,max_speed_2017,std_speed_2017
0,25.18,254.9,34.43


2. Calculate the average signal strength (signal) for each month in the year 2017 and order the results in descending order.

In [13]:
# Datasets: {mobile_data_cleaned}, {pop_density}, {percapita_income}

query = f"""
    SELECT 
        EXTRACT(MONTH FROM date) month,
        ROUND(AVG(signal),2) avg_signal_2017
    FROM `{mobile_data_cleaned}`
    WHERE year = 2017
    GROUP BY 1
    ORDER BY 2 DESC
    """

# Execute the query
query_df(query)  

Unnamed: 0,month,avg_signal_2017
0,2,13.27
1,1,13.1
2,3,12.9
3,4,12.86
4,5,12.72
5,7,12.61
6,6,12.55
7,8,12.46
8,12,12.33
9,11,12.25


3. Calculate the percentage of records where the activity is 'IN_VEHICLE' and net is NOT 'Undefined net' in the '{mobile_2015_2017}' table for the year 2017.

In [14]:
# Datasets: {mobile_data_cleaned}, {pop_density}, {percapita_income}

query = f"""
SELECT
    net,
    ROUND((in_vehicle_2017_count / total_count * 100), 2) AS in_vehicle_2017_perc
FROM
    
    # The first subquery (in_vehicle_subquery) calculates the count of records GROUPED BY 'net' where the activity is
    # 'IN_VEHICLE' and the 'net' is not 'Undefined net' for the year 2017.
     
    (SELECT 
        net,
        COUNT(*) AS in_vehicle_2017_count
    FROM `{mobile_data_cleaned}`
    WHERE year = 2017
        AND activity = 'IN_VEHICLE'
        AND net != 'Undefined net'
    GROUP BY 1) in_vehicle_subquery,
    
    # The second subquery (total_subquery) calculates the total count of records where the activity is 'IN_VEHICLE' 
    # and the 'net' is not 'Undefined net' for the year 2017.
    
    (SELECT COUNT(*) AS total_count
     FROM `{mobile_data_cleaned}`
     WHERE year = 2017
        AND activity = 'IN_VEHICLE'
        AND net != 'Undefined net') total_subquery;

    """
# Execute the query
query_df(query)   

Unnamed: 0,net,in_vehicle_2017_perc
0,4G,57.82
1,3G,26.2
2,2G,15.98


#### 3. Join Operations

1. Join the '{mobile_2015_2017}' table with the '{pop_density}' table to retrieve records from both tables for the year 2016.

In [15]:
# Datasets: {mobile_data_cleaned}, {pop_density}, {percapita_income}

query = f"""
    SELECT 
      m.province,
      p.sq_km,
      p.population,
      p.density_per_sq_km,
      ROUND(AVG(m.signal),2) avg_signal
    FROM `{mobile_data_cleaned}` m
    JOIN `{pop_density}` p 
      ON m.province = p.province AND m.year = p.year
    WHERE m.year = 2016
    GROUP BY m.province, p.sq_km, p.population, p.density_per_sq_km
    ORDER BY p.density_per_sq_km DESC;
    """
# Execute the query
query_df(query) 

Unnamed: 0,province,sq_km,population,density_per_sq_km,avg_signal
0,Barcelona,7726,5635085,729.4,13.38
1,Girona,5908,766273,129.7,11.94
2,Tarragona,6303,810947,128.7,12.7
3,Lleida,12172,742099,61.0,12.83


2. Calculate the average signal strength (signal) for each province in the year 2016. Join this data with the "{pop_density}" table to find the population density for each province in 2016.


In [16]:
# Datasets: {mobile_data_cleaned}, {pop_density}, {percapita_income}

query = f"""
    SELECT 
      m.province,
      ROUND(AVG(m.signal),2) avg_signal_2016,
      p.density_per_sq_km
    FROM `{mobile_data_cleaned}` m
    JOIN `{pop_density}` p 
      ON m.province = p.province AND m.year = p.year
    WHERE m.year = 2016
    GROUP BY m.province, p.density_per_sq_km
    ORDER BY p.density_per_sq_km DESC;
    """
# Execute the query
query_df(query) 

Unnamed: 0,province,avg_signal_2016,density_per_sq_km
0,Barcelona,13.38,729.4
1,Girona,11.94,129.7
2,Tarragona,12.7,128.7
3,Lleida,12.83,61.0


3. Find the province with the best 4G network coverage (highest number of records with '4G' in the network column) in the year 2015.


In [17]:
# Datasets: {mobile_data_cleaned}, {pop_density}, {percapita_income}

query = f"""
    SELECT 
      province,
      net,
      COUNT(*) records_count
    FROM `{mobile_data_cleaned}`
    WHERE net = '4G' AND year = 2015
    GROUP BY province, net
    ORDER BY COUNT(*) DESC
    LIMIT 1;
    """
# Execute the query
query_df(query) 

Unnamed: 0,province,net,records_count
0,Barcelona,4G,919541


4. Find the province with the highest average signal strength (signal) in the year 2017. Then, join this result with the "{percapita_income}" table to get the per capita income for that province in 2017.


In [18]:
# Datasets: {mobile_data_cleaned}, {pop_density}, {percapita_income}

query = f"""
    SELECT 
      m.province,
      ROUND(AVG(m.signal),2) avg_signal_2017,
      C.per_capita_income
    FROM `{mobile_data_cleaned}` m
    JOIN `{percapita_income}` c
      ON m.province = c.province AND m.year = c.year
    WHERE m.year = 2017
    GROUP BY m.province, c.per_capita_income
    ORDER BY avg_signal_2017 DESC
    LIMIT 1;
    """
# Execute the query
query_df(query)  

Unnamed: 0,province,avg_signal_2017,per_capita_income
0,Tarragona,12.96,23534


5. Calculate the correlation coefficient between the average signal strength (signal) and population density for all provinces.


In [19]:
# Datasets: {mobile_data_cleaned}, {pop_density}, {percapita_income}

query = f"""
WITH avg_signal_table AS (
    SELECT
        m.year,
        m.province,
        AVG(m.signal) avg_signal
    FROM
        `{mobile_data_cleaned}` m
    GROUP BY m.year, m.province
)

SELECT 
    CORR(t.avg_signal, p.density_per_sq_km) corr_signal_density
FROM avg_signal_table t
JOIN `{pop_density}` p 
  ON t.province = p.province AND t.year = p.year
;
"""
# Execute the query
query_df(query) 

Unnamed: 0,corr_signal_density
0,0.707663


    -  This value suggests a moderately strong positive linear correlation between the average signal strength (avg_signal) and population density (density_per_sq_km) for the given provinces and years in the dataset. A correlation coefficient of 0.71 indicates that as population density increases, the average signal strength tends to increase as well.

6. Calculate the correlation coefficient between the average signal strength (signal) and per capita income for all provinces in the year 2017.


In [20]:
# Datasets: {mobile_data_cleaned}, {pop_density}, {percapita_income}

query = f"""
WITH avg_signal_table AS (
    SELECT
        m.year,
        m.province,
        AVG(m.signal) avg_signal
    FROM
        `{mobile_data_cleaned}` m
    GROUP BY m.year, m.province
)

SELECT 
    CORR(t.avg_signal, c.per_capita_income) corr_signal_income
FROM avg_signal_table t
JOIN `{percapita_income}` c 
  ON t.province = c.province AND t.year = c.year
;
"""
# Execute the query
query_df(query)  

Unnamed: 0,corr_signal_income
0,0.35073


    - This value suggests a positive linear correlation too, but it is weaker than the correlation between average signal strength (avg_signal) and population density (density_per_sq_km) that was calculated in the previous query.

7. Group the data in the '{mobile_2015_2017}' table by net type (e.g., 2G, 3G, 4G) and calculate the average signal strength for each type. Join this data with the "{percapita_income}" table to compare the per capita income by network type in the year 2017.


In [21]:
# Datasets: {mobile_data_cleaned}, {pop_density}, {percapita_income}

query = f"""
WITH income_2017 AS (
    SELECT
        c.year,
        c.province,
        c.per_capita_income
    FROM
        `{percapita_income}` c
    WHERE
        c.year = 2017
)

SELECT
    m.net,
    ROUND(AVG(m.signal), 2) avg_signal,
    ROUND(AVG(t.per_capita_income),2) avg_percapita_income
FROM
    `income_2017` t
JOIN `{mobile_data_cleaned}` m 
ON t.province = m.province AND t.year = m.year
GROUP BY 1
ORDER BY 2 DESC;
"""
# Execute the query
query_df(query)

Unnamed: 0,net,avg_signal,avg_percapita_income
0,3G,13.38,26749.22
1,4G,12.69,27398.25
2,2G,12.46,26149.48
3,Undefined net,12.14,26969.51


#### 4. Subqueries

Write a SQL subquery to find the provinces in the 'pop density' table where the population in 2016 is higher than the average population for all provinces.

In [22]:
# Datasets: {mobile_data_cleaned}, {pop_density}, {percapita_income}

query = f"""
    SELECT
      province,
      population
    FROM `{pop_density}`
    WHERE
      year = 2016
      AND population > (
        SELECT AVG(population)
        FROM `{pop_density}`
      );
    """
# Execute the query
query_df(query)

Unnamed: 0,province,population
0,Barcelona,5635085


#### 5. Grouping and Aggregation

6. Grouping and Aggregation:
   - Group the 'mobile 2015_2017' table by 'network' type and calculate the average 'signal' strength and the maximum 'speed' for each network type in the year 2017.

In [23]:
# Datasets: {mobile_data_cleaned}, {pop_density}, {percapita_income}

query = f"""
    SELECT
      net,
      ROUND(AVG(signal),2) avg_signal,
      MAX(speed) max_speed
    FROM `{mobile_data_cleaned}`
    WHERE
      year = 2017
    GROUP BY 1;
    """
# Execute the query
query_df(query) 

Unnamed: 0,net,avg_signal,max_speed
0,Undefined net,12.14,254.1
1,4G,12.69,254.8
2,3G,13.38,254.9
3,2G,12.46,254.7


#### 6. Complex Filtering

1. Retrieve records from the '{mobile_2015_2017}' table for the year 2017 where 'network' is '4G,' 'status' is 'connected,' and 'speed' is greater than 10.

In [24]:
# Datasets: {mobile_data_cleaned}, {pop_density}, {percapita_income}

query = f"""
    SELECT
      date,
      operator,
      net,
      description,
      signal,
      speed
    FROM `{mobile_data_cleaned}`
    WHERE
      year = 2017
      AND net = '4G'
      AND description = 'STATE_IN_SERVICE'
      AND speed > 100.0
    ORDER BY 6 DESC
    LIMIT 1000;
    """
# Execute the query
query_df(query) 

Unnamed: 0,date,operator,net,description,signal,speed
0,2017-07-24,Movistar,4G,STATE_IN_SERVICE,26,253.1
1,2017-05-28,Vodafone,4G,STATE_IN_SERVICE,23,244.6
2,2017-02-22,Orange,4G,STATE_IN_SERVICE,14,242.6
3,2017-03-31,Orange,4G,STATE_IN_SERVICE,19,239.6
4,2017-05-10,Masmovil,4G,STATE_IN_SERVICE,13,229.5
...,...,...,...,...,...,...
570,2017-07-15,Vodafone,4G,STATE_IN_SERVICE,9,100.3
571,2017-05-14,Vodafone,4G,STATE_IN_SERVICE,7,100.3
572,2017-07-09,Orange,4G,STATE_IN_SERVICE,16,100.3
573,2017-07-25,Orange,4G,STATE_IN_SERVICE,12,100.2


#### 7. Data Validation

1. Identify and list all records in the '{mobile_2015_2017}' table with missing values in the 'provider' column.

In [25]:
# Datasets: {mobile_data_cleaned}, {pop_density}, {percapita_income}

query = f"""
    SELECT
      *
    FROM `{mobile_data_cleaned}`
    WHERE
      provider IS NULL
    LIMIT 1000;
    """
# Execute the query
query_df(query)

Unnamed: 0,date,hour,lat,long,signal,network,operator,status,description,net,...,precission,provider,activity,postal_code,town_name,position_geom,province,year,month,hour_24h


#### 8. Case Statements

1. Update the 'status' column in the '{mobile_2015_2017}' table, setting it to 'Above avg' when the 'signal' is above 14, 'On Avg' when between 13 and 14, and 'Under Avg' when below 13.

In [26]:
# Datasets: {mobile_data_cleaned}, {pop_density}, {percapita_income}

query = f"""
    SELECT
      max(signal) max_signal,
      min(signal) min_signal,
      avg(signal) avg_signal
    FROM `{mobile_data_cleaned}`
    """
# Execute the query
query_df(query)   

Unnamed: 0,max_signal,min_signal,avg_signal
0,99,0,13.216373


    - Create a new column to store the updated value

In [27]:
# Datasets: {mobile_data_cleaned}, {pop_density}, {percapita_income}

# Create a new column to store the updated value
query = f"""
    ALTER TABLE `{mobile_data_cleaned}`
    ADD COLUMN signal_eval STRING;
    """

# Execute the query
run_query(query)    

Query successfully executed, and the table has been updated.


    - Update values

In [28]:
# Datasets: {mobile_data_cleaned}, {pop_density}, {percapita_income}

# Update values
query = f"""
    UPDATE `{mobile_data_cleaned}`
    SET signal_eval =
        CASE
            WHEN signal < 13 THEN 'Under Avg'
            WHEN signal BETWEEN 13 AND 14 THEN 'On Avg'
            WHEN signal > 14 THEN 'Above Avg'
            END
        WHERE signal_eval IS NULL;
    """

# Execute the query and store the result in the DataFrame
run_query(query)    

Query successfully executed, and the table has been updated.


    - Check output

In [34]:
# Datasets: {mobile_data_cleaned}, {pop_density}, {percapita_income}

# Check results
query = f"""
    SELECT
      signal,
      signal_eval
    FROM `{mobile_data_cleaned}`
    LIMIT 100
    """
# Execute the query
query_df(query) 

Unnamed: 0,signal,signal_eval
0,16,Above Avg
1,8,Under Avg
2,8,Under Avg
3,5,Under Avg
4,12,Under Avg
...,...,...
95,15,Above Avg
96,15,Above Avg
97,16,Above Avg
98,14,On Avg


    - Delete 'signal_eval' column

In [35]:
# Datasets: {mobile_data_cleaned}, {pop_density}, {percapita_income}

# Delete the added column
query = f"""
    ALTER TABLE `{mobile_data_cleaned}`
    DROP COLUMN signal_eval;
    """

# Execute the query
run_query(query)    

Query successfully executed, and the table has been updated.


#### 9. Statistical Aggregate Functions

1. Calculate the Pearson correlation coefficient between different features pairs in the '{mobile_2015_2017}' table for all years in the dataset.

In [None]:
# Datasets: {mobile_data_cleaned}, {pop_density}, {percapita_income}

query = f"""
    SELECT
      CORR(signal, speed) AS signal_speed,
      CORR(signal, hour_24h) AS signal_hour,
      CORR(signal, CAST(postal_code AS INT64)) AS signal_postalcode,
      CORR(lat, long) AS lat_long,
      CORR(precission, satellites) AS precision_satellites,
      CORR(precission, speed) AS precision_speed,
      CORR(precission, signal) AS precision_signal,
      CORR(precission, CAST(postal_code AS INT64)) AS precission_postalcode,
      CORR(satellites, CAST(postal_code AS INT64)) AS satellites_postalcode
    FROM `{mobile_data_cleaned}`
    """
# Execute the query
query_df(query)  

2. Calculate the covariance between 'signal' strength and 'speed' in the '{mobile_2015_2017}' table.

In [36]:
# Datasets: {mobile_data_cleaned}, {pop_density}, {percapita_income}

query = f"""
    SELECT
      COVAR_POP(signal, speed) AS COVAR_signal_speed,
      CORR(signal, speed) AS CORR_signal_speed
    FROM `{mobile_data_cleaned}`
    """
# Execute the query
query_df(query)    

Unnamed: 0,COVAR_signal_speed,CORR_signal_speed
0,-0.954143,-0.003648


    - Covariance measures the degree to which two variables change together. In this case, the negative value of -0.95 suggests that there is a negative linear relationship between "signal" and "speed." When one variable increases, the other tends to decrease, and vice versa. However, the correlation coefficient (that unlike covariance, the correlation coefficient is a standardized measure within the range of -1 to 1.), which is approximately -0.004, indicates an extremely weak linear relationship, close to zero, suggesting that there is no substantial linear correlation between these two variables. In essence, while there is a negative covariance, the correlation coefficient underscores the absence of a strong linear connection between "signal" and "speed" in the dataset.

3. Calculate the standard deviation of 'signal' strength in the '{mobile_2015_2017}' table for the year 2017.

In [37]:
# Datasets: {mobile_data_cleaned}, {pop_density}, {percapita_income}

query = f"""
    SELECT
      ROUND(STDDEV_POP(signal),2) AS STD_signal,
      ROUND(MIN(signal),2) AS MIN_signal,
      ROUND(MAX(signal),2) AS MAX_signal,
      ROUND(AVG(signal),2) AS AVG_signal      
    FROM `{mobile_data_cleaned}`
    WHERE year = 2017
    """
# Execute the query
query_df(query)

Unnamed: 0,STD_signal,MIN_signal,MAX_signal,AVG_signal
0,7.18,0.0,65.0,12.66


    - These statistics provide insights into the variability and range of signal strengths in the dataset for the year 2017. The standard deviation of 7.18 indicates that signal strengths vary from the average by an average of approximately 7.18 units, and the signal strength ranged from 0.0 to 65.0 during that year.

4. Calculate the variance of 'speed' in the '{mobile_2015_2017}' table for the year 2017.

In [38]:
# Datasets: {mobile_data_cleaned}, {pop_density}, {percapita_income}

query = f"""
    SELECT
      VAR_POP(speed) VAR_speed,
      STDDEV_POP(speed) AS STD_speed,
      MAX(speed) - MIN(speed) AS Range_speed
    FROM `{mobile_data_cleaned}`
    WHERE year=2017
    """
# Execute the query
query_df(query)  

Unnamed: 0,VAR_speed,STD_speed,Range_speed
0,1185.673229,34.433606,254.9


    - The statistics for the 'speed' variable in the year 2017 reveal a substantial degree of variability, as evidenced by the high population variance, standard deviation, and range. This variation suggests that the data points for 'speed' exhibit a wide dispersion from the mean and cover a significant range, reflecting diverse speed measurements throughout the year 2017.

#### 10. Window Functions

Before we start working with window functions, we will set the stage by inserting sample data into the {mobile_data_cleaned} table. This data will help us explore the distinctions between the RANK() and DENSE_RANK() window functions when ranking operators within networks. Additionally, we will create a view, {netwop_4G_view}, to filter and extract a specific subset of the data, focusing on '4G' networks. This view will serve as the basis for our exploration of window functions, allowing us to rank operators and gain insights into their network activity.

    - Insert new rows

In [132]:
# Datasets: {mobile_data_cleaned}, {pop_density}, {percapita_income}

query = f"""
    INSERT INTO `{mobile_data_cleaned}` (network, operator, net, signal, speed)
    VALUES
        ('invented_network', 'invented_operator', '4G', 11, 13),
        ('invented_network', 'invented_operator', '4G', 12, 14),
        ('invented_network', 'invented_operator2', '4G', 12, 14)
    """

# Execute the query
run_query(query)  

Query successfully executed, and the table has been updated.


In [134]:
# Datasets: {mobile_data_cleaned}, {pop_density}, {percapita_income}

query = f"""
    SELECT
        *
    FROM `{mobile_data_cleaned}`
    WHERE REGEXP_CONTAINS(operator,'invented')
    """

# Execute the query
query_df(query)  

Unnamed: 0,date,hour,lat,long,signal,network,operator,status,description,net,...,precission,provider,activity,postal_code,town_name,position_geom,province,year,month,hour_24h
0,,,,,11,invented_network,invented_operator,,,4G,...,,,,,,,,,,
1,,,,,11,invented_network,invented_operator,,,4G,...,,,,,,,,,,
2,,,,,12,invented_network,invented_operator2,,,4G,...,,,,,,,,,,


    - Create View

In [62]:
# Network & Operator (unique pairs) and 4G net table (2015_2017)
netwop_4G_view = "bq-analyst-230590.project_cat_mobile_coverage_2015_2017.netw_op_4G_view"

In [135]:
# Datasets: {mobile_data_cleaned}, {pop_density}, {percapita_income}, 
# Views: {netwop_4G_view}

query = f"""
    CREATE OR REPLACE VIEW `{netwop_4G_view}` AS
    SELECT
      network,
      operator,
      COUNT(*) AS activity_count,
      ROUND(AVG(signal), 2) AS avg_signal,
      ROUND(AVG(speed), 2) AS avg_speed
    FROM `{mobile_data_cleaned}`
    WHERE net = '4G'
    GROUP BY network, operator
    ORDER BY network
    """

# Execute the query
run_query(query)  

Query successfully executed, and the table has been updated.


    - Output

In [136]:
# Datasets: {mobile_data_cleaned}, {pop_density}, {percapita_income}, 
# Views: {netwop_4G_view}

query = f"""
    SELECT
      *
    FROM `{netwop_4G_view}`
    """

# Execute the query
query_df(query)  

Unnamed: 0,network,operator,activity_count,avg_signal,avg_speed
0,Bouygues Telecom,Bouygues Telecom,1258,12.72,52.45
1,Bytel,Bytel,2,16.5,131.4
2,DEFACE,DEFACE,1,25.0,0.0
3,EE,EE,73,14.49,59.02
4,ESPRT,ESPRT,20,10.75,2.03
5,ETICOM SC,ETICOM SC,1985,14.3,34.27
6,F SFR,F SFR,940,13.23,50.62
7,France Telcom Espana SA,France Telcom Espana SA,15657,13.8,18.47
8,GurbTec,GurbTec,2,27.0,0.0
9,I TIM,I TIM,1,20.0,143.5


1. How do networks within the '4G' network rank based on the number of operators, and which networks are at the top 10 in terms of operator count?

In [138]:
# Datasets: {mobile_data_cleaned}, {pop_density}, {percapita_income}, 
# Views: {netwop_4G_view}

# Rank Vs Dense Rank

query = f"""
SELECT
  network,
  COUNT(operator) operator_count,
  RANK() OVER(ORDER BY COUNT(operator) DESC) overall_rank_num_op,
  DENSE_RANK() OVER(ORDER BY COUNT(operator) DESC) overall_DENSE_rank_num_op
FROM `{netwop_4G_view}`
GROUP BY 1
ORDER BY 2 DESC
LIMIT 10
    """

# Execute the query
query_df(query)  

Unnamed: 0,network,operator_count,overall_rank_num_op,overall_DENSE_rank_num_op
0,Orange,9,1,1
1,Movistar,3,2,2
2,Vodafone,2,3,3
3,invented_network,2,3,3
4,EE,1,5,4
5,France Telcom Espana SA,1,5,4
6,Bytel,1,5,4
7,F SFR,1,5,4
8,DEFACE,1,5,4
9,simyo 3g,1,5,4


    - Drop 'invented' rows from original dataset

In [141]:
# Datasets: {mobile_data_cleaned}, {pop_density}, {percapita_income}, 
# Views: {netwop_4G_view}

query = f"""
    DELETE FROM `{mobile_data_cleaned}`
    WHERE REGEXP_CONTAINS(operator,'invented')
    """

# Execute the query
run_query(query)  

Query successfully executed, and the table has been updated.


2. Within '4G' networks with multiple operators, how are operators ranked based on their activity count, and what is the comparative distribution of their rankings within their respective networks?

In [115]:
# Datasets: {mobile_data_cleaned}, {pop_density}, {percapita_income}, 
# Views: {netwop_4G_view}

query = f"""
WITH FilteredNetworks AS (
  SELECT
    network
  FROM `{netwop_4G_view}`
  GROUP BY network
  HAVING COUNT(DISTINCT operator) > 1
)

SELECT
  network,
  operator,
  activity_count,
  RANK() OVER (PARTITION BY network ORDER BY activity_count) AS rank_within_network,
  PERCENT_RANK() OVER (PARTITION BY network ORDER BY activity_count) AS percent_rank,
  DENSE_RANK() OVER (PARTITION BY network ORDER BY activity_count) AS dense_rank
FROM `{netwop_4G_view}`
WHERE network IN (SELECT network FROM FilteredNetworks);

    """

# Execute the query
query_df(query)  

Unnamed: 0,network,operator,activity_count,rank_within_network,percent_rank,dense_rank
0,Orange,EUSKALTEL,1,1,0.0,1
1,Orange,FIBRACAT,161,2,0.125,2
2,Orange,RACC,5229,3,0.25,3
3,Orange,Masmovil,5772,4,0.375,4
4,Orange,PARLEM,18025,5,0.5,5
5,Orange,adamo,21518,6,0.625,6
6,Orange,simyo,24090,7,0.75,7
7,Orange,Jazztel,165369,8,0.875,8
8,Orange,Orange,979833,9,1.0,9
9,Vodafone,Lowi,6595,1,0.0,1


1. Rank the provinces in the '{pop_density}' table based on population density in descending order for the year 2016.

In [39]:
# Datasets: {mobile_data_cleaned}, {pop_density}, {percapita_income}

query = f"""
    SELECT
      *
    FROM `{pop_density}`
    WHERE year=2016
    """
# Execute the query
query_df(query)  

Unnamed: 0,province,year,population,sq_km,density_per_sq_km
0,Girona,2016,766273,5908,129.7
1,Lleida,2016,742099,12172,61.0
2,Barcelona,2016,5635085,7726,729.4
3,Tarragona,2016,810947,6303,128.7


In [48]:
# Datasets: {mobile_data_cleaned}, {pop_density}, {percapita_income}

query = f"""
    SELECT
        province,
        DENSE_RANK() OVER(ORDER BY CAST(density_per_sq_km AS INT64) DESC) AS rank_pop_density
    FROM `{pop_density}`
    WHERE year = 2016
    """
# Execute the query
query_df(query)

Unnamed: 0,province,rank_pop_density
0,Barcelona,1
1,Girona,2
2,Tarragona,3
3,Lleida,4


2. Calculate the dense rank of provinces in the '{percapita_income}' table based on per capita income in ascending order for the year 2017.

3. Divide the provinces in the '{mobile_2015_2017}' table into quartiles based on 'speed' in the year 2017 using the ntile() window function.

10. Multiple Join:

    - Join the 'mobile_2015_2017' table with the 'pop_density' and 'percapita_income' tables to create a unified dataset. The result should include information about mobile coverage, income per capita, and population density for each province in the year 2016. Perform a triple join to combine the data from these three tables into a single result set.

In [None]:
# Datasets: {mobile_data_cleaned}, {pop_density}, {percapita_income}

query = f"""
    SELECT
      m.province,
      m.year,
      ROUND(AVG(m.signal),2) avg_signal,
      ROUND(MAX(m.speed),2) max_speed,
      p.per_capita_income,
      d.density_per_sq_km pop_density_sq_km
    FROM `{mobile_data_cleaned}` m
    JOIN `{percapita_income}` p
        ON m.province = p.province
        AND m.year = p.year
    JOIN `{pop_density}` d
        ON m.province = d.province
        AND m.year = d.year
    GROUP BY 1,2,5,6
    ORDER BY 1,2 DESC;
    """
# Execute the query
query_df(query)

11. Different Joins and Count with NULL Values:

     - In the pop_density table, add some rows with NULL values among its columns to simulate missing data. Then, perform the following join operations with the mobile_2015_2017 table:

In [None]:
# Datasets: {mobile_data_cleaned}, {pop_density}, {percapita_income}

# pop_density columns: province, year, population, sq_km, density_per_sq_km

query = f"""
    INSERT INTO `{pop_density}`(year, population, sq_km)
    VALUES 
        (2015, 600000, 500),
        (2014, 530000, 490),
        (2016, 420000, 400),
        (2017, 385000, 380);
    
    INSERT INTO `{pop_density}`(province, year, sq_km)
    VALUES
        ('Costa Brava', 2017, 678),
        ('Pirineus', 2015, 10000)
    """
# Execute the query
run_query(query)  

    - Output:

In [None]:
# Check updated pop_density dataset

query = f"""
    SELECT *
    FROM `{pop_density}`
    """
# Execute the query
query_df(query)  

    - a. LEFT JOIN: Count the number of records for each province and year, including provinces with NULL values in the pop_density table.

In [None]:
# Datasets: {mobile_data_cleaned}, {pop_density}, {percapita_income}

query = f"""
    SELECT
      d.province,
      d.year,
      COUNT(m.signal) activity_count
    FROM `{pop_density}` d
    LEFT JOIN `{mobile_data_cleaned}` m
        ON d.province = m.province
        AND d.year = m.year
    GROUP BY 1,2
    ORDER BY 1,2 DESC;
    """
# Execute the query
query_df(query)  

Left Join: include all rows from {pop_density} with NULL values in the columns from {mobile_2015_2017}

    - b. RIGHT JOIN: Count the number of records for each province and year, excluding provinces with NULL values in the pop_density table.


In [None]:
# Datasets: {mobile_data_cleaned}, {pop_density}, {percapita_income}

query = f"""
    SELECT
      d.province,
      d.year,
      ROUND(AVG(m.signal),2) avg_signal,
      ROUND(MAX(m.speed),2) max_speed
    FROM `{pop_density}` d
    RIGHT JOIN `{mobile_data_cleaned}` m
        ON d.province = m.province
        AND d.year = m.year
    GROUP BY 1,2
    ORDER BY 1,2 DESC;
    """
# Execute the query
query_df(query) 

Right Join: includes all rows from {mobile_2015_2017} and fill in NULL values from {pop_density} where there are no matches.

    - c. INNER JOIN: Count the number of records for each province and year, including provinces from the pop_density table and any additional data from the mobile_2015_2017 table.

In [None]:
# Datasets: {mobile_data_cleaned}, {pop_density}, {percapita_income}

query = f"""
    SELECT
      d.province,
      d.year,
      ROUND(AVG(m.signal),2) avg_signal,
      ROUND(MAX(m.speed),2) max_speed
    FROM `{pop_density}` d
    INNER JOIN `{mobile_data_cleaned}` m
        ON d.province = m.province
        AND d.year = m.year
    GROUP BY 1,2
    ORDER BY 1,2 DESC;
    """
# Execute the query
query_df(query)  

Inner Join: includes only records with matching province values in both tables and exclude rows with NULL values from {pop_density}.

    - d. FULL JOIN: Count the number of records for each province and year, including all available data from both tables.

In [None]:
# Datasets: {mobile_data_cleaned}, {pop_density}, {percapita_income}

query = f"""
    SELECT
      d.province,
      d.year,
      ROUND(AVG(m.signal),2) avg_signal,
      ROUND(MAX(m.speed),2) max_speed
    FROM `{pop_density}` d
    FULL JOIN `{mobile_data_cleaned}` m
        ON d.province = m.province
        AND d.year = m.year
    GROUP BY 1,2
    ORDER BY 1,2 DESC;
    """
# Execute the query
query_df(query)    

Full join: includes all rows from both tables and provide NULL values for {mobile_2015_2017} where there are no matches.

    - Delete added rows

In [None]:
# Datasets: {mobile_data_cleaned}, {pop_density}, {percapita_income}

# Delete all new rows
query = f"""
    DELETE FROM `{pop_density}`
    WHERE 
        province IS NULL 
        OR province IN ('Costa Brava', 'Pirineus');
    """
# Execute the query
run_query(query)    

### Dates

Date range

In [None]:
# SQL query
query = f"""
    SELECT
        MIN(date) AS first_date_recorded,
        MAX(date) AS last_date_recorded,
        DATE_DIFF(MAX(date), MIN(date), DAY) AS total_days_recorded
    FROM `{mobile_data_cleaned}`
    """
# Execute the query
query_df(query) 

Top 10 dates with the highest activity

In [None]:
# SQL query
query = f"""
    SELECT
      date,
      COUNT(*) AS record_count
    FROM `{mobile_data_cleaned}`
    GROUP BY date
    ORDER BY record_count DESC
    LIMIT 10
    """
# Execute the query
query_df(query) 

Monthly Activity Rank within Quarters and Across the Year

In [None]:
# SQL query
query = f"""
    WITH MonthlyCounts AS (
        SELECT
            EXTRACT(QUARTER FROM date) AS quarter,
            EXTRACT(MONTH FROM date) AS month,
            COUNT(*) AS record_count
        FROM `{mobile_data_cleaned}`
        GROUP BY quarter, month
    )

    SELECT
      quarter,
      month,
      record_count,
      RANK() OVER (ORDER BY record_count DESC) AS month_rank
    FROM MonthlyCounts
    ORDER BY quarter, record_count DESC;
    """
# Execute the query and store the result in the DataFrame
month_rank_df = query_df(query) 

month_rank_df

In [None]:
# Create a custom color palette for each month within a quarter
palette = sns.color_palette("tab10", n_colors=12)  # Use n_colors=12 for 12 months

# Pivot the DataFrame to have months as columns for stacking
stacked_df = month_rank_df.pivot(index='quarter', columns='month', values='record_count')

# Bar plot with stacked bars for each quarter and different colors for months
plt.figure(figsize=(12, 6))
ax = stacked_df.plot(kind='bar', stacked=True, color=palette)
plt.title('Stacked Bar Plot of Quarter-wise Record Count')
plt.xlabel('Quarter')
plt.ylabel('Record Count')
plt.legend(title='Month', loc='upper right', bbox_to_anchor=(1.15, 1))
plt.ticklabel_format(style='plain', axis='y')  # Disable scientific notation on y-axis
plt.show()


In [None]:
# Define custom color palettes for each quarter
colors_first_quarter = sns.color_palette("Blues", n_colors=3)  # Blue colors for months 1, 2, 3
colors_second_quarter = sns.color_palette("Greens", n_colors=3)  # Green colors for months 4, 5, 6
colors_third_quarter = sns.color_palette("Oranges", n_colors=3)  # Orange colors for months 7, 8, 9
colors_fourth_quarter = sns.color_palette("Purples", n_colors=3)  # Purple colors for months 10, 11, 12

# Pivot the DataFrame to have months as columns for stacking
stacked_df = month_rank_df.pivot(index='quarter', columns='month', values='record_count')

# Create a stacked bar plot with distinct colors for each month within a quarter
plt.figure(figsize=(12, 6))

# Loop through each quarter and plot stacked bars with the custom color palette
quarters = stacked_df.index
for quarter, color_palette in zip(quarters, [colors_first_quarter, colors_second_quarter, colors_third_quarter, colors_fourth_quarter]):
    quarter_data = stacked_df.loc[quarter]
    quarter_data.plot(kind='bar', stacked=True, color=color_palette, label=f'Quarter {quarter}')

# Calculate the overall average for all values
overall_average = stacked_df.stack().mean()

# Plot a red horizontal line for the overall average
plt.axhline(y=overall_average, color='red', linestyle='--', label=f'Avg. Overall')

plt.title('Stacked Bar Plot of Quarter-wise Record Count with Distinct Colors for Months')
plt.xlabel('Month')
plt.ylabel('Record Count')
plt.legend(title='Legend', loc='upper right', bbox_to_anchor=(1.15, 1))
plt.ticklabel_format(style='plain', axis='y')  # Disable scientific notation on y-axis
plt.show()


### Hours

Hourly Period Counts

In [None]:
# SQL query
query = f"""
    SELECT
      CASE
        WHEN EXTRACT(HOUR FROM hour) BETWEEN 0 AND 6 THEN 'Dawn'
        WHEN EXTRACT(HOUR FROM hour) BETWEEN 7 AND 12 THEN 'Morning'
        WHEN EXTRACT(HOUR FROM hour) BETWEEN 13 AND 18 THEN 'Afternoon'
        ELSE 'Night'
      END AS period,
      COUNT(*) AS record_count
    FROM `{mobile_data_cleaned}`
    GROUP BY 1
    ORDER BY 2 DESC
    """

# Execute the query and store the result in the DataFrame
hourly_period_counts = query_df(query)

hourly_period_counts

In [None]:
import plotly.express as px

# Define the custom order and colors
custom_order = ["Dawn", "Morning", "Afternoon", "Night"]

# Create an interactive pie chart with Plotly
fig = px.pie(hourly_period_counts, values='record_count', names='period', 
             title='Hourly Period Counts', 
             hover_data=['record_count'], 
             labels={'record_count': 'Record Count'},
             category_orders={"period": custom_order})

# Customize the layout (optional)
fig.update_traces(textinfo='percent+label', pull=[0.1, 0.1, 0.1, 0.1])

# Show the chart
fig.show()

# Save the plot as an HTML file in the 'Python Plots' folder
plot_file_path = "Python Plots/hourly_period_counts.html"
fig.write_html(plot_file_path)

[View the Plot](./Python%20Plots/hourly_period_counts.html)

### Towns and Province activity

Top 50 Towns with the Highest Mobile Activity Recorded

In [None]:
# SQL query
query = f"""
    SELECT
      postal_code,
      town_name,
      COUNT(*) AS record_count
    FROM
      `{mobile_data_cleaned}`
    WHERE postal_code IS NOT NULL
    GROUP BY postal_code, town_name
    ORDER BY 3 DESC
    LIMIT 50
    """

# Execute the query
query_df(query)

In [None]:
# SQL query
query = f"""
WITH ProvinceActivity AS (
  SELECT
      CASE
        WHEN LEFT(CAST(postal_code AS STRING), 2) = '08' THEN 'Barcelona'
        WHEN LEFT(CAST(postal_code AS STRING), 2) = '25' THEN 'Lleida'
        WHEN LEFT(CAST(postal_code AS STRING), 2) = '17' THEN 'Girona'
        WHEN LEFT(CAST(postal_code AS STRING), 2) = '43' THEN 'Tarragona'
        ELSE 'Not defined'
      END AS province,
      COUNT(*) AS record_count
  FROM `{mobile_data_cleaned}`
  GROUP BY province
)

SELECT
    province,
    record_count,
    ROUND((record_count / SUM(record_count) OVER ()) * 100,2) AS percentage
FROM ProvinceActivity
ORDER BY 2 DESC;
    """
# Execute the query
query_df(query)

### Network, operators and signal

In [None]:
# SQL query
query = f"""
SELECT
  network,
  operator,
  COUNT(*) record_count,
  ROUND(AVG(signal),1) avg_netw_signal
FROM
  `{mobile_data_cleaned}`
GROUP BY 1,2
ORDER BY 3 DESC;
    """

# Execute the query and store the result in the DataFrame
netw_oper_signal = query_df(query)

netw_oper_signal

In [None]:
networks = netw_oper_signal['network'].nunique()
operators = netw_oper_signal['operator'].nunique()

display(Markdown(f"There are {networks} unique networks and {operators} distinct operators"))

Which (national) network has the highest activity recorded?

In [None]:
# Group the DataFrame by the "network" column and calculate the total activity
network_activity = netw_oper_signal.groupby('network')['record_count'].sum().reset_index()

# Find the network with the highest total activity
max_activity_network = network_activity[network_activity['record_count'] == network_activity['record_count'].max()]

# Print the network with the highest total activity
max_activity_network

And the operator?

In [None]:
# Group the DataFrame by the "operator" column and calculate the total activity
operator_activity = netw_oper_signal.groupby('operator')['record_count'].sum().reset_index()

# Find the operator with the highest total activity
max_activity_operator = operator_activity[operator_activity['record_count'] == operator_activity['record_count'].max()]

# Print the operator with the highest total activity
max_activity_operator

Which network/operator has the highest average signal?

In [None]:
# Find the index of the row with the maximum "avg_netw_signal"
max_signal_index = netw_oper_signal['avg_netw_signal'].idxmax()

# Retrieve the corresponding row with the maximum "avg_netw_signal"
top_signal = netw_oper_signal.loc[max_signal_index]

# Display the row with the maximum "avg_netw_signal"
top_signal

Which network/operator has the highest average signal among the top 25% of recorded activities?

In [None]:
# Calculate the threshold for the top 25% percentile of "record_count"
threshold = netw_oper_signal['record_count'].quantile(0.75)

# Filter the DataFrame to select rows with "record_count" greater than or equal to the threshold
top_25_percentile = netw_oper_signal[netw_oper_signal['record_count'] >= threshold]

# Find the row with the maximum "avg_netw_signal" within the filtered DataFrame
max_avg_netw_signal_row = top_25_percentile.loc[top_25_percentile['avg_netw_signal'].idxmax()]

# Display the row with the maximum "avg_netw_signal"
max_avg_netw_signal_row

Top Signal Strength by Operator and Network (top 10% record_count, excluding null values)

In [None]:
# SQL query: Top Signal Strength by Operator and Network (top 10% record_count, excluding null values)
query = f"""
WITH Top10Percent AS (
  
  # Subquery to prepare data to calculate the number of operators for each network
  SELECT
    network,
    num_operator,
    record_count,
    RANK() OVER (ORDER BY record_count DESC) AS record_count_rank,
    avg_signal,
    quartile
  FROM (
    SELECT
      LOWER(network) AS network,
      
      # Calculate the number of distinct operators for each network
      COUNT(DISTINCT LOWER(operator)) AS num_operator,
      
      COUNT(*) AS record_count,
      ROUND(AVG(signal), 1) AS avg_signal,
      NTILE(10) OVER (ORDER BY COUNT(*) DESC) AS quartile
    FROM
      `{mobile_data_cleaned}`
    WHERE
    
    # Exclude rows with network 'null' (stored as string)
      network != 'null'
    GROUP BY 1
  )
)

SELECT
  network,
  num_operator,
  record_count,
  record_count_rank,
  avg_signal
FROM
  Top10Percent
WHERE
  quartile = 1
ORDER BY
  avg_signal DESC;
    """
# Execute the query
query_df(query)

Top Network and Operator by Record Count for Each Province

In [None]:
# SQL query
query = f"""
WITH ProvinceActivity AS (
  SELECT
    CASE
      WHEN LEFT(CAST(postal_code AS STRING), 2) = '08' THEN 'Barcelona'
      WHEN LEFT(CAST(postal_code AS STRING), 2) = '25' THEN 'Lleida'
      WHEN LEFT(CAST(postal_code AS STRING), 2) = '17' THEN 'Girona'
      WHEN LEFT(CAST(postal_code AS STRING), 2) = '43' THEN 'Tarragona'
      ELSE 'Not defined'
    END AS province,
    LOWER(network) AS network,
    LOWER(operator) AS operator,
    COUNT(*) AS record_count
  FROM `{mobile_data_cleaned}`
  GROUP BY province, network, operator
)

# Find the Rank of Network and Operator by Record Count for Each Province
, RankedNetwork AS (
  SELECT
    province,
    network,
    operator,
    record_count,
    RANK() OVER (PARTITION BY province ORDER BY record_count DESC) AS network_rank
  FROM ProvinceActivity
)

, RankedOperator AS (
  SELECT
    province,
    network,
    operator,
    record_count,
    RANK() OVER (PARTITION BY province ORDER BY record_count DESC) AS operator_rank
  FROM ProvinceActivity
)

# Select the Top Network and Top Operator for Each Province
SELECT
  p.province,
  n.network AS top_network,
  o.operator AS top_operator
  #n.record_count AS network_record_count,
  #o.record_count AS operator_record_count
FROM RankedNetwork n
JOIN RankedOperator o ON n.province = o.province AND n.network_rank = 1 AND o.operator_rank = 1
JOIN (SELECT DISTINCT province FROM ProvinceActivity) p ON n.province = p.province
ORDER BY p.province;
    """
# Execute the query
query_df(query)

### Description and activity

Description:
    
    - STATE_IN_SERVICE (0),
    - STATE_OUT_OF_SERVICE (1),
    - STATE_EMERGENCY_ONLY (2),
    - STATE_POWER_OFF (3)
    
User Activity:

    - IN_VEHICLE
    - STILL
    - ON_FOOT
    - TILTING
    - UNKNOWN
    - ON_BICYCLE

Activity Rank by Description with Overall Rank

In [None]:
# SQL query
query = f"""
# Activity Rank by Description with Overall Rank
WITH ActivityRank AS (
  SELECT
    description,
    activity,
    RANK() OVER (PARTITION BY description ORDER BY COUNT(*) DESC) AS rank_by_description,
    COUNT(*) AS record_count
  FROM
    `{mobile_data_cleaned}`
  GROUP BY 1, 2
)

SELECT
  description,
  activity,
  rank_by_description,
  #record_count,
  RANK() OVER (ORDER BY rank_by_description, record_count DESC) AS overall_rank
FROM ActivityRank
ORDER BY 1, 4 ASC;
    """
# Execute the query
query_df(query)

Calculate activity rank by description and activity, including most frequent town and overall rank

In [None]:
# SQL query
query = f"""
# Subquery to calculate the rank of activities by description
WITH ActivityRank AS (
  SELECT
    description,
    activity,
    RANK() OVER (PARTITION BY description ORDER BY COUNT(*) DESC) AS rank_by_description,
    COUNT(*) AS record_count
  FROM
    `bigquery-public-data.catalonian_mobile_coverage_eu.mobile_data_2015_2017`
  GROUP BY description, activity
),

# Subquery to find the most frequent town for each unique pair of description and activity
MostFrequentTown AS (
  SELECT
    description,
    activity,
    town_name,
    RANK() OVER (PARTITION BY description, activity ORDER BY COUNT(*) DESC) AS town_rank
  FROM
    `{mobile_data_cleaned}`
  GROUP BY description, activity, town_name
)

# Main query to combine the results and calculate overall rank
SELECT
  ar.description,
  ar.activity,
  ar.rank_by_description,
  RANK() OVER (ORDER BY ar.rank_by_description, ar.record_count DESC) AS overall_rank,
  mft.town_name AS most_frequent_town
FROM ActivityRank ar
# Join with the MostFrequentTown subquery to include the most frequent town
JOIN MostFrequentTown mft ON ar.description = mft.description AND ar.activity = mft.activity
# Filter for the most frequent town
WHERE mft.town_rank = 1 
ORDER BY 1, 3 ASC;
    """
# Execute the query
query_df(query)