In [1]:
import pandas as pd
from google.cloud import bigquery
from IPython.display import display
from IPython.display import Markdown

import matplotlib.pyplot as plt
import seaborn as sns

In [10]:
# Read the path to your new BigQuery key file
with open("bq_key_path.txt", "r") as f:
    credentials_path = f.read()

# Print the path to your new BigQuery key file
# print(credentials_path)

# Remove any newline characters from the path
credentials_path = credentials_path.strip()

# Create a BigQuery client using the credentials
client = bigquery.Client.from_service_account_json(credentials_path)


### Function: Interactive SQL Query to Pandas DataFrame Converter

In [8]:
def query_df(query, df_name):
    """
    This function allows the user to execute SQL queries using a pre-configured BigQuery client object
    and store the results in a Pandas DataFrame with a user-defined name.

    Args:
    - query (str): The SQL query to execute.
    - df_name (str): The name for the resulting DataFrame.

    Example Usage:
    1. Configure your BigQuery client object 'client' outside of the function.
    2. Call the function with the query and df_name as arguments.

    Example:
    1. Configure 'client' once outside the function.
    2. Call query_df(query, df_name) to execute SQL queries.
    """
    
    # Import required libraries
    import pandas as pd
    from google.cloud import bigquery
    from IPython.display import display
    from IPython.display import Markdown
    
    # Execute the query using the pre-configured client object
    query_job = client.query(query)

    # Get the query result
    results = query_job.result()

    # Initialize an empty list to store rows
    rows = []

    # Get the schema to retrieve column names
    schema = results.schema

    # Extract column names from the schema
    column_names = [field.name for field in schema]

    # Iterating through the results
    for row in results:
        # Append each row as a list to the rows list
        rows.append(list(row))

    # Create a DataFrame from the list of rows with column names
    df = pd.DataFrame(rows, columns=column_names)

    # Set the DataFrame as a variable with the user-defined name using globals()
    globals()[df_name] = df
    
    # Print SQL Query
    display(Markdown(f"Query: "))
    print(f"{query}")
    
    # Print DataFrame name
    display(Markdown(f"Dataframe: **{df_name}**"))    
    
    # Return the DataFrame with the new assigned name
    return df

In [9]:
## CLEANED DATAFRAME!

# SQL query
query = """
    SELECT *
    FROM `bq-analyst-230590.project_cat_mobile_coverage_2015_2017.mobile_data_2015_2017_cleaned`
    LIMIT 100
    """
# DataFrame name
df_name = "mobdata"

# Execute the query and store the result in the DataFrame
query_df(query, df_name)

Query: 


    SELECT *
    FROM `bq-analyst-230590.project_cat_mobile_coverage_2015_2017.mobile_data_2015_2017_cleaned`
    LIMIT 100
    


Dataframe: **mobdata**

Unnamed: 0,lat,long,signal,network,operator,status,description,net,speed,satellites,precission,provider,activity,postal_code,town_name,position_geom,province,year,hour_24h,month
0,41.60244,2.33614,7,EE,EE,2,STATE_EMERGENCY_ONLY,2G,129.7,5.0,22.0,GPS,IN_VEHICLE,081810,la Roca del Vallès,POINT(2.33614 41.60244),Barcelona,2015,6,9
1,41.30230,2.05657,31,EE,EE,0,STATE_IN_SERVICE,Undefined net,71.2,5.0,16.0,GPS,IN_VEHICLE,082009,Sant Boi de Llobregat,POINT(2.05657 41.3023),Barcelona,2015,12,11
2,41.69348,2.75208,4,EE,EE,2,STATE_EMERGENCY_ONLY,2G,46.4,5.0,13.0,GPS,IN_VEHICLE,082845,Tordera,POINT(2.75208 41.69348),Barcelona,2015,20,9
3,41.68127,2.77911,19,EE,EE,2,STATE_EMERGENCY_ONLY,4G,34.1,4.0,18.0,GPS,IN_VEHICLE,170237,Blanes,POINT(2.77911 41.68127),Girona,2015,23,9
4,42.11037,3.14279,11,EE,EE,2,STATE_EMERGENCY_ONLY,4G,24.9,6.0,8.0,GPS,IN_VEHICLE,170622,l'Escala,POINT(3.14279 42.11037),Girona,2015,17,9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,41.40618,2.20713,25,Movistar,ONO,2,STATE_EMERGENCY_ONLY,3G,7.3,3.0,10.0,GPS,IN_VEHICLE,080193,Barcelona,POINT(2.20713 41.40618),Barcelona,2015,7,4
96,41.38840,2.13425,16,Movistar,ONO,2,STATE_EMERGENCY_ONLY,3G,8.0,6.0,51.0,GPS,IN_VEHICLE,080193,Barcelona,POINT(2.13425 41.3884),Barcelona,2015,17,10
97,41.42271,2.19828,14,Movistar,ONO,2,STATE_EMERGENCY_ONLY,3G,19.4,4.0,3.0,GPS,IN_VEHICLE,080193,Barcelona,POINT(2.19828 41.42271),Barcelona,2015,8,4
98,41.40332,2.20036,14,Movistar,ONO,0,STATE_IN_SERVICE,3G,1.4,6.0,7.0,GPS,ON_FOOT,080193,Barcelona,POINT(2.20036 41.40332),Barcelona,2015,8,4


In [None]:
# SQL query
query = """
    SELECT *
    FROM `bigquery-public-data.catalonian_mobile_coverage_eu.mobile_data_2015_2017`
    LIMIT 10
    """
# DataFrame name
df_name = "preview_table"

# Execute the query and store the result in the DataFrame
query_df(query, df_name)

In [None]:
preview_table.columns

### Dates

Date range

In [None]:
# SQL query
query = """
    SELECT
        MIN(date) AS first_date_recorded,
        MAX(date) AS last_date_recorded,
        DATE_DIFF(MAX(date), MIN(date), DAY) AS total_days_recorded
    FROM `bigquery-public-data.catalonian_mobile_coverage_eu.mobile_data_2015_2017`
    """
# DataFrame name
df_name = "date_range"

# Execute the query and store the result in the DataFrame
query_df(query, df_name)

Top 10 dates with the highest activity

In [None]:
# SQL query
query = """
    SELECT
      date,
      COUNT(*) AS record_count
    FROM `bigquery-public-data.catalonian_mobile_coverage_eu.mobile_data_2015_2017`
    GROUP BY date
    ORDER BY record_count DESC
    LIMIT 10
    """
# DataFrame name
df_name = "top_10_dates_act"

# Execute the query and store the result in the DataFrame
query_df(query, df_name)

Monthly Activity Rank within Quarters and Across the Year

In [None]:
# SQL query
query = """
    WITH MonthlyCounts AS (
        SELECT
            EXTRACT(QUARTER FROM date) AS quarter,
            EXTRACT(MONTH FROM date) AS month,
            COUNT(*) AS record_count
        FROM `bigquery-public-data.catalonian_mobile_coverage_eu.mobile_data_2015_2017`
        GROUP BY quarter, month
    )

    SELECT
      quarter,
      month,
      record_count,
      RANK() OVER (ORDER BY record_count DESC) AS month_rank
    FROM MonthlyCounts
    ORDER BY quarter, record_count DESC;
    """
# DataFrame name
df_name = "month_rank_df"

# Execute the query and store the result in the DataFrame
query_df(query, df_name)

In [None]:
# Create a custom color palette for each month within a quarter
palette = sns.color_palette("tab10", n_colors=12)  # Use n_colors=12 for 12 months

# Pivot the DataFrame to have months as columns for stacking
stacked_df = month_rank_df.pivot(index='quarter', columns='month', values='record_count')

# Bar plot with stacked bars for each quarter and different colors for months
plt.figure(figsize=(12, 6))
ax = stacked_df.plot(kind='bar', stacked=True, color=palette)
plt.title('Stacked Bar Plot of Quarter-wise Record Count with Different Colors for Months')
plt.xlabel('Quarter')
plt.ylabel('Record Count')
plt.legend(title='Month', loc='upper right', bbox_to_anchor=(1.15, 1))
plt.ticklabel_format(style='plain', axis='y')  # Disable scientific notation on y-axis
plt.show()


In [None]:
# Define custom color palettes for each quarter
colors_first_quarter = sns.color_palette("Blues", n_colors=3)  # Blue colors for months 1, 2, 3
colors_second_quarter = sns.color_palette("Greens", n_colors=3)  # Green colors for months 4, 5, 6
colors_third_quarter = sns.color_palette("Oranges", n_colors=3)  # Orange colors for months 7, 8, 9
colors_fourth_quarter = sns.color_palette("Purples", n_colors=3)  # Purple colors for months 10, 11, 12

# Pivot the DataFrame to have months as columns for stacking
stacked_df = month_rank_df.pivot(index='quarter', columns='month', values='record_count')

# Create a stacked bar plot with distinct colors for each month within a quarter
plt.figure(figsize=(12, 6))

# Loop through each quarter and plot stacked bars with the custom color palette
quarters = stacked_df.index
for quarter, color_palette in zip(quarters, [colors_first_quarter, colors_second_quarter, colors_third_quarter, colors_fourth_quarter]):
    quarter_data = stacked_df.loc[quarter]
    quarter_data.plot(kind='bar', stacked=True, color=color_palette, label=f'Quarter {quarter}')

plt.title('Stacked Bar Plot of Quarter-wise Record Count with Distinct Colors for Months')
plt.xlabel('Month')
plt.ylabel('Record Count')
plt.legend(title='Month', loc='upper right', bbox_to_anchor=(1.15, 1))
plt.ticklabel_format(style='plain', axis='y')  # Disable scientific notation on y-axis
plt.show()


### Hours

Hourly Period Counts

In [None]:
# SQL query
query = """
    SELECT
      CASE
        WHEN EXTRACT(HOUR FROM hour) BETWEEN 0 AND 6 THEN 'Dawn'
        WHEN EXTRACT(HOUR FROM hour) BETWEEN 7 AND 12 THEN 'Morning'
        WHEN EXTRACT(HOUR FROM hour) BETWEEN 13 AND 18 THEN 'Afternoon'
        ELSE 'Night'
      END AS period,
      COUNT(*) AS record_count
    FROM `bigquery-public-data.catalonian_mobile_coverage_eu.mobile_data_2015_2017`
    GROUP BY 1
    ORDER BY 2 DESC
    """
# DataFrame name
df_name = "hourly_period_counts"

# Execute the query and store the result in the DataFrame
query_df(query, df_name)

In [None]:
# Extract data for plotting
labels = hourly_period_counts['period']
sizes = hourly_period_counts['record_count']

# Create a pie chart
plt.pie(sizes, labels=labels, autopct='%1.1f%%', startangle=140)

# Add a title
plt.title('Hourly Period Counts')

# Show the plot
plt.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.
plt.show()

In [None]:
import plotly.express as px

# Create an interactive pie chart with Plotly
fig = px.pie(hourly_period_counts, values='record_count', names='period', 
             title='Hourly Period Counts', 
             hover_data=['record_count'], 
             labels={'record_count': 'Record Count'})

# Customize the layout (optional)
fig.update_traces(textinfo='percent+label', pull=[0.1, 0.1, 0.1, 0.1])

# Show the chart
fig.show()


### Towns and Province activity

Top 50 Towns with the Highest Mobile Activity Recorded

In [None]:
# SQL query
query = """
    SELECT
      postal_code,
      town_name,
      COUNT(*) AS record_count
    FROM
      `bigquery-public-data.catalonian_mobile_coverage_eu.mobile_data_2015_2017`
    WHERE postal_code IS NOT NULL
    GROUP BY postal_code, town_name
    ORDER BY 3 DESC
    LIMIT 50
    """
# DataFrame name
df_name = "hourly_period_counts"

# Execute the query and store the result in the DataFrame
query_df(query, df_name)

In [None]:
# SQL query
query = """
WITH ProvinceActivity AS (
  SELECT
      CASE
        WHEN LEFT(CAST(postal_code AS STRING), 2) = '08' THEN 'Barcelona'
        WHEN LEFT(CAST(postal_code AS STRING), 2) = '25' THEN 'Lleida'
        WHEN LEFT(CAST(postal_code AS STRING), 2) = '17' THEN 'Girona'
        WHEN LEFT(CAST(postal_code AS STRING), 2) = '43' THEN 'Tarragona'
        ELSE 'Not defined'
      END AS province,
      COUNT(*) AS record_count
  FROM `bigquery-public-data.catalonian_mobile_coverage_eu.mobile_data_2015_2017`
  GROUP BY province
)

SELECT
    province,
    record_count,
    ROUND((record_count / SUM(record_count) OVER ()) * 100,2) AS percentage
FROM ProvinceActivity
ORDER BY 2 DESC;
    """
# DataFrame name
df_name = "hourly_period_counts"

# Execute the query and store the result in the DataFrame
query_df(query, df_name)

### Network, operators and signal

In [None]:
# SQL query
query = """
SELECT
  LOWER(network) AS network,
  LOWER(operator) AS operator,
  COUNT(*) record_count,
  ROUND(AVG(signal),1) avg_netw_signal
FROM
  `bigquery-public-data.catalonian_mobile_coverage_eu.mobile_data_2015_2017`
GROUP BY 1,2
ORDER BY 3 DESC;
    """
# DataFrame name
df_name = "netw_oper_signal"

# Execute the query and store the result in the DataFrame
query_df(query, df_name)

In [None]:
networks = netw_oper_signal['network'].nunique()
operators = netw_oper_signal['operator'].nunique()

display(Markdown(f"There are {networks} unique networks and {operators} distinct operators"))

Which (national) network has the highest activity recorded?

In [None]:
# Group the DataFrame by the "network" column and calculate the total activity
network_activity = netw_oper_signal.groupby('network')['record_count'].sum().reset_index()

# Find the network with the highest total activity
max_activity_network = network_activity[network_activity['record_count'] == network_activity['record_count'].max()]

# Print the network with the highest total activity
max_activity_network

And the operator?

In [None]:
# Group the DataFrame by the "operator" column and calculate the total activity
operator_activity = netw_oper_signal.groupby('operator')['record_count'].sum().reset_index()

# Find the operator with the highest total activity
max_activity_operator = operator_activity[operator_activity['record_count'] == operator_activity['record_count'].max()]

# Print the operator with the highest total activity
max_activity_operator

Which network/operator has the highest average signal?

In [None]:
# Find the index of the row with the maximum "avg_netw_signal"
max_signal_index = netw_oper_signal['avg_netw_signal'].idxmax()

# Retrieve the corresponding row with the maximum "avg_netw_signal"
top_signal = netw_oper_signal.loc[max_signal_index]

# Display the row with the maximum "avg_netw_signal"
top_signal

Which network/operator has the highest average signal among the top 25% of recorded activities?

In [None]:
# Calculate the threshold for the top 25% percentile of "record_count"
threshold = netw_oper_signal['record_count'].quantile(0.75)

# Filter the DataFrame to select rows with "record_count" greater than or equal to the threshold
top_25_percentile = netw_oper_signal[netw_oper_signal['record_count'] >= threshold]

# Find the row with the maximum "avg_netw_signal" within the filtered DataFrame
max_avg_netw_signal_row = top_25_percentile.loc[top_25_percentile['avg_netw_signal'].idxmax()]

# Display the row with the maximum "avg_netw_signal"
max_avg_netw_signal_row

Top Signal Strength by Operator and Network (top 10% record_count, excluding null values)

In [None]:
# SQL query
query = """
WITH Top10Percent AS (
  
  # Subquery to prepare data to calculate the number of operators for each network
  SELECT
    network,
    num_operator,
    record_count,
    RANK() OVER (ORDER BY record_count DESC) AS record_count_rank,
    avg_signal,
    quartile
  FROM (
    SELECT
      LOWER(network) AS network,
      
      # Calculate the number of distinct operators for each network
      COUNT(DISTINCT LOWER(operator)) AS num_operator,
      
      COUNT(*) AS record_count,
      ROUND(AVG(signal), 1) AS avg_signal,
      NTILE(10) OVER (ORDER BY COUNT(*) DESC) AS quartile
    FROM
      `bigquery-public-data.catalonian_mobile_coverage_eu.mobile_data_2015_2017`
    WHERE
    
    # Exclude rows with network 'null' (stored as string)
      network != 'null'
    GROUP BY 1
  )
)

SELECT
  network,
  num_operator,
  record_count,
  record_count_rank,
  avg_signal
FROM
  Top10Percent
WHERE
  quartile = 1
ORDER BY
  avg_signal DESC;
    """
# DataFrame name
df_name = "net_oper_signal"

# Execute the query and store the result in the DataFrame
query_df(query, df_name)

Top Network and Operator by Record Count for Each Province

In [None]:
# SQL query
query = """
WITH ProvinceActivity AS (
  SELECT
    CASE
      WHEN LEFT(CAST(postal_code AS STRING), 2) = '08' THEN 'Barcelona'
      WHEN LEFT(CAST(postal_code AS STRING), 2) = '25' THEN 'Lleida'
      WHEN LEFT(CAST(postal_code AS STRING), 2) = '17' THEN 'Girona'
      WHEN LEFT(CAST(postal_code AS STRING), 2) = '43' THEN 'Tarragona'
      ELSE 'Not defined'
    END AS province,
    LOWER(network) AS network,
    LOWER(operator) AS operator,
    COUNT(*) AS record_count
  FROM `bigquery-public-data.catalonian_mobile_coverage_eu.mobile_data_2015_2017`
  GROUP BY province, network, operator
)

# Find the Rank of Network and Operator by Record Count for Each Province
, RankedNetwork AS (
  SELECT
    province,
    network,
    operator,
    record_count,
    RANK() OVER (PARTITION BY province ORDER BY record_count DESC) AS network_rank
  FROM ProvinceActivity
)

, RankedOperator AS (
  SELECT
    province,
    network,
    operator,
    record_count,
    RANK() OVER (PARTITION BY province ORDER BY record_count DESC) AS operator_rank
  FROM ProvinceActivity
)

# Select the Top Network and Top Operator for Each Province
SELECT
  p.province,
  n.network AS top_network,
  o.operator AS top_operator
  #n.record_count AS network_record_count,
  #o.record_count AS operator_record_count
FROM RankedNetwork n
JOIN RankedOperator o ON n.province = o.province AND n.network_rank = 1 AND o.operator_rank = 1
JOIN (SELECT DISTINCT province FROM ProvinceActivity) p ON n.province = p.province
ORDER BY p.province;
    """
# DataFrame name
df_name = "top_net_and_ope_by_province"

# Execute the query and store the result in the DataFrame
query_df(query, df_name)

### Description and activity

Description:
    
    - STATE_IN_SERVICE (0),
    - STATE_OUT_OF_SERVICE (1),
    - STATE_EMERGENCY_ONLY (2),
    - STATE_POWER_OFF (3)
    
User Activity:

    - IN_VEHICLE
    - STILL
    - ON_FOOT
    - TILTING
    - UNKNOWN
    - ON_BICYCLE

Activity Rank by Description with Overall Rank

In [None]:
# SQL query
query = """
# Activity Rank by Description with Overall Rank
WITH ActivityRank AS (
  SELECT
    description,
    activity,
    RANK() OVER (PARTITION BY description ORDER BY COUNT(*) DESC) AS rank_by_description,
    COUNT(*) AS record_count
  FROM
    `bigquery-public-data.catalonian_mobile_coverage_eu.mobile_data_2015_2017`
  GROUP BY 1, 2
)

SELECT
  description,
  activity,
  rank_by_description,
  #record_count,
  RANK() OVER (ORDER BY rank_by_description, record_count DESC) AS overall_rank
FROM ActivityRank
ORDER BY 1, 4 ASC;
    """
# DataFrame name
df_name = "descr_act_rank"

# Execute the query and store the result in the DataFrame
query_df(query, df_name)

Calculate activity rank by description and activity, including most frequent town and overall rank

In [None]:
# SQL query
query = """
# Subquery to calculate the rank of activities by description
WITH ActivityRank AS (
  SELECT
    description,
    activity,
    RANK() OVER (PARTITION BY description ORDER BY COUNT(*) DESC) AS rank_by_description,
    COUNT(*) AS record_count
  FROM
    `bigquery-public-data.catalonian_mobile_coverage_eu.mobile_data_2015_2017`
  GROUP BY description, activity
),

# Subquery to find the most frequent town for each unique pair of description and activity
MostFrequentTown AS (
  SELECT
    description,
    activity,
    town_name,
    RANK() OVER (PARTITION BY description, activity ORDER BY COUNT(*) DESC) AS town_rank
  FROM
    `bigquery-public-data.catalonian_mobile_coverage_eu.mobile_data_2015_2017`
  GROUP BY description, activity, town_name
)

# Main query to combine the results and calculate overall rank
SELECT
  ar.description,
  ar.activity,
  ar.rank_by_description,
  RANK() OVER (ORDER BY ar.rank_by_description, ar.record_count DESC) AS overall_rank,
  mft.town_name AS most_frequent_town
FROM ActivityRank ar
# Join with the MostFrequentTown subquery to include the most frequent town
JOIN MostFrequentTown mft ON ar.description = mft.description AND ar.activity = mft.activity
# Filter for the most frequent town
WHERE mft.town_rank = 1 
ORDER BY 1, 3 ASC;
    """
# DataFrame name
df_name = "descr_act_rank"

# Execute the query and store the result in the DataFrame
query_df(query, df_name)