In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Topics
1. **Getting Started With SQL and BigQuery**: Workflow for handling big datasets with BigQuery and SQL
1. **Select, From & Where**: Foundational components for all SQL queries
1. **Group By, Having & Count**: Get more interesting insights directly from SQL queries
1. Order the results and focus on them ost important data for use case
1. Organize queries for better readability, important for complex queries
1. Combine data sources for almost all real-world data problems

## 1. Getting Started With SQL and BigQuery

In [5]:
# import Python package for BigQuery
from google.cloud import bigquery

In [8]:
# Create a Client object, retrieving info. from BigQuery
client = bigquery.Client()  # Using Kaggle's public dataset BigQuery integration.

Using Kaggle's public dataset BigQuery integration.


In [12]:
# Dataset: hacker_news, Hacker News posts
# A website focusing on computer science and cybersecurity news.
# https://news.ycombinator.com/
# BigQuery
# ---------
#     \-- project
#              \-- bigquery-public-data
#                           \-- hacker_news

# Construct a reference to the 'hacker_news' dataset
dataset_ref = client.dataset("hacker_news", project="bigquery-public-data")

# API request - fetch the dataset
dataset = client.get_dataset(dataset_ref)
#help(dataset)

In [17]:
# List all the tables in the "hacker_news" dataset
tables = list(client.list_tables(dataset))

# Print names of all tables in the dataset (there are four!)
for table in tables:  
    print(table.table_id)  # only one table - 'full'

full


In [21]:
# Construct a reference ot the 'full' table
table_ref = dataset_ref.table('full')

# API request - fetch table
table = client.get_table(table_ref)
#table   # Table(TableReference(DatasetReference('bigquery-public-data', 'hacker_news'), 'full'))


![](https://storage.googleapis.com/kaggle-media/learn/images/biYqbUB.png)

### Table schema   
The structure of table

In [23]:
# Print the info on all the columns in the 'full' tablre in the 'hacker_news" dataset'
table.schema

[SchemaField('title', 'STRING', 'NULLABLE', 'Story title', (), None),
 SchemaField('url', 'STRING', 'NULLABLE', 'Story url', (), None),
 SchemaField('text', 'STRING', 'NULLABLE', 'Story or comment text', (), None),
 SchemaField('dead', 'BOOLEAN', 'NULLABLE', 'Is dead?', (), None),
 SchemaField('by', 'STRING', 'NULLABLE', "The username of the item's author.", (), None),
 SchemaField('score', 'INTEGER', 'NULLABLE', 'Story score', (), None),
 SchemaField('time', 'INTEGER', 'NULLABLE', 'Unix time', (), None),
 SchemaField('timestamp', 'TIMESTAMP', 'NULLABLE', 'Timestamp for the unix time', (), None),
 SchemaField('type', 'STRING', 'NULLABLE', 'Type of details (comment, comment_ranking, poll, story, job, pollopt)', (), None),
 SchemaField('id', 'INTEGER', 'NULLABLE', "The item's unique id.", (), None),
 SchemaField('parent', 'INTEGER', 'NULLABLE', 'Parent comment ID', (), None),
 SchemaField('descendants', 'INTEGER', 'NULLABLE', 'Number of story or poll descendants', (), None),
 SchemaField

SchemaField('by', 'string', 'NULLABLE', "The username of the item's author.",())

This tells us:

the field (or column) is called by,
the data in this field is strings,
NULL values are allowed, and
it contains the usernames corresponding to each item's author.

In [29]:
# Preview the first five lines of the 'full' table
rows = client.list_rows(table, max_results=12).to_dataframe() # to panda's df
print(type(rows))
rows

<class 'pandas.core.frame.DataFrame'>


Unnamed: 0,title,url,text,dead,by,score,time,timestamp,type,id,parent,descendants,ranking,deleted
0,,,"I would rather just have wired earbuds, period...",,zeveb,,1591717736,2020-06-09 15:48:56+00:00,comment,23467666,23456782,,,
1,,,DNS?,,nly,,1572810465,2019-11-03 19:47:45+00:00,comment,21436112,21435130,,,
2,,,These benchmarks seem pretty good. Filterable...,,mrkeen,,1591717727,2020-06-09 15:48:47+00:00,comment,23467665,23467426,,,
3,,,Oh really?<p>* Excel alone uses 86.1MB of priv...,,oceanswave,,1462987532,2016-05-11 17:25:32+00:00,comment,11677248,11676886,,,
4,,,These systems are useless. Of the many flaws:...,,nyxxie,,1572810473,2019-11-03 19:47:53+00:00,comment,21436113,21435025,,,
5,,,<i>I don&#x27;t worry about the server</i> and...,,dahfizz,,1566231278,2019-08-19 16:14:38+00:00,comment,20739051,20738305,,,
6,,,Why do you take it as a given that the Secreta...,,chatmasta,,1425350594,2015-03-03 02:43:14+00:00,comment,9135815,9135800,,,
7,,,Let me know what you guys think so far and if ...,,augustin1989,,1394076499,2014-03-06 03:28:19+00:00,comment,7351690,7351684,,,
8,,,I often feel that the main result of this priv...,,jaynetics,,1572810482,2019-11-03 19:48:02+00:00,comment,21436115,21435981,,,
9,,,That is exactly why I generally prefer median-...,,bugra,,1393290376,2014-02-25 01:06:16+00:00,comment,7294506,7293801,,,


In [31]:
# Preview the first five entries in the "title" column of the "full" table
client.list_rows(table, selected_fields=table.schema[:1], max_results=5).to_dataframe()

Unnamed: 0,title
0,
1,
2,
3,
4,


### Exercise:    
### Practice the commands to explore the structure of a dataset with crimes in the city of Chicago

**Fetch data from bigquery**     
* dataset = chicago_crime
* project = bigquery-public-data

In [36]:
# moudle, dataset access setup

from google.cloud import bigquery

# Create a 'Client' obj
client = bigquery.Client()

# Construct a ref. to the dataset - 'chicago_crime'
dataset_ref = client.dataset("chicago_crime", project="bigquery-public-data")

# API request - fetch the dataset
dataset = client.get_dataset(dataset_ref)

Using Kaggle's public dataset BigQuery integration.


In [49]:
# count tables in the dataset
tables = list(client.list_tables(dataset))
num_tables = len(tables)
print(num_tables)

# print the table names
for table in tables:
    print(table.table_id)

1
crime


In [71]:
# explore the table schema
# How many columns in the crime table have TIMESTAMP data?

# Construct a reference ot the 'full' table
table_id = "crime"
table_ref = dataset_ref.table(table_id)

# API request - fetch table
table = client.get_table(table_ref)
print("Printing table schema item by item..")
for field in table.schema:
    print(field)
#print(table.schema)
print("-" * 80)

# Counting columns with TIMESTAMP' field.
# List with the field of interest
filed_to_seek = "TIMESTAMP"
fields_of_interest = [item for item in table.schema if filed_to_seek in str(item)]

print(f"{len(fields_of_interest)} column(s) found for \"{filed_to_seek}\" field, from the table \"{table.table_id}\".")
print(fields_of_interest)

Printing table schema item by item..
SchemaField('unique_key', 'INTEGER', 'REQUIRED', None, (), None)
SchemaField('case_number', 'STRING', 'NULLABLE', None, (), None)
SchemaField('date', 'TIMESTAMP', 'NULLABLE', None, (), None)
SchemaField('block', 'STRING', 'NULLABLE', None, (), None)
SchemaField('iucr', 'STRING', 'NULLABLE', None, (), None)
SchemaField('primary_type', 'STRING', 'NULLABLE', None, (), None)
SchemaField('description', 'STRING', 'NULLABLE', None, (), None)
SchemaField('location_description', 'STRING', 'NULLABLE', None, (), None)
SchemaField('arrest', 'BOOLEAN', 'NULLABLE', None, (), None)
SchemaField('domestic', 'BOOLEAN', 'NULLABLE', None, (), None)
SchemaField('beat', 'INTEGER', 'NULLABLE', None, (), None)
SchemaField('district', 'INTEGER', 'NULLABLE', None, (), None)
SchemaField('ward', 'INTEGER', 'NULLABLE', None, (), None)
SchemaField('community_area', 'INTEGER', 'NULLABLE', None, (), None)
SchemaField('fbi_code', 'STRING', 'NULLABLE', None, (), None)
SchemaField('x

In [75]:
# preview the table data in df.
rows = client.list_rows(table, max_results=12).to_dataframe() # to panda's df
rows
"""
Thinking about the question above, there are a few columns that appear to have geographic data. Look at a few values (with the list_rows() command) to see if you can determine their relationship. Two columns will still be hard to interpret. But it should be obvious how the location column relates to latitude and longitude.
"""

'\nThinking about the question above, there are a few columns that appear to have geographic data. Look at a few values (with the list_rows() command) to see if you can determine their relationship. Two columns will still be hard to interpret. But it should be obvious how the location column relates to latitude and longitude.\n'

In [74]:
# When mapping crime locations on map, what two fileds to pull out of the table?
fields_for_plotting = ['latitude', 'longitude']


## 2. Select, From & Where

### Example: What are all the U.S. cities in the OpenAQ dataset?   
dataset: 'openAQ', Fighting air inequality through
open data. https://openaq.org/

In [83]:
# setup

# 1. module for bigquery
from google.cloud import bigquery

# 'Client' obj
client = bigquery.Client()

# Ref. to dataset - 'openaq'
dataset_ref = client.dataset("openaq", project="bigquery-public-data")

# API request - fetch the dataset
dataset = client.get_dataset(dataset_ref)

# List of all tables in the dataset
tables = list(client.list_tables(dataset))

# Print all table names in dataset
for table in tables:
    print(table.table_id)  # The dataset contains only one table, called global_air_quality

Using Kaggle's public dataset BigQuery integration.
global_air_quality


In [86]:
# Ref. to the table - 'global_air_quality'
table_ref = dataset_ref.table("global_air_quality")

# API req. - fetch the table
table = client.get_table(table_ref)

# Preview the first 12 rows of the table in pandas df.
client.list_rows(table, max_results=12).to_dataframe()

Unnamed: 0,location,city,country,pollutant,value,timestamp,unit,source_name,latitude,longitude,averaged_over_in_hours,location_geom
0,"Borówiec, ul. Drapałka",Borówiec,PL,bc,0.85217,2022-04-28 07:00:00+00:00,µg/m³,GIOS,1.0,52.276794,17.074114,POINT(52.276794 1)
1,"Kraków, ul. Bulwarowa",Kraków,PL,bc,0.91284,2022-04-27 23:00:00+00:00,µg/m³,GIOS,1.0,50.069308,20.053492,POINT(50.069308 1)
2,"Płock, ul. Reja",Płock,PL,bc,1.41,2022-03-30 04:00:00+00:00,µg/m³,GIOS,1.0,52.550938,19.709791,POINT(52.550938 1)
3,"Elbląg, ul. Bażyńskiego",Elbląg,PL,bc,0.33607,2022-05-03 13:00:00+00:00,µg/m³,GIOS,1.0,54.167847,19.410942,POINT(54.167847 1)
4,"Piastów, ul. Pułaskiego",Piastów,PL,bc,0.51,2022-05-11 05:00:00+00:00,µg/m³,GIOS,1.0,52.191728,20.837489,POINT(52.191728 1)
5,"Biała, ul. Kmicica",Biała,PL,bc,5.64,2022-05-10 06:00:00+00:00,µg/m³,GIOS,1.0,52.602534,19.6451,POINT(52.602534 1)
6,"Białystok, ul. Waszyngtona",Białystok,PL,bc,0.28,2022-05-09 14:00:00+00:00,µg/m³,GIOS,1.0,53.126689,23.155869,POINT(53.126689 1)
7,"Gdańsk, ul. Leczkowa",Gdańsk,PL,bc,0.3726,2022-05-08 17:00:00+00:00,µg/m³,GIOS,1.0,54.380279,18.620274,POINT(54.380279 1)
8,"Zdzieszowice, ul. Piastów",Zdzieszowice,PL,bc,0.08659,2022-05-15 19:00:00+00:00,µg/m³,GIOS,1.0,50.423533,18.120739,POINT(50.423533 1)
9,"Mielec, ul. Biernackiego",Mielec,PL,bc,0.49923,2022-05-11 05:00:00+00:00,µg/m³,GIOS,1.0,50.299128,21.440942,POINT(50.299128 1)


In [87]:
# Prepare query to select all the items from the "city" column where the "country" column is 'US'
query = """
        SELECT city
        FROM `bigquery-public-data.openaq.global_air_quality`
        WHERE country = 'US'
"""

In [91]:
# Submit the query to the dataset

# Create a "Client" object
client = bigquery.Client()

# Set up the query
query_job = client.query(query)

# Run the query, results to pandas df
us_cities = query_job.to_dataframe()

Using Kaggle's public dataset BigQuery integration.


In [92]:
# check the results of the query in df
us_cities

Unnamed: 0,city
0,HOWARD
1,HOWARD
2,HOWARD
3,HOWARD
4,HOWARD
...,...
1421346,New York-Northern New Jersey-Long Island
1421347,New York-Northern New Jersey-Long Island
1421348,New York-Northern New Jersey-Long Island
1421349,New York-Northern New Jersey-Long Island


In [93]:
# What five cities have the most measurements?
us_cities.city.value_counts().head()

Phoenix-Mesa-Scottsdale                     39414
Los Angeles-Long Beach-Santa Ana            27479
Riverside-San Bernardino-Ontario            26887
New York-Northern New Jersey-Long Island    25417
San Francisco-Oakland-Fremont               22710
Name: city, dtype: int64

In [95]:
# Some more queries
query = """
        SELECT *
        FROM `bigquery-public-data.openaq.global_air_quality`
        WHERE country = 'US'
        """
# Set up the query
query_job = client.query(query)

# Run the query, results to pandas df
entire_table = query_job.to_dataframe()

# check the results of the query in df
entire_table

Unnamed: 0,location,city,country,pollutant,value,timestamp,unit,source_name,latitude,longitude,averaged_over_in_hours,location_geom
0,Laney College,San Francisco-Oakland-Fremont,US,bc,0.48,2022-05-16 13:00:00+00:00,µg/m³,AirNow,1.0,37.793624,-122.263376,POINT(37.793624 1)
1,Portland Near Road,Portland-Vancouver-Beaverton,US,bc,0.38,2022-05-14 07:00:00+00:00,µg/m³,AirNow,1.0,45.399160,-122.745500,POINT(45.39916 1)
2,San Jose - Knox Ave,San Jose-Sunnyvale-Santa Clara,US,bc,0.28,2022-05-17 19:00:00+00:00,µg/m³,AirNow,1.0,37.338202,-121.849892,POINT(37.338202 1)
3,McMillan Reservoir,Washington-Arlington-Alexandria,US,bc,0.23,2022-05-23 02:00:00+00:00,µg/m³,AirNow,1.0,38.921848,-77.013176,POINT(38.921848 1)
4,Howard County Near R,HOWARD,US,bc,0.80,2022-05-14 20:00:00+00:00,µg/m³,AirNow,1.0,39.143197,-76.846192,POINT(39.143197 1)
...,...,...,...,...,...,...,...,...,...,...,...,...
1421346,St. Paul-Harding H.S,Minneapolis-St. Paul-Bloomington,US,pm25,13.00,2022-05-11 16:00:00+00:00,µg/m³,AirNow,1.0,44.959400,-93.035600,POINT(44.9594 1)
1421347,Gresham Centennial H,MULTNOMAH,US,pm25,1.70,2022-05-08 20:00:00+00:00,µg/m³,AirNow,1.0,45.496200,-122.483400,POINT(45.4962 1)
1421348,HICKORY,Hickory-Lenoir-Morganton,US,pm25,12.00,2022-04-28 05:00:00+00:00,µg/m³,AirNow,1.0,35.728889,-81.365556,POINT(35.728889 1)
1421349,STILWELL CASTNET & N,ADAIR,US,pm25,16.00,2022-05-09 04:00:00+00:00,µg/m³,AirNow,1.0,35.750599,-94.669701,POINT(35.750599 1)


In [96]:
# Some more queries
query = """
        SELECT city, country
        FROM `bigquery-public-data.openaq.global_air_quality`
        WHERE country = 'US'
        """
# Set up the query
query_job = client.query(query)

# Run the query, results to pandas df
city_country = query_job.to_dataframe()

# check the results of the query in df
city_country

Unnamed: 0,city,country
0,HOWARD,US
1,HOWARD,US
2,HOWARD,US
3,HOWARD,US
4,HOWARD,US
...,...,...
1421346,New York-Northern New Jersey-Long Island,US
1421347,New York-Northern New Jersey-Long Island,US
1421348,New York-Northern New Jersey-Long Island,US
1421349,New York-Northern New Jersey-Long Island,US


### Working witht big datasets    
Each Kaggle user can scan 5TB every 30 days for free. Once you hit that limit, you'll have to wait for it to reset.

* **Estimate query cost beforehand - query dry run**
* **Limit the scan size - query safe run**

In [97]:
# Estimate query cost beforehand - query dry run

# Query to get the score column from every row where the type column has value "job"
query = """
        SELECT score, title
        FROM `bigquery-public-data.hacker_news.full`
        WHERE type = "job" 
        """

# Create a QueryJobConfig object to estimate size of query without running it
dry_run_config = bigquery.QueryJobConfig(dry_run=True)

# API request - dry run query to estimate costs
dry_run_query_job = client.query(query, job_config=dry_run_config)

print("This query will process {} bytes.".format(dry_run_query_job.total_bytes_processed))

This query will process 553320240 bytes.


In [98]:
# Limit how much data you are willing to scan.

# Only run the query if it's less than 1 GB
ONE_GB = 1000*1000*1000
safe_config = bigquery.QueryJobConfig(maximum_bytes_billed=ONE_GB)

# Set up the query (will only run if it's less than 1 GB)
safe_query_job = client.query(query, job_config=safe_config)

# API request - try to run the query, and return a pandas DataFrame
job_post_scores = safe_query_job.to_dataframe()

# Print average score for job posts
job_post_scores.score.mean()

1.7267060367454068

In [99]:
print(job_post_scores)

       score                                              title
0        1.0  Full-stack Ruby on Rails engineer not afraid t...
1        1.0  Come help us hack the Auto Industry - Dealer S...
2        1.0  First full-stack web developer role @ Airware ...
3        1.0  Generally Intelligent (YC S17) Is Hiring Syste...
4        1.0  Factored Quality (YC W20) Is Hiring a Front En...
...      ...                                                ...
15843    8.0           Mixpanel (S09) hiring Software Engineers
15844   15.0  Missed Work at a Startup but still want to wor...
15845   19.0  Justin.tv is still hiring (join us in our beau...
15846   29.0  Adioso (YC W09) needs FE developers to help ma...
15847   32.0  Work at Socialcam and help the world share mob...

[15848 rows x 2 columns]


### Exercise: Select, From & Where

In [103]:
# Fetches the global_air_quality table from the openaq dataset. 
# Preview the first twelve rows of the table.

# Create a 'Clent' obj
client = bigquery.Client()

# Ref. to 'openaq' dataset, project = 'bigquery-public-data'
dataset_ref = client.dataset("openaq", project="bigquery-public-data")

# API req. - fetch the datset
dataset = client.get_dataset(dataset_ref)

# Ref. to the table 'global_air_quality'
table_ref = dataset_ref.table("global_air_quality")

# API req. - fetch the table
table = client.get_table(table_ref)

# Preview in df
client.list_rows(table, max_results=12).to_dataframe()

Using Kaggle's public dataset BigQuery integration.


Unnamed: 0,location,city,country,pollutant,value,timestamp,unit,source_name,latitude,longitude,averaged_over_in_hours,location_geom
0,"Borówiec, ul. Drapałka",Borówiec,PL,bc,0.85217,2022-04-28 07:00:00+00:00,µg/m³,GIOS,1.0,52.276794,17.074114,POINT(52.276794 1)
1,"Kraków, ul. Bulwarowa",Kraków,PL,bc,0.91284,2022-04-27 23:00:00+00:00,µg/m³,GIOS,1.0,50.069308,20.053492,POINT(50.069308 1)
2,"Płock, ul. Reja",Płock,PL,bc,1.41,2022-03-30 04:00:00+00:00,µg/m³,GIOS,1.0,52.550938,19.709791,POINT(52.550938 1)
3,"Elbląg, ul. Bażyńskiego",Elbląg,PL,bc,0.33607,2022-05-03 13:00:00+00:00,µg/m³,GIOS,1.0,54.167847,19.410942,POINT(54.167847 1)
4,"Piastów, ul. Pułaskiego",Piastów,PL,bc,0.51,2022-05-11 05:00:00+00:00,µg/m³,GIOS,1.0,52.191728,20.837489,POINT(52.191728 1)
5,"Biała, ul. Kmicica",Biała,PL,bc,5.64,2022-05-10 06:00:00+00:00,µg/m³,GIOS,1.0,52.602534,19.6451,POINT(52.602534 1)
6,"Białystok, ul. Waszyngtona",Białystok,PL,bc,0.28,2022-05-09 14:00:00+00:00,µg/m³,GIOS,1.0,53.126689,23.155869,POINT(53.126689 1)
7,"Gdańsk, ul. Leczkowa",Gdańsk,PL,bc,0.3726,2022-05-08 17:00:00+00:00,µg/m³,GIOS,1.0,54.380279,18.620274,POINT(54.380279 1)
8,"Zdzieszowice, ul. Piastów",Zdzieszowice,PL,bc,0.08659,2022-05-15 19:00:00+00:00,µg/m³,GIOS,1.0,50.423533,18.120739,POINT(50.423533 1)
9,"Mielec, ul. Biernackiego",Mielec,PL,bc,0.49923,2022-05-11 05:00:00+00:00,µg/m³,GIOS,1.0,50.299128,21.440942,POINT(50.299128 1)


In [110]:
# 1) Units of measurement
# Which countries have reported pollution levels in units of "ppm"? 

# Set first_query to an SQL query that pulls the appropriate entries from the country column.
first_query = """
        SELECT DISTINCT country
        FROM `bigquery-public-data.openaq.global_air_quality`
        WHERE unit = "ppm"
        """
# Set query scan limit to 10GB
safe_config = bigquery.QueryJobConfig(maximum_bytes_billed=10**10)
first_query_job = client.query(first_query, job_config=safe_config)

# API req. - run the query, return to pands of
first_results = first_query_job.to_dataframe()

# Prievew the df, first 12 rows
print(first_results.head(12))

   country
0       AR
1       TW
2       IL
3       CO
4       EC
5       RW
6       AU
7       BR
8       CA
9       MX
10      TH
11      US


In [111]:
# 2) High air quality
# Which pollution levels were reported to be exactly 0?
# Set zero_pollution_query to select all columns of the rows where the value column is 0.
# Set zero_pollution_results to a pandas DataFrame containing the query results.

# Query to select all columns where pollution levels are exactly 0
zero_pollution_query = """
        SELECT *
        FROM `bigquery-public-data.openaq.global_air_quality`
        WHERE value = 0
        """
# Set up the query with safe config
safe_config = bigquery.QueryJobConfig(maximum_bytes_billed=10**10)
query_job = client.query(zero_pollution_query, job_config=safe_config)

# API req. - run the query and return a pandas DataFrame
zero_pollution_results = query_job.to_dataframe()

# Preview df
print(zero_pollution_results.head())

                       location      city country pollutant  value  \
0     Zielonka, Bory Tucholskie  Zielonka      PL        bc    0.0   
1    Toruń, ul. Przy Kaszowniku     Toruń      PL        bc    0.0   
2           Kielce, ul. Targowa    Kielce      PL        bc    0.0   
3     Zielonka, Bory Tucholskie  Zielonka      PL        bc    0.0   
4  Koszalin, ul. Armii Krajowej  Koszalin      PL        bc    0.0   

                  timestamp   unit source_name  latitude  longitude  \
0 2022-04-29 14:00:00+00:00  µg/m³        GIOS       1.0  53.662136   
1 2022-04-19 04:00:00+00:00  µg/m³        GIOS       1.0  53.017628   
2 2022-05-07 17:00:00+00:00  µg/m³        GIOS       1.0  50.878998   
3 2022-05-19 14:00:00+00:00  µg/m³        GIOS       1.0  53.662136   
4 2022-05-12 20:00:00+00:00  µg/m³        GIOS       1.0  54.193986   

   averaged_over_in_hours       location_geom  
0               17.933986  POINT(53.662136 1)  
1               18.612808  POINT(53.017628 1)  
2       

## 3. Group By, Having & Count    
Interesting insights directly from queries like:    
* How many of each kind of fruit has our store sold?
* How many species of animal has the vet office treated?
* **COUNT(), SUM(), AVG(), MIN(), MAX(), GROUP BY, GROUP BY HAVING**

### Example: Which Hacker News comments generated the most discussion?   
* The Hacker News dataset contains information on stories and comments from the Hacker News social networking site.   
* Will work with `full` table

In [117]:
# setup, connect , fetch

from google.cloud import bigquery

# Create a "Client" object
client = bigquery.Client()

# Construct a reference to the "hacker_news" dataset
dataset_ref = client.dataset("hacker_news", project="bigquery-public-data")

# API request - fetch the dataset
dataset = client.get_dataset(dataset_ref)

# Print the list of tables in dataset
tables = list(client.list_tables(dataset))
for table in tables:
    print(table.table_id)

# Construct a reference to the "full" table
table_ref = dataset_ref.table("full")

# API request - fetch the table
table = client.get_table(table_ref)

# Preview the first five lines of the "comments" table
client.list_rows(table, max_results=5).to_dataframe()

Using Kaggle's public dataset BigQuery integration.
full


Unnamed: 0,title,url,text,dead,by,score,time,timestamp,type,id,parent,descendants,ranking,deleted
0,,,"I would rather just have wired earbuds, period...",,zeveb,,1591717736,2020-06-09 15:48:56+00:00,comment,23467666,23456782,,,
1,,,DNS?,,nly,,1572810465,2019-11-03 19:47:45+00:00,comment,21436112,21435130,,,
2,,,These benchmarks seem pretty good. Filterable...,,mrkeen,,1591717727,2020-06-09 15:48:47+00:00,comment,23467665,23467426,,,
3,,,Oh really?<p>* Excel alone uses 86.1MB of priv...,,oceanswave,,1462987532,2016-05-11 17:25:32+00:00,comment,11677248,11676886,,,
4,,,These systems are useless. Of the many flaws:...,,nyxxie,,1572810473,2019-11-03 19:47:53+00:00,comment,21436113,21435025,,,


Let's use the table to see which comments generated the most replies. Since:

the parent column indicates the comment that was replied to, and
the id column has the unique ID used to identify each comment,
we can GROUP BY the parent column and COUNT() the id column in order to figure out the number of comments that were made as responses to a specific comment. (This might not make sense immediately -- take your time here to ensure that everything is clear!)

Furthermore, since we're only interested in popular comments, we'll look at comments with more than ten replies. So, we'll only return groups HAVING more than ten ID's.

In [122]:
# Query to select comments that received more than 10 replies
query_popular = """
        SELECT parent, COUNT(id)
        FROM `bigquery-public-data.hacker_news.full`
        GROUP BY parent
        HAVING COUNT(id) > 10
        """

In [123]:
# Set up the query with quata limited to 10 GB)
safe_config = bigquery.QueryJobConfig(maximum_bytes_billed=10**10)
query_job = client.query(query_popular, job_config=safe_config)

# API request - run the query, and convert the results to a pandas DataFrame
popular_comments = query_job.to_dataframe()

# Print the first twelve rows of the DataFrame
popular_comments.head(12)

Unnamed: 0,parent,f0_
0,11266796.0,56
1,9118977.0,48
2,20584311.0,755
3,28474997.0,52
4,23734093.0,63
5,7239333.0,43
6,27335574.0,135
7,17489934.0,77
8,22577132.0,127
9,24789379.0,89


A couple hints to make your queries even better:

The column resulting from COUNT(id) was called f0__. That's not a very descriptive name. You can change the name by adding AS NumPosts after you specify the aggregation. This is called aliasing, and it will be covered in more detail in an upcoming lesson.
If you are ever unsure what to put inside the COUNT() function, you can do **COUNT(1)** to count the rows in each group. Most people find it especially readable, because we know it's not focusing on other columns. It also scans less data than if supplied column names (making it faster and using less of your data access quota).

In [124]:
# Improved version of earlier query, now with aliasing & improved readability
query_improved = """
                 SELECT parent, COUNT(1) AS NumPosts
                 FROM `bigquery-public-data.hacker_news.full`
                 GROUP BY parent
                 HAVING COUNT(1) > 10
                 """

safe_config = bigquery.QueryJobConfig(maximum_bytes_billed=10**10)
query_job = client.query(query_improved, job_config=safe_config)

# API request - run the query, and convert the results to a pandas DataFrame
improved_df = query_job.to_dataframe()

# Print the first five rows of the DataFrame
improved_df.head(12)

Unnamed: 0,parent,NumPosts
0,33468611.0,49
1,6761297.0,108
2,9185356.0,83
3,13682949.0,298
4,31099186.0,81
5,11340510.0,121
6,1869046.0,55
7,20990583.0,164
8,5663157.0,45
9,11456907.0,46


**Note on using GROUP BY**    
Note that because it tells SQL how to apply aggregate functions (like COUNT()), it doesn't make sense to use GROUP BY without an aggregate function. Similarly, if you have any GROUP BY clause, then all variables must be passed to either a

1. GROUP BY command, or
1. an aggregation function.

### Exercise: Group By, Having & Count