In [1]:
from google.cloud import bigquery

In [2]:
client = bigquery.Client()

# Construct a reference to the "hacker_news" dataset
dataset_ref = client.dataset("hacker_news", project="bigquery-public-data")

# API request - fetch the dataset
dataset = client.get_dataset(dataset_ref)

In [3]:
#datasets contain multiple tables

tables = list(client.list_tables(dataset))

for table in tables:  
    print(table.table_id)

comments
full
full_201510
stories


In [4]:
# Construct a reference to the "full" table
table_ref = dataset_ref.table("full")

# API request - fetch the table
table = client.get_table(table_ref)

In [5]:
table.schema

[SchemaField('title', 'STRING', 'NULLABLE', 'Story title', ()),
 SchemaField('url', 'STRING', 'NULLABLE', 'Story url', ()),
 SchemaField('text', 'STRING', 'NULLABLE', 'Story or comment text', ()),
 SchemaField('dead', 'BOOLEAN', 'NULLABLE', 'Is dead?', ()),
 SchemaField('by', 'STRING', 'NULLABLE', "The username of the item's author.", ()),
 SchemaField('score', 'INTEGER', 'NULLABLE', 'Story score', ()),
 SchemaField('time', 'INTEGER', 'NULLABLE', 'Unix time', ()),
 SchemaField('timestamp', 'TIMESTAMP', 'NULLABLE', 'Timestamp for the unix time', ()),
 SchemaField('type', 'STRING', 'NULLABLE', 'Type of details (comment, comment_ranking, poll, story, job, pollopt)', ()),
 SchemaField('id', 'INTEGER', 'NULLABLE', "The item's unique id.", ()),
 SchemaField('parent', 'INTEGER', 'NULLABLE', 'Parent comment ID', ()),
 SchemaField('descendants', 'INTEGER', 'NULLABLE', 'Number of story or poll descendants', ()),
 SchemaField('ranking', 'INTEGER', 'NULLABLE', 'Comment ranking', ()),
 SchemaField(

In [7]:
#Preview the first five lines of the "full" table
#convert to dataframe using built in method
full_df = client.list_rows(table, max_results=5).to_dataframe()
full_df.head(5)

Unnamed: 0,title,url,text,dead,by,score,time,timestamp,type,id,parent,descendants,ranking,deleted
0,Monsanto accepts sweeter bid from Bayer,http://m.dw.com/en/monsanto-accepts-sweeter-bi...,,True,sjreese,1.0,1473854357,2016-09-14 11:59:17+00:00,story,12495969,,,,
1,,,"Actually, from a scientific standpoint, the mo...",,lutusp,,1379352749,2013-09-16 17:32:29+00:00,comment,6394468,6394291.0,,,
2,,,"Smart phones are locked down, harder to hack, ...",,ikeboy,,1467392428,2016-07-01 17:00:28+00:00,comment,12017811,12017577.0,,,
3,,,Down-voted because people find it romantic liv...,,jonesb6,,1458877643,2016-03-25 03:47:23+00:00,comment,11358396,11358234.0,,,
4,Air Jordan After Game,http://www.newsneakergreat.com/jordan-shoes-ai...,,True,wvey0669,1.0,1337663547,2012-05-22 05:12:27+00:00,story,4006348,,-1.0,,


In [14]:
#select the fields 1-3 and covert to dataframe

by_df = client.list_rows(table, selected_fields=table.schema[0:3], max_results=5).to_dataframe() #first index non-inclsive, second index inclusive
by_df.head()

Unnamed: 0,title,url,text
0,Monsanto accepts sweeter bid from Bayer,http://m.dw.com/en/monsanto-accepts-sweeter-bi...,
1,,,"Actually, from a scientific standpoint, the mo..."
2,,,"Smart phones are locked down, harder to hack, ..."
3,,,Down-voted because people find it romantic liv...
4,Air Jordan After Game,http://www.newsneakergreat.com/jordan-shoes-ai...,


In [15]:


# Query to get the score column from every row where the type column has value "job"
query = """
        SELECT score, title
        FROM `bigquery-public-data.hacker_news.full`
        WHERE type = "job" 
        """

# Create a QueryJobConfig object to estimate size of query without running it
dry_run_config = bigquery.QueryJobConfig(dry_run=True)

# API request - dry run query to estimate costs
dry_run_query_job = client.query(query, job_config=dry_run_config)

print("This query will process {} bytes.".format(dry_run_query_job.total_bytes_processed))

This query will process 393958103 bytes.


In [30]:
query_popular = """
                SELECT parent, COUNT(1) AS NumPosts
                FROM `bigquery-public-data.hacker_news.comments`
                GROUP BY parent
                HAVING COUNT(id) > 10
                """

In [31]:
# Set up the query (cancel the query if it would use too much of 
# your quota, with the limit set to 10 GB)
safe_config = bigquery.QueryJobConfig(maximum_bytes_billed=10**10)

#make a call to client.query
query_job = client.query(query_popular, job_config=safe_config)

# API request - run the query, and convert the results to a pandas DataFrame
popular_comments = query_job.to_dataframe()

# Print the first five rows of the DataFrame
popular_comments.head()

Unnamed: 0,parent,NumPosts
0,801208,56
1,5463210,55
2,6455391,67
3,8336025,50
4,3785277,85
