In [1]:
PROJECT_ID = 'hidden-cosmos-240318'
from google.cloud import bigquery

In [2]:
client = bigquery.Client(project=PROJECT_ID, location="US")

In [3]:
dataset_ref = client.dataset("hacker_news", project="bigquery-public-data")

In [4]:
# API request - fetch the dataset
dataset = client.get_dataset(dataset_ref)

In [5]:
table_ref = dataset_ref.table("comments")

In [6]:
table = client.get_table(table_ref)

In [7]:
# Exibição das 50 primeiras linhas da tabela
client.list_rows(table, max_results=50).to_dataframe()

Unnamed: 0,id,by,author,time,time_ts,text,parent,deleted,dead,ranking
0,2701393,5l,5l,1309184881,2011-06-27 14:28:01+00:00,And the glazier who fixed all the broken windo...,2701243,,,0
1,5811403,99,99,1370234048,2013-06-03 04:34:08+00:00,Does canada have the equivalent of H1B/Green c...,5804452,,,0
2,21623,AF,AF,1178992400,2007-05-12 17:53:20+00:00,"Speaking of Rails, there are other options in ...",21611,,,0
3,10159727,EA,EA,1441206574,2015-09-02 15:09:34+00:00,Humans and large livestock (and maybe even pet...,10159396,,,0
4,2988424,Iv,Iv,1315853580,2011-09-12 18:53:00+00:00,I must say I reacted in the same way when I re...,2988179,,,0
5,3867418,Iv,Iv,1334921984,2012-04-20 11:39:44+00:00,&#62; There's a whole class of 'mom &#38; pop'...,3867404,,,0
6,3925617,Iv,Iv,1336076765,2012-05-03 20:26:05+00:00,I'm also in this ballpark (300-600€ / day) whi...,3924840,,,0
7,3107534,Iv,Iv,1318520044,2011-10-13 15:34:04+00:00,how do you run unity in non-accelerated mode ?...,3107241,,,0
8,8409259,Iv,Iv,1412421647,2014-10-04 11:20:47+00:00,Polio is not exterminated even if it is absent...,8409226,,,0
9,2855741,Jd,Jd,1312690646,2011-08-07 04:17:26+00:00,"Yep, I didn't find Rescuetime very helpful. I ...",2855343,,,0


The parent column indicates the comment that was replied to, and the id column has the unique ID used to identify each comment,


In [8]:
# Query to select comments that received more than 10 replies
query_popular = """
                SELECT parent, COUNT(id)
                FROM `bigquery-public-data.hacker_news.comments`
                GROUP BY parent
                HAVING COUNT(id) > 10
                """

In [9]:
# Set up the query (cancel the query if it would use too much of 
# your quota, with the limit set to 10 GB)
safe_config = bigquery.QueryJobConfig(maximum_bytes_billed=10**10)
query_job = client.query(query_popular, job_config=safe_config)

In [10]:
# API request - run the query, and convert the results to a pandas DataFrame
popular_comments = query_job.to_dataframe()

In [11]:
popular_comments.head()

Unnamed: 0,parent,f0_
0,4799163,45
1,726969,51
2,480171,50
3,1326047,63
4,10373608,50


Each row in the popular_comments DataFrame corresponds to a comment that received more than ten replies. For instance, the comment with ID 801208 received 56 replies.


In [12]:
# Improved version of earlier query, now with aliasing (mudança no nome usando AS) & improved readability
query_improved = """
                 SELECT parent, COUNT(1) AS NumPosts
                 FROM `bigquery-public-data.hacker_news.comments`
                 GROUP BY parent
                 HAVING COUNT(1) > 10
                 """

In [13]:
safe_config = bigquery.QueryJobConfig(maximum_bytes_billed=10**10)

In [14]:
query_job = client.query(query_improved, job_config=safe_config)

In [15]:
# API request - run the query, and convert the results to a pandas DataFrame
improved_df = query_job.to_dataframe()

In [16]:
# Print the first five rows of the DataFrame
improved_df.head()

Unnamed: 0,parent,NumPosts
0,7204982,41
1,2559177,61
2,9070488,97
3,9459364,106
4,3402359,54
