In [1]:
from google.cloud import bigquery

In [5]:
client = bigquery.Client()

# Construct a reference to the "hacker_news" dataset
dataset_ref = client.dataset("github_repos", project="bigquery-public-data")

# API request - fetch the dataset
dataset = client.get_dataset(dataset_ref)

table_ref = dataset_ref.table("licenses")

license_table = client.get_table(table_ref)

In [6]:
license_table.schema

[SchemaField('repo_name', 'STRING', 'NULLABLE', None, ()),
 SchemaField('license', 'STRING', 'NULLABLE', None, ())]

In [7]:
client.list_rows(license_table, max_results=5).to_dataframe()

Unnamed: 0,repo_name,license
0,Manwar/WWW-Google-APIDiscovery,artistic-2.0
1,FindAllTogether/LifeIDE,artistic-2.0
2,skaji/perl6-HTTP-Tinyish,artistic-2.0
3,jonathanstowe/Oyatul,artistic-2.0
4,gitpan/App-FastishCGI,artistic-2.0


In [8]:
files_table_ref = dataset_ref.table("sample_files")

file_table = client.get_table(files_table_ref)

In [9]:
client.list_rows(file_table, max_results = 10).to_dataframe()

Unnamed: 0,repo_name,ref,path,mode,id,symlink_target
0,git/git,refs/heads/master,RelNotes,40960,62615ffa4e97803da96aefbc798ab50f949a8db7,Documentation/RelNotes/2.10.0.txt
1,np/ling,refs/heads/master,tests/success/plug_compose.t/plug_compose.ll,40960,0c1605e4b447158085656487dc477f7670c4bac1,../../../fixtures/all/plug_compose.ll
2,np/ling,refs/heads/master,fixtures/strict-par-success/parallel_assoc_lef...,40960,b59bff84ec03d12fabd3b51a27ed7e39a180097e,../all/parallel_assoc_left.ll
3,np/ling,refs/heads/master,fixtures/sequence/parallel_assoc_2tensor2_left.ll,40960,f29523e3fb65702d99478e429eac6f801f32152b,../all/parallel_assoc_2tensor2_left.ll
4,np/ling,refs/heads/master,fixtures/success/my_dual.ll,40960,38a3af095088f90dfc956cb990e893909c3ab286,../all/my_dual.ll
5,np/ling,refs/heads/master,tests/success/literals.t/literals.ll,40960,e933ac4bb885bf2ce6dddf01853a61310a684dd3,../../../fixtures/all/literals.ll
6,np/ling,refs/heads/master,fixtures/sequence/feed_recv.ll,40960,3f0bd39ae146acade476d76a3f0ebd391974c570,../all/feed_recv.ll
7,np/ling,refs/heads/master,tests/success/lettype.t/lettype.ll,40960,065788982da09b08b8defc3e3b0f9764fae2a7b9,../../../fixtures/all/lettype.ll
8,np/ling,refs/heads/master,fixtures/strict-par-success/literals.ll,40960,3cf9f78c9276d38d6192149d2d30d4b4b0dba119,../all/literals.ll
9,np/ling,refs/heads/master,fixtures/success/parallel_assoc_tensor3_left.ll,40960,0c491fa21a5fdabfd08392e7f45e70fae33cc9f9,../all/parallel_assoc_tensor3_left.ll


In [10]:
query = """
        SELECT L.license AS license, COUNT(1) as count_files
        FROM `bigquery-public-data.github_repos.sample_files` as SF 
        INNER JOIN `bigquery-public-data.github_repos.licenses` as L
            ON sf.repo_name = L.repo_name
        GROUP BY L.license
        ORDER BY L.license
        
        """

safe_config = bigquery.QueryJobConfig(maximum_bytes_billed=10**10)
query_job = client.query(query, job_config=safe_config)

# API request - run the query, and convert the results to a pandas DataFrame
file_count_by_license = query_job.to_dataframe()

In [12]:
file_count_by_license.head(10)

Unnamed: 0,license,count_files
0,agpl-3.0,1294168
1,apache-2.0,7149872
2,artistic-2.0,148414
3,bsd-2-clause,697185
4,bsd-3-clause,2937749
5,cc0-1.0,406208
6,epl-1.0,320203
7,gpl-2.0,16859832
8,gpl-3.0,4943732
9,isc,117396


In [33]:
## Example with functions

from google.cloud import bigquery

# Create a "Client" object
client = bigquery.Client()

# Construct a reference to the "stackoverflow" dataset
dataset_ref = client.dataset("stackoverflow", project="bigquery-public-data")

# API request - fetch the dataset
dataset = client.get_dataset(dataset_ref)

def expert_finder(topic, client):
    '''
    Returns a DataFrame with the user IDs who have written Stack Overflow answers on a topic.

    Inputs:
        topic: A string with the topic of interest

    Outputs:
        results: A DataFrame with columns for user_id and number_of_answers. Follows similar logic to bigquery_experts_results shown above.
    '''
    
    my_query = """
               SELECT a.owner_user_id AS user_id, COUNT(1) AS number_of_answers
               FROM `bigquery-public-data.stackoverflow.posts_questions` AS q
               INNER JOIN `bigquery-public-data.stackoverflow.posts_answers` AS a
                   ON q.id = a.parent_Id
               WHERE q.tags like @topic
               GROUP BY a.owner_user_id
               """

    # Set up the query (a real service would have good error handling for 
    # queries that scan too much data)
    safe_config = bigquery.QueryJobConfig(
        query_parameters= [bigquery.ScalarQueryParameter("topic", "STRING", "%" + topic + "%")],
        maximum_bytes_billed=10**10
    )      
    
    my_query_job = client.query(my_query, job_config=safe_config)

    # API request - run the query, and return a pandas DataFrame
    results = my_query_job.to_dataframe()

    return results

expert_finder("nosql", client)

Unnamed: 0,user_id,number_of_answers
0,7463546.0,1
1,3778976.0,7
2,7715250.0,1
3,3477070.0,1
4,7676577.0,1
...,...,...
8420,781707.0,2
8421,285462.0,1
8422,283851.0,1
8423,736524.0,1
