In [12]:
# imports relevant packages. (Shift + Enter) to execute.

import ast
import os
import json
import re
from firecloud import api as fapi
import pandas as pd
from io import StringIO
import csv
import pprint
from collections import OrderedDict

import requests
from google.cloud import bigquery
from oauth2client.client import GoogleCredentials

# Sets up workspace environment variables. (Shift + Enter) to execute.

ws_project = os.environ['WORKSPACE_NAMESPACE']
ws_name = os.environ['WORKSPACE_NAME']
ws_bucket = os.environ['WORKSPACE_BUCKET']
google_project = os.environ['GOOGLE_PROJECT']

print(ws_project + "\n" + ws_name + "\n" + "bucket: " + ws_bucket + "\n" + "google project: " + google_project)


anvil-stage-demo
internal_demo_AnVIL_Data_Browser_completed
bucket: gs://fc-b9d72b57-5a36-46ff-967b-38098b2a1889
google project: terra-bf174a68


In [13]:
# gets list of single entity types in workspace. (Shift + Enter) to execute.
    
# API call to get all entity types in workspace
res_etypes = fapi.list_entity_types(ws_project, ws_name)
dict_all_etypes = json.loads(res_etypes.text)

# get non-set entities and add to list
# the unique ID of any single entity is not modified so sets should remain the same
single_etypes_list = []
single_etypes_list = [key for key in dict_all_etypes.keys() if not key.endswith("_set")]

print(f"List of entity types that will be updated, if applicable:")
print('\n'.join(['\t' * 7 + c for c in single_etypes_list]))

List of entity types that will be updated, if applicable:
							activities
							donors
							files
							datasets
							biosamples


In [14]:
# functions. (Shift + Enter) to execute.
def get_access_token():
    """Get access token."""

    scopes = ["https://www.googleapis.com/auth/userinfo.profile", "https://www.googleapis.com/auth/userinfo.email", "openid"]
    credentials = GoogleCredentials.get_application_default()
    credentials = credentials.create_scoped(scopes)

    return credentials.get_access_token().access_token


def get_query_results(query):
    """Performs a BQ query."""
    
    # create BQ connection
    bq = bigquery.Client(google_project)
    
    executed_query = bq.query(query)
    result = executed_query.result()
    
    df_result = result.to_dataframe()
    
    return df_result


def get_dataset_info(dataset_id):
    """"Get dataset details from retrieveDataset API given a datasetID."""
    
    uri = f"https://data.terra.bio/api/repository/v1/datasets/{dataset_id}?include=SCHEMA%2CPROFILE%2CDATA_PROJECT%2CSTORAGE"
    
    headers = {"Authorization": "Bearer " + get_access_token(), "accept": "application/json"}
    
    response = requests.get(uri, headers=headers)
    status_code = response.status_code
    
    if status_code != 200:
        return response.text
    
    print(f"Successfully retrieved details for dataset with datasetID {dataset_id}.")
    return json.loads(response.text)


def get_snapshot_info(snapshot_id):
    """"Get dataset details from retrieveDataset API given a datasetID."""
    
    uri = f"https://data.terra.bio/api/repository/v1/snapshots/{snapshot_id}?include="

    headers = {"Authorization": "Bearer " + get_access_token(), "accept": "application/json"}
    
    response = requests.get(uri, headers=headers)
    status_code = response.status_code
    
    if status_code != 200:
        return response.text
    
    print(f"Successfully retrieved details for dataset with datasetID {snapshot_id}.")
    return json.loads(response.text)


def get_dataset_access_info(dataset_id):
    """"Get dataset access details from retrieveDataset API given a datasetID."""
    
    uri = f"https://data.terra.bio/api/repository/v1/datasets/{dataset_id}?include=ACCESS_INFORMATION"
    
    headers = {"Authorization": "Bearer " + get_access_token(),
               "accept": "application/json"}
    
    response = requests.get(uri, headers=headers)
    status_code = response.status_code
    
    if status_code != 200:
        return response.text
    
    print(f"Successfully retrieved access information for dataset with datasetID {dataset_id}.")
    return json.loads(response.text)


def get_snapshot_access_info(snapshot_id):
    """Get snapshot access information from retrieveSnapshot API given a snapshotID"""
    
    uri = f"https://data.terra.bio/api/repository/v1/snapshots/{snapshot_id}?include=ACCESS_INFORMATION"
    
    headers = {"Authorization": "Bearer " + get_access_token(),
               "accept": "application/json"}
    
    response = requests.get(uri, headers=headers)
    status_code = response.status_code
    
    if status_code != 200:
        return response.text
    
    print(f"Successfully retrieved access information for snapshot with snapshotID {snapshot_id}.")
    return json.loads(response.text)


def get_fq_table(entity_id, table_name, entity_type='dataset'):
    """Given a datset or snapshot id, table name, and entity type {dataset,snapshot}, retrieve its fully qualified BQ table name"""
    if entity_type == 'dataset':
        access_info = get_dataset_access_info(entity_id)
    elif entity_type == 'snapshot':
        access_info = get_snapshot_access_info(entity_id)

    project_id = access_info['accessInformation']['bigQuery']['projectId']
    tables = access_info['accessInformation']['bigQuery']['tables']

    # pull out desired table
    table_fq = None  # fq = fully qualified name, i.e. project.dataset.table
    for table_info in tables:
        if table_info['name'] == table_name:
            table_fq = table_info['qualifiedName'] 
    
    return table_fq


def rename_etype(original_etype, new_etype):
    """Rename PFB handoff tables with `anvil_` prefix."""
    
    uri = f"https://rawls.dsde-prod.broadinstitute.org/api/workspaces/{ws_project}/{ws_name}/entityTypes/{original_etype}"
    
    headers = {"Authorization": "Bearer " + get_access_token(), "accept": "*/*", "Content-Type": "application/json"}
    
    # capture response from API and parse out status code
    d = {"newName": new_etype}
    data = json.dumps(d)
    response = requests.patch(uri, headers=headers, data=data)
    
    status_code = response.status_code
    
    if status_code != 204:
        print(f"Warning: Renaming table from {original_etype} to {new_etype} failed. See error for details:\n")
        print(response.text)
        return status_code, response.text
    
    print(f"Successfully renamed table {original_etype} to {new_etype}.")
    return status_code, response.text
    
    
def create_data_table(entities_tsv):
    """Create a Terra data table given a Terra load tsv."""
    response = fapi.upload_entities_tsv(ws_project, ws_name, entities_tsv, model='flexible')
    
    if response.status_code not in [200, 202]:
        print(f"Loading {entities_tsv} to Terra workspace failed.\n")
        print(response.text)

    print(f"Finished loading {entities_tsv} to Terra workspace.")

        
def group_list(input_list, group_column, entity_name):
    """Group data by their source data table names."""
    
    # create df grouping by input column value
    grouped_df = pd.DataFrame(input_list).groupby(group_column).agg(list)
    
    for index, row in grouped_df.iterrows():
        print(index)
        group_dict = row.to_dict()
        group_df = pd.DataFrame(group_dict)
        
        if index == "file_inventory":
            original_col = "file_id"
            etype_name = "entity:file_inventory_id"
        elif index == "workspace_attributes":
            original_col = "datarepo_row_id"
            etype_name = "entity:workspace_attributes_id"
        else:
            original_col = f"{index}_id"
            etype_name = f"entity:{index}_id"
            
        # rename the id column with entity:[]_id and move to first column
        group_df = group_df.rename(columns={original_col: etype_name})
        
        first_column = group_df.pop(etype_name)
        group_df.insert(0, etype_name, first_column)
        
        filename = write_df_to_tsv(group_df, index)
        create_data_table(filename)
    
    
def write_df_to_tsv(input_df, entity_name):
    """Create a Terra load tsv given a list of dictionaries."""
    
    outfile_name = f"{entity_name}.tsv"

    # rename the first column to have a new entity_name (entity:entity_name_combined:id)
    input_df.to_csv(outfile_name, sep='\t', index=False)
    
    print(f"Finished writing {entity_name} to Terra load file: {outfile_name} \n")
    return outfile_name

    
def get_nfs_data(entity_data):
    """Return non findability subset data mapping to findability subset inputs."""

    nfs = [] # holds list of dictionaries - each dict = single row of NON-findability data
    
    for index, row in entity_data.iterrows():
        # dictionary of findability subset values for a single row
        fs_dict = row.to_dict()
        src_snapshot_id = row["pfb:source_datarepo_snapshot_id"]
        # TODO: handle what to do if the source datarepo row ids have more than 1 source repo id
        src_drr_ids = row["pfb:source_datarepo_row_ids"][0].split(":")[1]
        src_table_name = row["pfb:source_datarepo_row_ids"][0].split(":")[0]
        fq_src_table_name = get_fq_table(src_snapshot_id, src_table_name, entity_type='snapshot')

        print(f"Extracting non-findability subset data for source datarepo_row_id: {src_drr_ids}")
        
        query = f"""SELECT * FROM `{fq_src_table_name}` WHERE datarepo_row_id = '{src_drr_ids}'"""
        results = get_query_results(query)

        # dictionary of NON-findability subset values for a single row
        nfs_dict = results.to_dict("records")[0] # convert results df to a dictionary without index value

        # if any of the dictionary keys (table column names) have Terra reserved names, rename
        # file_inventory table in snapshot has `name` column and fails to load to Terra
        # renaming it to make terra load table work without failure
        if nfs_dict.get('name'):
            nfs_dict["file_name"] = nfs_dict.pop("name")
        
        # add the source table name into dictionary to organize into original workspace table structure
        nfs_dict["source_table_name"] = src_table_name
        
        # TODO: don't combine nfs with fs data, keep in separate tables
        # updated the findability subset dictionary with its non-findability subset data
#         fs_dict.update(nfs_dict)
        # append the combined fs and nfs dictionary to a list
        nfs.append(nfs_dict)
        
        print(f"Finished extracting non-findability subset data: {src_drr_ids} \n\n")

    return nfs

    
def get_ws_nfs_data(ws_project, ws_name):
    """For each entity in the workspace, create combined findability and non-findability subset tables."""
    
    # gets list of single entity types in workspace. (Shift + Enter) to execute.
    
    # API call to get all entity types in workspace
    res_etypes = fapi.list_entity_types(ws_project, ws_name)
    dict_all_etypes = json.loads(res_etypes.text)

    # get non-set entities and add to list
    # the unique ID of any single entity is not modified so sets should remain the same
    single_etypes_list = []
    single_etypes_list = [key for key in dict_all_etypes.keys() if not key.endswith("_set")]

    print(f"List of entity types that will be updated, if applicable:")
    print('\n'.join(['\t' * 7 + c for c in single_etypes_list]))
    
    for etype in single_etypes_list:
        print(f"Starting: {etype}")
        # rename original etype with `anvil_` prefix to match TDR
        print(f"Renaming {etype} to anvil_{etype}.")
        renamed_etype = f"anvil_{etype}"
        status_code, response = rename_etype(etype, renamed_etype)
              
        response = pd.read_csv(StringIO(fapi.get_entities_tsv(ws_project, ws_name, renamed_etype, model='flexible').text), sep='\t')
        # set column of source datarepo_row_ids to type array (come in as string originally)
        response['pfb:source_datarepo_row_ids'] = response['pfb:source_datarepo_row_ids'].map(ast.literal_eval)

        nfs_data_list = get_nfs_data(response)
#         nfs_data_list = get_nfs_data(response.head(5))
        # combined_data_list contains dictionaries - each dict contains source_table_name
        # organize the list items by source_table_name values and then create tsv files and ingest into Terra
        df = group_list(nfs_data_list, "source_table_name", etype)



In [15]:
# RUN FUNCTION
get_ws_nfs_data(ws_project, ws_name)

List of entity types that will be updated, if applicable:
							activities
							donors
							files
							datasets
							biosamples
Starting: activities
Renaming activities to anvil_activities.
Successfully renamed table activities to anvil_activities.
Successfully retrieved access information for snapshot with snapshotID e6412f80-f150-4212-a4a7-bac6f1a1fb33.
Extracting non-findability subset data for source datarepo_row_id: 96c4b433-2c60-433f-9596-2169a342d111
Finished extracting non-findability subset data: 96c4b433-2c60-433f-9596-2169a342d111 


Successfully retrieved access information for snapshot with snapshotID e6412f80-f150-4212-a4a7-bac6f1a1fb33.
Extracting non-findability subset data for source datarepo_row_id: 812ac0b4-7ed4-49a1-bd8d-9d2e79e520b1
Finished extracting non-findability subset data: 812ac0b4-7ed4-49a1-bd8d-9d2e79e520b1 


Successfully retrieved access information for snapshot with snapshotID a6c7c091-834f-498d-bce2-211b7557f8f9.
Extracting non-findability su

In [None]:
!gsutil cp *.tsv $ws_bucket


In [32]:
# TODO: handle what happens if src data repo row ids list has more than one source data repo row id
# TODO: what is user flow? do they pass in a snapshots worth of data or do we run everything in the data tables each time


