### Background
Until recently, AnVIL data has been hosted and shared from Terra workspaces. Depending on the AnVIL dataset/study, the data tables in the workspace have varying schemas. In an effort to ingest all of the AnVIL datasets into TDR we had to find a way to create a common schema across all AnVIL datasets for the successful development of the AnVIL browser.

When a user creates a cohort from the AnVIL browser, the data that is handed off to the workspace is a subset of data called the Findability Subset - that is to say that there are additional fields and information that remain in the TDR snapshot that the user will not see in their destination workspaces.

### What does the notebook do?
At point of hand-off, the user will see data tables with information that is representative of the Findability Subset. To retrieve the additional information, the non-Findability Subset (NFS), they must execute the notebook within the workspace to which they did their hand-off.

The result of the notebook should be data tables that are organized the same as the original Terra AnVIL workspace as well as the tables that were created as views for the purposes of the AnVIL browser.

### How to run the notebook?
1. Start the notebook environment.
2. Once running, click Cell → Run All.
3. The user should see stdout that displays the status of the notebook’s actions as it performs its NFS data extraction.

In [1]:
print(f"version 3 - 2/02/2023 - SC")

version 3 - 2/02/2023 - SC


In [24]:
# imports relevant packages. (Shift + Enter) to execute.

import ast
import os
import json
import re
from firecloud import api as fapi
import pandas as pd
from io import StringIO
import csv
import pprint
from collections import OrderedDict

import requests
from google.cloud import bigquery
from oauth2client.client import GoogleCredentials

# Sets up workspace environment variables. (Shift + Enter) to execute.
ws_project = os.environ['WORKSPACE_NAMESPACE']
ws_name = os.environ['WORKSPACE_NAME']
ws_bucket = os.environ['WORKSPACE_BUCKET']
google_project = os.environ['GOOGLE_PROJECT']

print(ws_project + "\n" + ws_name + "\n" + "bucket: " + ws_bucket + "\n" + "google project: " + google_project)


In [22]:
# functions. (Shift + Enter) to execute.
def get_access_token():
    """Get access token."""

    scopes = ["https://www.googleapis.com/auth/userinfo.profile", "https://www.googleapis.com/auth/userinfo.email", "openid"]
    credentials = GoogleCredentials.get_application_default()
    credentials = credentials.create_scoped(scopes)

    return credentials.get_access_token().access_token


def get_query_results(query):
    """Performs a BQ query."""
    
    # create BQ connection
    bq = bigquery.Client(google_project)
    
    executed_query = bq.query(query)
    result = executed_query.result()
    
    df_result = result.to_dataframe()
    
    return df_result


def get_dataset_info(dataset_id):
    """"Get dataset details from retrieveDataset API given a datasetID."""
    
    uri = f"https://data.terra.bio/api/repository/v1/datasets/{dataset_id}?include=SCHEMA%2CPROFILE%2CDATA_PROJECT%2CSTORAGE"
    
    headers = {"Authorization": "Bearer " + get_access_token(), "accept": "application/json"}
    
    response = requests.get(uri, headers=headers)
    status_code = response.status_code
    
    if status_code != 200:
        return response.text
    
    print(f"Successfully retrieved details for dataset with datasetID {dataset_id}.")
    return json.loads(response.text)


def get_snapshot_info(snapshot_id):
    """"Get dataset details from retrieveDataset API given a datasetID."""
    
    uri = f"https://data.terra.bio/api/repository/v1/snapshots/{snapshot_id}?include="

    headers = {"Authorization": "Bearer " + get_access_token(), "accept": "application/json"}
    
    response = requests.get(uri, headers=headers)
    status_code = response.status_code
    
    if status_code != 200:
        return response.text
    
    print(f"Successfully retrieved details for dataset with datasetID {snapshot_id}.")
    return json.loads(response.text)


def get_dataset_access_info(dataset_id):
    """"Get dataset access details from retrieveDataset API given a datasetID."""
    
    uri = f"https://data.terra.bio/api/repository/v1/datasets/{dataset_id}?include=ACCESS_INFORMATION"
    
    headers = {"Authorization": "Bearer " + get_access_token(),
               "accept": "application/json"}
    
    response = requests.get(uri, headers=headers)
    status_code = response.status_code
    
    if status_code != 200:
        return response.text
    
    print(f"Successfully retrieved access information for dataset with datasetID {dataset_id}.")
    return json.loads(response.text)


def get_snapshot_access_info(snapshot_id):
    """Get snapshot access information from retrieveSnapshot API given a snapshotID"""
    
    uri = f"https://data.terra.bio/api/repository/v1/snapshots/{snapshot_id}?include=ACCESS_INFORMATION"
    
    headers = {"Authorization": "Bearer " + get_access_token(),
               "accept": "application/json"}
    
    response = requests.get(uri, headers=headers)
    status_code = response.status_code
    
    if status_code != 200:
        return response.text
    
    print(f"Successfully retrieved access information for snapshot with snapshotID {snapshot_id}.")
    return json.loads(response.text)


def get_fq_table(entity_id, table_name, entity_type='dataset'):
    """Given a datset or snapshot id, table name, and entity type {dataset,snapshot}, retrieve its fully qualified BQ table name"""
    if entity_type == 'dataset':
        access_info = get_dataset_access_info(entity_id)
    elif entity_type == 'snapshot':
        access_info = get_snapshot_access_info(entity_id)

    project_id = access_info['accessInformation']['bigQuery']['projectId']
    tables = access_info['accessInformation']['bigQuery']['tables']

    # pull out desired table
    table_fq = None  # fq = fully qualified name, i.e. project.dataset.table
    for table_info in tables:
        if table_info['name'] == table_name:
            table_fq = table_info['qualifiedName'] 
    
    return table_fq
    
    
def create_data_table(entities_tsv):
    """Create a Terra data table given a Terra load tsv."""
    
    response = fapi.upload_entities_tsv(ws_project, ws_name, entities_tsv, model='flexible')
    status_code = response.status_code
    
    if status_code not in [200, 202]:
        print(f"WARNING: Loading {entities_tsv} to Terra workspace failed: \n {response.text}")
        return status_code
    
    print(f"Finished uploading {entities_tsv} to Terra workspace as a data table.")
    return status_code
    
    
def group_list(input_df, group_column):
    """Group data by their source data table names."""
    
    # create df grouping by input column value
    grouped_df = pd.DataFrame(input_df).groupby(group_column).agg(list)
        
    for index, row in grouped_df.iterrows():
        group_dict = row.to_dict()
        group_df = pd.DataFrame(group_dict)
        
        # remove duplicate rows in single data table's dataframe        
        # sort by datarepo_row_id
        group_df.sort_values("datarepo_row_id", inplace=True)
        # drop all but one of the duplicate rows
        group_df.drop_duplicates(subset="datarepo_row_id", keep="first", inplace=True)
    
        if index == "file_inventory":
            original_col = "file_id"
            etype_name = "entity:file_inventory_id"
        elif index == "workspace_attributes":
            original_col = "datarepo_row_id"
            etype_name = "entity:workspace_attributes_id"
        else:
            original_col = f"{index}_id"
            etype_name = f"entity:{index}_id"
            
        # rename the id column with entity:[]_id and move to first column position
        group_df = group_df.rename(columns={original_col: etype_name})
        first_column = group_df.pop(etype_name)
        group_df.insert(0, etype_name, first_column)

        # write df to tsv and load to Terra
        filename = write_df_to_tsv(group_df, index)
        create_data_table(filename)

        
def write_df_to_tsv(input_df, entity_name):
    """Create a Terra load tsv given a list of dictionaries."""
    
    outfile_name = f"{entity_name}.tsv"

    # rename the first column to have a new entity_name (entity:entity_name_combined:id)
    input_df.to_csv(outfile_name, sep='\t', index=False)
    
    print(f"Finished writing {entity_name} as {outfile_name} to Terra load file.")
    return outfile_name


def rename_etype(original_etype):
    """Rename PFB handoff tables with `anvil_` prefix."""
    
    uri = f"https://rawls.dsde-prod.broadinstitute.org/api/workspaces/{ws_project}/{ws_name}/entityTypes/{original_etype}"
    
    headers = {"Authorization": "Bearer " + get_access_token(), "accept": "*/*", "Content-Type": "application/json"}
    
    # capture response from API and parse out status code
    new_etype = f"anvil_{original_etype}"
    d = {"newName": new_etype}
    data = json.dumps(d)
    response = requests.patch(uri, headers=headers, data=data)
    
    status_code = response.status_code
    
    if status_code != 204:
        print(f"Warning: Renaming table from {original_etype} to {new_etype} failed. See error for details:\n")
        print(response.text)
        return status_code, response.text
    
    print(f"Successfully renamed table {original_etype} to {new_etype}. \n\n")
    return status_code, response.text


def query_source_tables(query_terms_dict):
    """Query source TDR dataset tables by datarepo_row_id to get NFS data."""

    nfs = pd.DataFrame() # empty df that will contain all entities
    for key, value in query_terms_dict.items():
        src_table_name = key.split(":")[0]
        src_snapshot_id = key.split(":")[1]
        src_datarepo_row_ids = list(set(value)) # set to remove duplicate values
        
        # chunk src_datarepo_row_ids to handle bq query string being > 1024 characters
        chunk_size = 10000 # handles if list len > chunk_size or odd number of values
        chunked_src_datarepo_row_ids = [src_datarepo_row_ids[i:i + chunk_size] for i in range(0, len(src_datarepo_row_ids), chunk_size)]
        
        # get bq qualified table name for bq query
        fq_src_table_name = get_fq_table(src_snapshot_id, src_table_name, entity_type='snapshot')
        
        results = pd.DataFrame() # df to capture results of chunked queries
        # for each chunk (20 datarepo_row_ids)
        for chunk in chunked_src_datarepo_row_ids:
            src_datarepo_row_ids = "('" + "','".join(list(chunk)) + "')"
            query = f"""SELECT * FROM `{fq_src_table_name}` WHERE datarepo_row_id IN {src_datarepo_row_ids}"""
            chunk_results = get_query_results(query) # get df of results
            
            # concatenate results from chunk with previous chunk results
            results = pd.concat([results, chunk_results], axis=0)

        # add the source table name into dictionary to organize into original workspace table structure
        results["source_table_name"] = src_table_name

        # if any of the df column names have Terra reserved names, rename to load to terra without failure
        # ex: file_inventory table in snapshot has `name` column and fails to load to Terra
        col_new_name = "_name_"
        if 'name' in results.columns:
            if col_new_name not in results.columns:
                results = results.rename(columns={"name": col_new_name})
            else:
                raise ValueError(f"{col_new_name} already exists in {src_table_name}. Pick another name and retry.")

        # combined results to nfs and nfs dictionary to a list
        nfs = pd.concat([nfs, results], axis=0)

    return nfs
    
    
def get_nfs_data(entity_data):
    """Return non findability subset data mapping to findability subset inputs."""
    
    sources = {} # capture source details for each datarepo_row_id to be queried
    for index, row in entity_data.iterrows():
        # dictionary of findability subset values for a single row
        fs_dict = row.to_dict()
        src_snapshot_id = row["pfb:source_datarepo_snapshot_id"]
        # list of all source data repo row ids
        src_drr_ids = row["pfb:source_datarepo_row_ids"] 
        # for each source_datarepo_row --> [sequencing:f9e70781-gjt6-422d-a93a-733ac060cb05]
        for src_drr_id in src_drr_ids:
            drr_id = src_drr_id.split(":")[1] # f9e70781-gjt6-422d-a93a-733ac060cb05
            src_table_name = src_drr_id.split(":")[0] # sequencing
            snap_table = f"{src_table_name}:{src_snapshot_id}" # unique pair
            
            # add datarepo_row_ids based on unique key
            if snap_table not in sources:
                sources[snap_table] = set()
            
            
            sources[snap_table].add(drr_id)
    
    # query and get list of results for single entity table
    entity_nfs_results = query_source_tables(sources)
    
    return entity_nfs_results
        

def get_entity_df(ws_project, ws_name, etype):
    """Get tsv file for a given entity in a Terra workspace."""
        
    response = fapi.get_entities_tsv(ws_project, ws_name, etype, model='flexible')
    if response.status_code != 200:
        raise ValueError(f"Error getting {etype} table data from workspace: {response.text}")

    df = pd.read_csv(StringIO(response.text), sep='\t')

    return df

    
def get_entities(ws_project, ws_name):
    """Get applicable entity types for nfs extraction."""
    
    # API call to get all entity types in workspace
    res_etypes = fapi.list_entity_types(ws_project, ws_name)
    dict_all_etypes = json.loads(res_etypes.text)

    # get non-set entities and add to list
    single_etypes_list = []
    single_etypes_list = [key for key in dict_all_etypes.keys() if not key.endswith("_set")]
    
    # filter single etypes to ones applicable for NFS extraction.
    # do not attempt to analyze and rename data in these tables or if they were already renamed with "anvil_"
    # tables in the the ignore list are resulting NFS tables that do not have PFB values
    ignore = ["sample", "file_inventory", "subject", "workspace_attributes", "sequencing", "family",
                 "participant"]
    
    fltrd_entities = [table for table in single_etypes_list if table not in ignore and not table.startswith("anvil_")] 

    print(f"List of entity types that will be updated, if applicable:")
    print('\n'.join(['\t' * 7 + c for c in fltrd_entities]))
    
    return fltrd_entities


def get_ws_nfs_data(ws_project, ws_name):
    """For each entity in the workspace, create combined findability and non-findability subset tables."""

    # get list of viable tables to run NFS extraction
    entities = get_entities(ws_project, ws_name)
#     entities = ["activities", "biosamples"]
    
    all_entities_nfs = pd.DataFrame()
    for etype in entities:
        print(f"Starting: {etype}")
        
        # call Terra API to get table data
        print(f"Extracting {etype}'s findability subset data from Terra data tables.")
        response = get_entity_df(ws_project, ws_name, etype)
        # set column of source datarepo_row_ids to type array (come in as string)
        response['pfb:source_datarepo_row_ids'] = response['pfb:source_datarepo_row_ids'].map(ast.literal_eval)
        
        # get df of all nfs data for entity
        print(f"Starting extraction of {etype}'s non-findability subset data.")
        entity_nfs = get_nfs_data(response)
        
        # concatenate single entity df to all entities df
        all_entities_nfs = pd.concat([all_entities_nfs, entity_nfs], axis=0)
        print(f"Finished extraction of {etype}'s non-findability subset data.")
    
        # rename original etype with `anvil_` prefix to match TDR
        # TODO: consider how to handle tables that have already been renamed or modified from a previous notebook run
        print(f"Renaming {etype} to anvil_{etype}.")
        rename_etype(etype)
    
    print(f"Starting creation of Terra data tables.")
    # organize all entities' nfs data by source_table_name values and then create tsv files and ingest into Terra
    df = group_list(all_entities_nfs, "source_table_name")


In [25]:
# RUN FUNCTION
get_ws_nfs_data(ws_project, ws_name)