In [None]:
# Version
#print('Version 1.0.0: 2/22/2023 8:30pm - Nate Calvanese - Initial version')
print('Version 1.0.1: 3/13/2023 2:14pm - Nate Calvanese - Updated query for identifying records to flag')


In [None]:
#!pip install data_repo_client

In [1]:
## imports and environment variables

# Imports
import import_ipynb
import pandas as pd
import os
import re
import json
import data_repo_client
from google.cloud import bigquery
import ingest_pipeline_utilities as utils
import output_data_validation as odv
import logging
from time import sleep

# Configure logging format
logging.basicConfig(format="%(asctime)s - %(levelname)s: %(message)s", datefmt="%m/%d/%Y %I:%M:%S %p", level=logging.INFO)

# workspace environment variables
ws_name = os.environ["WORKSPACE_NAME"]
ws_project = os.environ["WORKSPACE_NAMESPACE"]
ws_bucket = os.environ["WORKSPACE_BUCKET"]
ws_bucket_name = re.sub('^gs://', '', ws_bucket)

# print(f"workspace name = {ws_name}")
# print(f"workspace project = {ws_project}")
# print(f"workspace bucket = {ws_bucket}")
# print(f"workspace bucket name = {ws_bucket_name}")

importing Jupyter notebook from ingest_pipeline_utilities.ipynb
Version 1.0.25: 3/10/2023 8:46am - Nate Calvanese - Turned on the predictable file IDs dataset creation parameter
importing Jupyter notebook from source_files_creation.ipynb
Version 1.0.9: 2/25/2023 3:15pm - Nate Calvanese - Replaced FAPI with utils functions
importing Jupyter notebook from build_file_inventory.ipynb
Version 2.0.0: 3/7/2022 9:32pm - Nate Calvanese - Massive performance improvement with use of gsutil parsing
importing Jupyter notebook from process_table_data.ipynb
Version: 1.0.9: 3/8/2023 12:09pm - Nate Calvanese - Performance improvements for file ref lookups
importing Jupyter notebook from build_mapping_query.ipynb
Version 1.0.12: 2/21/2023 2:50pm - Nate Calvanese - Added support for $BQ_DATASET substitution variable
importing Jupyter notebook from output_data_validation.ipynb
Version 2.0.6: 2/28/2023 11:33am -- Updated notebook to be usable in dev (removed TDR host hardcoding)
importing Jupyter notebook 

In [2]:
# Function to identify supplementary files and update anvil_file records to properly mark them
def identify_supplementary_files(params, dataset_id):
    
    # Establish TDR API client and retrieve the schema for the specified dataset
    logging.info("Attempting to identify the TDR object, and the necessary attributes...")
    api_client = utils.refresh_tdr_api_client()
    full_tdr_schema, bq_project, bq_schema, skip_bq_queries = odv.retrieve_tdr_schema(dataset_id, "dataset", api_client)
    if skip_bq_queries:
        return "Error retrieving BQ project and schema", "Error"
    
    # Determine if field needs to be added, and add if so
    logging.info("Adding anvil_file.is_supplementary to dataset schema, if necessary.")
    field_found = False
    for table in full_tdr_schema["tables"]:
        if table["name"] == "anvil_file":
            for col in table["columns"]:
                if col["name"] == "is_supplementary":
                    field_found = True
                    logging.info("Field already found! Skipping schema update.")
                    break
            break
    if field_found == False:
        logging.info("Field not found. Running dataset schema update.")
        schema_update_request = {
            "description": "Adding is_supplementary column to anvil_file",
            "changes": {
                "addColumns": [
                  {
                    "tableName": "anvil_file",
                    "columns": [
                      {
                        "name": "is_supplementary",
                        "datatype": "boolean",
                        "array_of": False,
                        "required": False
                      }
                    ]
                  }
                ]
            }
        }
        attempt_counter = 0
        while True:
            try:
                schema_update_result, job_id = utils.wait_for_tdr_job(datasets_api.update_schema(id=dataset_id, dataset_schema_update_model=schema_update_request))
                logging.info("Dataset schema update succeeded!")
                break
            except Exception as e:
                logging.error("Error on dataset schema update: {}".format(str(e)))
                attempt_counter += 1
                if attempt_counter < 2:
                    logging.info("Retrying dataset schema update (attempt #{})...".format(str(attempt_counter)))
                    sleep(15)
                    continue
                else:
                    logging.error("Maximum number of retries exceeded. Unable to update dataset schema.")
                    return "Error updating TDR dataset schema", "Error"
        
    # Re-process anvil_file data to include is_supplementary (where appropriate) and ingest into TDR dataset
    logging.info("Re-processing existing anvil_file data to include is_supplementary value.")
    client = bigquery.Client()
    target_file = "anvil_file.json"
    destination_dir = params["t_output_dir"]
    query = """BEGIN
        
        CREATE TEMPORARY TABLE activity_exp AS WITH activity_agg
        AS
        (
          SELECT used_biosample_id, generated_file_id, used_file_id FROM `{project}.{dataset}.anvil_activity`
          UNION ALL 
          SELECT [] AS used_biosample_id, generated_file_id, used_file_id FROM `{project}.{dataset}.anvil_alignmentactivity`
          UNION ALL 
          SELECT used_biosample_id, generated_file_id, [] AS used_file_id FROM `{project}.{dataset}.anvil_assayactivity`
          UNION ALL 
          SELECT used_biosample_id, generated_file_id, [] AS used_file_id FROM `{project}.{dataset}.anvil_sequencingactivity`
          UNION ALL 
          SELECT [] AS used_biosample_id, generated_file_id, used_file_id FROM `{project}.{dataset}.anvil_variantcallingactivity`
        )
        SELECT file_id, int_file_id, biosample_id
        FROM activity_agg
            LEFT JOIN UNNEST(used_biosample_id) AS biosample_id
            LEFT JOIN UNNEST(generated_file_id) as file_id
            LEFT JOIN UNNEST(used_file_id) as int_file_id
        ;
        
        CREATE TEMPORARY TABLE act_exp_lookup
        AS
        (
            SELECT file_id, MAX(biosample_id) AS biosample_id
          FROM
          (
            --Level 1:
            SELECT file_id, biosample_id
            FROM activity_exp
            WHERE int_file_id IS NULL AND file_id IS NOT NULl AND biosample_id IS NOT NULL
            --Level 2:
            UNION ALL
            SELECT a2.file_id, a1.biosample_id
            FROM activity_exp a1
              LEFT JOIN activity_exp a2
              ON a1.file_id = a2.int_file_id
            WHERE a2.int_file_id IS NOT NULL AND a2.file_id IS NOT NULL AND a1.biosample_id IS NOT NULL
            --Level 3:
            UNION ALL
            SELECT a3.file_id, a1.biosample_id
            FROM activity_exp a1
              LEFT JOIN activity_exp a2
              ON a1.file_id = a2.int_file_id
              LEFT JOIN activity_exp a3
              ON a2.file_id = a3.int_file_id
            WHERE a3.int_file_id IS NOT NULL AND a3.file_id IS NOT NULL AND a1.biosample_id IS NOT NULL
            --Level 4:
            UNION ALL
            SELECT a4.file_id, a1.biosample_id
            FROM activity_exp a1
              LEFT JOIN activity_exp a2
              ON a1.file_id = a2.int_file_id
              LEFT JOIN activity_exp a3
              ON a2.file_id = a3.int_file_id
              LEFT JOIN activity_exp a4
              ON a3.file_id = a4.int_file_id
            WHERE a4.int_file_id IS NOT NULL AND a4.file_id IS NOT NULL AND a1.biosample_id IS NOT NULL
            --Level 5:
            UNION ALL
            SELECT a5.file_id, a1.biosample_id
            FROM activity_exp a1
              LEFT JOIN activity_exp a2
              ON a1.file_id = a2.int_file_id
              LEFT JOIN activity_exp a3
              ON a2.file_id = a3.int_file_id
              LEFT JOIN activity_exp a4
              ON a3.file_id = a4.int_file_id
              LEFT JOIN activity_exp a5
              ON a4.file_id = a5.int_file_id
            WHERE a5.int_file_id IS NOT NULL AND a5.file_id IS NOT NULL AND a1.biosample_id IS NOT NULL
          )
          GROUP BY file_id
        );
        
        SELECT t1.file_id, data_modality, file_format, file_size, file_md5sum, reference_assembly, file_name, file_ref, source_datarepo_row_ids,
        CASE WHEN t2.biosample_id IS NULL THEN TRUE ELSE FALSE END AS is_supplementary
        FROM `{project}.{dataset}.anvil_file` t1
          LEFT JOIN act_exp_lookup t2
          ON t1.file_id = t2.file_id
        ;
        
        END
        """.format(project=bq_project, dataset=bq_schema)
    try:
        df = client.query(query).result().to_dataframe()
        records_json = df.to_json(orient='records') 
        records_list = json.loads(records_json)
        records_cnt = len(records_list)
        with open(target_file, 'w') as outfile:
            for idx, val in enumerate(records_list):
                json.dump(val, outfile)
                if idx < (records_cnt - 1):
                    outfile.write('\n')
        !gsutil cp $target_file $ws_bucket/$destination_dir/ 2> stdout
        !rm $target_file
        logging.info("Successfully created new anvil_file.json file.")
    except Exception as e:
        logging.error("Error creating new anvil_file.json file: {}".format(str(e)))
        return "Error creating new anvil_file.json file", "Error"

    # Ingest updated anvil_file data
    logging.info("Ingesting updated anvil_file data into TDR dataset.")
    source_full_file_path = "{}/{}/{}".format(ws_bucket, destination_dir, "anvil_file.json")
    ingest_request = {
        "table": "anvil_file",
        "profile_id": params["profile_id"],
        "ignore_unknown_values": True,
        "resolve_existing_files": True,
        "updateStrategy": "replace",
        "format": "json",
        "load_tag": "Ingest for {}".format(dataset_id),
        "path": source_full_file_path
    }
    attempt_counter = 0
    while True:
        try:
            api_client = utils.refresh_tdr_api_client()
            datasets_api = data_repo_client.DatasetsApi(api_client=api_client)
            ingest_request_result, job_id = utils.wait_for_tdr_job(datasets_api.ingest_dataset(id=dataset_id, ingest=ingest_request))
            logging.info("Ingest from file anvil_file.json succeeded: {}".format(str(ingest_request_result)[0:1000]))
            ingest_str = "Ingest Succeeded: {}".format(str(ingest_request_result)[0:1000])
            break
        except Exception as e:
            logging.error("Error on Dataset Ingest: {}".format(str(e)))
            attempt_counter += 1
            if attempt_counter < 2:
                logging.info("Retrying Dataset Ingest (attempt #{})...".format(str(attempt_counter)))
                sleep(10)
                continue
            else:
                logging.error("Maximum number of retries exceeded.")
                return "Error ingesting new anvil_file.json file into TDR dataset", "Error"
    
    # Return success message if no failures recorded
    logging.info("Supplementary file identification ran successfully.")
    return ingest_str, "Success"


In [3]:
# Test
# params = {}
# dataset_id = "f85ea65e-1943-4bd6-a541-71c5d8465ca9"
# params["profile_id"] = "e0e03e48-5b96-45ec-baa4-8cc1ebf74c61"
# params["t_output_dir"] = "ingest_pipeline/output/transformed/anvil/{}/table_data".format(dataset_id)
# output, status = identify_supplementary_files(params, dataset_id)
# print(status)
# print(output)

03/13/2023 06:22:24 PM - INFO: Attempting to identify the TDR object, and the necessary attributes...
03/13/2023 06:22:25 PM - INFO: Adding anvil_file.is_supplementary to dataset schema, if necessary.
03/13/2023 06:22:25 PM - INFO: Field already found! Skipping schema update.
03/13/2023 06:22:25 PM - INFO: Re-processing existing anvil_file data to include is_supplementary value.
03/13/2023 06:22:41 PM - INFO: Successfully created new anvil_file.json file.
03/13/2023 06:22:41 PM - INFO: Ingesting updated anvil_file data into TDR dataset.
TDR Job ID: -h3Os7FnTrG0RX2dIOYCLg
03/13/2023 06:23:52 PM - INFO: Ingest from file anvil_file.json succeeded: {'dataset_id': 'f85ea65e-1943-4bd6-a541-71c5d8465ca9', 'dataset': 'ANVIL_ccdg_broad_ai_ibd_daly_vermeire_gsa_20221121', 'table': 'anvil_file', 'path': 'gs://fc-2a9eefc3-0302-427f-9ac3-82f078741c03/ingest_pipeline/output/transformed/anvil/f85ea65e-1943-4bd6-a541-71c5d8465ca9/table_data/anvil_file.json', 'load_tag': 'Ingest for f85ea65e-1943-4bd6-