In [None]:
# Version
#print('Version 1.0.0: 11/4/2022 8:45am - Nate Calvanese - Initial version')
#print('Version 1.0.1: 11/8/2022 4:03pm - Nate Calvanese - Expanded query to cover additional cases')
print('Version 1.0.2: 10/6/2023 11:20am - Nate Calvanese - Updated query and added validation logic')


In [None]:
#!pip install data_repo_client

In [1]:
## imports and environment variables

# Imports
import import_ipynb
import pandas as pd
import os
import re
import json
import data_repo_client
from google.cloud import bigquery
import ingest_pipeline_utilities as utils
import output_data_validation as odv
import logging
from time import sleep
import numpy as np

# Configure logging format
logging.basicConfig(format="%(asctime)s - %(levelname)s: %(message)s", datefmt="%m/%d/%Y %I:%M:%S %p", level=logging.INFO)

# workspace environment variables
ws_name = os.environ["WORKSPACE_NAME"]
ws_project = os.environ["WORKSPACE_NAMESPACE"]
ws_bucket = os.environ["WORKSPACE_BUCKET"]
ws_bucket_name = re.sub('^gs://', '', ws_bucket)

# print(f"workspace name = {ws_name}")
# print(f"workspace project = {ws_project}")
# print(f"workspace bucket = {ws_bucket}")
# print(f"workspace bucket name = {ws_bucket_name}")


importing Jupyter notebook from ingest_pipeline_utilities.ipynb
Version 1.0.31: 9/1/2023 11:44am - Nate Calvanese - Updated dataset creation to conditionally enable secure monitoring by public status
importing Jupyter notebook from source_files_creation.ipynb
Version 1.0.9: 2/25/2023 3:15pm - Nate Calvanese - Replaced FAPI with utils functions
importing Jupyter notebook from build_file_inventory.ipynb
Version 2.0.3: 10/6/2023 9:29am - Nate Calvanese - Tweaked file extension parsing logic
importing Jupyter notebook from process_table_data.ipynb
Version: 1.0.9: 3/8/2023 12:09pm - Nate Calvanese - Performance improvements for file ref lookups
importing Jupyter notebook from build_mapping_query.ipynb
Version 1.0.12: 2/21/2023 2:50pm - Nate Calvanese - Added support for $BQ_DATASET substitution variable
importing Jupyter notebook from output_data_validation.ipynb
Version 2.0.6: 2/28/2023 11:33am -- Updated notebook to be usable in dev (removed TDR host hardcoding)
importing Jupyter notebook

In [None]:
def ingest_records(params, dataset_id, table, records_dict):
    logging.info("File relationships found: {} new records to ingest".format(str(len(records_dict))))
    logging.info("Submitting ingest for inferred file relationships.")
    
    # Build, submit, and monitor ingest request
    ingest_request = {
        "table": table,
        "profile_id": params["profile_id"],
        "ignore_unknown_values": True,
        "resolve_existing_files": True,
        "updateStrategy": "replace",
        "format": "array",
        "load_tag": "File relationships inference ingest for {}".format(dataset_id),
        "records": records_dict
    }
    attempt_counter = 0
    while True:
        try:
            api_client = utils.refresh_tdr_api_client()
            datasets_api = data_repo_client.DatasetsApi(api_client=api_client)
            ingest_request_result, job_id = utils.wait_for_tdr_job(datasets_api.ingest_dataset(id=dataset_id, ingest=ingest_request))
            logging.info("File relationships inference ingest succeeded: {}".format(str(ingest_request_result)[0:1000]))
            return_str = "Ingest Succeeded: {}".format(str(ingest_request_result)[0:1000])
            status = "Success"
            return return_str, status
        except Exception as e:
            logging.error("Error on file relationships inference ingest: {}".format(str(e)))
            attempt_counter += 1
            if attempt_counter < 2:
                logging.info("Retrying file relationships inference ingest (attempt #{})...".format(str(attempt_counter)))
                sleep(10)
                continue
            else:
                logging.error("Maximum number of retries exceeded. Logging error.")
                return_str = "Ingest Failed ({})".format(str(e))
                status = "Error"
                return return_str, status

def infer_file_relationships(params, dataset_id):
    
    # Establish TDR API client and retrieve the schema for the specified dataset
    logging.info("Attempting to identify the TDR object, and the necessary attributes...")
    api_client = utils.refresh_tdr_api_client()
    full_tdr_schema, bq_project, bq_schema, skip_bq_queries = odv.retrieve_tdr_schema(dataset_id, "dataset", api_client)
    if skip_bq_queries:
        return "Error retrieving BQ project and schema", "Error"
    
    # Check files for duplicate names
    logging.info("Determining file relationship inference method...")
    client = bigquery.Client()
    file_query = """
        SELECT COUNT(*) file_count, COUNT(DISTINCT file_name) AS distinct_file_names
        FROM `{project}.{schema}.anvil_file`
        """.format(project=bq_project, schema = bq_schema)
    try:
        df = client.query(file_query).result().to_dataframe()
        method = "file_name"
        if not df.empty:
            file_count = df["file_count"].values[0]
            distinct_name_count = df["distinct_file_names"].values[0] 
            if file_count != distinct_name_count:
                method = "file_path"
    except Exception as e:
        logging.error("Error during query execution: {}".format(str(e)))
        return "Error during query execution: {}".format(str(e)), "Error"
    if method == "file_name":
        infer_file_rel_query = """WITH existing_activity_records
                AS
                (
                  SELECT activity_id, used_file_id, generated_file_id
                  FROM `{project}.{schema}.anvil_activity`,
                  UNNEST(used_file_id) AS used_file_id,
                  UNNEST(generated_file_id) AS generated_file_id 
                ), 
                potential_activity_records
                AS
                (
                  SELECT *, ROW_NUMBER() OVER (PARTITION BY used_file_id, generated_file_id ORDER BY 1) AS RN
                  FROM 
                  (
                    SELECT t1.file_id AS used_file_id, t1.file_format AS used_file_format, t0.file_id AS generated_file_id, t0.file_format AS generated_file_format, 
                    ARRAY_CONCAT(t0.source_datarepo_row_ids, t1.source_datarepo_row_ids) AS source_datarepo_row_ids
                    FROM `{project}.{schema}.anvil_file` t0
                      INNER JOIN `{project}.{schema}.anvil_file` t1
                      ON REPLACE(t0.file_name, t0.file_format, '') = t1.file_name
                      AND t0.file_name <> t1.file_name
                    WHERE t0.file_id IS NOT NULL
                    UNION ALL
                    SELECT t1.file_id AS used_file_id, t1.file_format AS used_file_format, t0.file_id AS generated_file_id, t0.file_format AS generated_file_format, 
                    ARRAY_CONCAT(t0.source_datarepo_row_ids, t1.source_datarepo_row_ids) AS source_datarepo_row_ids
                    FROM `{project}.{schema}.anvil_file` t0
                      INNER JOIN `{project}.{schema}.anvil_file` t1
                      ON REPLACE(t0.file_name, t0.file_format, '') = REPLACE(t1.file_name, t1.file_format, '')
                      AND t0.file_name <> t1.file_name 
                      AND ((t1.file_format = '.cram' AND t0.file_format = '.crai') OR (t1.file_format = '.bam' AND t0.file_format = '.bai'))
                    WHERE t0.file_id IS NOT NULL
                  )
                )
                SELECT `dsp-data-ingest.transform_resources`.uuid_hash_value(t0.generated_file_id||t0.used_file_id) AS activity_id, 
                CASE WHEN generated_file_format IN ('.crai', '.bai', '.tbi', '.csi') THEN 'Indexing'
                      WHEN generated_file_format = '.md5' THEN 'Checksum'
                      ELSE 'Unknown'
                END AS activity_type,
                [t0.used_file_id] AS used_file_id,
                [t0.generated_file_id] AS generated_file_id,
                t0.source_datarepo_row_ids
                FROM potential_activity_records t0
                  LEFT JOIN existing_activity_records t1
                  ON t0.used_file_id = t1.used_file_id AND t0.generated_file_id = t1.generated_file_id
                WHERE t1.activity_id IS NULL""".format(project = bq_project, schema = bq_schema)
    else:
        infer_file_rel_query = """WITH existing_activity_records
                AS
                (
                  SELECT activity_id, used_file_id, generated_file_id
                  FROM `{project}.{schema}.anvil_activity`,
                  UNNEST(used_file_id) AS used_file_id,
                  UNNEST(generated_file_id) AS generated_file_id 
                ), 
                potential_activity_records
                AS
                (
                  SELECT *, ROW_NUMBER() OVER (PARTITION BY used_file_id, generated_file_id ORDER BY 1) AS RN
                  FROM 
                  (
                    SELECT t1.file_ref AS used_file_id, t1.full_extension AS used_file_format, t0.file_ref AS generated_file_id, t0.full_extension AS generated_file_format, 
                    [t0.datarepo_row_id, t1.datarepo_row_id] AS source_datarepo_row_ids
                    FROM `{project}.{schema}.file_inventory` t0
                      INNER JOIN `{project}.{schema}.file_inventory` t1
                      ON REPLACE(t0.path, t0.full_extension, '') = t1.path
                      AND t0.path <> t1.path 
                    WHERE t0.file_ref IS NOT NULL
                    UNION ALL
                    SELECT t1.file_ref AS used_file_id, t1.full_extension AS used_file_format, t0.file_ref AS generated_file_id, t0.full_extension AS generated_file_format, 
                    [t0.datarepo_row_id, t1.datarepo_row_id] AS source_datarepo_row_ids
                    FROM `{project}.{schema}.file_inventory` t0
                      INNER JOIN `{project}.{schema}.file_inventory` t1
                      ON REPLACE(t0.path, t0.full_extension, '') = REPLACE(t1.path, t1.full_extension, '')
                      AND t0.path <> t1.path 
                      AND ((t1.full_extension = '.cram' AND t0.full_extension = '.crai') OR (t1.full_extension = '.bam' AND t0.full_extension = '.bai'))
                    WHERE t0.file_ref IS NOT NULL
                  )
                )
                SELECT `dsp-data-ingest.transform_resources`.uuid_hash_value(t0.generated_file_id||t0.used_file_id) AS activity_id, 
                CASE WHEN generated_file_format IN ('.crai', '.bai', '.tbi', '.csi') THEN 'Indexing'
                      WHEN generated_file_format = '.md5' THEN 'Checksum'
                      ELSE 'Unknown'
                END AS activity_type,
                [t0.used_file_id] AS used_file_id,
                [t0.generated_file_id] AS generated_file_id,
                t0.source_datarepo_row_ids
                FROM potential_activity_records t0
                  LEFT JOIN existing_activity_records t1
                  ON t0.used_file_id = t1.used_file_id AND t0.generated_file_id = t1.generated_file_id
                WHERE t1.activity_id IS NULL""".format(project = bq_project, schema = bq_schema)
        
    # Construct records to ingest
    logging.info("Attempting to infer and ingest file relationships...")
    try:
        df = client.query(infer_file_rel_query).result().to_dataframe()
        records_dict = df.to_dict(orient="records")
        final_records_dict = []
        for record in records_dict:
            inner_dict = {}
            for key, val in record.items():
                if isinstance(val, np.ndarray):
                    inner_dict[key] = val.tolist()
                else:
                    inner_dict[key] = val
            final_records_dict.append(inner_dict)
    except Exception as e:
        logging.error("Error during query execution: {}".format(str(e)))
        return "Error during query execution: {}".format(str(e)), "Error"
    
    # Ingest records
    if final_records_dict:
        ingest_str, ingest_status = ingest_records(params, dataset_id, "anvil_activity", final_records_dict)
    else:
        logging.info("No new file relationships found, no records to ingest.")
        return "No new file relationships found, no records to ingest.", "Success"     
    
    # Validate ingested file relationships
    logging.info("Validating ingested file relationships...")
    client = bigquery.Client()
    validation_query = """
        WITH activity_flattened AS
        (
          SELECT DISTINCT generated_file, activity_type, used_file
          FROM `{project}.{schema}.anvil_activity`
            CROSS JOIN UNNEST(used_file_id) AS used_file
            CROSS JOIN UNNEST(generated_file_id) AS generated_file
          WHERE ARRAY_LENGTH(used_biosample_id) = 0
        ),
        activity_agg AS
        (
          SELECT generated_file, activity_type, COUNT(DISTINCT used_file)
          FROM activity_flattened
          GROUP BY generated_file, activity_type
          HAVING COUNT(DISTINCT used_file) > 1
        )
        SELECT *
        FROM 
        (
          SELECT 'Files generated from multiple file activities (Activity Type - All)' AS metric, COUNT(DISTINCT generated_file) AS result 
          FROM activity_agg
          UNION ALL
          SELECT 'Files generated from multiple file activities (Activity Type - ' || activity_type || ')' AS metric, COUNT(DISTINCT generated_file) AS result 
          FROM activity_agg
          GROUP BY activity_type
        )
        ORDER BY metric
        """.format(project = bq_project, schema = bq_schema)
    try:
        df = client.query(validation_query).result().to_dataframe()
        if df.empty or (len(df) == 1 and df["result"].values[0] == 0):
            logging.info("No failures reported in validation of ingested file relationships.")
        else:
            records_json = json.loads(df.to_json(orient='records'))
            total_file_count = 0
            index_file_count = 0
            checksum_file_count = 0
            unknown_file_count = 0
            for record in records_json:
                if record["metric"] == "Files generated from multiple file activities (Activity Type - All)":
                    total_file_count = record["result"]
                elif record["metric"] == "Files generated from multiple file activities (Activity Type - Indexing)":
                    index_file_count = record["result"]
                elif record["metric"] == "Files generated from multiple file activities (Activity Type - Checksum)":
                    checksum_file_count = record["result"]
                else:
                    unknown_file_count = record["result"]
            err_msg = f"Errors found when validating ingested file relationships (files generated from multiple file activities). All: {str(total_file_count)} Indexing Activities: {str(index_file_count)} Checksum Activities: {str(checksum_file_count)} Unknown Activities: {str(unknown_file_count)}"
            logging.error(err_msg)
            return err_msg, "Error"   
    except Exception as e:
        logging.error("Error during query execution: {}".format(str(e)))
        return "Error during query execution: {}".format(str(e)), "Error" 
    
    # Return success message if no failures recorded
    logging.info("File relationships ingested and validated successfully.")
    return ingest_str, ingest_status


In [None]:
# Test
# params = {}
# params["profile_id"] = "e0e03e48-5b96-45ec-baa4-8cc1ebf74c61"
# dataset_id = "8fbfea50-6a71-4b19-98e9-f95e3a8594c7"
# output, status = infer_file_relationships(params, dataset_id)
# print(status)
# print(output)