# Imports and Common Functions

In [None]:
#!pip install --upgrade data_repo_client
# !wget https://aka.ms/downloadazcopy-v10-linux
# !tar -xvf downloadazcopy-v10-linux

In [5]:
# Imports
import import_ipynb
import data_repo_client
import google.auth
import datetime
import os
import sys
import logging
from time import sleep
from google.cloud import bigquery
from google.cloud import storage
import ingest_pipeline_utilities as utils
import pandas as pd
import json
import re
import math
import requests
import subprocess

# Configure logging format
while logging.root.handlers:
    logging.root.removeHandler(logging.root.handlers[-1])
logging.basicConfig(format="%(asctime)s - %(levelname)s: %(message)s", datefmt="%m/%d/%Y %I:%M:%S %p", level=logging.INFO, handlers=[logging.StreamHandler(sys.stdout)])
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option("display.max_colwidth", None)
pd.set_option('display.width', 1000)
pd.set_option('display.colheader_justify', 'center')
pd.set_option('display.precision', 3)

# Function to refresh TDR API client
def refresh_tdr_api_client(host):
    creds, project = google.auth.default()
    auth_req = google.auth.transport.requests.Request()
    creds.refresh(auth_req)
    config = data_repo_client.Configuration()
    config.host = host
    config.access_token = creds.token
    api_client = data_repo_client.ApiClient(configuration=config)
    api_client.client_side_validation = False
    return api_client

# Function to wait for TDR job completion
def wait_for_tdr_job(job_model, host):
    result = job_model
    print("TDR Job ID: " + job_model.id)
    counter = 0
    job_state = "UNKNOWN"
    while True:
        # Re-establish credentials and API clients every 30 minutes
        if counter == 0 or counter%180 == 0:
            api_client = refresh_tdr_api_client(host)
            jobs_api = data_repo_client.JobsApi(api_client=api_client)
        # Check for TDR connectivity issues and raise exception if the issue persists
        conn_err_counter = 0
        while job_state == "UNKNOWN":
            conn_err_counter += 1
            if conn_err_counter >= 10:
                raise Exception("Error interacting with TDR: {}".format(result.status_code)) 
            elif result == None or result.status_code in ["500", "502", "503", "504"]:
                sleep(10)
                counter += 1
                attempt_counter = 0
                while True:
                    try:
                        result = jobs_api.retrieve_job(job_model.id)
                        break
                    except Exception as e:
                        if attempt_counter < 5:
                            attempt_counter += 1
                            sleep(10)
                            continue
                        else:
                            raise Exception("Error retrieving job status from TDR: {}".format(str(e)))
            else:
                job_state = "KNOWN"
        # Check if job is still running, and sleep/re-check if so
        if job_state == "KNOWN" and result.job_status == "running":
            sleep(10)
            counter += 1
            attempt_counter = 0
            while True:
                try:
                    result = jobs_api.retrieve_job(job_model.id)
                    break
                except Exception as e:
                    if attempt_counter < 5:
                        sleep(10)
                        attempt_counter += 1
                        continue
                    else:
                        raise Exception("Error retrieving job status from TDR: {}".format(str(e)))
        # If job has returned as failed, confirm this is the correct state and retrieve result if so
        elif job_state == "KNOWN" and result.job_status == "failed":
            fail_counter = 0
            while True:
                attempt_counter = 0
                while True:
                    try:
                        result = jobs_api.retrieve_job(job_model.id)
                        if result.job_status == "failed":
                            fail_counter += 1
                        break
                    except Exception as e:
                        if attempt_counter < 5:
                            sleep(10)
                            attempt_counter += 1
                            continue
                        else:
                            raise Exception("Error retrieving job status from TDR: {}".format(str(e)))
                if fail_counter >= 3:
                    try:
                        fail_result = jobs_api.retrieve_job_result(job_model.id)
                        raise Exception("Job " + job_model.id + " failed: " + fail_result)
                    except Exception as e:
                        raise Exception("Job " + job_model.id + " failed: " + str(e))
        # If a job has returned as succeeded, retrieve result
        elif job_state == "KNOWN" and result.job_status == "succeeded":
            attempt_counter = 0
            while True:
                try:
                    return jobs_api.retrieve_job_result(job_model.id), job_model.id
                except Exception as e:
                    if attempt_counter < 3:
                        sleep(10)
                        attempt_counter += 1
                        continue
                    else:
                        return "Job succeeded, but error retrieving job result: {}".format(str(e)), job_model.id
        else:
            raise Exception("Unrecognized job state: {}".format(result.job_status))

# Migrating TDR Datasets

## Step 1: Pre-Connector Processing
For the list of GCP TDR datasets provided:
1. Extract the schema
2. Create an Azure TDR dataset using the extracted schema
3. Build a manifest of files to be copied from the GCP dataset to the Azure dataset and write to BigQuery.

In [None]:
#############################################
## Functions
#############################################

# Function to build default target TDR dataset name
def format_dataset_name(input_str):
    current_datetime = datetime.datetime.now()
    current_date_string = current_datetime.strftime("%Y%m%d")
    input_str = input_str[:-9]
    output_str = "ANVIL_" + re.sub("^ANVIL[_]?", "", input_str, flags=re.IGNORECASE) + "_" + current_date_string
    output_str = re.sub("[^a-zA-Z0-9_]", "_", output_str)
    return output_str

# Function to create a new TDR dataset from an existing TDR dataset
def create_dataset_from_dataset(src_tdr_object_uuid, tar_tdr_object_uuid, billing_profile):

    # Setup/refresh TDR clients
    api_client = refresh_tdr_api_client("https://data.terra.bio")
    datasets_api = data_repo_client.DatasetsApi(api_client=api_client)

    # Retrieve original dataset details
    logging.info(f"Retrieving original dataset details from prod environment. UUID:  {src_tdr_object_uuid}")
    try:
        dataset_details = datasets_api.retrieve_dataset(id=src_tdr_object_uuid, include=["SCHEMA", "ACCESS_INFORMATION", "PROPERTIES", "DATA_PROJECT", "STORAGE"]).to_dict()
        bq_project = dataset_details["access_information"]["big_query"]["project_id"]
        bq_dataset = dataset_details["access_information"]["big_query"]["dataset_name"]
        orig_object_name = dataset_details["name"]
    except Exception as e:
        error_str = f"Error retrieving details from dataset {src_tdr_object_uuid} in TDR prod environment: {str(e)}"
        logging.error(error_str)
        return None, None, None, None, None

    # If target dataset specified, retrieve name
    if tar_tdr_object_uuid:
        new_dataset_id = tar_tdr_object_uuid
        logging.info(f"Retrieving new dataset details from prod environment. UUID:  {tar_tdr_object_uuid}")
        try:
            dataset_details = datasets_api.retrieve_dataset(id=tar_tdr_object_uuid, include=["SCHEMA", "ACCESS_INFORMATION", "PROPERTIES", "DATA_PROJECT", "STORAGE"]).to_dict()
            new_object_name = dataset_details["name"]
        except Exception as e:
            error_str = f"Error retrieving details from dataset {tar_tdr_object_uuid} in TDR prod environment: {str(e)}"
            logging.error(error_str)
            return None, None, None, None, None 
    else:
        # Build new dataset schema
        apply_anvil_transforms = True
        new_schema_dict = {"tables": [], "relationships": [], "assets": []}
        for table_entry in dataset_details["schema"]["tables"]:
            int_table_dict = table_entry.copy()
            int_table_dict["primaryKey"] = int_table_dict.pop("primary_key")
            for key in ["partition_mode", "date_partition_options", "int_partition_options", "row_count"]:
                del int_table_dict[key]
            for idx, column_entry in enumerate(table_entry["columns"]):
                if column_entry["datatype"] == "integer":
                    table_entry["columns"][idx]["datatype"] = "int64"
            if apply_anvil_transforms:
                if table_entry["name"] == "file_inventory":
                    int_table_dict["columns"].append({"name": "orig_file_ref", "datatype": "string", "array_of": False, "required": False})
                    int_table_dict["columns"].append({"name": "orig_datarepo_row_id", "datatype": "string", "array_of": False, "required": False})
                elif "anvil_" not in table_entry["name"]:
                    int_table_dict["columns"].append({"name": "orig_datarepo_row_id", "datatype": "string", "array_of": False, "required": False})
            new_schema_dict["tables"].append(int_table_dict)
        for rel_entry in dataset_details["schema"]["relationships"]:
            int_rel_dict = rel_entry.copy()
            int_rel_dict["from"] = int_rel_dict.pop("_from")
            new_schema_dict["relationships"].append(int_rel_dict)
        for asset_entry in dataset_details["schema"]["assets"]:
            int_asset_dict = asset_entry.copy()
            int_asset_dict["rootTable"] = int_asset_dict.pop("root_table")
            int_asset_dict["rootColumn"] = int_asset_dict.pop("root_column")
            new_schema_dict["assets"].append(int_asset_dict)

        # Retrieve original dataset policies
        try:
            dataset_policies = datasets_api.retrieve_dataset_policies(id=src_tdr_object_uuid).to_dict()
            for policy in dataset_policies["policies"]:
                if policy["name"] == "steward":
                    stewards_list = policy["members"]
                elif policy["name"] == "custodian":
                    custodians_list = policy["members"]
                elif policy["name"] == "snapshot_creator":
                    snapshot_creators_list = policy["members"]
        except:
            logging.info("Error retrieving original dataset policies. Skipping policy copy.")
            stewards_list = []
            custodians_list = []
            snapshot_creators_list = []
        policies = {
            "stewards": stewards_list,
            "custodians": custodians_list,
            "snapshotCreators": snapshot_creators_list
        }

        # Determine dataset properties
        new_object_name = format_dataset_name(orig_object_name)
        new_description = dataset_details["description"] + f"\n\nCopy of dataset {orig_object_name} from TDR prod."
        self_hosted = False
        dedicated_ingest_sa = False
        phs_id = dataset_details["phs_id"]
        predictable_file_ids = dataset_details["predictable_file_ids"]
        secure_monitoring_enabled = dataset_details["secure_monitoring_enabled"]
        properties = dataset_details["properties"]
        tags = dataset_details["tags"]

        # Create new TDR dataset
        logging.info("Submitting dataset creation request.")
        dataset_request = {
            "name": new_object_name,
            "description": new_description,
            "defaultProfileId": billing_profile,
            "cloudPlatform": "azure",
            "region": "southcentralus",
            "phsId": phs_id,
            "experimentalSelfHosted": self_hosted,
            "experimentalPredictableFileIds": predictable_file_ids,
            "dedicatedIngestServiceAccount": dedicated_ingest_sa,
            "enableSecureMonitoring": secure_monitoring_enabled,
            "properties": properties,
            "tags": tags,
            "policies": policies,
            "schema": new_schema_dict
        }
        attempt_counter = 1
        while True:
            try:
                create_dataset_result, job_id = wait_for_tdr_job(datasets_api.create_dataset(dataset=dataset_request), "https://data.terra.bio")
                logging.info("Dataset Creation succeeded: {}".format(create_dataset_result))
                new_dataset_id = create_dataset_result["id"]
                break
            except Exception as e:
                error_str = f"Error on Dataset Creation: {str(e)}"
                logging.error(error_str)
                if attempt_counter < 3:
                    logging.info("Retrying Dataset Creation (attempt #{})...".format(str(attempt_counter)))
                    sleep(10)
                    attempt_counter += 1
                    continue
                else:
                    logging.error("Maximum number of retries exceeded. Exiting job.")
                    return None, None, None, None, None
        
    # Exit function
    return orig_object_name, new_dataset_id, new_object_name, bq_project, bq_dataset

# Function to create file transfer details
def output_file_details(orig_dataset_id, orig_dataset_name, new_dataset_id, new_dataset_name, bq_project, bq_dataset, public_flag, target_bigquery_table, delete_existing_records):
    
    # Setup/refresh TDR clients (and BQ client)
    api_client = refresh_tdr_api_client("https://data.terra.bio")
    datasets_api = data_repo_client.DatasetsApi(api_client=api_client)
    snapshots_api = data_repo_client.SnapshotsApi(api_client=api_client)
    client = bigquery.Client()
    
    # Clear records from target BQ table
    if delete_existing_records:
        logging.info(f"Preparing target BQ table ({target_bigquery_table}).")
        delete_query = f"""DELETE FROM `{target_bigquery_table}` WHERE gcp_dataset_id = '{orig_dataset_id}'"""
        try:
            delete_query_job = client.query(delete_query)
            delete_query_job.result()
        except Exception as e:
            logging.info("Error deleting records for the original dataset from the target BQ table.")
    
    # Retrieve table data from the original dataset and write to target BQ table
    logging.info(f"Fetching and recording all rows from table 'file_inventory' in the original dataset ({orig_dataset_id}). BQ Project = '{bq_project}' and BQ Dataset = '{bq_dataset}'.")
    current_datetime_string = datetime.datetime.now().strftime("%Y%m%d%H%M%S")
    job_config = bigquery.QueryJobConfig()
    job_config.destination = target_bigquery_table
    job_config.write_disposition = "WRITE_APPEND"
    query = f"""WITH drlh_deduped AS
                (
                  SELECT DISTINCT file_id, target_path, source_name 
                  FROM 
                  (
                    SELECT *, ROW_NUMBER() OVER (PARTITION BY source_name ORDER BY load_time DESC) AS rn
                    --SELECT *, ROW_NUMBER() OVER (PARTITION BY source_name, target_path ORDER BY load_time DESC) AS rn
                    FROM `{bq_project}.{bq_dataset}.datarepo_load_history`
                    WHERE state = "succeeded" 
                  )
                  WHERE rn = 1
                ),
                file_records AS
                (
                  SELECT '{orig_dataset_id}' AS gcp_dataset_id, '{orig_dataset_name}' AS gcp_dataset_name, 
                  '{new_dataset_id}' AS az_dataset_id, '{new_dataset_name}' AS az_dataset_name, 
                  b.source_name AS source_path, b.target_path, a.size_in_bytes, a.md5_hash, a.file_ref AS orig_tdr_file_id,
                  '{current_datetime_string}' AS date_added, '{public_flag}' AS public_flag, ROW_NUMBER() OVER (PARTITION BY a.file_ref ORDER BY b.source_name) AS rn
                  FROM `{bq_project}.{bq_dataset}.file_inventory` a
                      LEFT JOIN drlh_deduped b
                      ON a.uri = b.source_name
                      LEFT JOIN `broad-dsde-prod-analytics-dev.anvil_azure_migration.azure_migration_file_list` c
                      ON a.file_ref = c.orig_tdr_file_id AND c.az_dataset_id = '{new_dataset_id}'
                  WHERE c.source_path IS NULL
                )
                SELECT * EXCEPT(rn)
                FROM file_records
                WHERE rn = 1"""
    #print(query)
    attempt_counter = 0
    while True:
        try:
            query_job = client.query(query, job_config=job_config)
            query_job.result()
            logging.info("Records recorded successfully.")
            return
        except Exception as e:
            if attempt_counter < 5:
                sleep(10)
                attempt_counter += 1
                continue
            else:
                err_str = f"Error recording records for all rows of table 'file_inventory': {str(e)}."
                logging.error(err_str)
                return
    

#############################################
## Input Parameters
#############################################

# General parameters
target_bigquery_table = "broad-dsde-prod-analytics-dev.anvil_azure_migration.azure_migration_file_list"
azure_billing_profile = "9ee23bed-b46c-4561-9103-d2a723113f7f"

# Specify the list of datasets to process, leaving the target Azure dataset ID empty to create a new one
migration_list = [
    #["src_gcp_dataset_id", "tar_az_dataset_id", "open_access (Y/N)"]
    ['902596ce-714e-49b3-8271-f3dfece52309', 'e091028e-a6b1-4989-9477-498e7ea206f0', 'N'],
]

# Specify whether existing records in the azure_migration_file_list table should be deleted before running
delete_existing_records = False


#############################################
## Execution
#############################################

# Loop through migration list and process entries
results = []
for entry in migration_list:
    logging.info(f"Processing Migration List Entry: {str(entry)}")
    dataset_name, new_dataset_id, new_dataset_name, bq_project, bq_dataset = create_dataset_from_dataset(entry[0], entry[1], azure_billing_profile)
    if new_dataset_id:
        output_file_details(entry[0], dataset_name, new_dataset_id, new_dataset_name, bq_project, bq_dataset, entry[2], target_bigquery_table, delete_existing_records)
        results.append([entry[0], dataset_name, "Success", new_dataset_id, new_dataset_name])
    else:
        results.append([entry[0], dataset_name, "Failure", new_dataset_id, new_dataset_name])
        
# Display final results
print("\nFinal Results:")
results_df = pd.DataFrame(results, columns = ["Source Dataset ID", "Source Dataset Name", "Status", "New Dataset ID", "New Dataset Name"])
display(results_df)
            

## Step 2: Post-Connector Processing
For each GCP Dataset - Azure Dataset pair:
1. Retrieve the source GCP Dataset for the Snapshot
2. Extract, pre-process, and ingest tabular data from the GCP Dataset to the Azure Dataset
3. Create a new Azure snapshot based on the GCP snapshot

In [None]:
#############################################
## Functions
#############################################

# Function to fetch data from BigQuery
def fetch_source_records_bigquery(config, new_dataset_id, array_col_dict, table, start_row, end_row):
    # Extract parameters from config
    src_tdr_object_uuid = config["source_dataset_id"]
    src_tdr_object_type = "dataset"
    tdr_host = config["tdr_host"]
    files_already_ingested = True
    datarepo_row_ids_to_ingest = []
    apply_anvil_transforms = True
    bq_project = config["bigquery_project"]
    bq_dataset = config["bigquery_dataset"]
    
    # Setup/refresh TDR clients (and BQ client)
    api_client = refresh_tdr_api_client(tdr_host)
    datasets_api = data_repo_client.DatasetsApi(api_client=api_client)
    snapshots_api = data_repo_client.SnapshotsApi(api_client=api_client)
    client = bigquery.Client()
    
    # Retrieve table data from the original dataset
    logging.info(f"Fetching rows {str(start_row)}-{str(end_row)} from table '{table}' in the original {src_tdr_object_type} ({src_tdr_object_uuid}).")
    table_recs_str = f"Table: {table} -- Rows: {str(start_row)}-{str(end_row)}"
    final_records = []
    if apply_anvil_transforms and "anvil_" not in table:
        if table == "file_inventory":
            if files_already_ingested == False:
                file_ref_sql = "TO_JSON_STRING(STRUCT(source_name AS sourcePath, target_path AS targetPath, 'Ingest of '||source_name AS description, COALESCE(content_type, 'application/octet-stream') AS mimeType))"
            else:
                file_ref_sql = "file_ref"
            rec_fetch_query = f"""WITH drlh_deduped AS
                            (
                              SELECT DISTINCT file_id, target_path, source_name
                              FROM `{bq_project}.{bq_dataset}.datarepo_load_history`
                              WHERE state = "succeeded" 
                            )
                            SELECT * EXCEPT(rownum)
                            FROM
                            (
                              SELECT datarepo_row_id, datarepo_row_id AS orig_datarepo_row_id, a.file_id, name, path, target_path AS uri, content_type, full_extension, size_in_bytes, crc32c, md5_hash, ingest_provenance,
                              file_ref AS orig_file_ref, {file_ref_sql} AS file_ref,
                              ROW_NUMBER() OVER (ORDER BY datarepo_row_id) AS rownum
                              FROM `{bq_project}.{bq_dataset}.{table}` a
                                  LEFT JOIN drlh_deduped b
                                  ON a.file_ref = b.file_id
                            )
                            WHERE rownum BETWEEN {start_row} AND {end_row}"""
        else:
            rec_fetch_query = f"""SELECT * EXCEPT(rownum)
                            FROM
                            (
                              SELECT *, datarepo_row_id AS orig_datarepo_row_id,
                              ROW_NUMBER() OVER (ORDER BY datarepo_row_id) AS rownum
                              FROM `{bq_project}.{bq_dataset}.{table}`
                            )
                            WHERE rownum BETWEEN {start_row} AND {end_row}"""
    else:
        rec_fetch_query = f"""SELECT * EXCEPT(rownum)
                            FROM
                            (
                              SELECT *, 
                              ROW_NUMBER() OVER (ORDER BY datarepo_row_id) AS rownum
                              FROM `{bq_project}.{bq_dataset}.{table}`
                            )
                            WHERE rownum BETWEEN {start_row} AND {end_row}"""
    attempt_counter = 0
    while True:
        try:
            df = client.query(rec_fetch_query).result().to_dataframe()
            df = df.astype(object).where(pd.notnull(df),None)
            for column in array_col_dict[table]:
                df[column] = df[column].apply(lambda x: list(x))
            if apply_anvil_transforms and table == "file_inventory" and files_already_ingested == False: 
                df["file_ref"] = df.apply(lambda x: json.loads(x["file_ref"].replace("\'", "\"")), axis=1)
            final_records = df.to_dict(orient="records")
            break
        except Exception as e:
            if attempt_counter < 5:
                sleep(10)
                attempt_counter += 1
                continue
            else:
                err_str = f"Error retrieving records for rows {str(start_row)}-{str(end_row)} of table {table}: {str(e)}."
                logging.error(err_str)
                config["migration_results"].append(["Dataset Ingestion", table_recs_str, "Failure", err_str])
                return {}
    
    # Filter retrieved data if necessary and return as dict of records
    if final_records:
        df_temp = pd.DataFrame.from_dict(final_records)
        if datarepo_row_ids_to_ingest:
            df_orig = df_temp[df_temp["datarepo_row_id"].isin(datarepo_row_ids_to_ingest)].copy()
        else:
            df_orig = df_temp.copy()
        del df_temp
        df_orig.drop(columns=["datarepo_row_id"], inplace=True, errors="ignore")
        df_orig = df_orig.astype(object).where(pd.notnull(df_orig),None)
        records_orig = df_orig.to_dict(orient="records")
        if not records_orig:
            msg_str = f"No records found in rows {str(start_row)}-{str(end_row)} of table {table} after filtering based on datarepo_row_ids_to_ingest parameter. Continuing to next record set or table validation."
            logging.info(msg_str)
            config["migration_results"].append(["Dataset Ingestion", table_recs_str, "Skipped", msg_str])
            return records_orig
        elif len(final_records) != len(records_orig):
            logging.info(f"Filtering records to ingest based on the datarepo_row_ids_to_ingest parameter. {str(len(records_orig))} of {str(len(final_records))} records to be ingested.")
            return records_orig
        else:
            return records_orig
    else:
        msg_str = f"No records found for rows {str(start_row)}-{str(end_row)} of table {table} in original {src_tdr_object_type}. Continuing to next record set or table validation."
        logging.info(msg_str)
        config["migration_results"].append(["Dataset Ingestion", table_recs_str, "Skipped", msg_str])
        return final_records

# Function to process ingests for specific table
def ingest_table_data(config, new_dataset_id, array_col_dict, float_col_dict, table, start_row, end_row):
    
    # Extract parameters from config
    src_tdr_object_uuid = config["source_dataset_id"]
    src_tdr_object_type = "dataset"
    tdr_host = config["tdr_host"]
    tar_tdr_billing_profile = config["tar_tdr_billing_profile"]
    records_processing_method = "in_memory"
    write_to_cloud_platform = ""
    apply_anvil_transforms = True
    dr_row_id_xwalk = config["dr_row_id_xwalk"]

    # Setup/refresh TDR clients
    api_client = refresh_tdr_api_client(tdr_host)
    datasets_api = data_repo_client.DatasetsApi(api_client=api_client)
    snapshots_api = data_repo_client.SnapshotsApi(api_client=api_client)
    
    # Retrieve table data from the original dataset
    table_recs_str = f"Table: {table} -- Rows: {str(start_row)}-{str(end_row)}"
    records_orig = fetch_source_records_bigquery(config, new_dataset_id, array_col_dict, table, start_row, end_row)
    if not records_orig:
        return

    # Pre-process records before ingest
    if "anvil_" in table:
        try:
            # Pre-process records in AnVIL_ records to use new datarepo_row_ids in the source_datarepo_row_ids field
            logging.info("FSS (anvil_%) table with ingest.apply_anvil_transforms parameter set to 'True'. Pre-processing records before submitting ingestion request.")
            records_processed = []
            for record in records_orig:
                int_record = record.copy()
                new_dr_row_id_list = []
                for row_id in int_record["source_datarepo_row_ids"]:
                    new_row_id = dr_row_id_xwalk.get(row_id)
                    if new_row_id:
                        new_dr_row_id_list.append(new_row_id)
                    else:
                        err_str = f"Failure in pre-processing: row_id '{row_id}'' not found in datarepo_row_id crosswalk."
                        logging.error(err_str)
                        config["migration_results"].append(["Dataset Ingestion", table_recs_str, "Failure", err_str])
                        return   
                int_record["source_datarepo_row_ids"] = new_dr_row_id_list
                for fcol in float_col_dict[table]:
                    if int_record[fcol]:
                        int_record[fcol] = float(int_record[fcol])
                records_processed.append(int_record)
        except Exception as e:
            err_str = f"Failure in pre-processing: {str(e)}"
            logging.error(err_str)
            config["migration_results"].append(["Dataset Ingestion", table_recs_str, "Failure", err_str])
            return
    else:
        records_processed = []
        for record in records_orig:
            int_record = record.copy()
            for fcol in float_col_dict[table]:
                if int_record[fcol]:
                    int_record[fcol] = float(int_record[fcol])
            records_processed.append(int_record)
    
    # Write out records to cloud, if specified by user
    if records_processing_method == "write_to_cloud":
        logging.info(f"Writing records to a control file in the cloud.")
        if write_to_cloud_platform == "gcp":
            control_file_path = write_records_to_gcp(config, table, records_processed)
        else:
            control_file_path = write_records_to_azure(config, table, records_processed)

    # Build, submit, and monitor ingest request
    logging.info(f"Submitting ingestion request to new dataset ({new_dataset_id}).")
    if records_processing_method == "write_to_cloud":
        ingest_request = {
            "table": table,
            "profile_id": tar_tdr_billing_profile,
            "ignore_unknown_values": True,
            "resolve_existing_files": True,
            "updateStrategy": "append",
            "format": "json",
            "load_tag": "Ingest for {}".format(new_dataset_id),
            "path": control_file_path
        }        
    else:
        ingest_request = {
            "table": table,
            "profile_id": tar_tdr_billing_profile,
            "ignore_unknown_values": True,
            "resolve_existing_files": True,
            "updateStrategy": "append",
            "format": "array",
            "load_tag": "Ingest for {}".format(new_dataset_id),
            "records": records_processed
        }
    attempt_counter = 1
    while True:
        try:
            api_client = refresh_tdr_api_client(tdr_host)
            datasets_api = data_repo_client.DatasetsApi(api_client=api_client)
            ingest_request_result, job_id = wait_for_tdr_job(datasets_api.ingest_dataset(id=new_dataset_id, ingest=ingest_request), tdr_host)
            logging.info("Ingest succeeded: {}".format(str(ingest_request_result)[0:1000]))
            config["migration_results"].append(["Dataset Ingestion", table_recs_str, "Success", str(ingest_request_result)[0:1000]])
            break
        except Exception as e:
            logging.error("Error on ingest: {}".format(str(e)[0:2500]))
            if attempt_counter < 3:
                logging.info("Retrying ingest (attempt #{})...".format(str(attempt_counter)))
                sleep(10)
                attempt_counter += 1
                continue
            else:
                logging.error("Maximum number of retries exceeded. Logging error.")
                err_str = f"Error on ingest: {str(e)[0:2500]}"
                config["migration_results"].append(["Dataset Ingestion", table_recs_str, "Failure", err_str])  
                break

    # Remove control file from cloud, if written out
    if records_processing_method == "write_to_cloud":
        logging.info(f"Removing control file from the cloud.")
        if write_to_cloud_platform == "gcp":
            client = storage.Client()
            target_bucket = control_file_path.split("/")[2]
            target_object = "/".join(control_file_path.split("/")[3:])
            bucket = client.bucket(target_bucket)
            blob = bucket.blob(target_object)
            blob.delete()
        else:
            blob = BlobClient.from_blob_url(control_file_path)
            blob.delete_blob()

# Function to orchestration the migration of tabular data
def migrate_tabular_data(config):

    # Extract parameters from config
    source_dataset_id = config["source_dataset_id"]
    target_dataset_id = config["target_dataset_id"] 
    tables_to_ingest = config["tables_to_ingest"] 
    tdr_host = config["tdr_host"] 
    tdr_sa_to_use = config["tdr_sa_to_use"] 
    chunk_size = config["chunk_size"] 
    max_combined_rec_ref_size = config["max_combined_rec_ref_size"] 
    skip_ingests = config["skip_ingests"]

    # Setup/refresh TDR clients
    api_client = refresh_tdr_api_client(tdr_host)
    datasets_api = data_repo_client.DatasetsApi(api_client=api_client)
    snapshots_api = data_repo_client.SnapshotsApi(api_client=api_client)

    # Add TDR SA to original dataset
    logging.info(f"Adding TDR general SA ({tdr_sa_to_use}) to original dataset: {source_dataset_id}")
    try:
        resp = datasets_api.add_dataset_policy_member(id=source_dataset_id, policy_name="steward", policy_member={"email": tdr_sa_to_use}) 
        logging.info("TDR SA added successfully.")
    except:
        error_str = f"Error adding TDR SA to dataset {source_dataset_id}: {str(e)}"
        logging.error(error_str)
        config["migration_results"].append(["Dataset Ingestion", "All Tables", "Failure", error_str])
        return

    # Collect details from original dataset to build inventory of tables to migrate
    logging.info(f"Retrieving dataset details from original dataset: {source_dataset_id}")
    try:
        dataset_details = datasets_api.retrieve_dataset(id=source_dataset_id, include=["SCHEMA", "ACCESS_INFORMATION", "PROPERTIES", "DATA_PROJECT", "STORAGE"]).to_dict()
        config["bigquery_project"] = dataset_details["access_information"]["big_query"]["project_id"]
        config["bigquery_dataset"] = dataset_details["access_information"]["big_query"]["dataset_name"]
        fileref_col_dict = {}
        array_col_dict = {}
        float_col_dict = {}
        for table_entry in dataset_details["schema"]["tables"]:
            fileref_list = []
            array_list = []
            float_list = []
            for idx, column_entry in enumerate(table_entry["columns"]):
                if column_entry["datatype"] == "fileref":
                    fileref_list.append(column_entry["name"])
                elif column_entry["datatype"] == "float":
                    float_list.append(column_entry["name"])
                if column_entry["array_of"] == True:
                    array_list.append(column_entry["name"])
            fileref_col_dict[table_entry["name"]] = fileref_list
            array_col_dict[table_entry["name"]] = array_list
            float_col_dict[table_entry["name"]] = float_list
    except Exception as e:
        error_str = f"Error retrieving details from dataset {source_dataset_id}: {str(e)}"
        logging.error(error_str)
        config["migration_results"].append(["Dataset Ingestion", "All Tables", "Failure", error_str])
        return

    # Read in existing datarepo_row_id crosswalk, if one exists
    logging.info("Fetching existing datarepo_row_id crosswalk (if one exists).")
    xwalk_json_file_name = f"{source_dataset_id}_{target_dataset_id}_rowid_xwalk.json"
    try:
        with open(xwalk_json_file_name,"r") as file:
            datarepo_row_id_xwalk = json.load(file)
    except:
        datarepo_row_id_xwalk = {}
        logging.warning(f"No datarepo_row_id crosswalk file name '{xwalk_json_file_name}' found.")

    # Order tables for ingestion
    logging.info("Ordering tables and pulling current record counts for validation.")
    table_rank_dict = {}
    for table in fileref_col_dict.keys():
        if table == "file_inventory":
            table_rank_dict[table] = 1
        elif "anvil_" not in table:
            table_rank_dict[table] = 2
        else:
            table_rank_dict[table] = 3
    ordered_table_list = sorted(table_rank_dict, key= lambda key: table_rank_dict[key])

    # Fetch total record counts for all tables
    populated_table_dict = {}
    for table in ordered_table_list:
        api_client = refresh_tdr_api_client(tdr_host)
        datasets_api = data_repo_client.DatasetsApi(api_client=api_client)
        snapshots_api = data_repo_client.SnapshotsApi(api_client=api_client)
        attempt_counter = 0
        while True:
            payload = {
              "offset": 0,
              "limit": 10,
              "sort": "datarepo_row_id",
              "direction": "asc",
              "filter": ""
            }
            try:
                record_results = datasets_api.query_dataset_data_by_id(id=source_dataset_id, table=table, query_data_request_model=payload).to_dict()
                total_record_count = record_results["total_row_count"]
                break
            except Exception as e:
                if attempt_counter < 5:
                    sleep(10)
                    attempt_counter += 1
                    continue
                else:
                    total_record_count = -1
                    break
        if total_record_count == -1:
            error_str = f"Error retrieving current record counts for tables in dataset {source_dataset_id}: {str(e)}"
            logging.error(error_str)
            config["migration_results"].append(["Dataset Ingestion", "All Tables", "Failure", error_str])
            return
        elif total_record_count > 0:
            populated_table_dict[table] = total_record_count

    # Loop through and process tables for ingestion
    logging.info("Processing dataset ingestion requests.")
    pop_fss_table_cnt = 0
    for table in ordered_table_list:

        # Determine whether table should be processed, and skip if not
        logging.info(f"Processing dataset ingestion for table '{table}'.")
        total_record_count = 0
        if tables_to_ingest and table not in tables_to_ingest:
            msg_str = f"Table '{table}' not listed in the tables_to_ingest parameter. Skipping."
            logging.info(msg_str)
            config["migration_results"].append(["Dataset Ingestion", f"Table: {table}", "Skipped", msg_str])
            continue
        elif table not in populated_table_dict.keys():
            msg_str = f"No records found for table '{table}' in original dataset. Continuing to next table/record set."
            logging.info(msg_str)
            config["migration_results"].append(["Dataset Ingestion", f"Table: {table}", "Skipped", msg_str])
            continue
        elif "anvil_" in table:
            # Confirm all non-FSS tables are present in datarepo_row_id_xwalk
            pop_fss_table_cnt += 1
            missing_tab_list = []
            for tab in populated_table_dict.keys():
                if "anvil_" not in tab and tab not in datarepo_row_id_xwalk.keys():
                    missing_tab_list.append(tab)
            if len(missing_tab_list) > 0:
                missing_tab_string = ", ".join(missing_tab_list)
                msg_str = f"Populated non-FSS tables missing from datarepo_row_id crosswalk: {missing_tab_string}. Skipping FSS table '{table}'."
                logging.info(msg_str)
                config["migration_results"].append(["Dataset Ingestion", f"Table: {table}", "Skipped", msg_str])
                continue
        
        # Aggregate datarepo_row_id crosswalk informatino for us in FSS table processing
        if pop_fss_table_cnt == 1:
            dr_row_id_xwalk = {}
            for key in datarepo_row_id_xwalk.keys():
                dr_row_id_xwalk.update(datarepo_row_id_xwalk[key])
            config["dr_row_id_xwalk"] = dr_row_id_xwalk 
            
        # Chunk table records as necessary, then loop through and process each chunk
        total_record_count = populated_table_dict.get(table)
        if skip_ingests:
            msg_str = f"Parameter 'skip_ingests' set to true. Skipping ingestion for table '{table}'."
            logging.info(msg_str)
            config["migration_results"].append(["Dataset Ingestion", f"Table: {table}", "Skipped", msg_str])
        else:
            if fileref_col_dict[table]:
                ref_chunk_size = math.floor(max_combined_rec_ref_size / len(fileref_col_dict[table]))
                table_chunk_size = min(chunk_size, ref_chunk_size)
                logging.info(f"Table '{table}' contains fileref columns. Will use a chunk size of {table_chunk_size} rows per ingestion request, to keep the number of file references per chunk below {max_combined_rec_ref_size}.")
            else:
                table_chunk_size = chunk_size
                logging.info(f"Table '{table}' does not contain fileref columns. Will use a chunk size of {table_chunk_size} rows per ingestion request.")
            start_row = 1
            end_row = min((table_chunk_size), total_record_count)
            while start_row <= total_record_count:
                if end_row > total_record_count:
                    end_row = total_record_count
                ingest_table_data(config, target_dataset_id, array_col_dict, float_col_dict, table, start_row, end_row)    
                start_row += table_chunk_size
                end_row += table_chunk_size

        # Build datarepo_row_id crosswalk for the table, add to datarepo_row_id_xwalk dict, and write out updated dict to file
        if "anvil_" not in table: 
            logging.info("Fetching ingested records and building datarepo_row_id lookup for use in AnVIL transforms.")
            temp_dr_xwalk = {}
            api_client = refresh_tdr_api_client(tdr_host)
            datasets_api = data_repo_client.DatasetsApi(api_client=api_client)
            max_page_size = 1000
            records_fetched = 0
            retrieval_error = False
            while records_fetched < total_record_count and not retrieval_error:
                row_start = records_fetched
                attempt_counter = 0
                while True:
                    payload = {
                      "offset": row_start,
                      "limit": max_page_size,
                      "sort": "datarepo_row_id",
                      "direction": "asc",
                      "filter": ""
                    }
                    try:
                        dataset_results = datasets_api.query_dataset_data_by_id(id=target_dataset_id, table=table, query_data_request_model=payload).to_dict() 
                        if len(dataset_results["result"]) == 0:
                            warn_str = f"No records found for '{table}' table, which prevents the proper building of the datarepo_row_id_xwalk. Note that this may cause failures in FSS table ingestion requests downstream."
                            logging.warning(warn_str)
                            retrieval_error = True
                            break  
                        else:
                            for record in dataset_results["result"]:
                                key = table + ":" + record["orig_datarepo_row_id"]
                                val = table + ":" + record["datarepo_row_id"]
                                temp_dr_xwalk[key] = val
                                records_fetched += 1
                            break
                    except Exception as e:
                        if attempt_counter < 5:
                            sleep(10)
                            attempt_counter += 1
                            continue
                        else:
                            warn_str = f"Error retrieving records for '{table}' table to build datarepo_row_id_xwalk. Note that this may cause failures in FSS table ingestion requests downstream."
                            logging.warning(warn_str)
                            retrieval_error = True
                            break
            if not retrieval_error:
                datarepo_row_id_xwalk[table] = temp_dr_xwalk
                with open(xwalk_json_file_name, 'w') as file:
                    json.dump(datarepo_row_id_xwalk, file)
        
        # Fetch total record count for the new table
        api_client = refresh_tdr_api_client(tdr_host)
        datasets_api = data_repo_client.DatasetsApi(api_client=api_client)
        snapshots_api = data_repo_client.SnapshotsApi(api_client=api_client)
        attempt_counter = 0
        while True:
            payload = {
              "offset": 0,
              "limit": 10,
              "sort": "datarepo_row_id",
              "direction": "asc",
              "filter": ""
            }
            try:
                record_results = datasets_api.query_dataset_data_by_id(id=target_dataset_id, table=table, query_data_request_model=payload).to_dict()
                new_record_count = record_results["total_row_count"]
                break
            except Exception as e:
                if attempt_counter < 5:
                    sleep(10)
                    attempt_counter += 1
                    continue
                else:
                    new_record_count = -1
                    break
        if new_record_count == -1:
            err_str = f"Error retrieving record count for table '{table}' in new dataset. Skipping validation and continuing to next table."
            logging.error(err_str)
            config["migration_results"].append(["Dataset Validation", f"Table: {table}", "Failure", err_str])
            continue 

        # Validate the new table against the old table, with extra scrutiny given to the file_inventory table for AnVIL migrations
        logging.info(f"Validating table '{table}' in new dataset vs. original dataset.")
        if new_record_count == total_record_count:
            config["migration_results"].append(["Dataset Validation", f"Table: {table}", "Success", f"{new_record_count} records found in both new and original table."])
        else:
            config["migration_results"].append(["Dataset Validation", f"Table: {table}", "Failure", f"{new_record_count} records found in new table doesn't match {total_record_count} records in original table."])

    # Display results
    pipeline_results = pd.DataFrame(config["migration_results"], columns = ["Task", "Step", "Status", "Message"])
    failures = pipeline_results[pipeline_results["Status"].str.contains("Failure")]
    logging.info("Migration Pipeline Results:")
    display(pipeline_results)
    logging.info(f"\nPipeline finished with {len(failures)} failures.")
    return len(failures)

# Function for creating a snapshot for the new dataset
def recreate_snapshot(config):
    
    # Extract parameters from config
    target_dataset_id = config["target_dataset_id"] 
    azure_billing_profile = config["azure_billing_profile"] 
    tdr_host = config["tdr_host"] 
    anvil_schema = config["anvil_schema"] 
    
    # Setup/refresh TDR clients
    api_client = refresh_tdr_api_client(tdr_host)
    datasets_api = data_repo_client.DatasetsApi(api_client=api_client)
    snapshots_api = data_repo_client.SnapshotsApi(api_client=api_client)
    
    # Retrieve new dataset details
    logging.info(f"Retrieving dataset details from prod environment. UUID:  {target_dataset_id}")
    try:
        dataset_details = datasets_api.retrieve_dataset(id=target_dataset_id, include=["SCHEMA", "ACCESS_INFORMATION", "PROPERTIES", "DATA_PROJECT", "STORAGE"]).to_dict()
        dataset_name = dataset_details["name"]
        phs_id = dataset_details["phs_id"]
        consent_name = dataset_details["properties"]["consent_name"]
        auth_domains = dataset_details["properties"]["auth_domains"]
        current_datetime = datetime.datetime.now()
        current_datetime_string = current_datetime.strftime("%Y%m%d%H%M")
        snapshot_name = dataset_name + "_" + anvil_schema + "_" + current_datetime_string
    except Exception as e:
        error_str = f"Error retrieving details from dataset: {str(e)}"
        logging.error(error_str)
    # Build config and submit snapshot job
    snapshot_config = {
        "profile_id": azure_billing_profile,
        "snapshot_readers_list": ["azul-anvil-prod@firecloud.org", "auth-domain"],
        "anvil_schema_versin": anvil_schema,
        "ws_bucket": os.environ["WORKSPACE_BUCKET"],
        "dataset_id": entry[1],
        "dataset_name": dataset_name,
        "phs_id": phs_id,
        "consent_name": consent_name,
        "auth_domains": auth_domains,
        "pipeline_results": [],
        "snapshot_name": snapshot_name
    }
    utils.create_and_share_snapshot(snapshot_config)
    int_df_results = pd.DataFrame(snapshot_config["pipeline_results"], columns = ["Dataset", "Time", "Step", "Task", "Status", "Message"])
    errors = int_df_results[int_df_results["Status"].str.contains("Error")]
    if len(errors) > 0:
        logging.error("Errors reported in snapshotting. See logs for details.")
        status = "Failure"
        message = f"{len(errors)} failures reported. See log for details."
        snapshot_id = ""
        snapshot_name = ""
    else:
        status = "Success"
        message = ""
        snapshot_id = re.search("{'id': '([a-z0-9]{8}-[a-z0-9]{4}-[a-z0-9]{4}-[a-z0-9]{4}-[a-z0-9]{12})'", str(int_df_results[int_df_results["Task"]=="Create and Share Snapshot"]["Message"]))[1]
        snapshot_name = re.search("'name': '([a-zA-Z0-9_\-]+)'", str(int_df_results[int_df_results["Task"]=="Create and Share Snapshot"]["Message"]))[1]
    return status, message, snapshot_id, snapshot_name
        
#############################################
## Input Parameters
#############################################

# Specify migration pairs: [Source GCP Dataset, Target Azure Dataset]
migration_list = [
    #["gcp_dataset_id", "az_dataset_id"]
    ['dd2cb8fc-42a6-482f-898e-ef6125feccb8', '245020b9-7355-4002-95db-12e7234070c5'],
    ['92382848-f5e9-426c-b7dc-f2841ae97018', '8a90137a-7aed-4e8c-bd99-1399f1c550fd'],
    ['4999a410-990e-484b-b4f3-d636f894a741', '79abc50c-6a4e-47e0-962a-4bfa7cb8e321'],
    ['1f534eb4-701f-4182-9895-64c5e5b52d82', '1bf4d70f-db98-4d07-b48f-d177efd25ae4'],
    ['039dd3d6-0cb5-4cd1-86b3-e9579c9b5218', '12c0a3ee-4a21-4be9-a09f-762e2737da1c'],
    ['e68d1d39-99df-4cd7-8053-1b298f03eabb', 'cfcb0f71-1157-4dbb-a76b-926c0cd40ea2'],
    ['d0ce8b95-9c3b-4f9e-8ce0-169fd89a8b20', '5da7394c-1432-4ee0-add5-851280e32d24'],
    ['7427b2eb-a84f-413c-bfb0-7d2e36b0628f', '4fd72248-6778-4f7c-880a-e61773531d0d'],
    ['2ef4530a-cc36-4f32-9a1a-63a555346587', 'd28cb1d4-2300-4cd7-882b-99ce59305ce0'],
    ['65793118-3c88-4185-9172-2354850e6056', '183ec762-f867-46c5-bb19-8b2b3417f7b2'],
    ['36bdd59f-4f5b-43cd-8d34-a21ef87bbf30', '933d1603-8c61-4ff2-8489-7f774ac15e97'],
    ['3abfc362-7e73-4663-9dcf-07b78b9aa2d4', '5dd1128b-6a23-486e-8950-47c7d9f687a8'],
    ['b60b4737-c646-4299-85a0-520890e830b7', '757191b0-9db3-4d18-b4ad-97bead5f3221'],
]

# Run parameters
azure_billing_profile = "9ee23bed-b46c-4561-9103-d2a723113f7f"
anvil_schema = "ANV5"
run_data_migration = True
skip_ingests = False # Set to True to build datarepo_row_id xwalk and run validation w/o ingesting more records
tables_to_ingest = [] # Leave empty for all
run_snapshot_creation = True

#############################################
## Execution
#############################################

# Set up logging
current_datetime_string = datetime.datetime.now().strftime("%Y%m%d%H%M%S")
logs_stream_file_path = "processing_details_" + current_datetime_string + ".log"
while logging.root.handlers:
    logging.root.removeHandler(logging.root.handlers[-1])
logging.basicConfig(format="%(asctime)s - %(levelname)s: %(message)s", datefmt="%m/%d/%Y %I:%M:%S %p", level=logging.INFO, handlers=[logging.FileHandler(logs_stream_file_path), logging.StreamHandler(sys.stdout)])

# Loop through migration list and process entries
results = []
for entry in migration_list:
    
    # Run cross-cloud ingestion, if specified
    failure_count = 0
    if run_data_migration:
        logging.info(f"\nMigrating tabular data from TDR dataset {entry[0]} to TDR dataset {entry[1]}.")
        # Build config and submit migration job
        config = {
            "source_dataset_id": entry[0], 
            "target_dataset_id": entry[1],
            "tables_to_ingest": tables_to_ingest,
            "tdr_host": "https://data.terra.bio",
            "tdr_sa_to_use": "datarepo-jade-api@terra-datarepo-production.iam.gserviceaccount.com",
            "tar_tdr_billing_profile": azure_billing_profile,
            "chunk_size": 250000,
            "max_combined_rec_ref_size": 40000,
            "migration_results": [],
            "dr_row_id_xwalk": {},
            "skip_ingests": skip_ingests
        }
        failure_count = migrate_tabular_data(config)
        status = "Failure" if failure_count > 0 else "Success"
        msg = f"{failure_count} failures reported. See log for details." if failure_count > 0 else ""
        results.append([entry[0], entry[1], "Data Ingestion", status, msg, "", ""])

    # Run snapshotting, if specified and no upstream errors detected
    if run_snapshot_creation:
        logging.info(f"Creating a snapshot for TDR dataset {entry[1]}.")
        # Build config and submit snapshot job
        config = { 
            "target_dataset_id": entry[1],
            "tdr_host": "https://data.terra.bio",
            "azure_billing_profile": azure_billing_profile,
            "anvil_schema": anvil_schema
        }
        if failure_count > 0:
            logging.error("Failures noted in upstream data processing. Skipping snapshotting.")
            results.append([entry[0], entry[1], "Data Snapshotting", "Skipped", "Failures noted in upstream data processing.", "", ""])
        else:
            status, message, snapshot_id, snapshot_name = recreate_snapshot(config)
            results.append([entry[0], entry[1], "Data Snapshotting", status, message, snapshot_id, snapshot_name])
            
# Display final results
print("\nFinal Results:")
results_df = pd.DataFrame(results, columns = ["Source Dataset ID", "Target Dataset ID", "Processing Step", "Status", "Message", "Snapshot ID", "Snapshot Name"])
display(results_df)


09/18/2024 08:13:43 PM - INFO: 
Migrating tabular data from TDR dataset dd2cb8fc-42a6-482f-898e-ef6125feccb8 to TDR dataset 245020b9-7355-4002-95db-12e7234070c5.
09/18/2024 08:13:43 PM - INFO: Adding TDR general SA (datarepo-jade-api@terra-datarepo-production.iam.gserviceaccount.com) to original dataset: dd2cb8fc-42a6-482f-898e-ef6125feccb8
09/18/2024 08:13:43 PM - INFO: TDR SA added successfully.
09/18/2024 08:13:43 PM - INFO: Retrieving dataset details from original dataset: dd2cb8fc-42a6-482f-898e-ef6125feccb8
09/18/2024 08:13:44 PM - INFO: Fetching existing datarepo_row_id crosswalk (if one exists).
09/18/2024 08:13:44 PM - INFO: Ordering tables and pulling current record counts for validation.
09/18/2024 08:13:59 PM - INFO: Processing dataset ingestion requests.
09/18/2024 08:13:59 PM - INFO: Processing dataset ingestion for table 'file_inventory'.
09/18/2024 08:13:59 PM - INFO: Table 'file_inventory' contains fileref columns. Will use a chunk size of 40000 rows per ingestion requ

09/18/2024 08:28:10 PM - ERROR: Maximum number of retries exceeded. Logging error.
09/18/2024 08:28:10 PM - INFO: Fetching ingested records and building datarepo_row_id lookup for use in AnVIL transforms.
09/18/2024 08:28:23 PM - INFO: Validating table 'sample' in new dataset vs. original dataset.
09/18/2024 08:28:23 PM - INFO: Processing dataset ingestion for table 'participant'.
09/18/2024 08:28:23 PM - INFO: Table 'participant' does not contain fileref columns. Will use a chunk size of 250000 rows per ingestion request.
09/18/2024 08:28:23 PM - INFO: Fetching rows 1-5031 from table 'participant' in the original dataset (dd2cb8fc-42a6-482f-898e-ef6125feccb8).
09/18/2024 08:28:27 PM - INFO: Submitting ingestion request to new dataset (245020b9-7355-4002-95db-12e7234070c5).
TDR Job ID: _JiX9C-dSIOev1jXPA3P-A
09/18/2024 08:29:50 PM - INFO: Ingest succeeded: {'dataset_id': '245020b9-7355-4002-95db-12e7234070c5', 'dataset': 'ANVIL_CCDG_Broad_CVD_AF_BioVU_HMB_GSO_WES_20240307', 'table': 'p

Unnamed: 0,Task,Step,Status,Message
0,Dataset Ingestion,Table: file_inventory -- Rows: 1-15101,Success,"{'dataset_id': '245020b9-7355-4002-95db-12e7234070c5', 'dataset': 'ANVIL_CCDG_Broad_CVD_AF_BioVU_HMB_GSO_WES_20240307', 'table': 'file_inventory', 'path': None, 'load_tag': 'Ingest for 245020b9-7355-4002-95db-12e7234070c5', 'row_count': 15101, 'bad_row_count': 0, 'load_result': None}"
1,Dataset Validation,Table: file_inventory,Success,15101 records found in both new and original table.
2,Dataset Ingestion,Table: workspace_attributes -- Rows: 1-27,Success,"{'dataset_id': '245020b9-7355-4002-95db-12e7234070c5', 'dataset': 'ANVIL_CCDG_Broad_CVD_AF_BioVU_HMB_GSO_WES_20240307', 'table': 'workspace_attributes', 'path': None, 'load_tag': 'Ingest for 245020b9-7355-4002-95db-12e7234070c5', 'row_count': 27, 'bad_row_count': 0, 'load_result': None}"
3,Dataset Validation,Table: workspace_attributes,Success,27 records found in both new and original table.
4,Dataset Ingestion,Table: subject -- Rows: 1-5031,Success,"{'dataset_id': '245020b9-7355-4002-95db-12e7234070c5', 'dataset': 'ANVIL_CCDG_Broad_CVD_AF_BioVU_HMB_GSO_WES_20240307', 'table': 'subject', 'path': None, 'load_tag': 'Ingest for 245020b9-7355-4002-95db-12e7234070c5', 'row_count': 5031, 'bad_row_count': 0, 'load_result': None}"
5,Dataset Validation,Table: subject,Success,5031 records found in both new and original table.
6,Dataset Ingestion,Table: sample_set -- Rows: 1-1,Success,"{'dataset_id': '245020b9-7355-4002-95db-12e7234070c5', 'dataset': 'ANVIL_CCDG_Broad_CVD_AF_BioVU_HMB_GSO_WES_20240307', 'table': 'sample_set', 'path': None, 'load_tag': 'Ingest for 245020b9-7355-4002-95db-12e7234070c5', 'row_count': 1, 'bad_row_count': 0, 'load_result': None}"
7,Dataset Validation,Table: sample_set,Success,1 records found in both new and original table.
8,Dataset Ingestion,Table: sample -- Rows: 1-5031,Failure,"Error on ingest: Job njlNkfV4SPW6jVNoCWKfVA failed: (500)\nReason: Internal Server Error\nHTTP response headers: HTTPHeaderDict({'Date': 'Wed, 18 Sep 2024 20:28:10 GMT', 'Server': 'Apache', 'X-Frame-Options': 'SAMEORIGIN', 'Access-Control-Allow-Headers': 'DNT,User-Agent,X-Requested-With,If-Modified-Since,Cache-Control,Content-Type,Range,Authorization,Accept,Referer,X-App-Id,Origin', 'Access-Control-Allow-Methods': 'GET,POST,DELETE,PUT,PATCH,OPTIONS,HEAD', 'X-Content-Type-Options': 'nosniff', 'Strict-Transport-Security': 'max-age=31536000;includeSubDomains', 'Cache-Control': 'no-cache,no-store,must-revalidate', 'X-Request-ID': 'a5JyGd4o', 'Content-Type': 'application/json', 'Vary': 'Origin', 'Via': '1.1 google', 'Alt-Svc': 'h3="":443""; ma=2592000,h3-29="":443""; ma=2592000', 'Transfer-Encoding': 'chunked'})\nHTTP response body: {""timestamp"":1726691290865,""status"":500,""error"":""Internal Server Error"",""path"":""/api/repository/v1/jobs/njlNkfV4SPW6jVNoCWKfVA/result""}\n"
9,Dataset Validation,Table: sample,Failure,0 records found in new table doesn't match 5031 records in original table.


09/18/2024 08:30:10 PM - INFO: 
Pipeline finished with 2 failures.
09/18/2024 08:30:10 PM - INFO: Creating a snapshot for TDR dataset 245020b9-7355-4002-95db-12e7234070c5.
09/18/2024 08:30:10 PM - ERROR: Failures noted in upstream data processing. Skipping snapshotting.
09/18/2024 08:30:10 PM - INFO: 
Migrating tabular data from TDR dataset 92382848-f5e9-426c-b7dc-f2841ae97018 to TDR dataset 8a90137a-7aed-4e8c-bd99-1399f1c550fd.
09/18/2024 08:30:10 PM - INFO: Adding TDR general SA (datarepo-jade-api@terra-datarepo-production.iam.gserviceaccount.com) to original dataset: 92382848-f5e9-426c-b7dc-f2841ae97018
09/18/2024 08:30:10 PM - INFO: TDR SA added successfully.
09/18/2024 08:30:10 PM - INFO: Retrieving dataset details from original dataset: 92382848-f5e9-426c-b7dc-f2841ae97018
09/18/2024 08:30:10 PM - INFO: Fetching existing datarepo_row_id crosswalk (if one exists).
09/18/2024 08:30:10 PM - INFO: Ordering tables and pulling current record counts for validation.
09/18/2024 08:30:26 P

09/18/2024 08:36:54 PM - ERROR: Maximum number of retries exceeded. Logging error.
09/18/2024 08:36:54 PM - INFO: Fetching ingested records and building datarepo_row_id lookup for use in AnVIL transforms.
09/18/2024 08:38:04 PM - INFO: Validating table 'sample' in new dataset vs. original dataset.
09/18/2024 08:38:04 PM - INFO: Processing dataset ingestion for table 'workspace_attributes'.
09/18/2024 08:38:04 PM - INFO: Table 'workspace_attributes' does not contain fileref columns. Will use a chunk size of 250000 rows per ingestion request.
09/18/2024 08:38:04 PM - INFO: Fetching rows 1-30 from table 'workspace_attributes' in the original dataset (92382848-f5e9-426c-b7dc-f2841ae97018).
09/18/2024 08:38:06 PM - INFO: Submitting ingestion request to new dataset (8a90137a-7aed-4e8c-bd99-1399f1c550fd).
TDR Job ID: n5f3NUkVRaWDu0yUf2C6tw
09/18/2024 08:38:47 PM - INFO: Ingest succeeded: {'dataset_id': '8a90137a-7aed-4e8c-bd99-1399f1c550fd', 'dataset': 'ANVIL_CCDG_Broad_CVD_AF_EAST_DS_WES_202

Unnamed: 0,Task,Step,Status,Message
0,Dataset Ingestion,Table: file_inventory -- Rows: 1-4696,Success,"{'dataset_id': '8a90137a-7aed-4e8c-bd99-1399f1c550fd', 'dataset': 'ANVIL_CCDG_Broad_CVD_AF_EAST_DS_WES_20240220', 'table': 'file_inventory', 'path': None, 'load_tag': 'Ingest for 8a90137a-7aed-4e8c-bd99-1399f1c550fd', 'row_count': 4696, 'bad_row_count': 0, 'load_result': None}"
1,Dataset Validation,Table: file_inventory,Success,4696 records found in both new and original table.
2,Dataset Ingestion,Table: participant -- Rows: 1-1565,Success,"{'dataset_id': '8a90137a-7aed-4e8c-bd99-1399f1c550fd', 'dataset': 'ANVIL_CCDG_Broad_CVD_AF_EAST_DS_WES_20240220', 'table': 'participant', 'path': None, 'load_tag': 'Ingest for 8a90137a-7aed-4e8c-bd99-1399f1c550fd', 'row_count': 1565, 'bad_row_count': 0, 'load_result': None}"
3,Dataset Validation,Table: participant,Success,1565 records found in both new and original table.
4,Dataset Ingestion,Table: sample_set -- Rows: 1-1,Success,"{'dataset_id': '8a90137a-7aed-4e8c-bd99-1399f1c550fd', 'dataset': 'ANVIL_CCDG_Broad_CVD_AF_EAST_DS_WES_20240220', 'table': 'sample_set', 'path': None, 'load_tag': 'Ingest for 8a90137a-7aed-4e8c-bd99-1399f1c550fd', 'row_count': 1, 'bad_row_count': 0, 'load_result': None}"
5,Dataset Validation,Table: sample_set,Success,1 records found in both new and original table.
6,Dataset Ingestion,Table: sample -- Rows: 1-1565,Failure,"Error on ingest: Job poe1HYYuSweBq3NhXnACKg failed: (500)\nReason: Internal Server Error\nHTTP response headers: HTTPHeaderDict({'Date': 'Wed, 18 Sep 2024 20:36:54 GMT', 'Server': 'Apache', 'X-Frame-Options': 'SAMEORIGIN', 'Access-Control-Allow-Headers': 'DNT,User-Agent,X-Requested-With,If-Modified-Since,Cache-Control,Content-Type,Range,Authorization,Accept,Referer,X-App-Id,Origin', 'Access-Control-Allow-Methods': 'GET,POST,DELETE,PUT,PATCH,OPTIONS,HEAD', 'X-Content-Type-Options': 'nosniff', 'Strict-Transport-Security': 'max-age=31536000;includeSubDomains', 'Cache-Control': 'no-cache,no-store,must-revalidate', 'X-Request-ID': 'pLGbMb0J', 'Content-Type': 'application/json', 'Vary': 'Origin', 'Via': '1.1 google', 'Alt-Svc': 'h3="":443""; ma=2592000,h3-29="":443""; ma=2592000', 'Transfer-Encoding': 'chunked'})\nHTTP response body: {""timestamp"":1726691814736,""status"":500,""error"":""Internal Server Error"",""path"":""/api/repository/v1/jobs/poe1HYYuSweBq3NhXnACKg/result""}\n"
7,Dataset Validation,Table: sample,Failure,0 records found in new table doesn't match 1565 records in original table.
8,Dataset Ingestion,Table: workspace_attributes -- Rows: 1-30,Success,"{'dataset_id': '8a90137a-7aed-4e8c-bd99-1399f1c550fd', 'dataset': 'ANVIL_CCDG_Broad_CVD_AF_EAST_DS_WES_20240220', 'table': 'workspace_attributes', 'path': None, 'load_tag': 'Ingest for 8a90137a-7aed-4e8c-bd99-1399f1c550fd', 'row_count': 30, 'bad_row_count': 0, 'load_result': None}"
9,Dataset Validation,Table: workspace_attributes,Success,30 records found in both new and original table.


09/18/2024 08:40:44 PM - INFO: 
Pipeline finished with 2 failures.
09/18/2024 08:40:44 PM - INFO: Creating a snapshot for TDR dataset 8a90137a-7aed-4e8c-bd99-1399f1c550fd.
09/18/2024 08:40:44 PM - ERROR: Failures noted in upstream data processing. Skipping snapshotting.
09/18/2024 08:40:44 PM - INFO: 
Migrating tabular data from TDR dataset 4999a410-990e-484b-b4f3-d636f894a741 to TDR dataset 79abc50c-6a4e-47e0-962a-4bfa7cb8e321.
09/18/2024 08:40:44 PM - INFO: Adding TDR general SA (datarepo-jade-api@terra-datarepo-production.iam.gserviceaccount.com) to original dataset: 4999a410-990e-484b-b4f3-d636f894a741
09/18/2024 08:40:45 PM - INFO: TDR SA added successfully.
09/18/2024 08:40:45 PM - INFO: Retrieving dataset details from original dataset: 4999a410-990e-484b-b4f3-d636f894a741
09/18/2024 08:40:45 PM - INFO: Fetching existing datarepo_row_id crosswalk (if one exists).
09/18/2024 08:40:45 PM - INFO: Ordering tables and pulling current record counts for validation.
09/18/2024 08:40:58 P

09/18/2024 08:46:58 PM - INFO: Populated non-FSS tables missing from datarepo_row_id crosswalk: subject. Skipping FSS table 'anvil_dataset'.
09/18/2024 08:46:58 PM - INFO: Processing dataset ingestion for table 'anvil_variantcallingactivity'.
09/18/2024 08:46:58 PM - INFO: No records found for table 'anvil_variantcallingactivity' in original dataset. Continuing to next table/record set.
09/18/2024 08:46:58 PM - INFO: Processing dataset ingestion for table 'anvil_alignmentactivity'.
09/18/2024 08:46:58 PM - INFO: No records found for table 'anvil_alignmentactivity' in original dataset. Continuing to next table/record set.
09/18/2024 08:46:58 PM - INFO: Processing dataset ingestion for table 'anvil_antibody'.
09/18/2024 08:46:58 PM - INFO: No records found for table 'anvil_antibody' in original dataset. Continuing to next table/record set.
09/18/2024 08:46:58 PM - INFO: Processing dataset ingestion for table 'anvil_assayactivity'.
09/18/2024 08:46:58 PM - INFO: No records found for table

Unnamed: 0,Task,Step,Status,Message
0,Dataset Ingestion,Table: file_inventory -- Rows: 1-1911,Success,"{'dataset_id': '79abc50c-6a4e-47e0-962a-4bfa7cb8e321', 'dataset': 'ANVIL_CCDG_Broad_CVD_AF_Ellinor_MGH_Arrays_20240220', 'table': 'file_inventory', 'path': None, 'load_tag': 'Ingest for 79abc50c-6a4e-47e0-962a-4bfa7cb8e321', 'row_count': 1911, 'bad_row_count': 0, 'load_result': None}"
1,Dataset Validation,Table: file_inventory,Success,1911 records found in both new and original table.
2,Dataset Ingestion,Table: workspace_attributes -- Rows: 1-29,Success,"{'dataset_id': '79abc50c-6a4e-47e0-962a-4bfa7cb8e321', 'dataset': 'ANVIL_CCDG_Broad_CVD_AF_Ellinor_MGH_Arrays_20240220', 'table': 'workspace_attributes', 'path': None, 'load_tag': 'Ingest for 79abc50c-6a4e-47e0-962a-4bfa7cb8e321', 'row_count': 29, 'bad_row_count': 0, 'load_result': None}"
3,Dataset Validation,Table: workspace_attributes,Success,29 records found in both new and original table.
4,Dataset Ingestion,Table: participant -- Rows: 1-381,Success,"{'dataset_id': '79abc50c-6a4e-47e0-962a-4bfa7cb8e321', 'dataset': 'ANVIL_CCDG_Broad_CVD_AF_Ellinor_MGH_Arrays_20240220', 'table': 'participant', 'path': None, 'load_tag': 'Ingest for 79abc50c-6a4e-47e0-962a-4bfa7cb8e321', 'row_count': 381, 'bad_row_count': 0, 'load_result': None}"
5,Dataset Validation,Table: participant,Success,381 records found in both new and original table.
6,Dataset Ingestion,Table: sample_set -- Rows: 1-1,Success,"{'dataset_id': '79abc50c-6a4e-47e0-962a-4bfa7cb8e321', 'dataset': 'ANVIL_CCDG_Broad_CVD_AF_Ellinor_MGH_Arrays_20240220', 'table': 'sample_set', 'path': None, 'load_tag': 'Ingest for 79abc50c-6a4e-47e0-962a-4bfa7cb8e321', 'row_count': 1, 'bad_row_count': 0, 'load_result': None}"
7,Dataset Validation,Table: sample_set,Success,1 records found in both new and original table.
8,Dataset Ingestion,Table: subject -- Rows: 1-379,Success,"{'dataset_id': '79abc50c-6a4e-47e0-962a-4bfa7cb8e321', 'dataset': 'ANVIL_CCDG_Broad_CVD_AF_Ellinor_MGH_Arrays_20240220', 'table': 'subject', 'path': None, 'load_tag': 'Ingest for 79abc50c-6a4e-47e0-962a-4bfa7cb8e321', 'row_count': 379, 'bad_row_count': 0, 'load_result': None}"
9,Dataset Validation,Table: subject,Success,379 records found in both new and original table.


09/18/2024 08:46:58 PM - INFO: 
Pipeline finished with 0 failures.
09/18/2024 08:46:58 PM - INFO: Creating a snapshot for TDR dataset 79abc50c-6a4e-47e0-962a-4bfa7cb8e321.
09/18/2024 08:46:58 PM - INFO: Retrieving dataset details from prod environment. UUID:  79abc50c-6a4e-47e0-962a-4bfa7cb8e321
09/18/2024 08:47:00 PM - INFO: Creating full-view snapshot.
09/18/2024 08:47:00 PM - INFO: Attempting to lookup consent code using PHS: 1062 and Consent Name: TBD.
09/18/2024 08:47:01 PM - INFO: Submitting snapshot request.
TDR Job ID: XESZL5gYQnCTn8iKkNbG3w
09/18/2024 08:48:12 PM - INFO: Snapshot Creation succeeded: {'id': 'bf5e1ef2-f9b0-42b1-9f2e-deb8dcd67b81', 'name': 'ANVIL_CCDG_Broad_CVD_AF_Ellinor_MGH_Arrays_20240220_ANV5_202409182047', 'description': 'Full view snapshot of ANVIL_CCDG_Broad_CVD_AF_Ellinor_MGH_Arrays_20240220', 'createdDate': '2024-09-18T20:47:01.709174Z', 'profileId': '9ee23bed-b46c-4561-9103-d2a723113f7f', 'storage': [{'region': 'southcentralus', 'cloudResource': 'applic

09/18/2024 08:54:08 PM - INFO: Ingest succeeded: {'dataset_id': '1bf4d70f-db98-4d07-b48f-d177efd25ae4', 'dataset': 'ANVIL_CCDG_Broad_CVD_AF_Ellinor_MGH_DS_WES_20240220', 'table': 'subject', 'path': None, 'load_tag': 'Ingest for 1bf4d70f-db98-4d07-b48f-d177efd25ae4', 'row_count': 489, 'bad_row_count': 0, 'load_result': None}
09/18/2024 08:54:08 PM - INFO: Fetching ingested records and building datarepo_row_id lookup for use in AnVIL transforms.
09/18/2024 08:55:17 PM - INFO: Validating table 'subject' in new dataset vs. original dataset.
09/18/2024 08:55:17 PM - INFO: Processing dataset ingestion for table 'anvil_file'.
09/18/2024 08:55:17 PM - INFO: Populated non-FSS tables missing from datarepo_row_id crosswalk: sample, subject. Skipping FSS table 'anvil_file'.
09/18/2024 08:55:17 PM - INFO: Processing dataset ingestion for table 'anvil_diagnosis'.
09/18/2024 08:55:17 PM - INFO: No records found for table 'anvil_diagnosis' in original dataset. Continuing to next table/record set.
09/1

Unnamed: 0,Task,Step,Status,Message
0,Dataset Ingestion,Table: file_inventory -- Rows: 1-1468,Success,"{'dataset_id': '1bf4d70f-db98-4d07-b48f-d177efd25ae4', 'dataset': 'ANVIL_CCDG_Broad_CVD_AF_Ellinor_MGH_DS_WES_20240220', 'table': 'file_inventory', 'path': None, 'load_tag': 'Ingest for 1bf4d70f-db98-4d07-b48f-d177efd25ae4', 'row_count': 1468, 'bad_row_count': 0, 'load_result': None}"
1,Dataset Validation,Table: file_inventory,Success,1468 records found in both new and original table.
2,Dataset Ingestion,Table: sample_set -- Rows: 1-1,Success,"{'dataset_id': '1bf4d70f-db98-4d07-b48f-d177efd25ae4', 'dataset': 'ANVIL_CCDG_Broad_CVD_AF_Ellinor_MGH_DS_WES_20240220', 'table': 'sample_set', 'path': None, 'load_tag': 'Ingest for 1bf4d70f-db98-4d07-b48f-d177efd25ae4', 'row_count': 1, 'bad_row_count': 0, 'load_result': None}"
3,Dataset Validation,Table: sample_set,Success,1 records found in both new and original table.
4,Dataset Ingestion,Table: workspace_attributes -- Rows: 1-30,Success,"{'dataset_id': '1bf4d70f-db98-4d07-b48f-d177efd25ae4', 'dataset': 'ANVIL_CCDG_Broad_CVD_AF_Ellinor_MGH_DS_WES_20240220', 'table': 'workspace_attributes', 'path': None, 'load_tag': 'Ingest for 1bf4d70f-db98-4d07-b48f-d177efd25ae4', 'row_count': 30, 'bad_row_count': 0, 'load_result': None}"
5,Dataset Validation,Table: workspace_attributes,Success,30 records found in both new and original table.
6,Dataset Ingestion,Table: sample -- Rows: 1-489,Failure,"Error on ingest: Job oGmBGjlZR1aGEfYFz0dumg failed: (500)\nReason: Internal Server Error\nHTTP response headers: HTTPHeaderDict({'Date': 'Wed, 18 Sep 2024 20:52:47 GMT', 'Server': 'Apache', 'X-Frame-Options': 'SAMEORIGIN', 'Access-Control-Allow-Headers': 'DNT,User-Agent,X-Requested-With,If-Modified-Since,Cache-Control,Content-Type,Range,Authorization,Accept,Referer,X-App-Id,Origin', 'Access-Control-Allow-Methods': 'GET,POST,DELETE,PUT,PATCH,OPTIONS,HEAD', 'X-Content-Type-Options': 'nosniff', 'Strict-Transport-Security': 'max-age=31536000;includeSubDomains', 'Cache-Control': 'no-cache,no-store,must-revalidate', 'X-Request-ID': 'pPLYJnKz', 'Content-Type': 'application/json', 'Vary': 'Origin', 'Via': '1.1 google', 'Alt-Svc': 'h3="":443""; ma=2592000,h3-29="":443""; ma=2592000', 'Transfer-Encoding': 'chunked'})\nHTTP response body: {""timestamp"":1726692767807,""status"":500,""error"":""Internal Server Error"",""path"":""/api/repository/v1/jobs/oGmBGjlZR1aGEfYFz0dumg/result""}\n"
7,Dataset Validation,Table: sample,Failure,0 records found in new table doesn't match 489 records in original table.
8,Dataset Ingestion,Table: participant -- Rows: 1-489,Success,"{'dataset_id': '1bf4d70f-db98-4d07-b48f-d177efd25ae4', 'dataset': 'ANVIL_CCDG_Broad_CVD_AF_Ellinor_MGH_DS_WES_20240220', 'table': 'participant', 'path': None, 'load_tag': 'Ingest for 1bf4d70f-db98-4d07-b48f-d177efd25ae4', 'row_count': 489, 'bad_row_count': 0, 'load_result': None}"
9,Dataset Validation,Table: participant,Success,489 records found in both new and original table.


09/18/2024 08:55:17 PM - INFO: 
Pipeline finished with 2 failures.
09/18/2024 08:55:17 PM - INFO: Creating a snapshot for TDR dataset 1bf4d70f-db98-4d07-b48f-d177efd25ae4.
09/18/2024 08:55:17 PM - ERROR: Failures noted in upstream data processing. Skipping snapshotting.
09/18/2024 08:55:17 PM - INFO: 
Migrating tabular data from TDR dataset 039dd3d6-0cb5-4cd1-86b3-e9579c9b5218 to TDR dataset 12c0a3ee-4a21-4be9-a09f-762e2737da1c.
09/18/2024 08:55:17 PM - INFO: Adding TDR general SA (datarepo-jade-api@terra-datarepo-production.iam.gserviceaccount.com) to original dataset: 039dd3d6-0cb5-4cd1-86b3-e9579c9b5218
09/18/2024 08:55:17 PM - INFO: TDR SA added successfully.
09/18/2024 08:55:17 PM - INFO: Retrieving dataset details from original dataset: 039dd3d6-0cb5-4cd1-86b3-e9579c9b5218
09/18/2024 08:55:17 PM - INFO: Fetching existing datarepo_row_id crosswalk (if one exists).
09/18/2024 08:55:17 PM - INFO: Ordering tables and pulling current record counts for validation.
09/18/2024 08:55:31 P

09/18/2024 09:06:45 PM - INFO: No records found for table 'anvil_antibody' in original dataset. Continuing to next table/record set.
09/18/2024 09:06:45 PM - INFO: Processing dataset ingestion for table 'anvil_biosample'.
09/18/2024 09:06:45 PM - INFO: Populated non-FSS tables missing from datarepo_row_id crosswalk: subject. Skipping FSS table 'anvil_biosample'.
09/18/2024 09:06:45 PM - INFO: Processing dataset ingestion for table 'anvil_diagnosis'.
09/18/2024 09:06:45 PM - INFO: No records found for table 'anvil_diagnosis' in original dataset. Continuing to next table/record set.
09/18/2024 09:06:45 PM - INFO: Processing dataset ingestion for table 'anvil_activity'.
09/18/2024 09:06:45 PM - INFO: Populated non-FSS tables missing from datarepo_row_id crosswalk: subject. Skipping FSS table 'anvil_activity'.
09/18/2024 09:06:45 PM - INFO: Processing dataset ingestion for table 'anvil_project'.
09/18/2024 09:06:45 PM - INFO: Populated non-FSS tables missing from datarepo_row_id crosswalk:

Unnamed: 0,Task,Step,Status,Message
0,Dataset Ingestion,Table: file_inventory -- Rows: 1-10411,Success,"{'dataset_id': '12c0a3ee-4a21-4be9-a09f-762e2737da1c', 'dataset': 'ANVIL_CCDG_Broad_CVD_AF_GAPP_DS_MDS_Arrays_20240220', 'table': 'file_inventory', 'path': None, 'load_tag': 'Ingest for 12c0a3ee-4a21-4be9-a09f-762e2737da1c', 'row_count': 10411, 'bad_row_count': 0, 'load_result': None}"
1,Dataset Validation,Table: file_inventory,Success,10411 records found in both new and original table.
2,Dataset Ingestion,Table: sample -- Rows: 1-2081,Success,"{'dataset_id': '12c0a3ee-4a21-4be9-a09f-762e2737da1c', 'dataset': 'ANVIL_CCDG_Broad_CVD_AF_GAPP_DS_MDS_Arrays_20240220', 'table': 'sample', 'path': None, 'load_tag': 'Ingest for 12c0a3ee-4a21-4be9-a09f-762e2737da1c', 'row_count': 2081, 'bad_row_count': 0, 'load_result': None}"
3,Dataset Validation,Table: sample,Success,2081 records found in both new and original table.
4,Dataset Ingestion,Table: subject -- Rows: 1-2079,Success,"{'dataset_id': '12c0a3ee-4a21-4be9-a09f-762e2737da1c', 'dataset': 'ANVIL_CCDG_Broad_CVD_AF_GAPP_DS_MDS_Arrays_20240220', 'table': 'subject', 'path': None, 'load_tag': 'Ingest for 12c0a3ee-4a21-4be9-a09f-762e2737da1c', 'row_count': 2079, 'bad_row_count': 0, 'load_result': None}"
5,Dataset Validation,Table: subject,Success,2079 records found in both new and original table.
6,Dataset Ingestion,Table: sample_set -- Rows: 1-1,Success,"{'dataset_id': '12c0a3ee-4a21-4be9-a09f-762e2737da1c', 'dataset': 'ANVIL_CCDG_Broad_CVD_AF_GAPP_DS_MDS_Arrays_20240220', 'table': 'sample_set', 'path': None, 'load_tag': 'Ingest for 12c0a3ee-4a21-4be9-a09f-762e2737da1c', 'row_count': 1, 'bad_row_count': 0, 'load_result': None}"
7,Dataset Validation,Table: sample_set,Success,1 records found in both new and original table.
8,Dataset Ingestion,Table: workspace_attributes -- Rows: 1-29,Success,"{'dataset_id': '12c0a3ee-4a21-4be9-a09f-762e2737da1c', 'dataset': 'ANVIL_CCDG_Broad_CVD_AF_GAPP_DS_MDS_Arrays_20240220', 'table': 'workspace_attributes', 'path': None, 'load_tag': 'Ingest for 12c0a3ee-4a21-4be9-a09f-762e2737da1c', 'row_count': 29, 'bad_row_count': 0, 'load_result': None}"
9,Dataset Validation,Table: workspace_attributes,Success,29 records found in both new and original table.


09/18/2024 09:06:45 PM - INFO: 
Pipeline finished with 0 failures.
09/18/2024 09:06:45 PM - INFO: Creating a snapshot for TDR dataset 12c0a3ee-4a21-4be9-a09f-762e2737da1c.
09/18/2024 09:06:45 PM - INFO: Retrieving dataset details from prod environment. UUID:  12c0a3ee-4a21-4be9-a09f-762e2737da1c
09/18/2024 09:06:47 PM - INFO: Creating full-view snapshot.
09/18/2024 09:06:47 PM - INFO: Attempting to lookup consent code using PHS: 2236 and Consent Name: TBD.
09/18/2024 09:06:47 PM - INFO: Submitting snapshot request.
TDR Job ID: 5asqUKrGRy6wTd0XDLwHZQ
09/18/2024 09:08:59 PM - INFO: Snapshot Creation succeeded: {'id': '1a253008-05b2-43b2-9d42-37a8f32d8a1a', 'name': 'ANVIL_CCDG_Broad_CVD_AF_GAPP_DS_MDS_Arrays_20240220_ANV5_202409182106', 'description': 'Full view snapshot of ANVIL_CCDG_Broad_CVD_AF_GAPP_DS_MDS_Arrays_20240220', 'createdDate': '2024-09-18T21:06:48.271898Z', 'profileId': '9ee23bed-b46c-4561-9103-d2a723113f7f', 'storage': [{'region': 'southcentralus', 'cloudResource': 'applic

09/18/2024 09:14:29 PM - INFO: Submitting ingestion request to new dataset (cfcb0f71-1157-4dbb-a76b-926c0cd40ea2).
TDR Job ID: ZO4XwoT3Q76SHW2M67CqsQ
09/18/2024 09:15:21 PM - INFO: Ingest succeeded: {'dataset_id': 'cfcb0f71-1157-4dbb-a76b-926c0cd40ea2', 'dataset': 'ANVIL_CCDG_Broad_CVD_AF_GAPP_DS_MDS_WES_20240307', 'table': 'participant', 'path': None, 'load_tag': 'Ingest for cfcb0f71-1157-4dbb-a76b-926c0cd40ea2', 'row_count': 2150, 'bad_row_count': 0, 'load_result': None}
09/18/2024 09:15:21 PM - INFO: Fetching ingested records and building datarepo_row_id lookup for use in AnVIL transforms.
09/18/2024 09:15:31 PM - INFO: Validating table 'participant' in new dataset vs. original dataset.
09/18/2024 09:15:31 PM - INFO: Processing dataset ingestion for table 'subject'.
09/18/2024 09:15:31 PM - INFO: Table 'subject' does not contain fileref columns. Will use a chunk size of 250000 rows per ingestion request.
09/18/2024 09:15:31 PM - INFO: Fetching rows 1-2150 from table 'subject' in the

Unnamed: 0,Task,Step,Status,Message
0,Dataset Ingestion,Table: file_inventory -- Rows: 1-6454,Success,"{'dataset_id': 'cfcb0f71-1157-4dbb-a76b-926c0cd40ea2', 'dataset': 'ANVIL_CCDG_Broad_CVD_AF_GAPP_DS_MDS_WES_20240307', 'table': 'file_inventory', 'path': None, 'load_tag': 'Ingest for cfcb0f71-1157-4dbb-a76b-926c0cd40ea2', 'row_count': 6454, 'bad_row_count': 0, 'load_result': None}"
1,Dataset Validation,Table: file_inventory,Success,6454 records found in both new and original table.
2,Dataset Ingestion,Table: sample -- Rows: 1-2150,Failure,"Error on ingest: Job hDXYxD65RT-GaZpJnqOd8g failed: (500)\nReason: Internal Server Error\nHTTP response headers: HTTPHeaderDict({'Date': 'Wed, 18 Sep 2024 21:14:21 GMT', 'Server': 'Apache', 'X-Frame-Options': 'SAMEORIGIN', 'Access-Control-Allow-Headers': 'DNT,User-Agent,X-Requested-With,If-Modified-Since,Cache-Control,Content-Type,Range,Authorization,Accept,Referer,X-App-Id,Origin', 'Access-Control-Allow-Methods': 'GET,POST,DELETE,PUT,PATCH,OPTIONS,HEAD', 'X-Content-Type-Options': 'nosniff', 'Strict-Transport-Security': 'max-age=31536000;includeSubDomains', 'Cache-Control': 'no-cache,no-store,must-revalidate', 'X-Request-ID': 'e48qGOXV', 'Content-Type': 'application/json', 'Vary': 'Origin', 'Via': '1.1 google', 'Alt-Svc': 'h3="":443""; ma=2592000,h3-29="":443""; ma=2592000', 'Transfer-Encoding': 'chunked'})\nHTTP response body: {""timestamp"":1726694062122,""status"":500,""error"":""Internal Server Error"",""path"":""/api/repository/v1/jobs/hDXYxD65RT-GaZpJnqOd8g/result""}\n"
3,Dataset Validation,Table: sample,Failure,0 records found in new table doesn't match 2150 records in original table.
4,Dataset Ingestion,Table: participant -- Rows: 1-2150,Success,"{'dataset_id': 'cfcb0f71-1157-4dbb-a76b-926c0cd40ea2', 'dataset': 'ANVIL_CCDG_Broad_CVD_AF_GAPP_DS_MDS_WES_20240307', 'table': 'participant', 'path': None, 'load_tag': 'Ingest for cfcb0f71-1157-4dbb-a76b-926c0cd40ea2', 'row_count': 2150, 'bad_row_count': 0, 'load_result': None}"
5,Dataset Validation,Table: participant,Success,2150 records found in both new and original table.
6,Dataset Ingestion,Table: subject -- Rows: 1-2150,Success,"{'dataset_id': 'cfcb0f71-1157-4dbb-a76b-926c0cd40ea2', 'dataset': 'ANVIL_CCDG_Broad_CVD_AF_GAPP_DS_MDS_WES_20240307', 'table': 'subject', 'path': None, 'load_tag': 'Ingest for cfcb0f71-1157-4dbb-a76b-926c0cd40ea2', 'row_count': 2150, 'bad_row_count': 0, 'load_result': None}"
7,Dataset Validation,Table: subject,Success,2150 records found in both new and original table.
8,Dataset Ingestion,Table: workspace_attributes -- Rows: 1-27,Success,"{'dataset_id': 'cfcb0f71-1157-4dbb-a76b-926c0cd40ea2', 'dataset': 'ANVIL_CCDG_Broad_CVD_AF_GAPP_DS_MDS_WES_20240307', 'table': 'workspace_attributes', 'path': None, 'load_tag': 'Ingest for cfcb0f71-1157-4dbb-a76b-926c0cd40ea2', 'row_count': 27, 'bad_row_count': 0, 'load_result': None}"
9,Dataset Validation,Table: workspace_attributes,Success,27 records found in both new and original table.


09/18/2024 09:18:36 PM - INFO: 
Pipeline finished with 2 failures.
09/18/2024 09:18:36 PM - INFO: Creating a snapshot for TDR dataset cfcb0f71-1157-4dbb-a76b-926c0cd40ea2.
09/18/2024 09:18:36 PM - ERROR: Failures noted in upstream data processing. Skipping snapshotting.
09/18/2024 09:18:36 PM - INFO: 
Migrating tabular data from TDR dataset d0ce8b95-9c3b-4f9e-8ce0-169fd89a8b20 to TDR dataset 5da7394c-1432-4ee0-add5-851280e32d24.
09/18/2024 09:18:36 PM - INFO: Adding TDR general SA (datarepo-jade-api@terra-datarepo-production.iam.gserviceaccount.com) to original dataset: d0ce8b95-9c3b-4f9e-8ce0-169fd89a8b20
09/18/2024 09:18:36 PM - INFO: TDR SA added successfully.
09/18/2024 09:18:36 PM - INFO: Retrieving dataset details from original dataset: d0ce8b95-9c3b-4f9e-8ce0-169fd89a8b20
09/18/2024 09:18:36 PM - INFO: Fetching existing datarepo_row_id crosswalk (if one exists).
09/18/2024 09:18:36 PM - INFO: Ordering tables and pulling current record counts for validation.
09/18/2024 09:18:49 P

09/18/2024 09:24:11 PM - INFO: No records found for table 'anvil_sequencingactivity' in original dataset. Continuing to next table/record set.
09/18/2024 09:24:11 PM - INFO: Processing dataset ingestion for table 'anvil_antibody'.
09/18/2024 09:24:11 PM - INFO: No records found for table 'anvil_antibody' in original dataset. Continuing to next table/record set.
09/18/2024 09:24:11 PM - INFO: Processing dataset ingestion for table 'anvil_variantcallingactivity'.
09/18/2024 09:24:11 PM - INFO: No records found for table 'anvil_variantcallingactivity' in original dataset. Continuing to next table/record set.
09/18/2024 09:24:11 PM - INFO: Processing dataset ingestion for table 'anvil_alignmentactivity'.
09/18/2024 09:24:11 PM - INFO: No records found for table 'anvil_alignmentactivity' in original dataset. Continuing to next table/record set.
09/18/2024 09:24:11 PM - INFO: Processing dataset ingestion for table 'anvil_project'.
09/18/2024 09:24:11 PM - INFO: Populated non-FSS tables missi

Unnamed: 0,Task,Step,Status,Message
0,Dataset Ingestion,Table: file_inventory -- Rows: 1-784,Success,"{'dataset_id': '5da7394c-1432-4ee0-add5-851280e32d24', 'dataset': 'ANVIL_CCDG_Broad_CVD_AF_Marcus_UCSF_Arrays_20240220', 'table': 'file_inventory', 'path': None, 'load_tag': 'Ingest for 5da7394c-1432-4ee0-add5-851280e32d24', 'row_count': 784, 'bad_row_count': 0, 'load_result': None}"
1,Dataset Validation,Table: file_inventory,Success,784 records found in both new and original table.
2,Dataset Ingestion,Table: workspace_attributes -- Rows: 1-30,Success,"{'dataset_id': '5da7394c-1432-4ee0-add5-851280e32d24', 'dataset': 'ANVIL_CCDG_Broad_CVD_AF_Marcus_UCSF_Arrays_20240220', 'table': 'workspace_attributes', 'path': None, 'load_tag': 'Ingest for 5da7394c-1432-4ee0-add5-851280e32d24', 'row_count': 30, 'bad_row_count': 0, 'load_result': None}"
3,Dataset Validation,Table: workspace_attributes,Success,30 records found in both new and original table.
4,Dataset Ingestion,Table: sample -- Rows: 1-156,Success,"{'dataset_id': '5da7394c-1432-4ee0-add5-851280e32d24', 'dataset': 'ANVIL_CCDG_Broad_CVD_AF_Marcus_UCSF_Arrays_20240220', 'table': 'sample', 'path': None, 'load_tag': 'Ingest for 5da7394c-1432-4ee0-add5-851280e32d24', 'row_count': 156, 'bad_row_count': 0, 'load_result': None}"
5,Dataset Validation,Table: sample,Success,156 records found in both new and original table.
6,Dataset Ingestion,Table: sample_set -- Rows: 1-1,Success,"{'dataset_id': '5da7394c-1432-4ee0-add5-851280e32d24', 'dataset': 'ANVIL_CCDG_Broad_CVD_AF_Marcus_UCSF_Arrays_20240220', 'table': 'sample_set', 'path': None, 'load_tag': 'Ingest for 5da7394c-1432-4ee0-add5-851280e32d24', 'row_count': 1, 'bad_row_count': 0, 'load_result': None}"
7,Dataset Validation,Table: sample_set,Success,1 records found in both new and original table.
8,Dataset Ingestion,Table: subject -- Rows: 1-156,Success,"{'dataset_id': '5da7394c-1432-4ee0-add5-851280e32d24', 'dataset': 'ANVIL_CCDG_Broad_CVD_AF_Marcus_UCSF_Arrays_20240220', 'table': 'subject', 'path': None, 'load_tag': 'Ingest for 5da7394c-1432-4ee0-add5-851280e32d24', 'row_count': 156, 'bad_row_count': 0, 'load_result': None}"
9,Dataset Validation,Table: subject,Success,156 records found in both new and original table.


09/18/2024 09:24:11 PM - INFO: 
Pipeline finished with 0 failures.
09/18/2024 09:24:11 PM - INFO: Creating a snapshot for TDR dataset 5da7394c-1432-4ee0-add5-851280e32d24.
09/18/2024 09:24:11 PM - INFO: Retrieving dataset details from prod environment. UUID:  5da7394c-1432-4ee0-add5-851280e32d24
09/18/2024 09:24:12 PM - INFO: Creating full-view snapshot.
09/18/2024 09:24:12 PM - INFO: Attempting to lookup consent code using PHS: 1933 and Consent Name: TBD.
09/18/2024 09:24:13 PM - INFO: Submitting snapshot request.
TDR Job ID: qH401eeZRl2S83DA6H42og
09/18/2024 09:25:14 PM - INFO: Snapshot Creation succeeded: {'id': '5d0e29fc-3454-4827-ab4b-e5660a3cad39', 'name': 'ANVIL_CCDG_Broad_CVD_AF_Marcus_UCSF_Arrays_20240220_ANV5_202409182124', 'description': 'Full view snapshot of ANVIL_CCDG_Broad_CVD_AF_Marcus_UCSF_Arrays_20240220', 'createdDate': '2024-09-18T21:24:13.500191Z', 'profileId': '9ee23bed-b46c-4561-9103-d2a723113f7f', 'storage': [{'region': 'southcentralus', 'cloudResource': 'applic

09/18/2024 09:30:41 PM - INFO: Table 'participant' does not contain fileref columns. Will use a chunk size of 250000 rows per ingestion request.
09/18/2024 09:30:41 PM - INFO: Fetching rows 1-598 from table 'participant' in the original dataset (7427b2eb-a84f-413c-bfb0-7d2e36b0628f).
09/18/2024 09:30:44 PM - INFO: Submitting ingestion request to new dataset (4fd72248-6778-4f7c-880a-e61773531d0d).
TDR Job ID: F3KCarCMRNOKW6Nrc_QJ7Q
09/18/2024 09:31:15 PM - INFO: Ingest succeeded: {'dataset_id': '4fd72248-6778-4f7c-880a-e61773531d0d', 'dataset': 'ANVIL_CCDG_Broad_CVD_AF_Marcus_UCSF_HMB_WES_20240220', 'table': 'participant', 'path': None, 'load_tag': 'Ingest for 4fd72248-6778-4f7c-880a-e61773531d0d', 'row_count': 598, 'bad_row_count': 0, 'load_result': None}
09/18/2024 09:31:15 PM - INFO: Fetching ingested records and building datarepo_row_id lookup for use in AnVIL transforms.
09/18/2024 09:31:20 PM - INFO: Validating table 'participant' in new dataset vs. original dataset.
09/18/2024 09

Unnamed: 0,Task,Step,Status,Message
0,Dataset Ingestion,Table: file_inventory -- Rows: 1-1795,Success,"{'dataset_id': '4fd72248-6778-4f7c-880a-e61773531d0d', 'dataset': 'ANVIL_CCDG_Broad_CVD_AF_Marcus_UCSF_HMB_WES_20240220', 'table': 'file_inventory', 'path': None, 'load_tag': 'Ingest for 4fd72248-6778-4f7c-880a-e61773531d0d', 'row_count': 1795, 'bad_row_count': 0, 'load_result': None}"
1,Dataset Validation,Table: file_inventory,Success,1795 records found in both new and original table.
2,Dataset Ingestion,Table: sample_set -- Rows: 1-1,Success,"{'dataset_id': '4fd72248-6778-4f7c-880a-e61773531d0d', 'dataset': 'ANVIL_CCDG_Broad_CVD_AF_Marcus_UCSF_HMB_WES_20240220', 'table': 'sample_set', 'path': None, 'load_tag': 'Ingest for 4fd72248-6778-4f7c-880a-e61773531d0d', 'row_count': 1, 'bad_row_count': 0, 'load_result': None}"
3,Dataset Validation,Table: sample_set,Success,1 records found in both new and original table.
4,Dataset Ingestion,Table: subject -- Rows: 1-599,Success,"{'dataset_id': '4fd72248-6778-4f7c-880a-e61773531d0d', 'dataset': 'ANVIL_CCDG_Broad_CVD_AF_Marcus_UCSF_HMB_WES_20240220', 'table': 'subject', 'path': None, 'load_tag': 'Ingest for 4fd72248-6778-4f7c-880a-e61773531d0d', 'row_count': 599, 'bad_row_count': 0, 'load_result': None}"
5,Dataset Validation,Table: subject,Success,599 records found in both new and original table.
6,Dataset Ingestion,Table: workspace_attributes -- Rows: 1-29,Success,"{'dataset_id': '4fd72248-6778-4f7c-880a-e61773531d0d', 'dataset': 'ANVIL_CCDG_Broad_CVD_AF_Marcus_UCSF_HMB_WES_20240220', 'table': 'workspace_attributes', 'path': None, 'load_tag': 'Ingest for 4fd72248-6778-4f7c-880a-e61773531d0d', 'row_count': 29, 'bad_row_count': 0, 'load_result': None}"
7,Dataset Validation,Table: workspace_attributes,Success,29 records found in both new and original table.
8,Dataset Ingestion,Table: sample -- Rows: 1-598,Success,"{'dataset_id': '4fd72248-6778-4f7c-880a-e61773531d0d', 'dataset': 'ANVIL_CCDG_Broad_CVD_AF_Marcus_UCSF_HMB_WES_20240220', 'table': 'sample', 'path': None, 'load_tag': 'Ingest for 4fd72248-6778-4f7c-880a-e61773531d0d', 'row_count': 598, 'bad_row_count': 0, 'load_result': None}"
9,Dataset Validation,Table: sample,Success,598 records found in both new and original table.


09/18/2024 09:31:20 PM - INFO: 
Pipeline finished with 0 failures.
09/18/2024 09:31:20 PM - INFO: Creating a snapshot for TDR dataset 4fd72248-6778-4f7c-880a-e61773531d0d.
09/18/2024 09:31:20 PM - INFO: Retrieving dataset details from prod environment. UUID:  4fd72248-6778-4f7c-880a-e61773531d0d
09/18/2024 09:31:21 PM - INFO: Creating full-view snapshot.
09/18/2024 09:31:21 PM - INFO: Attempting to lookup consent code using PHS: 1933 and Consent Name: NA.
09/18/2024 09:31:22 PM - INFO: Submitting snapshot request.
TDR Job ID: wnlM7fXsRCe59WxjNZHrdA
09/18/2024 09:32:23 PM - INFO: Snapshot Creation succeeded: {'id': '716619bc-d99d-44e2-8ff9-1b0c0f36ccac', 'name': 'ANVIL_CCDG_Broad_CVD_AF_Marcus_UCSF_HMB_WES_20240220_ANV5_202409182131', 'description': 'Full view snapshot of ANVIL_CCDG_Broad_CVD_AF_Marcus_UCSF_HMB_WES_20240220', 'createdDate': '2024-09-18T21:31:23.084948Z', 'profileId': '9ee23bed-b46c-4561-9103-d2a723113f7f', 'storage': [{'region': 'southcentralus', 'cloudResource': 'appli

TDR Job ID: 9ZpA_mtfRCS89KrG9vkWag
09/18/2024 09:48:25 PM - INFO: Ingest succeeded: {'dataset_id': 'd28cb1d4-2300-4cd7-882b-99ce59305ce0', 'dataset': 'ANVIL_CCDG_Broad_CVD_AF_PEGASUS_HMB_WES_20240313', 'table': 'sample_set', 'path': None, 'load_tag': 'Ingest for d28cb1d4-2300-4cd7-882b-99ce59305ce0', 'row_count': 1, 'bad_row_count': 0, 'load_result': None}
09/18/2024 09:48:25 PM - INFO: Fetching ingested records and building datarepo_row_id lookup for use in AnVIL transforms.
09/18/2024 09:48:31 PM - INFO: Validating table 'sample_set' in new dataset vs. original dataset.
09/18/2024 09:48:31 PM - INFO: Processing dataset ingestion for table 'workspace_attributes'.
09/18/2024 09:48:31 PM - INFO: Table 'workspace_attributes' does not contain fileref columns. Will use a chunk size of 250000 rows per ingestion request.
09/18/2024 09:48:31 PM - INFO: Fetching rows 1-29 from table 'workspace_attributes' in the original dataset (2ef4530a-cc36-4f32-9a1a-63a555346587).
09/18/2024 09:48:34 PM - 

Unnamed: 0,Task,Step,Status,Message
0,Dataset Ingestion,Table: file_inventory -- Rows: 1-22450,Success,"{'dataset_id': 'd28cb1d4-2300-4cd7-882b-99ce59305ce0', 'dataset': 'ANVIL_CCDG_Broad_CVD_AF_PEGASUS_HMB_WES_20240313', 'table': 'file_inventory', 'path': None, 'load_tag': 'Ingest for d28cb1d4-2300-4cd7-882b-99ce59305ce0', 'row_count': 22450, 'bad_row_count': 0, 'load_result': None}"
1,Dataset Validation,Table: file_inventory,Success,22450 records found in both new and original table.
2,Dataset Ingestion,Table: participant -- Rows: 1-7483,Success,"{'dataset_id': 'd28cb1d4-2300-4cd7-882b-99ce59305ce0', 'dataset': 'ANVIL_CCDG_Broad_CVD_AF_PEGASUS_HMB_WES_20240313', 'table': 'participant', 'path': None, 'load_tag': 'Ingest for d28cb1d4-2300-4cd7-882b-99ce59305ce0', 'row_count': 7483, 'bad_row_count': 0, 'load_result': None}"
3,Dataset Validation,Table: participant,Success,7483 records found in both new and original table.
4,Dataset Ingestion,Table: sample -- Rows: 1-7483,Failure,"Error on ingest: Job WlNLyUnrQMCq2Mh9yk-jhA failed: (500)\nReason: Internal Server Error\nHTTP response headers: HTTPHeaderDict({'Date': 'Wed, 18 Sep 2024 21:46:35 GMT', 'Server': 'Apache', 'X-Frame-Options': 'SAMEORIGIN', 'Access-Control-Allow-Headers': 'DNT,User-Agent,X-Requested-With,If-Modified-Since,Cache-Control,Content-Type,Range,Authorization,Accept,Referer,X-App-Id,Origin', 'Access-Control-Allow-Methods': 'GET,POST,DELETE,PUT,PATCH,OPTIONS,HEAD', 'X-Content-Type-Options': 'nosniff', 'Strict-Transport-Security': 'max-age=31536000;includeSubDomains', 'Cache-Control': 'no-cache,no-store,must-revalidate', 'X-Request-ID': '6M8lPVrK', 'Content-Type': 'application/json', 'Vary': 'Origin', 'Via': '1.1 google', 'Alt-Svc': 'h3="":443""; ma=2592000,h3-29="":443""; ma=2592000', 'Transfer-Encoding': 'chunked'})\nHTTP response body: {""timestamp"":1726695995255,""status"":500,""error"":""Internal Server Error"",""path"":""/api/repository/v1/jobs/WlNLyUnrQMCq2Mh9yk-jhA/result""}\n"
5,Dataset Validation,Table: sample,Failure,0 records found in new table doesn't match 7483 records in original table.
6,Dataset Ingestion,Table: sample_set -- Rows: 1-1,Success,"{'dataset_id': 'd28cb1d4-2300-4cd7-882b-99ce59305ce0', 'dataset': 'ANVIL_CCDG_Broad_CVD_AF_PEGASUS_HMB_WES_20240313', 'table': 'sample_set', 'path': None, 'load_tag': 'Ingest for d28cb1d4-2300-4cd7-882b-99ce59305ce0', 'row_count': 1, 'bad_row_count': 0, 'load_result': None}"
7,Dataset Validation,Table: sample_set,Success,1 records found in both new and original table.
8,Dataset Ingestion,Table: workspace_attributes -- Rows: 1-29,Success,"{'dataset_id': 'd28cb1d4-2300-4cd7-882b-99ce59305ce0', 'dataset': 'ANVIL_CCDG_Broad_CVD_AF_PEGASUS_HMB_WES_20240313', 'table': 'workspace_attributes', 'path': None, 'load_tag': 'Ingest for d28cb1d4-2300-4cd7-882b-99ce59305ce0', 'row_count': 29, 'bad_row_count': 0, 'load_result': None}"
9,Dataset Validation,Table: workspace_attributes,Success,29 records found in both new and original table.


09/18/2024 09:53:29 PM - INFO: 
Pipeline finished with 2 failures.
09/18/2024 09:53:29 PM - INFO: Creating a snapshot for TDR dataset d28cb1d4-2300-4cd7-882b-99ce59305ce0.
09/18/2024 09:53:29 PM - ERROR: Failures noted in upstream data processing. Skipping snapshotting.
09/18/2024 09:53:29 PM - INFO: 
Migrating tabular data from TDR dataset 65793118-3c88-4185-9172-2354850e6056 to TDR dataset 183ec762-f867-46c5-bb19-8b2b3417f7b2.
09/18/2024 09:53:29 PM - INFO: Adding TDR general SA (datarepo-jade-api@terra-datarepo-production.iam.gserviceaccount.com) to original dataset: 65793118-3c88-4185-9172-2354850e6056
09/18/2024 09:53:29 PM - INFO: TDR SA added successfully.
09/18/2024 09:53:29 PM - INFO: Retrieving dataset details from original dataset: 65793118-3c88-4185-9172-2354850e6056
09/18/2024 09:53:29 PM - INFO: Fetching existing datarepo_row_id crosswalk (if one exists).
09/18/2024 09:53:29 PM - INFO: Ordering tables and pulling current record counts for validation.
09/18/2024 09:53:43 P

09/18/2024 10:12:10 PM - INFO: Processing dataset ingestion for table 'anvil_variantcallingactivity'.
09/18/2024 10:12:10 PM - INFO: No records found for table 'anvil_variantcallingactivity' in original dataset. Continuing to next table/record set.
09/18/2024 10:12:10 PM - INFO: Processing dataset ingestion for table 'anvil_file'.
09/18/2024 10:12:10 PM - INFO: Populated non-FSS tables missing from datarepo_row_id crosswalk: subject. Skipping FSS table 'anvil_file'.
09/18/2024 10:12:10 PM - INFO: Processing dataset ingestion for table 'anvil_donor'.
09/18/2024 10:12:10 PM - INFO: Populated non-FSS tables missing from datarepo_row_id crosswalk: subject. Skipping FSS table 'anvil_donor'.
09/18/2024 10:12:10 PM - INFO: Processing dataset ingestion for table 'anvil_alignmentactivity'.
09/18/2024 10:12:10 PM - INFO: No records found for table 'anvil_alignmentactivity' in original dataset. Continuing to next table/record set.
09/18/2024 10:12:10 PM - INFO: Processing dataset ingestion for ta

Unnamed: 0,Task,Step,Status,Message
0,Dataset Ingestion,Table: file_inventory -- Rows: 1-22311,Success,"{'dataset_id': '183ec762-f867-46c5-bb19-8b2b3417f7b2', 'dataset': 'ANVIL_CCDG_Broad_CVD_AF_Swiss_Cases_DS_MDS_Arrays_20240220', 'table': 'file_inventory', 'path': None, 'load_tag': 'Ingest for 183ec762-f867-46c5-bb19-8b2b3417f7b2', 'row_count': 22311, 'bad_row_count': 0, 'load_result': None}"
1,Dataset Validation,Table: file_inventory,Success,22311 records found in both new and original table.
2,Dataset Ingestion,Table: sample -- Rows: 1-4461,Success,"{'dataset_id': '183ec762-f867-46c5-bb19-8b2b3417f7b2', 'dataset': 'ANVIL_CCDG_Broad_CVD_AF_Swiss_Cases_DS_MDS_Arrays_20240220', 'table': 'sample', 'path': None, 'load_tag': 'Ingest for 183ec762-f867-46c5-bb19-8b2b3417f7b2', 'row_count': 4461, 'bad_row_count': 0, 'load_result': None}"
3,Dataset Validation,Table: sample,Success,4461 records found in both new and original table.
4,Dataset Ingestion,Table: participant -- Rows: 1-4461,Success,"{'dataset_id': '183ec762-f867-46c5-bb19-8b2b3417f7b2', 'dataset': 'ANVIL_CCDG_Broad_CVD_AF_Swiss_Cases_DS_MDS_Arrays_20240220', 'table': 'participant', 'path': None, 'load_tag': 'Ingest for 183ec762-f867-46c5-bb19-8b2b3417f7b2', 'row_count': 4461, 'bad_row_count': 0, 'load_result': None}"
5,Dataset Validation,Table: participant,Success,4461 records found in both new and original table.
6,Dataset Ingestion,Table: subject -- Rows: 1-4448,Success,"{'dataset_id': '183ec762-f867-46c5-bb19-8b2b3417f7b2', 'dataset': 'ANVIL_CCDG_Broad_CVD_AF_Swiss_Cases_DS_MDS_Arrays_20240220', 'table': 'subject', 'path': None, 'load_tag': 'Ingest for 183ec762-f867-46c5-bb19-8b2b3417f7b2', 'row_count': 4448, 'bad_row_count': 0, 'load_result': None}"
7,Dataset Validation,Table: subject,Success,4448 records found in both new and original table.
8,Dataset Ingestion,Table: sample_set -- Rows: 1-1,Success,"{'dataset_id': '183ec762-f867-46c5-bb19-8b2b3417f7b2', 'dataset': 'ANVIL_CCDG_Broad_CVD_AF_Swiss_Cases_DS_MDS_Arrays_20240220', 'table': 'sample_set', 'path': None, 'load_tag': 'Ingest for 183ec762-f867-46c5-bb19-8b2b3417f7b2', 'row_count': 1, 'bad_row_count': 0, 'load_result': None}"
9,Dataset Validation,Table: sample_set,Success,1 records found in both new and original table.


09/18/2024 10:12:10 PM - INFO: 
Pipeline finished with 0 failures.
09/18/2024 10:12:10 PM - INFO: Creating a snapshot for TDR dataset 183ec762-f867-46c5-bb19-8b2b3417f7b2.
09/18/2024 10:12:11 PM - INFO: Retrieving dataset details from prod environment. UUID:  183ec762-f867-46c5-bb19-8b2b3417f7b2
09/18/2024 10:12:12 PM - INFO: Creating full-view snapshot.
09/18/2024 10:12:12 PM - INFO: Attempting to lookup consent code using PHS: 2242 and Consent Name: TBD.
09/18/2024 10:12:12 PM - INFO: Submitting snapshot request.
TDR Job ID: MR2DNbyTQkevohBDOiF6nQ


### Manual Ingest Scratch

In [None]:
tdr_host = "https://data.terra.bio"
api_client = refresh_tdr_api_client(tdr_host)
datasets_api = data_repo_client.DatasetsApi(api_client=api_client)
float_col_dict = {}
float_col_dict["sample"] = ['fold_80_base_penalty', 'fold_enrichment', 'het_snp_sensitivity', 'library_1_mean_insert_size', 'library_1_pct_exc_dupe', 'library_1_percent_duplication', 'mean_bait_coverage', 'mean_insert_size', 'mean_target_coverage', 'on_bait_vs_selected', 'pct_chimeras', 'pct_contamination', 'pct_exc_baseq', 'pct_exc_dupe', 'pct_exc_mapq', 'pct_exc_off_target', 'pct_exc_overlap', 'pct_off_bait', 'pct_pf_reads_aligned', 'pct_reads_aligned_in_pairs', 'pct_selected_bases', 'pct_target_bases_100x', 'pct_target_bases_10x', 'pct_target_bases_20x', 'pct_target_bases_2x', 'pct_target_bases_30x', 'pct_target_bases_50x', 'pct_usable_bases_on_bait', 'pct_usable_bases_on_target', 'pf_hq_error_rate', 'strand_balance', 'zero_cvg_targets_pct', 'library_2_mean_insert_size', 'library_2_pct_exc_dupe', 'library_2_percent_duplication']
table = "sample"

# Pull ingested samples
payload = {
  "offset": 0,
  "limit": 1000,
  "sort": "datarepo_row_id",
  "direction": "asc",
  "filter": ""
}
ingested_records = datasets_api.query_dataset_data_by_id(id="5f4ece3e-d76e-4d78-99e0-e62a24cd163d", table=table, query_data_request_model=payload).to_dict()
already_processed_samples = [rec["sample_id"] for rec in ingested_records["result"]]

# Pull samples to ingest
payload = {
  "offset": 0,
  "limit": 1000,
  "sort": "datarepo_row_id",
  "direction": "asc",
  "filter": ""
}
records_orig = datasets_api.query_dataset_data_by_id(id="85dbde76-c130-40b2-8a8a-ba815ba499da", table=table, query_data_request_model=payload).to_dict()
records_processed = []
for record in records_orig["result"]:
    int_record = record.copy()
    for fcol in float_col_dict[table]:
        if int_record[fcol]:
            int_record[fcol] = float(int_record[fcol])
    if int_record["sample_id"] not in already_processed_samples:
        records_processed.append(int_record)

# Build ingest request
ingest_request = {
    "table": table,
    "profile_id": "9ee23bed-b46c-4561-9103-d2a723113f7f",
    "ignore_unknown_values": True,
    "resolve_existing_files": True,
    "updateStrategy": "append",
    "format": "array",
    "load_tag": "Ingest for 5f4ece3e-d76e-4d78-99e0-e62a24cd163d",
    "records": records_processed[0:100]
}

In [None]:
len(already_processed_samples)

In [None]:
len(records_processed)

In [None]:
len(ingest_request["records"])

In [None]:
json.dumps(ingest_request)

## Validation

### Pull and Compare Tabular Data between TDR Datasets

In [6]:
#############################################
## Functions
#############################################

def compare_row_counts(dataset_1_id, dataset_2_id):
    
    # Setup/refresh TDR clients
    logging.info(f"Comparing tabular data record counts between TDR dataset {dataset_1_id} and TDR dataset {dataset_2_id}.")
    api_client = refresh_tdr_api_client("https://data.terra.bio")
    datasets_api = data_repo_client.DatasetsApi(api_client=api_client)

    # Pull table list across datasets
    logging.info(f"Pulling the superset of tables across the two datasets.")
    try:
        dataset_1_details = datasets_api.retrieve_dataset(id=dataset_1_id, include=["SCHEMA"]).to_dict()
        dataset_2_details = datasets_api.retrieve_dataset(id=dataset_2_id, include=["SCHEMA"]).to_dict()
    except Exception as e:
        error_str = f"Error retrieving details from datasets: {str(e)}"
        logging.error(error_str)
    table_set = set()
    for table_entry in dataset_1_details["schema"]["tables"]:
        table_set.add(table_entry["name"])
    for table_entry in dataset_2_details["schema"]["tables"]:
        table_set.add(table_entry["name"])   

    # For each table in the table list, pull record counts from the two datasets and compare
    results = []
    payload = {
      "offset": 0,
      "limit": 10,
      "sort": "datarepo_row_id",
      "direction": "asc",
      "filter": ""
    }
    for table in table_set:
        logging.info(f"Comparing record counts for table '{table}'")
        # Pulling record counts for dataset 1
        ds1_table_present = "True"
        attempt_counter = 0
        while True:
            try:
                record_results = datasets_api.query_dataset_data_by_id(id=dataset_1_id, table=table, query_data_request_model=payload).to_dict()
                ds1_record_count = record_results["total_row_count"]
                break
            except Exception as e:
                if "No dataset table exists" in str(e):
                    ds1_record_count = 0
                    ds1_table_present = "False"
                    break
                else:
                    if attempt_counter < 2:
                        sleep(10)
                        attempt_counter += 1
                        continue
                    else:
                        ds1_record_count = 0
                        ds1_table_present = "Unknown"
                        break
        # Pulling record counts for dataset 2
        ds2_table_present = "True"
        attempt_counter = 0
        while True:
            try:
                record_results = datasets_api.query_dataset_data_by_id(id=dataset_2_id, table=table, query_data_request_model=payload).to_dict()
                ds2_record_count = record_results["total_row_count"]
                break
            except Exception as e:
                if "No dataset table exists" in str(e):
                    ds2_record_count = 0
                    ds2_table_present = "False"
                    break
                else:
                    if attempt_counter < 2:
                        sleep(10)
                        attempt_counter += 1
                        continue
                    else:
                        ds2_record_count = 0
                        ds2_table_present = "Unknown"
                        break
        # Build table comparison
        if ds1_table_present == "Unknown" or ds2_table_present == "Unknown":
            status = "Fail"
            error_reason = "Error retrieving table data from dataset(s)"
        elif ds1_table_present == "False" or ds2_table_present == "False":
            status = "Fail"
            error_reason = "Table presence mismatch between datasets"
        elif ds1_record_count != ds2_record_count:
            status = "Fail"
            error_reason = "Difference in record count"
        else:
            status = "Pass"
            error_reason = ""
        results.append([dataset_1_id, dataset_2_id, table, ds1_table_present, ds1_record_count, ds2_table_present, ds2_record_count, status, error_reason])

    # Display detailed results
    print("\nResults:")
    results_df = pd.DataFrame(results, columns = ["Dataset 1 ID", "Dataset 2 ID", "Table", "Table in DS1", "DS1 Record Count", "Table in DS2", "DS2 Record Count", "Status", "Message"])
    display(results_df)

    # Return final aggregated results
    status = "Pass"
    failed_tables = []
    for entry in results:
        if entry[7] == "Fail":
            failed_tables.append(entry[2])
            status = "Fail"
    return status, sorted(failed_tables)
        
def compare_contents_sample(dataset_1_id, dataset_2_id, sample_size, fields_to_ignore):
    # Pull schema, record first column in each table (for ordering)
    # Setup/refresh TDR clients
    logging.info(f"Comparing tabular data record counts between TDR dataset {dataset_1_id} and TDR dataset {dataset_2_id}.")
    api_client = refresh_tdr_api_client("https://data.terra.bio")
    datasets_api = data_repo_client.DatasetsApi(api_client=api_client)

    # Pull table list across datasets
    logging.info(f"Pulling the superset of tables across the two datasets.")
    try:
        dataset_1_details = datasets_api.retrieve_dataset(id=dataset_1_id, include=["SCHEMA"]).to_dict()
        dataset_2_details = datasets_api.retrieve_dataset(id=dataset_2_id, include=["SCHEMA"]).to_dict()
    except Exception as e:
        error_str = f"Error retrieving details from datasets: {str(e)}"
        logging.error(error_str)
    table_set = {}
    for table_entry in dataset_1_details["schema"]["tables"]:
        table_set.add(table_entry["name"])
    for table_entry in dataset_2_details["schema"]["tables"]:
        table_set.add(table_entry["name"])  
    
    
    # Loop through tables, pull xxx records (by sample size), ordering by first column
    # Drop fields_to_ignore
    # Compare --> How to best do this
    pass
    
#############################################
## Input Parameters
#############################################

# Specify the list of dataset pairs to compare
dataset_id_pairs_list = [
    #["gcp_dataset_id", "az_dataset_id"]
    ['dd2cb8fc-42a6-482f-898e-ef6125feccb8', '245020b9-7355-4002-95db-12e7234070c5'],
    ['92382848-f5e9-426c-b7dc-f2841ae97018', '8a90137a-7aed-4e8c-bd99-1399f1c550fd'],
    ['4999a410-990e-484b-b4f3-d636f894a741', '79abc50c-6a4e-47e0-962a-4bfa7cb8e321'],
    ['1f534eb4-701f-4182-9895-64c5e5b52d82', '1bf4d70f-db98-4d07-b48f-d177efd25ae4'],
    ['039dd3d6-0cb5-4cd1-86b3-e9579c9b5218', '12c0a3ee-4a21-4be9-a09f-762e2737da1c'],
    ['e68d1d39-99df-4cd7-8053-1b298f03eabb', 'cfcb0f71-1157-4dbb-a76b-926c0cd40ea2'],
    ['d0ce8b95-9c3b-4f9e-8ce0-169fd89a8b20', '5da7394c-1432-4ee0-add5-851280e32d24'],
    ['7427b2eb-a84f-413c-bfb0-7d2e36b0628f', '4fd72248-6778-4f7c-880a-e61773531d0d'],
    ['2ef4530a-cc36-4f32-9a1a-63a555346587', 'd28cb1d4-2300-4cd7-882b-99ce59305ce0'],
    ['65793118-3c88-4185-9172-2354850e6056', '183ec762-f867-46c5-bb19-8b2b3417f7b2'],
    ['36bdd59f-4f5b-43cd-8d34-a21ef87bbf30', '933d1603-8c61-4ff2-8489-7f774ac15e97'],
    ['3abfc362-7e73-4663-9dcf-07b78b9aa2d4', '5dd1128b-6a23-486e-8950-47c7d9f687a8'],
    ['b60b4737-c646-4299-85a0-520890e830b7', '757191b0-9db3-4d18-b4ad-97bead5f3221'],
]

# Specify whether row comparison checks should run
run_row_count_comparison = True

# Specify whether table content checks should run, the size of the sample to use (if so), and which fields should be excluded from comparison
run_contents_sample_comparison = False
contents_sample_comparison_size = 1000
fields_to_ignore = ["datarepo_row_id", "orig_datarepo_row_id", "orig_file_ref", "source_datarepo_row_ids", "uri"]

#############################################
## Execution
#############################################

# Run validation
results = []
for dataset_id_pair in dataset_id_pairs_list:
    if run_row_count_comparison:
        status, failed_tables = compare_row_counts(dataset_id_pair[0], dataset_id_pair[1])
        results.append([dataset_id_pair[0], dataset_id_pair[1], "Record Count Comparison", status, ', '.join(failed_tables)])

# Display final results
print("\nFinal Validation Results:")
results_df = pd.DataFrame(results, columns = ["Dataset 1 ID", "Dataset 2 ID", "Validation Type", "Status", "Failed Tables"])
display(results_df)   


09/19/2024 12:46:18 PM - INFO: Comparing tabular data record counts between TDR dataset dd2cb8fc-42a6-482f-898e-ef6125feccb8 and TDR dataset 245020b9-7355-4002-95db-12e7234070c5.
09/19/2024 12:46:18 PM - INFO: Pulling the superset of tables across the two datasets.
09/19/2024 12:46:19 PM - INFO: Comparing record counts for table 'anvil_dataset'
09/19/2024 12:46:24 PM - INFO: Comparing record counts for table 'anvil_variantcallingactivity'
09/19/2024 12:46:38 PM - INFO: Comparing record counts for table 'sample'
09/19/2024 12:46:41 PM - INFO: Comparing record counts for table 'subject'
09/19/2024 12:46:49 PM - INFO: Comparing record counts for table 'anvil_activity'
09/19/2024 12:46:52 PM - INFO: Comparing record counts for table 'anvil_antibody'
09/19/2024 12:46:56 PM - INFO: Comparing record counts for table 'file_inventory'
09/19/2024 12:46:59 PM - INFO: Comparing record counts for table 'anvil_diagnosis'
09/19/2024 12:47:03 PM - INFO: Comparing record counts for table 'anvil_assayac

Unnamed: 0,Dataset 1 ID,Dataset 2 ID,Table,Table in DS1,DS1 Record Count,Table in DS2,DS2 Record Count,Status,Message
0,dd2cb8fc-42a6-482f-898e-ef6125feccb8,245020b9-7355-4002-95db-12e7234070c5,anvil_dataset,True,1,True,0,Fail,Difference in record count
1,dd2cb8fc-42a6-482f-898e-ef6125feccb8,245020b9-7355-4002-95db-12e7234070c5,anvil_variantcallingactivity,True,0,True,0,Pass,
2,dd2cb8fc-42a6-482f-898e-ef6125feccb8,245020b9-7355-4002-95db-12e7234070c5,sample,True,5031,True,0,Fail,Difference in record count
3,dd2cb8fc-42a6-482f-898e-ef6125feccb8,245020b9-7355-4002-95db-12e7234070c5,subject,True,5031,True,5031,Pass,
4,dd2cb8fc-42a6-482f-898e-ef6125feccb8,245020b9-7355-4002-95db-12e7234070c5,anvil_activity,True,15095,True,0,Fail,Difference in record count
5,dd2cb8fc-42a6-482f-898e-ef6125feccb8,245020b9-7355-4002-95db-12e7234070c5,anvil_antibody,True,0,True,0,Pass,
6,dd2cb8fc-42a6-482f-898e-ef6125feccb8,245020b9-7355-4002-95db-12e7234070c5,file_inventory,True,15101,True,15101,Pass,
7,dd2cb8fc-42a6-482f-898e-ef6125feccb8,245020b9-7355-4002-95db-12e7234070c5,anvil_diagnosis,True,0,True,0,Pass,
8,dd2cb8fc-42a6-482f-898e-ef6125feccb8,245020b9-7355-4002-95db-12e7234070c5,anvil_assayactivity,True,0,True,0,Pass,
9,dd2cb8fc-42a6-482f-898e-ef6125feccb8,245020b9-7355-4002-95db-12e7234070c5,anvil_project,True,1,True,0,Fail,Difference in record count


09/19/2024 12:47:35 PM - INFO: Comparing tabular data record counts between TDR dataset 92382848-f5e9-426c-b7dc-f2841ae97018 and TDR dataset 8a90137a-7aed-4e8c-bd99-1399f1c550fd.
09/19/2024 12:47:35 PM - INFO: Pulling the superset of tables across the two datasets.
09/19/2024 12:47:35 PM - INFO: Comparing record counts for table 'anvil_dataset'
09/19/2024 12:47:38 PM - INFO: Comparing record counts for table 'sample'
09/19/2024 12:47:42 PM - INFO: Comparing record counts for table 'anvil_variantcallingactivity'
09/19/2024 12:47:45 PM - INFO: Comparing record counts for table 'anvil_antibody'
09/19/2024 12:47:49 PM - INFO: Comparing record counts for table 'anvil_activity'
09/19/2024 12:47:52 PM - INFO: Comparing record counts for table 'anvil_assayactivity'
09/19/2024 12:47:55 PM - INFO: Comparing record counts for table 'file_inventory'
09/19/2024 12:47:58 PM - INFO: Comparing record counts for table 'anvil_project'
09/19/2024 12:48:01 PM - INFO: Comparing record counts for table 'sub

Unnamed: 0,Dataset 1 ID,Dataset 2 ID,Table,Table in DS1,DS1 Record Count,Table in DS2,DS2 Record Count,Status,Message
0,92382848-f5e9-426c-b7dc-f2841ae97018,8a90137a-7aed-4e8c-bd99-1399f1c550fd,anvil_dataset,True,1,True,0,Fail,Difference in record count
1,92382848-f5e9-426c-b7dc-f2841ae97018,8a90137a-7aed-4e8c-bd99-1399f1c550fd,sample,True,1565,True,0,Fail,Difference in record count
2,92382848-f5e9-426c-b7dc-f2841ae97018,8a90137a-7aed-4e8c-bd99-1399f1c550fd,anvil_variantcallingactivity,True,0,True,0,Pass,
3,92382848-f5e9-426c-b7dc-f2841ae97018,8a90137a-7aed-4e8c-bd99-1399f1c550fd,anvil_antibody,True,0,True,0,Pass,
4,92382848-f5e9-426c-b7dc-f2841ae97018,8a90137a-7aed-4e8c-bd99-1399f1c550fd,anvil_activity,True,4695,True,0,Fail,Difference in record count
5,92382848-f5e9-426c-b7dc-f2841ae97018,8a90137a-7aed-4e8c-bd99-1399f1c550fd,anvil_assayactivity,True,0,True,0,Pass,
6,92382848-f5e9-426c-b7dc-f2841ae97018,8a90137a-7aed-4e8c-bd99-1399f1c550fd,file_inventory,True,4696,True,4696,Pass,
7,92382848-f5e9-426c-b7dc-f2841ae97018,8a90137a-7aed-4e8c-bd99-1399f1c550fd,anvil_project,True,1,True,0,Fail,Difference in record count
8,92382848-f5e9-426c-b7dc-f2841ae97018,8a90137a-7aed-4e8c-bd99-1399f1c550fd,subject,True,1565,True,1565,Pass,
9,92382848-f5e9-426c-b7dc-f2841ae97018,8a90137a-7aed-4e8c-bd99-1399f1c550fd,anvil_diagnosis,True,0,True,0,Pass,


09/19/2024 12:48:32 PM - INFO: Comparing tabular data record counts between TDR dataset 4999a410-990e-484b-b4f3-d636f894a741 and TDR dataset 79abc50c-6a4e-47e0-962a-4bfa7cb8e321.
09/19/2024 12:48:32 PM - INFO: Pulling the superset of tables across the two datasets.
09/19/2024 12:48:32 PM - INFO: Comparing record counts for table 'anvil_dataset'
09/19/2024 12:48:36 PM - INFO: Comparing record counts for table 'sample'
09/19/2024 12:48:38 PM - INFO: Comparing record counts for table 'subject'
09/19/2024 12:48:41 PM - INFO: Comparing record counts for table 'anvil_variantcallingactivity'
09/19/2024 12:48:45 PM - INFO: Comparing record counts for table 'anvil_antibody'
09/19/2024 12:48:49 PM - INFO: Comparing record counts for table 'anvil_diagnosis'
09/19/2024 12:48:52 PM - INFO: Comparing record counts for table 'file_inventory'
09/19/2024 12:48:55 PM - INFO: Comparing record counts for table 'anvil_assayactivity'
09/19/2024 12:48:59 PM - INFO: Comparing record counts for table 'anvil_pr

Unnamed: 0,Dataset 1 ID,Dataset 2 ID,Table,Table in DS1,DS1 Record Count,Table in DS2,DS2 Record Count,Status,Message
0,4999a410-990e-484b-b4f3-d636f894a741,79abc50c-6a4e-47e0-962a-4bfa7cb8e321,anvil_dataset,True,1,True,0,Fail,Difference in record count
1,4999a410-990e-484b-b4f3-d636f894a741,79abc50c-6a4e-47e0-962a-4bfa7cb8e321,sample,True,381,True,381,Pass,
2,4999a410-990e-484b-b4f3-d636f894a741,79abc50c-6a4e-47e0-962a-4bfa7cb8e321,subject,True,379,True,379,Pass,
3,4999a410-990e-484b-b4f3-d636f894a741,79abc50c-6a4e-47e0-962a-4bfa7cb8e321,anvil_variantcallingactivity,True,0,True,0,Pass,
4,4999a410-990e-484b-b4f3-d636f894a741,79abc50c-6a4e-47e0-962a-4bfa7cb8e321,anvil_antibody,True,0,True,0,Pass,
5,4999a410-990e-484b-b4f3-d636f894a741,79abc50c-6a4e-47e0-962a-4bfa7cb8e321,anvil_diagnosis,True,0,True,0,Pass,
6,4999a410-990e-484b-b4f3-d636f894a741,79abc50c-6a4e-47e0-962a-4bfa7cb8e321,file_inventory,True,1911,True,1911,Pass,
7,4999a410-990e-484b-b4f3-d636f894a741,79abc50c-6a4e-47e0-962a-4bfa7cb8e321,anvil_assayactivity,True,0,True,0,Pass,
8,4999a410-990e-484b-b4f3-d636f894a741,79abc50c-6a4e-47e0-962a-4bfa7cb8e321,anvil_project,True,1,True,0,Fail,Difference in record count
9,4999a410-990e-484b-b4f3-d636f894a741,79abc50c-6a4e-47e0-962a-4bfa7cb8e321,anvil_sequencingactivity,True,0,True,0,Pass,


09/19/2024 12:49:27 PM - INFO: Comparing tabular data record counts between TDR dataset 1f534eb4-701f-4182-9895-64c5e5b52d82 and TDR dataset 1bf4d70f-db98-4d07-b48f-d177efd25ae4.
09/19/2024 12:49:27 PM - INFO: Pulling the superset of tables across the two datasets.
09/19/2024 12:49:28 PM - INFO: Comparing record counts for table 'anvil_dataset'
09/19/2024 12:49:31 PM - INFO: Comparing record counts for table 'sample'
09/19/2024 12:49:34 PM - INFO: Comparing record counts for table 'anvil_variantcallingactivity'
09/19/2024 12:49:40 PM - INFO: Comparing record counts for table 'anvil_antibody'
09/19/2024 12:49:44 PM - INFO: Comparing record counts for table 'anvil_activity'
09/19/2024 12:49:48 PM - INFO: Comparing record counts for table 'anvil_diagnosis'
09/19/2024 12:49:51 PM - INFO: Comparing record counts for table 'file_inventory'
09/19/2024 12:49:54 PM - INFO: Comparing record counts for table 'anvil_assayactivity'
09/19/2024 12:49:58 PM - INFO: Comparing record counts for table 'a

Unnamed: 0,Dataset 1 ID,Dataset 2 ID,Table,Table in DS1,DS1 Record Count,Table in DS2,DS2 Record Count,Status,Message
0,1f534eb4-701f-4182-9895-64c5e5b52d82,1bf4d70f-db98-4d07-b48f-d177efd25ae4,anvil_dataset,True,1,True,0,Fail,Difference in record count
1,1f534eb4-701f-4182-9895-64c5e5b52d82,1bf4d70f-db98-4d07-b48f-d177efd25ae4,sample,True,489,True,0,Fail,Difference in record count
2,1f534eb4-701f-4182-9895-64c5e5b52d82,1bf4d70f-db98-4d07-b48f-d177efd25ae4,anvil_variantcallingactivity,True,0,True,0,Pass,
3,1f534eb4-701f-4182-9895-64c5e5b52d82,1bf4d70f-db98-4d07-b48f-d177efd25ae4,anvil_antibody,True,0,True,0,Pass,
4,1f534eb4-701f-4182-9895-64c5e5b52d82,1bf4d70f-db98-4d07-b48f-d177efd25ae4,anvil_activity,True,1467,True,0,Fail,Difference in record count
5,1f534eb4-701f-4182-9895-64c5e5b52d82,1bf4d70f-db98-4d07-b48f-d177efd25ae4,anvil_diagnosis,True,0,True,0,Pass,
6,1f534eb4-701f-4182-9895-64c5e5b52d82,1bf4d70f-db98-4d07-b48f-d177efd25ae4,file_inventory,True,1468,True,1468,Pass,
7,1f534eb4-701f-4182-9895-64c5e5b52d82,1bf4d70f-db98-4d07-b48f-d177efd25ae4,anvil_assayactivity,True,0,True,0,Pass,
8,1f534eb4-701f-4182-9895-64c5e5b52d82,1bf4d70f-db98-4d07-b48f-d177efd25ae4,anvil_sequencingactivity,True,0,True,0,Pass,
9,1f534eb4-701f-4182-9895-64c5e5b52d82,1bf4d70f-db98-4d07-b48f-d177efd25ae4,subject,True,489,True,489,Pass,


09/19/2024 12:50:29 PM - INFO: Comparing tabular data record counts between TDR dataset 039dd3d6-0cb5-4cd1-86b3-e9579c9b5218 and TDR dataset 12c0a3ee-4a21-4be9-a09f-762e2737da1c.
09/19/2024 12:50:29 PM - INFO: Pulling the superset of tables across the two datasets.
09/19/2024 12:50:29 PM - INFO: Comparing record counts for table 'anvil_dataset'
09/19/2024 12:50:33 PM - INFO: Comparing record counts for table 'sample'
09/19/2024 12:50:36 PM - INFO: Comparing record counts for table 'subject'
09/19/2024 12:50:39 PM - INFO: Comparing record counts for table 'anvil_antibody'
09/19/2024 12:50:42 PM - INFO: Comparing record counts for table 'anvil_activity'
09/19/2024 12:50:46 PM - INFO: Comparing record counts for table 'anvil_diagnosis'
09/19/2024 12:50:50 PM - INFO: Comparing record counts for table 'anvil_variantcallingactivity'
09/19/2024 12:50:53 PM - INFO: Comparing record counts for table 'anvil_project'
09/19/2024 12:50:56 PM - INFO: Comparing record counts for table 'file_inventory

Unnamed: 0,Dataset 1 ID,Dataset 2 ID,Table,Table in DS1,DS1 Record Count,Table in DS2,DS2 Record Count,Status,Message
0,039dd3d6-0cb5-4cd1-86b3-e9579c9b5218,12c0a3ee-4a21-4be9-a09f-762e2737da1c,anvil_dataset,True,1,True,0,Fail,Difference in record count
1,039dd3d6-0cb5-4cd1-86b3-e9579c9b5218,12c0a3ee-4a21-4be9-a09f-762e2737da1c,sample,True,2081,True,2081,Pass,
2,039dd3d6-0cb5-4cd1-86b3-e9579c9b5218,12c0a3ee-4a21-4be9-a09f-762e2737da1c,subject,True,2079,True,2079,Pass,
3,039dd3d6-0cb5-4cd1-86b3-e9579c9b5218,12c0a3ee-4a21-4be9-a09f-762e2737da1c,anvil_antibody,True,0,True,0,Pass,
4,039dd3d6-0cb5-4cd1-86b3-e9579c9b5218,12c0a3ee-4a21-4be9-a09f-762e2737da1c,anvil_activity,True,4163,True,0,Fail,Difference in record count
5,039dd3d6-0cb5-4cd1-86b3-e9579c9b5218,12c0a3ee-4a21-4be9-a09f-762e2737da1c,anvil_diagnosis,True,0,True,0,Pass,
6,039dd3d6-0cb5-4cd1-86b3-e9579c9b5218,12c0a3ee-4a21-4be9-a09f-762e2737da1c,anvil_variantcallingactivity,True,0,True,0,Pass,
7,039dd3d6-0cb5-4cd1-86b3-e9579c9b5218,12c0a3ee-4a21-4be9-a09f-762e2737da1c,anvil_project,True,1,True,0,Fail,Difference in record count
8,039dd3d6-0cb5-4cd1-86b3-e9579c9b5218,12c0a3ee-4a21-4be9-a09f-762e2737da1c,file_inventory,True,10411,True,10411,Pass,
9,039dd3d6-0cb5-4cd1-86b3-e9579c9b5218,12c0a3ee-4a21-4be9-a09f-762e2737da1c,anvil_assayactivity,True,0,True,0,Pass,


09/19/2024 12:51:27 PM - INFO: Comparing tabular data record counts between TDR dataset e68d1d39-99df-4cd7-8053-1b298f03eabb and TDR dataset cfcb0f71-1157-4dbb-a76b-926c0cd40ea2.
09/19/2024 12:51:27 PM - INFO: Pulling the superset of tables across the two datasets.
09/19/2024 12:51:28 PM - INFO: Comparing record counts for table 'anvil_dataset'
09/19/2024 12:51:31 PM - INFO: Comparing record counts for table 'sample'
09/19/2024 12:51:34 PM - INFO: Comparing record counts for table 'subject'
09/19/2024 12:51:38 PM - INFO: Comparing record counts for table 'anvil_variantcallingactivity'
09/19/2024 12:51:41 PM - INFO: Comparing record counts for table 'anvil_antibody'
09/19/2024 12:51:45 PM - INFO: Comparing record counts for table 'anvil_assayactivity'
09/19/2024 12:51:48 PM - INFO: Comparing record counts for table 'file_inventory'
09/19/2024 12:51:52 PM - INFO: Comparing record counts for table 'anvil_sequencingactivity'
09/19/2024 12:51:55 PM - INFO: Comparing record counts for table 

Unnamed: 0,Dataset 1 ID,Dataset 2 ID,Table,Table in DS1,DS1 Record Count,Table in DS2,DS2 Record Count,Status,Message
0,e68d1d39-99df-4cd7-8053-1b298f03eabb,cfcb0f71-1157-4dbb-a76b-926c0cd40ea2,anvil_dataset,True,1,True,0,Fail,Difference in record count
1,e68d1d39-99df-4cd7-8053-1b298f03eabb,cfcb0f71-1157-4dbb-a76b-926c0cd40ea2,sample,True,2150,True,0,Fail,Difference in record count
2,e68d1d39-99df-4cd7-8053-1b298f03eabb,cfcb0f71-1157-4dbb-a76b-926c0cd40ea2,subject,True,2150,True,2150,Pass,
3,e68d1d39-99df-4cd7-8053-1b298f03eabb,cfcb0f71-1157-4dbb-a76b-926c0cd40ea2,anvil_variantcallingactivity,True,0,True,0,Pass,
4,e68d1d39-99df-4cd7-8053-1b298f03eabb,cfcb0f71-1157-4dbb-a76b-926c0cd40ea2,anvil_antibody,True,0,True,0,Pass,
5,e68d1d39-99df-4cd7-8053-1b298f03eabb,cfcb0f71-1157-4dbb-a76b-926c0cd40ea2,anvil_assayactivity,True,0,True,0,Pass,
6,e68d1d39-99df-4cd7-8053-1b298f03eabb,cfcb0f71-1157-4dbb-a76b-926c0cd40ea2,file_inventory,True,6454,True,6454,Pass,
7,e68d1d39-99df-4cd7-8053-1b298f03eabb,cfcb0f71-1157-4dbb-a76b-926c0cd40ea2,anvil_sequencingactivity,True,0,True,0,Pass,
8,e68d1d39-99df-4cd7-8053-1b298f03eabb,cfcb0f71-1157-4dbb-a76b-926c0cd40ea2,anvil_project,True,1,True,0,Fail,Difference in record count
9,e68d1d39-99df-4cd7-8053-1b298f03eabb,cfcb0f71-1157-4dbb-a76b-926c0cd40ea2,anvil_activity,True,6451,True,0,Fail,Difference in record count


09/19/2024 12:52:27 PM - INFO: Comparing tabular data record counts between TDR dataset d0ce8b95-9c3b-4f9e-8ce0-169fd89a8b20 and TDR dataset 5da7394c-1432-4ee0-add5-851280e32d24.
09/19/2024 12:52:27 PM - INFO: Pulling the superset of tables across the two datasets.
09/19/2024 12:52:28 PM - INFO: Comparing record counts for table 'anvil_dataset'
09/19/2024 12:52:31 PM - INFO: Comparing record counts for table 'sample'
09/19/2024 12:52:34 PM - INFO: Comparing record counts for table 'anvil_variantcallingactivity'
09/19/2024 12:52:37 PM - INFO: Comparing record counts for table 'anvil_antibody'
09/19/2024 12:52:41 PM - INFO: Comparing record counts for table 'anvil_activity'
09/19/2024 12:52:44 PM - INFO: Comparing record counts for table 'anvil_diagnosis'
09/19/2024 12:52:47 PM - INFO: Comparing record counts for table 'file_inventory'
09/19/2024 12:52:50 PM - INFO: Comparing record counts for table 'anvil_sequencingactivity'
09/19/2024 12:52:54 PM - INFO: Comparing record counts for tab

Unnamed: 0,Dataset 1 ID,Dataset 2 ID,Table,Table in DS1,DS1 Record Count,Table in DS2,DS2 Record Count,Status,Message
0,d0ce8b95-9c3b-4f9e-8ce0-169fd89a8b20,5da7394c-1432-4ee0-add5-851280e32d24,anvil_dataset,True,1,True,0,Fail,Difference in record count
1,d0ce8b95-9c3b-4f9e-8ce0-169fd89a8b20,5da7394c-1432-4ee0-add5-851280e32d24,sample,True,156,True,156,Pass,
2,d0ce8b95-9c3b-4f9e-8ce0-169fd89a8b20,5da7394c-1432-4ee0-add5-851280e32d24,anvil_variantcallingactivity,True,0,True,0,Pass,
3,d0ce8b95-9c3b-4f9e-8ce0-169fd89a8b20,5da7394c-1432-4ee0-add5-851280e32d24,anvil_antibody,True,0,True,0,Pass,
4,d0ce8b95-9c3b-4f9e-8ce0-169fd89a8b20,5da7394c-1432-4ee0-add5-851280e32d24,anvil_activity,True,312,True,0,Fail,Difference in record count
5,d0ce8b95-9c3b-4f9e-8ce0-169fd89a8b20,5da7394c-1432-4ee0-add5-851280e32d24,anvil_diagnosis,True,0,True,0,Pass,
6,d0ce8b95-9c3b-4f9e-8ce0-169fd89a8b20,5da7394c-1432-4ee0-add5-851280e32d24,file_inventory,True,784,True,784,Pass,
7,d0ce8b95-9c3b-4f9e-8ce0-169fd89a8b20,5da7394c-1432-4ee0-add5-851280e32d24,anvil_sequencingactivity,True,0,True,0,Pass,
8,d0ce8b95-9c3b-4f9e-8ce0-169fd89a8b20,5da7394c-1432-4ee0-add5-851280e32d24,anvil_project,True,1,True,0,Fail,Difference in record count
9,d0ce8b95-9c3b-4f9e-8ce0-169fd89a8b20,5da7394c-1432-4ee0-add5-851280e32d24,anvil_assayactivity,True,0,True,0,Pass,


09/19/2024 12:53:22 PM - INFO: Comparing tabular data record counts between TDR dataset 7427b2eb-a84f-413c-bfb0-7d2e36b0628f and TDR dataset 4fd72248-6778-4f7c-880a-e61773531d0d.
09/19/2024 12:53:22 PM - INFO: Pulling the superset of tables across the two datasets.
09/19/2024 12:53:23 PM - INFO: Comparing record counts for table 'anvil_dataset'
09/19/2024 12:53:26 PM - INFO: Comparing record counts for table 'sample'
09/19/2024 12:53:31 PM - INFO: Comparing record counts for table 'subject'
09/19/2024 12:53:34 PM - INFO: Comparing record counts for table 'anvil_activity'
09/19/2024 12:53:37 PM - INFO: Comparing record counts for table 'anvil_variantcallingactivity'
09/19/2024 12:53:41 PM - INFO: Comparing record counts for table 'file_inventory'
09/19/2024 12:53:44 PM - INFO: Comparing record counts for table 'anvil_diagnosis'
09/19/2024 12:53:47 PM - INFO: Comparing record counts for table 'anvil_assayactivity'
09/19/2024 12:53:51 PM - INFO: Comparing record counts for table 'anvil_pr

Unnamed: 0,Dataset 1 ID,Dataset 2 ID,Table,Table in DS1,DS1 Record Count,Table in DS2,DS2 Record Count,Status,Message
0,7427b2eb-a84f-413c-bfb0-7d2e36b0628f,4fd72248-6778-4f7c-880a-e61773531d0d,anvil_dataset,True,1,True,0,Fail,Difference in record count
1,7427b2eb-a84f-413c-bfb0-7d2e36b0628f,4fd72248-6778-4f7c-880a-e61773531d0d,sample,True,598,True,598,Pass,
2,7427b2eb-a84f-413c-bfb0-7d2e36b0628f,4fd72248-6778-4f7c-880a-e61773531d0d,subject,True,599,True,599,Pass,
3,7427b2eb-a84f-413c-bfb0-7d2e36b0628f,4fd72248-6778-4f7c-880a-e61773531d0d,anvil_activity,True,1794,True,0,Fail,Difference in record count
4,7427b2eb-a84f-413c-bfb0-7d2e36b0628f,4fd72248-6778-4f7c-880a-e61773531d0d,anvil_variantcallingactivity,True,0,True,0,Pass,
5,7427b2eb-a84f-413c-bfb0-7d2e36b0628f,4fd72248-6778-4f7c-880a-e61773531d0d,file_inventory,True,1795,True,1795,Pass,
6,7427b2eb-a84f-413c-bfb0-7d2e36b0628f,4fd72248-6778-4f7c-880a-e61773531d0d,anvil_diagnosis,True,0,True,0,Pass,
7,7427b2eb-a84f-413c-bfb0-7d2e36b0628f,4fd72248-6778-4f7c-880a-e61773531d0d,anvil_assayactivity,True,0,True,0,Pass,
8,7427b2eb-a84f-413c-bfb0-7d2e36b0628f,4fd72248-6778-4f7c-880a-e61773531d0d,anvil_project,True,1,True,0,Fail,Difference in record count
9,7427b2eb-a84f-413c-bfb0-7d2e36b0628f,4fd72248-6778-4f7c-880a-e61773531d0d,anvil_antibody,True,0,True,0,Pass,


09/19/2024 12:55:23 PM - INFO: Comparing tabular data record counts between TDR dataset 2ef4530a-cc36-4f32-9a1a-63a555346587 and TDR dataset d28cb1d4-2300-4cd7-882b-99ce59305ce0.
09/19/2024 12:55:23 PM - INFO: Pulling the superset of tables across the two datasets.
09/19/2024 12:55:24 PM - INFO: Comparing record counts for table 'anvil_dataset'
09/19/2024 12:55:27 PM - INFO: Comparing record counts for table 'sample'
09/19/2024 12:55:30 PM - INFO: Comparing record counts for table 'anvil_variantcallingactivity'
09/19/2024 12:55:34 PM - INFO: Comparing record counts for table 'anvil_antibody'
09/19/2024 12:55:38 PM - INFO: Comparing record counts for table 'subject'
09/19/2024 12:55:41 PM - INFO: Comparing record counts for table 'file_inventory'
09/19/2024 12:55:45 PM - INFO: Comparing record counts for table 'anvil_assayactivity'
09/19/2024 12:55:49 PM - INFO: Comparing record counts for table 'anvil_project'
09/19/2024 12:55:52 PM - INFO: Comparing record counts for table 'anvil_acti

Unnamed: 0,Dataset 1 ID,Dataset 2 ID,Table,Table in DS1,DS1 Record Count,Table in DS2,DS2 Record Count,Status,Message
0,2ef4530a-cc36-4f32-9a1a-63a555346587,d28cb1d4-2300-4cd7-882b-99ce59305ce0,anvil_dataset,True,1,True,0,Fail,Difference in record count
1,2ef4530a-cc36-4f32-9a1a-63a555346587,d28cb1d4-2300-4cd7-882b-99ce59305ce0,sample,True,7483,True,0,Fail,Difference in record count
2,2ef4530a-cc36-4f32-9a1a-63a555346587,d28cb1d4-2300-4cd7-882b-99ce59305ce0,anvil_variantcallingactivity,True,0,True,0,Pass,
3,2ef4530a-cc36-4f32-9a1a-63a555346587,d28cb1d4-2300-4cd7-882b-99ce59305ce0,anvil_antibody,True,0,True,0,Pass,
4,2ef4530a-cc36-4f32-9a1a-63a555346587,d28cb1d4-2300-4cd7-882b-99ce59305ce0,subject,True,7483,True,7483,Pass,
5,2ef4530a-cc36-4f32-9a1a-63a555346587,d28cb1d4-2300-4cd7-882b-99ce59305ce0,file_inventory,True,22450,True,22450,Pass,
6,2ef4530a-cc36-4f32-9a1a-63a555346587,d28cb1d4-2300-4cd7-882b-99ce59305ce0,anvil_assayactivity,True,0,True,0,Pass,
7,2ef4530a-cc36-4f32-9a1a-63a555346587,d28cb1d4-2300-4cd7-882b-99ce59305ce0,anvil_project,True,1,True,0,Fail,Difference in record count
8,2ef4530a-cc36-4f32-9a1a-63a555346587,d28cb1d4-2300-4cd7-882b-99ce59305ce0,anvil_activity,True,22449,True,0,Fail,Difference in record count
9,2ef4530a-cc36-4f32-9a1a-63a555346587,d28cb1d4-2300-4cd7-882b-99ce59305ce0,anvil_sequencingactivity,True,0,True,0,Pass,


09/19/2024 12:56:25 PM - INFO: Comparing tabular data record counts between TDR dataset 65793118-3c88-4185-9172-2354850e6056 and TDR dataset 183ec762-f867-46c5-bb19-8b2b3417f7b2.
09/19/2024 12:56:25 PM - INFO: Pulling the superset of tables across the two datasets.
09/19/2024 12:56:25 PM - INFO: Comparing record counts for table 'anvil_dataset'
09/19/2024 12:56:28 PM - INFO: Comparing record counts for table 'sample'
09/19/2024 12:56:31 PM - INFO: Comparing record counts for table 'anvil_variantcallingactivity'
09/19/2024 12:56:35 PM - INFO: Comparing record counts for table 'anvil_antibody'
09/19/2024 12:56:38 PM - INFO: Comparing record counts for table 'subject'
09/19/2024 12:56:41 PM - INFO: Comparing record counts for table 'file_inventory'
09/19/2024 12:56:45 PM - INFO: Comparing record counts for table 'anvil_assayactivity'
09/19/2024 12:56:49 PM - INFO: Comparing record counts for table 'anvil_project'
09/19/2024 12:56:52 PM - INFO: Comparing record counts for table 'anvil_sequ

Unnamed: 0,Dataset 1 ID,Dataset 2 ID,Table,Table in DS1,DS1 Record Count,Table in DS2,DS2 Record Count,Status,Message
0,65793118-3c88-4185-9172-2354850e6056,183ec762-f867-46c5-bb19-8b2b3417f7b2,anvil_dataset,True,1,True,0,Fail,Difference in record count
1,65793118-3c88-4185-9172-2354850e6056,183ec762-f867-46c5-bb19-8b2b3417f7b2,sample,True,4461,True,4461,Pass,
2,65793118-3c88-4185-9172-2354850e6056,183ec762-f867-46c5-bb19-8b2b3417f7b2,anvil_variantcallingactivity,True,0,True,0,Pass,
3,65793118-3c88-4185-9172-2354850e6056,183ec762-f867-46c5-bb19-8b2b3417f7b2,anvil_antibody,True,0,True,0,Pass,
4,65793118-3c88-4185-9172-2354850e6056,183ec762-f867-46c5-bb19-8b2b3417f7b2,subject,True,4448,True,4448,Pass,
5,65793118-3c88-4185-9172-2354850e6056,183ec762-f867-46c5-bb19-8b2b3417f7b2,file_inventory,True,22311,True,22311,Pass,
6,65793118-3c88-4185-9172-2354850e6056,183ec762-f867-46c5-bb19-8b2b3417f7b2,anvil_assayactivity,True,0,True,0,Pass,
7,65793118-3c88-4185-9172-2354850e6056,183ec762-f867-46c5-bb19-8b2b3417f7b2,anvil_project,True,1,True,0,Fail,Difference in record count
8,65793118-3c88-4185-9172-2354850e6056,183ec762-f867-46c5-bb19-8b2b3417f7b2,anvil_sequencingactivity,True,0,True,0,Pass,
9,65793118-3c88-4185-9172-2354850e6056,183ec762-f867-46c5-bb19-8b2b3417f7b2,anvil_activity,True,8923,True,0,Fail,Difference in record count


09/19/2024 12:57:23 PM - INFO: Comparing tabular data record counts between TDR dataset 36bdd59f-4f5b-43cd-8d34-a21ef87bbf30 and TDR dataset 933d1603-8c61-4ff2-8489-7f774ac15e97.
09/19/2024 12:57:23 PM - INFO: Pulling the superset of tables across the two datasets.
09/19/2024 12:57:24 PM - INFO: Comparing record counts for table 'anvil_dataset'
09/19/2024 12:57:27 PM - INFO: Comparing record counts for table 'anvil_variantcallingactivity'
09/19/2024 12:57:30 PM - INFO: Comparing record counts for table 'sample'
09/19/2024 12:57:33 PM - INFO: Comparing record counts for table 'subject'
09/19/2024 12:57:36 PM - INFO: Comparing record counts for table 'anvil_antibody'
09/19/2024 12:57:40 PM - INFO: Comparing record counts for table 'anvil_activity'
09/19/2024 12:57:42 PM - INFO: Comparing record counts for table 'anvil_diagnosis'
09/19/2024 12:57:47 PM - INFO: Comparing record counts for table 'anvil_assayactivity'
09/19/2024 12:57:51 PM - INFO: Comparing record counts for table 'file_inv

Unnamed: 0,Dataset 1 ID,Dataset 2 ID,Table,Table in DS1,DS1 Record Count,Table in DS2,DS2 Record Count,Status,Message
0,36bdd59f-4f5b-43cd-8d34-a21ef87bbf30,933d1603-8c61-4ff2-8489-7f774ac15e97,anvil_dataset,True,1,True,0,Fail,Difference in record count
1,36bdd59f-4f5b-43cd-8d34-a21ef87bbf30,933d1603-8c61-4ff2-8489-7f774ac15e97,anvil_variantcallingactivity,True,0,True,0,Pass,
2,36bdd59f-4f5b-43cd-8d34-a21ef87bbf30,933d1603-8c61-4ff2-8489-7f774ac15e97,sample,True,4546,True,0,Fail,Difference in record count
3,36bdd59f-4f5b-43cd-8d34-a21ef87bbf30,933d1603-8c61-4ff2-8489-7f774ac15e97,subject,True,4546,True,0,Fail,Difference in record count
4,36bdd59f-4f5b-43cd-8d34-a21ef87bbf30,933d1603-8c61-4ff2-8489-7f774ac15e97,anvil_antibody,True,0,True,0,Pass,
5,36bdd59f-4f5b-43cd-8d34-a21ef87bbf30,933d1603-8c61-4ff2-8489-7f774ac15e97,anvil_activity,True,13639,True,0,Fail,Difference in record count
6,36bdd59f-4f5b-43cd-8d34-a21ef87bbf30,933d1603-8c61-4ff2-8489-7f774ac15e97,anvil_diagnosis,True,0,True,0,Pass,
7,36bdd59f-4f5b-43cd-8d34-a21ef87bbf30,933d1603-8c61-4ff2-8489-7f774ac15e97,anvil_assayactivity,True,0,True,0,Pass,
8,36bdd59f-4f5b-43cd-8d34-a21ef87bbf30,933d1603-8c61-4ff2-8489-7f774ac15e97,file_inventory,True,13642,True,0,Fail,Difference in record count
9,36bdd59f-4f5b-43cd-8d34-a21ef87bbf30,933d1603-8c61-4ff2-8489-7f774ac15e97,anvil_sequencingactivity,True,0,True,0,Pass,


09/19/2024 12:58:23 PM - INFO: Comparing tabular data record counts between TDR dataset 3abfc362-7e73-4663-9dcf-07b78b9aa2d4 and TDR dataset 5dd1128b-6a23-486e-8950-47c7d9f687a8.
09/19/2024 12:58:23 PM - INFO: Pulling the superset of tables across the two datasets.
09/19/2024 12:58:23 PM - INFO: Comparing record counts for table 'anvil_dataset'
09/19/2024 12:58:26 PM - INFO: Comparing record counts for table 'anvil_variantcallingactivity'
09/19/2024 12:58:30 PM - INFO: Comparing record counts for table 'sample'
09/19/2024 12:58:33 PM - INFO: Comparing record counts for table 'subject'
09/19/2024 12:58:36 PM - INFO: Comparing record counts for table 'anvil_activity'
09/19/2024 12:58:38 PM - INFO: Comparing record counts for table 'anvil_antibody'
09/19/2024 12:58:42 PM - INFO: Comparing record counts for table 'anvil_diagnosis'
09/19/2024 12:58:46 PM - INFO: Comparing record counts for table 'anvil_assayactivity'
09/19/2024 12:58:49 PM - INFO: Comparing record counts for table 'file_inv

Unnamed: 0,Dataset 1 ID,Dataset 2 ID,Table,Table in DS1,DS1 Record Count,Table in DS2,DS2 Record Count,Status,Message
0,3abfc362-7e73-4663-9dcf-07b78b9aa2d4,5dd1128b-6a23-486e-8950-47c7d9f687a8,anvil_dataset,True,1,True,0,Fail,Difference in record count
1,3abfc362-7e73-4663-9dcf-07b78b9aa2d4,5dd1128b-6a23-486e-8950-47c7d9f687a8,anvil_variantcallingactivity,True,0,True,0,Pass,
2,3abfc362-7e73-4663-9dcf-07b78b9aa2d4,5dd1128b-6a23-486e-8950-47c7d9f687a8,sample,True,1384,True,0,Fail,Difference in record count
3,3abfc362-7e73-4663-9dcf-07b78b9aa2d4,5dd1128b-6a23-486e-8950-47c7d9f687a8,subject,True,1375,True,0,Fail,Difference in record count
4,3abfc362-7e73-4663-9dcf-07b78b9aa2d4,5dd1128b-6a23-486e-8950-47c7d9f687a8,anvil_activity,True,2769,True,0,Fail,Difference in record count
5,3abfc362-7e73-4663-9dcf-07b78b9aa2d4,5dd1128b-6a23-486e-8950-47c7d9f687a8,anvil_antibody,True,0,True,0,Pass,
6,3abfc362-7e73-4663-9dcf-07b78b9aa2d4,5dd1128b-6a23-486e-8950-47c7d9f687a8,anvil_diagnosis,True,0,True,0,Pass,
7,3abfc362-7e73-4663-9dcf-07b78b9aa2d4,5dd1128b-6a23-486e-8950-47c7d9f687a8,anvil_assayactivity,True,0,True,0,Pass,
8,3abfc362-7e73-4663-9dcf-07b78b9aa2d4,5dd1128b-6a23-486e-8950-47c7d9f687a8,file_inventory,True,6926,True,0,Fail,Difference in record count
9,3abfc362-7e73-4663-9dcf-07b78b9aa2d4,5dd1128b-6a23-486e-8950-47c7d9f687a8,anvil_sequencingactivity,True,0,True,0,Pass,


09/19/2024 12:59:19 PM - INFO: Comparing tabular data record counts between TDR dataset b60b4737-c646-4299-85a0-520890e830b7 and TDR dataset 757191b0-9db3-4d18-b4ad-97bead5f3221.
09/19/2024 12:59:19 PM - INFO: Pulling the superset of tables across the two datasets.
09/19/2024 12:59:20 PM - INFO: Comparing record counts for table 'anvil_dataset'
09/19/2024 12:59:23 PM - INFO: Comparing record counts for table 'sample'
09/19/2024 12:59:26 PM - INFO: Comparing record counts for table 'subject'
09/19/2024 12:59:29 PM - INFO: Comparing record counts for table 'anvil_activity'
09/19/2024 12:59:33 PM - INFO: Comparing record counts for table 'anvil_variantcallingactivity'
09/19/2024 12:59:36 PM - INFO: Comparing record counts for table 'anvil_assayactivity'
09/19/2024 12:59:40 PM - INFO: Comparing record counts for table 'file_inventory'
09/19/2024 12:59:42 PM - INFO: Comparing record counts for table 'anvil_project'
09/19/2024 12:59:45 PM - INFO: Comparing record counts for table 'anvil_sequ

Unnamed: 0,Dataset 1 ID,Dataset 2 ID,Table,Table in DS1,DS1 Record Count,Table in DS2,DS2 Record Count,Status,Message
0,b60b4737-c646-4299-85a0-520890e830b7,757191b0-9db3-4d18-b4ad-97bead5f3221,anvil_dataset,True,1,True,0,Fail,Difference in record count
1,b60b4737-c646-4299-85a0-520890e830b7,757191b0-9db3-4d18-b4ad-97bead5f3221,sample,True,1548,True,0,Fail,Difference in record count
2,b60b4737-c646-4299-85a0-520890e830b7,757191b0-9db3-4d18-b4ad-97bead5f3221,subject,True,1548,True,0,Fail,Difference in record count
3,b60b4737-c646-4299-85a0-520890e830b7,757191b0-9db3-4d18-b4ad-97bead5f3221,anvil_activity,True,4644,True,0,Fail,Difference in record count
4,b60b4737-c646-4299-85a0-520890e830b7,757191b0-9db3-4d18-b4ad-97bead5f3221,anvil_variantcallingactivity,True,0,True,0,Pass,
5,b60b4737-c646-4299-85a0-520890e830b7,757191b0-9db3-4d18-b4ad-97bead5f3221,anvil_assayactivity,True,0,True,0,Pass,
6,b60b4737-c646-4299-85a0-520890e830b7,757191b0-9db3-4d18-b4ad-97bead5f3221,file_inventory,True,4645,True,0,Fail,Difference in record count
7,b60b4737-c646-4299-85a0-520890e830b7,757191b0-9db3-4d18-b4ad-97bead5f3221,anvil_project,True,1,True,0,Fail,Difference in record count
8,b60b4737-c646-4299-85a0-520890e830b7,757191b0-9db3-4d18-b4ad-97bead5f3221,anvil_sequencingactivity,True,0,True,0,Pass,
9,b60b4737-c646-4299-85a0-520890e830b7,757191b0-9db3-4d18-b4ad-97bead5f3221,anvil_antibody,True,0,True,0,Pass,



Final Validation Results:


Unnamed: 0,Dataset 1 ID,Dataset 2 ID,Validation Type,Status,Failed Tables
0,dd2cb8fc-42a6-482f-898e-ef6125feccb8,245020b9-7355-4002-95db-12e7234070c5,Record Count Comparison,Fail,"anvil_activity, anvil_biosample, anvil_dataset, anvil_donor, anvil_file, anvil_project, sample"
1,92382848-f5e9-426c-b7dc-f2841ae97018,8a90137a-7aed-4e8c-bd99-1399f1c550fd,Record Count Comparison,Fail,"anvil_activity, anvil_biosample, anvil_dataset, anvil_donor, anvil_file, anvil_project, sample"
2,4999a410-990e-484b-b4f3-d636f894a741,79abc50c-6a4e-47e0-962a-4bfa7cb8e321,Record Count Comparison,Fail,"anvil_activity, anvil_biosample, anvil_dataset, anvil_donor, anvil_file, anvil_project"
3,1f534eb4-701f-4182-9895-64c5e5b52d82,1bf4d70f-db98-4d07-b48f-d177efd25ae4,Record Count Comparison,Fail,"anvil_activity, anvil_biosample, anvil_dataset, anvil_donor, anvil_file, anvil_project, sample"
4,039dd3d6-0cb5-4cd1-86b3-e9579c9b5218,12c0a3ee-4a21-4be9-a09f-762e2737da1c,Record Count Comparison,Fail,"anvil_activity, anvil_biosample, anvil_dataset, anvil_donor, anvil_file, anvil_project"
5,e68d1d39-99df-4cd7-8053-1b298f03eabb,cfcb0f71-1157-4dbb-a76b-926c0cd40ea2,Record Count Comparison,Fail,"anvil_activity, anvil_biosample, anvil_dataset, anvil_donor, anvil_file, anvil_project, sample"
6,d0ce8b95-9c3b-4f9e-8ce0-169fd89a8b20,5da7394c-1432-4ee0-add5-851280e32d24,Record Count Comparison,Fail,"anvil_activity, anvil_biosample, anvil_dataset, anvil_donor, anvil_file, anvil_project"
7,7427b2eb-a84f-413c-bfb0-7d2e36b0628f,4fd72248-6778-4f7c-880a-e61773531d0d,Record Count Comparison,Fail,"anvil_activity, anvil_biosample, anvil_dataset, anvil_donor, anvil_file, anvil_project"
8,2ef4530a-cc36-4f32-9a1a-63a555346587,d28cb1d4-2300-4cd7-882b-99ce59305ce0,Record Count Comparison,Fail,"anvil_activity, anvil_biosample, anvil_dataset, anvil_donor, anvil_file, anvil_project, sample"
9,65793118-3c88-4185-9172-2354850e6056,183ec762-f867-46c5-bb19-8b2b3417f7b2,Record Count Comparison,Fail,"anvil_activity, anvil_biosample, anvil_dataset, anvil_donor, anvil_file, anvil_project"


In [None]:
# Parameters
dataset_1_id = "b12fb9be-2ce0-4bfd-8503-732fabba06ab"
dataset_2_id = "744c85cc-13d2-4f90-9d2e-d3143cb01edb"
contents_sample_comparison_size = 1000
fields_to_ignore = ["datarepo_row_id", "orig_datarepo_row_id", "orig_file_ref", "source_datarepo_row_ids", "uri"]

# Setup/refresh TDR clients
logging.info(f"Comparing a sample of tabular data content between TDR dataset {dataset_1_id} and TDR dataset {dataset_2_id}.")
api_client = refresh_tdr_api_client("https://data.terra.bio")
datasets_api = data_repo_client.DatasetsApi(api_client=api_client)

# Pull table list across datasets
logging.info(f"Pulling the superset of tables across the two datasets.")
try:
    dataset_1_details = datasets_api.retrieve_dataset(id=dataset_1_id, include=["SCHEMA"]).to_dict()
    dataset_2_details = datasets_api.retrieve_dataset(id=dataset_2_id, include=["SCHEMA"]).to_dict()
except Exception as e:
    error_str = f"Error retrieving details from datasets: {str(e)}"
    logging.error(error_str)
table_set = {}
for table_entry in dataset_1_details["schema"]["tables"]:
    table_set[table_entry["name"]] = table_entry["columns"][0]["name"]
for table_entry in dataset_2_details["schema"]["tables"]:
    table_set[table_entry["name"]] = table_entry["columns"][0]["name"]
    
# For each table in the table list, pull sample records from the two datasets and compare
results = []
for table in ["file_inventory"]: #table_set.keys():
    logging.info(f"Comparing sample records for table '{table}'")
    # Pulling sample records for dataset 1
    ds1_table_present = "True"
    max_page_size = 1000
    total_records_fetched = 0
    ds1_final_records = []
    attempt_counter = 0
    while True:
        offset = total_records_fetched
        page_size = min(max_page_size, contents_sample_comparison_size - total_records_fetched)
        attempt_counter = 0
        while True:
            payload = {
              "offset": offset,
              "limit": page_size,
              "sort": table_set[table],
              "direction": "asc",
              "filter": ""
            }
            try:
                record_results = datasets_api.query_dataset_data_by_id(id=dataset_1_id, table=table, query_data_request_model=payload).to_dict() 
                break
            except Exception as e:
                if "No dataset table exists" in str(e):
                    record_results = []
                    ds1_table_present = "False"
                    break
                else:
                    if attempt_counter < 2:
                        sleep(10)
                        attempt_counter += 1
                        continue
                    else:
                        record_results = []
                        ds1_table_present = "Unknown"
                        break
        if record_results["result"]:
            ds1_final_records.extend(record_results["result"])
            total_records_fetched += len(record_results["result"])
        else:
            break
        if total_records_fetched >= contents_sample_comparison_size:
            break
    # Pulling sample records for dataset 2
    ds2_table_present = "True"
    max_page_size = 1000
    total_records_fetched = 0
    ds2_final_records = []
    attempt_counter = 0
    while True:
        offset = total_records_fetched
        page_size = min(max_page_size, contents_sample_comparison_size - total_records_fetched)
        attempt_counter = 0
        while True:
            payload = {
              "offset": offset,
              "limit": page_size,
              "sort": table_set[table],
              "direction": "asc",
              "filter": ""
            }
            try:
                record_results = datasets_api.query_dataset_data_by_id(id=dataset_2_id, table=table, query_data_request_model=payload).to_dict() 
                break
            except Exception as e:
                if "No dataset table exists" in str(e):
                    record_results = []
                    ds2_table_present = "False"
                    break
                else:
                    if attempt_counter < 2:
                        sleep(10)
                        attempt_counter += 1
                        continue
                    else:
                        record_results = []
                        ds2_table_present = "Unknown"
                        break
        if record_results["result"]:
            ds2_final_records.extend(record_results["result"])
            total_records_fetched += len(record_results["result"])
        else:
            break
        if total_records_fetched >= contents_sample_comparison_size:
            break

In [None]:
df_ds1_records_int = pd.DataFrame.from_dict(ds1_final_records)
df_ds2_records_int = pd.DataFrame.from_dict(ds2_final_records)
cols = df_ds1_records_int.columns.tolist()
for field in fields_to_ignore:
    if field in cols:
        cols.remove(field)
df_ds1_records = df_ds1_records_int[cols]
df_ds2_records = df_ds2_records_int[cols]

In [None]:
diff = df_ds1_records.compare(df_ds2_records)

In [None]:
if df_ds1_records.equals(df_ds2_records):
    print("True")
else:
    print("False")

### Pull and Compare File Counts and Sizes between TDR Datasets

In [None]:
#############################################
## Functions
#############################################

def collect_file_stats(dataset_id_pairs_list):
    
    results = []
    for dataset_id_pair in dataset_id_pairs_list:

            # Setup/refresh TDR clients
            logging.info(f"Processing dataset_id_pair: {dataset_id_pair}")
            api_client = refresh_tdr_api_client("https://data.terra.bio")
            datasets_api = data_repo_client.DatasetsApi(api_client=api_client)

            # Initialize variables
            dataset_id_1 = dataset_id_pair[0]
            file_count_1 = 0
            total_file_size_1 = 0
            max_file_size_1 = 0
            status_1 = "Success"
            message_1 = ""
            dataset_id_2 = dataset_id_pair[1]
            file_count_2 = 0
            total_file_size_2 = 0
            max_file_size_2 = 0
            status_2 = "Success"
            message_2 = ""
            validation_status = "Passed"
            validation_message = ""

            # For dataset_id_1, loop through dataset files and record information
            logging.info(f"Retrieving files from dataset_id {dataset_id_1}...")
            try:
                max_page_size = 1000
                total_records_fetched = 0
                attempt_counter = 0
                while True:
                    try:
                        row_start = total_records_fetched
                        dataset_file_results = datasets_api.list_files(id=dataset_id_1, offset=row_start, limit=max_page_size)
                        if dataset_file_results:
                            total_records_fetched += len(dataset_file_results)
                            for entry in dataset_file_results:
                                file_count_1 += 1
                                total_file_size_1 += entry.size
                                if entry.size > max_file_size_1:
                                    max_file_size_1 = entry.size
                            logging.info(f"{total_records_fetched} records fetched...")
                            attempt_counter = 0
                        else:
                            break
                    except Exception as e:
                        attempt_counter += 1
                        if attempt_counter <= 10:
                            logging.info(f"Failure in file retrieval (attempt #{attempt_counter}). Trying again...")
                            continue
                        else:
                            status_1 = "Failure"
                            message_1 = str(e)
                            logging.error(f"Failure in file retrieval: {message_1}")
                            break
                if status_1 == "Success":
                    logging.info(f"File retrieval complete!")
            except Exception as e:
                status_1 = "Failure"
                message_1 = str(e)
                logging.error(f"Failure in file retrieval: {message_1}")
            
            # For dataset_id_2, loop through dataset files and record information
            logging.info(f"Retrieving files from dataset_id {dataset_id_2}...")
            try:
                max_page_size = 1000
                total_records_fetched = 0
                attempt_counter = 0
                while True:
                    try:
                        row_start = total_records_fetched
                        dataset_file_results = datasets_api.list_files(id=dataset_id_2, offset=row_start, limit=max_page_size)
                        if dataset_file_results:
                            total_records_fetched += len(dataset_file_results)
                            for entry in dataset_file_results:
                                file_count_2 += 1
                                total_file_size_2 += entry.size
                                if entry.size > max_file_size_2:
                                    max_file_size_2 = entry.size
                            logging.info(f"{total_records_fetched} records fetched...")
                            attempt_counter = 0
                        else:
                            break
                    except Exception as e:
                        attempt_counter += 1
                        if attempt_counter <= 10:
                            logging.info(f"Failure in file retrieval (attempt #{attempt_counter}). Trying again...")
                            continue
                        else:
                            status_2 = "Failure"
                            message_2 = str(e)
                            logging.error(f"Failure in file retrieval: {message_2}")
                            break
                if status_2 == "Success":
                    logging.info(f"File retrieval complete!")
            except Exception as e:
                status_2 = "Failure"
                message_2 = str(e)
                logging.error(f"Failure in file retrieval: {message_2}")
                
            # Record and display interim results
            file_count_diff = file_count_1 - file_count_2
            total_file_size_diff = total_file_size_1 - total_file_size_2
            max_file_size_diff = max_file_size_1 - max_file_size_2
            if status_1 == "Failure" or status_2 == "Failure":
                validation_status = "Failed"
                validation_message = "Errors pulling counts for one or more datasets."
            elif file_count_diff > 0 or total_file_size_diff > 0 or max_file_size_diff > 0:
                validation_status = "Failed"
                validation_message = "Difference in counts between datasets."
            results.append([dataset_id_1, dataset_id_2, validation_status, validation_message, file_count_diff, total_file_size_diff, max_file_size_diff, file_count_1, total_file_size_1, max_file_size_1, status_1, message_1, file_count_2, total_file_size_2, max_file_size_2, status_2, message_2])
            int_results_df = pd.DataFrame([[dataset_id_1, dataset_id_2, validation_status, validation_message, file_count_diff, total_file_size_diff, max_file_size_diff, file_count_1, total_file_size_1, max_file_size_1, status_1, message_1, file_count_2, total_file_size_2, max_file_size_2, status_2, message_2]], columns = ["Dataset ID 1", "Dataset ID 2", "Validation Status", "Validation Message", "File Count Diff", "Total File Size (Bytes) Diff", "Max File Size (Bytes) Diff", "File Count 1", "Total File Size (Bytes) 1", "Max File Size (Bytes) 1", "Status 1 ", "Message 1", "File Count 2", "Total File Size (Bytes) 2", "Max File Size (Bytes) 2", "Status 2 ", "Message 2"])
            logging.info("Results recorded:")
            display(int_results_df)
        
    # Display final results
    logging.info("Aggregating results...")
    ws_bucket = os.environ["WORKSPACE_BUCKET"]
    destination_dir = "ingest_pipeline/resources/azure_migration"
    current_datetime_string = datetime.datetime.now().strftime("%Y%m%d%H%M%S")
    output_file = f"validation_results_{current_datetime_string}.tsv"
    results_df = pd.DataFrame(results, columns = ["Dataset ID 1", "Dataset ID 2", "Validation Status", "Validation Message", "File Count Diff", "Total File Size (Bytes) Diff", "Max File Size (Bytes) Diff", "File Count 1", "Total File Size (Bytes) 1", "Max File Size (Bytes) 1", "Status 1 ", "Message 1", "File Count 2", "Total File Size (Bytes) 2", "Max File Size (Bytes) 2", "Status 2 ", "Message 2"])
    results_df.to_csv(output_file, index=False, sep="\t")
    !gsutil cp $output_file $ws_bucket/$destination_dir/ 2> stdout
    !rm $output_file
    print("\nAggregated Validation Results:")
    display(results_df)   
    

#############################################
## Input Parameters
#############################################

# Specify the list of dataset IDs
dataset_id_pairs_list = [
#    ['bef62e8a-5f5c-4e81-a8f8-ddeaf657b4e8', 'c7206e9a-78ad-4c9d-927f-3ca76646227d'],
#    ['902596ce-714e-49b3-8271-f3dfece52309', 'e091028e-a6b1-4989-9477-498e7ea206f0'],
    ['bef62e8a-5f5c-4e81-a8f8-ddeaf657b4e8', 'c7206e9a-78ad-4c9d-927f-3ca76646227d'],
]

#############################################
## Execution
#############################################

collect_file_stats(dataset_id_pairs_list)


# Migrating Workspaces

## Pre-Connector Processing
For each GCP Workspace - Azure Workspace pair:
1. Build a manifest of files to be copied from the GCP Workspace to the Azure Workspace. 
2. Write the manifest to BigQuery for consumption by downstream processes.

Pre-run steps:
1. Use the anvil_ingest_tools notebook to create the Azure workspaces. 
2. Use the anvil_ingest_tools notebook to add the TDR general SA (datarepo-jade-api@terra-datarepo-production.iam.gserviceaccount.com) as a reader on the source GCP workspaces and a writer on the target Azure workspaces.

Post-run steps:
1. Use the anvil_ingest_tools notebook to remove the TDR general SA from the GCP and Azure workspaces. 


In [None]:
#############################################
## Functions
#############################################

# Function to create file transfer details
def output_file_details(source_ws_project, source_ws_name, target_ws_project, target_ws_name, file_bigquery_table, target_bigquery_table, delete_existing_records):
    
    # Establish credentials and clients
    client = bigquery.Client()
    creds, project = google.auth.default(scopes=['https://www.googleapis.com/auth/cloud-platform', 'openid', 'email', 'profile'])
    auth_req = google.auth.transport.requests.Request()
    creds.refresh(auth_req)

    # Pull bucket from source workspace
    try:
        ws_attributes = requests.get(
            url=f"https://api.firecloud.org/api/workspaces/{source_ws_project}/{source_ws_name}?fields=workspace.bucketName",
            headers={"Authorization": f"Bearer {creds.token}"}
        ).json()
        ws_bucket = ws_attributes["workspace"]["bucketName"]
    except:
        err_str = "Error retrieving workspace attributes for source workspace."
        logging.error(err_str)
        raise Exception(err_str)

    # Pull storage container from target workspace
    try:
        ws_attributes = requests.get(
            url=f"https://api.firecloud.org/api/workspaces/{target_ws_project}/{target_ws_name}?fields=workspace.workspaceId",
            headers={"Authorization": f"Bearer {creds.token}"}
        ).json()
        ws_id = ws_attributes["workspace"]["workspaceId"] 
        ws_resources = requests.get(
            url=f"https://workspace.dsde-prod.broadinstitute.org/api/workspaces/v1/{ws_id}/resources?offset=0&limit=10&resource=AZURE_STORAGE_CONTAINER",
            headers={"Authorization": f"Bearer {creds.token}"}
        ).json()
        resource_id = ""
        for resource_entry in ws_resources["resources"]:
            if resource_entry["resourceAttributes"]["azureStorageContainer"]["storageContainerName"][0:3] == "sc-":
                resource_id = resource_entry["metadata"]["resourceId"]
                break
        if resource_id:
            sas_response = requests.post(
                url=f"https://workspace.dsde-prod.broadinstitute.org/api/workspaces/v1/{ws_id}/resources/controlled/azure/storageContainer/{resource_id}/getSasToken?sasExpirationDuration=86400",
                headers={"Authorization": f"Bearer {creds.token}", "accept": "application/json"}
            ).json()
            base_url = sas_response["url"]
            ws_storage_container = re.search("^[a-z0-9:\/=\-\.]+", base_url, re.IGNORECASE).group(0)
        else:
            err_str = "Error retrieving resource information for target workspace."
            logging.error(err_str)
            raise Exception(err_str)
    except:
        err_str = "Error retrieving workspace attributes for target workspace."
        logging.error(err_str)
        raise Exception(err_str)

    # Clear records from target BQ table (if specified)
    if delete_existing_records:
        logging.info(f"Preparing target BQ table ({target_bigquery_table}).")
        delete_query = f"""DELETE FROM `{target_bigquery_table}` WHERE gcp_ws_project = '{source_ws_project}' and gcp_ws_name = '{source_ws_name}'"""
        try:
            delete_query_job = client.query(delete_query)
            delete_query_job.result()
        except Exception as e:
            logging.warning("Error deleting records for the original dataset from the target BQ table.") 

    # Write the query to pull files into a dataframe
    logging.info(f"Building manifest of files to copy from the source '{source_ws_project}.{source_ws_name}' workspace to the target '{target_ws_project}.{target_ws_name}' workspace.")
    current_datetime_string = datetime.datetime.now().strftime("%Y%m%d%H%M%S")
    job_config = bigquery.LoadJobConfig()
    job_config.write_disposition = "WRITE_APPEND"
    query = f"""SELECT '{source_ws_project}' AS gcp_ws_project, '{source_ws_name}' AS gcp_ws_name, 
                '{target_ws_project}' AS az_ws_project, '{target_ws_name}' AS az_ws_name, 
                 'gs://{ws_bucket}/'||name AS source_path, '{ws_storage_container}/'||name AS target_path, 
                 size AS size_in_bytes, md5Hash AS md5_hash, '{current_datetime_string}' AS date_added
                FROM `{file_bigquery_table}` 
                WHERE bucket = '{ws_bucket}'
                AND name NOT LIKE '%/'"""
    attempt_counter = 0
    while True:
        try:
            df = client.query(query).result().to_dataframe()
            job = client.load_table_from_dataframe(df, target_bigquery_table, job_config=job_config)
            logging.info("Records recorded successfully.")
            break
        except Exception as e:
            if attempt_counter < 5:
                sleep(10)
                attempt_counter += 1
                continue
            else:
                err_str = f"Error building and writing file manifest: {str(e)}."
                logging.error(err_str)
                raise Exception(err_str)

            
#############################################
## Input Parameters
#############################################

# General parameters
file_bigquery_table = "broad-dsde-prod-analytics-dev.anvil_inventory.object_metadata_26_02_2024__17_14_55"
target_bigquery_table = "broad-dsde-prod-analytics-dev.anvil_azure_migration.azure_migration_file_list_workspaces"

# Specify migration pairs: Source GCP Workspace - Target Azure Workspace
migration_list = [
    #{"gcp_ws_project": "anvil-datastorage", "gcp_ws_name": "<name>", "az_ws_project": "AnVILDataStorage_Azure", "az_ws_name": "<name>"}
    {'gcp_ws_project': 'anvil-datastorage', 'gcp_ws_name': 'AnVIL_CCDG_WGS_HAIL_Phased-data', 'az_ws_project': 'AnVILDataStorage_Azure', 'az_ws_name': 'AnVIL_CCDG_WGS_HAIL_Phased-data_Azure'},
]

# Specify whether existing records in the azure_migration_file_list_workspaces table should be deleted before running
delete_existing_records = True


#############################################
## Execution
#############################################

# Loop through migration list and process entries
results = []
for entry in migration_list:
    logging.info(f"Processing Migration List Entry: {str(entry)}")
    try:
        output_file_details(entry["gcp_ws_project"], entry["gcp_ws_name"], entry["az_ws_project"], entry["az_ws_name"], file_bigquery_table, target_bigquery_table, delete_existing_records)
        results.append([entry["gcp_ws_name"], entry["az_ws_name"], "Success", ""])
    except Exception as e:
        results.append([entry["gcp_ws_name"], entry["az_ws_name"], "Failure", str(e)])
        
# Display final results
print("\nFinal Results:")
results_df = pd.DataFrame(results, columns = ["Source Workspace Name", "Target Workspace Name", "Status", "Message"])
display(results_df)


## Validation

### Pull and Compare File Counts and Sizes between Workspace Buckets

In [None]:
#############################################
## Functions
#############################################

def collect_file_stats(storage_pairs_list):
    
    results = []
    for storage_pair in storage_pairs_list:

            # Initialize variables
            logging.info(f"Processing storage pair: {storage_pair}")
            gcs_storage_location = storage_pair[0]
            gcs_file_count = 0
            gcs_total_file_size = 0
            gcs_max_file_size = 0
            gcs_status = "Success"
            gcs_message = ""
            az_storage_location = storage_pair[1]
            az_file_count = 0
            az_total_file_size = 0
            az_max_file_size = 0
            az_status = "Success"
            az_message = ""
            validation_status = "Passed"
            validation_message = ""

            # For gcs_storage_location, loop through files and record information
            logging.info("Pulling and parsing GCP bucket contents to create a list of existing files.")
            existing_gcs_files = []
            try:
                cmd = f"gsutil ls -L '{gcs_storage_location}/**'"
                output = subprocess.check_output(cmd, shell=True, universal_newlines=True)
                file_name = ""
                file_size = ""
                for line in output.split("\n"):
                    if line[0:2] == "gs":
                        if file_name and file_size and file_name[-1] != "/":
                            existing_gcs_files.append([file_name, file_size])
                        file_name = re.sub(":$", "", line)
                    else:
                        if "Content-Length:" in line:
                            file_size = re.match("\s*Content-Length:\s*([0-9]+)", line).group(1)
                if file_name and file_size and file_name[-1] != "/":
                    existing_gcs_files.append([file_name, file_size])
                for entry in existing_gcs_files:
                    gcs_file_count += 1
                    entry_file_size = int(entry[1])
                    gcs_total_file_size += entry_file_size
                    if entry_file_size > gcs_max_file_size:
                        gcs_max_file_size = entry_file_size
            except Exception as e:
                gcs_status = "Failure"
                gcs_message = str(e)
                logging.error(f"Failure in file retrieval: {gcs_message}")
                
            # For az_storage_location, loop through files and record information
            logging.info("Pulling and parsing target Azure container contents to create a list of existing files.")
            cmd = f"azcopy_linux_amd64_10.24.0/azcopy list '{az_storage_location}' --machine-readable"
            output = subprocess.check_output(cmd, shell=True, universal_newlines=True)
            existing_az_files = []
            try:
                for line in output.split("\n"):
                    if line:
                        file_name = re.match(r"^INFO: (.*);", line).group(1)
                        file_size = re.match(r".*Content Length: ([0-9\.]+).*", line).group(1)
                        existing_az_files.append([file_name, file_size])
                for entry in existing_az_files:
                    az_file_count += 1
                    entry_file_size = int(entry[1])
                    az_total_file_size += entry_file_size
                    if entry_file_size > az_max_file_size:
                        az_max_file_size = entry_file_size
            except Exception as e:
                az_status = "Failure"
                az_message = str(e)
                logging.error(f"Failure in file retrieval: {az_message}")
                
            # Record and display interim results
            file_count_diff = gcs_file_count - az_file_count
            total_file_size_diff = gcs_total_file_size - az_total_file_size
            max_file_size_diff = gcs_max_file_size - az_max_file_size
            if gcs_status == "Failure" or az_status == "Failure":
                validation_status = "Failed"
                validation_message = "Errors pulling counts for one or more storage locations."
            elif file_count_diff > 0 or total_file_size_diff > 0 or max_file_size_diff > 0:
                validation_status = "Failed"
                validation_message = "Difference in counts between storage locations."
            results.append([gcs_storage_location, az_storage_location, validation_status, validation_message, file_count_diff, total_file_size_diff, max_file_size_diff, gcs_file_count, gcs_total_file_size, gcs_max_file_size, gcs_status, gcs_message, az_file_count, az_total_file_size, az_max_file_size, az_status, az_message])
            int_results_df = pd.DataFrame([[gcs_storage_location, az_storage_location, validation_status, validation_message, file_count_diff, total_file_size_diff, max_file_size_diff, gcs_file_count, gcs_total_file_size, gcs_max_file_size, gcs_status, gcs_message, az_file_count, az_total_file_size, az_max_file_size, az_status, az_message]], columns = ["GCS Storage Location", "AZ Storage Location", "Validation Status", "Validation Message", "File Count Diff", "Total File Size (Bytes) Diff", "Max File Size (Bytes) Diff", "GCS File Count", "GCS Total File Size (Bytes)", "GCS Max File Size (Bytes)", "GCS Status", "GCS Message", "AZ File Count", "AZ Total File Size (Bytes)", "AZ Max File Size (Bytes)", "AZ Status", "AZ Message"])
            logging.info("Results recorded:")
            display(int_results_df)
        
    # Display final results
    logging.info("Aggregating results...")
    ws_bucket = os.environ["WORKSPACE_BUCKET"]
    destination_dir = "ingest_pipeline/resources/azure_migration"
    current_datetime_string = datetime.datetime.now().strftime("%Y%m%d%H%M%S")
    output_file = f"workspace_validation_results_{current_datetime_string}.tsv"
    results_df = pd.DataFrame(results, columns = ["GCS Storage Location", "AZ Storage Location", "Validation Status", "Validation Message", "File Count Diff", "Total File Size (Bytes) Diff", "Max File Size (Bytes) Diff", "GCS File Count", "GCS Total File Size (Bytes)", "GCS Max File Size (Bytes)", "GCS Status", "GCS Message", "AZ File Count", "AZ Total File Size (Bytes)", "AZ Max File Size (Bytes)", "AZ Status", "AZ Message"])
    results_df.to_csv(output_file, index=False, sep="\t")
    !gsutil cp $output_file $ws_bucket/$destination_dir/ 2> stdout
    !rm $output_file
    print("\nAggregated Validation Results:")
    display(results_df)   
    

#############################################
## Input Parameters
#############################################

# Specify the list of dataset IDs
storage_pairs_list = [
#     ["gcs_bucket_path", "azure_storage_container_sas_url"]
    ['gs://fc-secure-0932b76c-22e6-4321-94f7-9726ad4aeb76', 'https://lzb34bb58bfb122730765416.blob.core.windows.net/sc-0ef0b0b4-92b6-462e-8b4f-498f1cb7983b?sv=2023-11-03&spr=https&st=2024-04-23T13%3A45%3A11Z&se=2024-04-23T22%3A00%3A11Z&sr=c&sp=racwdlt&sig=7Usayb1DzV4LEcQYheVZrSrvEkiaiot9wEZGmnEO3BM%3D&rscd=2661442731880e5cbc2c9'],
]

#############################################
## Execution
#############################################

collect_file_stats(storage_pairs_list)


# Utility

## Dataset Deletion

In [None]:
# Function to delete a specific TDR Snapshot
def delete_snapshot(snapshot_id):
    api_client = refresh_tdr_api_client("https://data.terra.bio")
    snapshots_api = data_repo_client.SnapshotsApi(api_client=api_client)
    print("Attempting to delete snapshot = {}".format(snapshot_id))
    try:
        delete_snapshot_result, job_id = wait_for_tdr_job(snapshots_api.delete_snapshot(id=snapshot_id), "https://data.terra.bio")
        print("Result: {}".format(delete_snapshot_result))
    except Exception as e:
        print("Error: {}".format(e))

# Function to delete a specific TDR Dataset
def delete_dataset(dataset_id):
    api_client = refresh_tdr_api_client("https://data.terra.bio")
    datasets_api = data_repo_client.DatasetsApi(api_client=api_client)
    print("Attempting to delete dataset = {}".format(dataset_id))
    try:
        delete_dataset_result, job_id = wait_for_tdr_job(datasets_api.delete_dataset(id=dataset_id), "https://data.terra.bio")
        print("Result: {}".format(delete_dataset_result))
    except Exception as e:
        print("Error: {}".format(e))

# Function to delete a specific TDR Dataset and all of its Snapshots
def delete_dataset_and_all_snapshots(dataset_id):
    api_client = refresh_tdr_api_client("https://data.terra.bio")
    datasets_api = data_repo_client.DatasetsApi(api_client=api_client)
    snapshots_api = data_repo_client.SnapshotsApi(api_client=api_client)
    print("Attempting to delete dataset = {} and all associated snapshots".format(dataset_id))
    dataset_id_list = [dataset_id]
    # Delete snapshots
    snapshot_list = snapshots_api.enumerate_snapshots(dataset_ids=dataset_id_list)
    if snapshot_list.items:
        for snapshot in snapshot_list.items:
            snapshot_id = str(snapshot.id)
            delete_snapshot(snapshot_id)
            sleep(10)
    # Delete dataset
    delete_dataset(dataset_id)

# Delete snapshots
# snapshot_id_list = [
# '1234',
# ]
# for snapshot_id in snapshot_id_list:
#     delete_snapshot(snapshot_id)

# Delete datasets and all their associated snapshots
dataset_id_list = [
'1be5b5e6-019e-419a-9248-6e80d067d697',
]
for dataset_id in dataset_id_list:
    delete_dataset_and_all_snapshots(dataset_id)

## Update Migration File List Table

In [None]:
# General parameters
target_bigquery_table = "broad-dsde-prod-analytics-dev.anvil_azure_migration.azure_migration_file_list"

# Update parameters
update_list = [
    {"az_dataset_id": "6007151f-45bc-4111-8e9a-b667bc722a6a", "new_gcp_dataset_id": "b22c71b2-2cb2-4b27-a49b-9a2a83d432e8", "new_gcp_dataset_name": "ANVIL_1000G_PRIMED_data_model_20240301"},
    {"az_dataset_id": "a28e4ab5-a07b-4316-b743-7f5f9cc88211", "new_gcp_dataset_id": "3a89c170-2939-4c12-9940-f32d96fa9e55", "new_gcp_dataset_name": "ANVIL_CMH_GAFK_GS_long_read_20240301"}
]

# Execute updates
client = bigquery.Client()
for entry in update_list:
    logging.info(f"Running update for entry: {str(entry)}")
    az_dataset_id = entry["az_dataset_id"]
    gcp_dataset_id = entry["new_gcp_dataset_id"]
    gcp_dataset_name = entry["new_gcp_dataset_name"]
    update_query = f"""UPDATE `{target_bigquery_table}` 
                       SET gcp_dataset_id = '{gcp_dataset_id}', gcp_dataset_name = '{gcp_dataset_name}'
                       WHERE az_dataset_id = '{az_dataset_id}'"""
    try:
        update_query_job = client.query(update_query)
        update_query_job.result()
        logging.info("Update complete.")
    except Exception as e:
        logging.info("Error running update.")
