# Imports and Common Functions

In [None]:
#!pip install --upgrade data_repo_client
# !wget https://aka.ms/downloadazcopy-v10-linux
# !tar -xvf downloadazcopy-v10-linux

In [2]:
# Imports
import import_ipynb
import data_repo_client
import google.auth
import datetime
import os
import sys
import logging
from time import sleep
from google.cloud import bigquery
from google.cloud import storage
import ingest_pipeline_utilities as utils
import pandas as pd
import json
import re
import math
import requests
import subprocess
from azure.storage.blob import BlobServiceClient
import base64

# Configure logging format
while logging.root.handlers:
    logging.root.removeHandler(logging.root.handlers[-1])
logging.basicConfig(format="%(asctime)s - %(levelname)s: %(message)s", datefmt="%m/%d/%Y %I:%M:%S %p", level=logging.INFO, handlers=[logging.StreamHandler(sys.stdout)])
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option("display.max_colwidth", None)
pd.set_option('display.width', 1000)
pd.set_option('display.colheader_justify', 'center')
pd.set_option('display.precision', 3)

# Function to refresh TDR API client
def refresh_tdr_api_client(host):
    creds, project = google.auth.default()
    auth_req = google.auth.transport.requests.Request()
    creds.refresh(auth_req)
    config = data_repo_client.Configuration()
    config.host = host
    config.access_token = creds.token
    api_client = data_repo_client.ApiClient(configuration=config)
    api_client.client_side_validation = False
    return api_client

# Function to wait for TDR job completion
def wait_for_tdr_job(job_model, host):
    result = job_model
    print("TDR Job ID: " + job_model.id)
    counter = 0
    job_state = "UNKNOWN"
    while True:
        # Re-establish credentials and API clients every 30 minutes
        if counter == 0 or counter%180 == 0:
            api_client = refresh_tdr_api_client(host)
            jobs_api = data_repo_client.JobsApi(api_client=api_client)
        # Check for TDR connectivity issues and raise exception if the issue persists
        conn_err_counter = 0
        while job_state == "UNKNOWN":
            conn_err_counter += 1
            if conn_err_counter >= 10:
                raise Exception("Error interacting with TDR: {}".format(result.status_code)) 
            elif result == None or result.status_code in ["500", "502", "503", "504"]:
                sleep(10)
                counter += 1
                attempt_counter = 0
                while True:
                    try:
                        result = jobs_api.retrieve_job(job_model.id)
                        break
                    except Exception as e:
                        if attempt_counter < 5:
                            attempt_counter += 1
                            sleep(10)
                            continue
                        else:
                            raise Exception("Error retrieving job status from TDR: {}".format(str(e)))
            else:
                job_state = "KNOWN"
        # Check if job is still running, and sleep/re-check if so
        if job_state == "KNOWN" and result.job_status == "running":
            sleep(10)
            counter += 1
            attempt_counter = 0
            while True:
                try:
                    result = jobs_api.retrieve_job(job_model.id)
                    break
                except Exception as e:
                    if attempt_counter < 5:
                        sleep(10)
                        attempt_counter += 1
                        continue
                    else:
                        raise Exception("Error retrieving job status from TDR: {}".format(str(e)))
        # If job has returned as failed, confirm this is the correct state and retrieve result if so
        elif job_state == "KNOWN" and result.job_status == "failed":
            fail_counter = 0
            while True:
                attempt_counter = 0
                while True:
                    try:
                        result = jobs_api.retrieve_job(job_model.id)
                        if result.job_status == "failed":
                            fail_counter += 1
                        break
                    except Exception as e:
                        if attempt_counter < 5:
                            sleep(10)
                            attempt_counter += 1
                            continue
                        else:
                            raise Exception("Error retrieving job status from TDR: {}".format(str(e)))
                if fail_counter >= 3:
                    try:
                        fail_result = jobs_api.retrieve_job_result(job_model.id)
                        raise Exception("Job " + job_model.id + " failed: " + fail_result)
                    except Exception as e:
                        raise Exception("Job " + job_model.id + " failed: " + str(e))
        # If a job has returned as succeeded, retrieve result
        elif job_state == "KNOWN" and result.job_status == "succeeded":
            attempt_counter = 0
            while True:
                try:
                    return jobs_api.retrieve_job_result(job_model.id), job_model.id
                except Exception as e:
                    if attempt_counter < 3:
                        sleep(10)
                        attempt_counter += 1
                        continue
                    else:
                        return "Job succeeded, but error retrieving job result: {}".format(str(e)), job_model.id
        else:
            raise Exception("Unrecognized job state: {}".format(result.job_status))

# Migrating TDR Datasets

## Step 1: Pre-Connector Processing
For the list of GCP TDR datasets provided:
1. Extract the schema
2. Create an Azure TDR dataset using the extracted schema
3. Build a manifest of files to be copied from the GCP dataset to the Azure dataset and write to BigQuery.

In [None]:
#############################################
## Functions
#############################################

# Function to build default target TDR dataset name
def format_dataset_name(input_str):
    current_datetime = datetime.datetime.now()
    current_date_string = current_datetime.strftime("%Y%m%d")
    input_str = input_str[:-9]
    output_str = "ANVIL_" + re.sub("^ANVIL[_]?", "", input_str, flags=re.IGNORECASE) + "_" + current_date_string
    output_str = re.sub("[^a-zA-Z0-9_]", "_", output_str)
    return output_str

# Function to create a new TDR dataset from an existing TDR dataset
def create_dataset_from_dataset(src_tdr_object_uuid, tar_tdr_object_uuid, billing_profile):

    # Setup/refresh TDR clients
    api_client = refresh_tdr_api_client("https://data.terra.bio")
    datasets_api = data_repo_client.DatasetsApi(api_client=api_client)

    # Retrieve original dataset details
    logging.info(f"Retrieving original dataset details from prod environment. UUID:  {src_tdr_object_uuid}")
    try:
        dataset_details = datasets_api.retrieve_dataset(id=src_tdr_object_uuid, include=["SCHEMA", "ACCESS_INFORMATION", "PROPERTIES", "DATA_PROJECT", "STORAGE"]).to_dict()
        bq_project = dataset_details["access_information"]["big_query"]["project_id"]
        bq_dataset = dataset_details["access_information"]["big_query"]["dataset_name"]
        orig_object_name = dataset_details["name"]
    except Exception as e:
        error_str = f"Error retrieving details from dataset {src_tdr_object_uuid} in TDR prod environment: {str(e)}"
        logging.error(error_str)
        return None, None, None, None, None

    # If target dataset specified, retrieve name
    if tar_tdr_object_uuid:
        new_dataset_id = tar_tdr_object_uuid
        logging.info(f"Retrieving new dataset details from prod environment. UUID:  {tar_tdr_object_uuid}")
        try:
            dataset_details = datasets_api.retrieve_dataset(id=tar_tdr_object_uuid, include=["SCHEMA", "ACCESS_INFORMATION", "PROPERTIES", "DATA_PROJECT", "STORAGE"]).to_dict()
            new_object_name = dataset_details["name"]
        except Exception as e:
            error_str = f"Error retrieving details from dataset {tar_tdr_object_uuid} in TDR prod environment: {str(e)}"
            logging.error(error_str)
            return None, None, None, None, None 
    else:
        # Build new dataset schema
        apply_anvil_transforms = True
        new_schema_dict = {"tables": [], "relationships": [], "assets": []}
        for table_entry in dataset_details["schema"]["tables"]:
            int_table_dict = table_entry.copy()
            int_table_dict["primaryKey"] = int_table_dict.pop("primary_key")
            for key in ["partition_mode", "date_partition_options", "int_partition_options", "row_count"]:
                del int_table_dict[key]
            for idx, column_entry in enumerate(table_entry["columns"]):
                if column_entry["datatype"] == "integer":
                    table_entry["columns"][idx]["datatype"] = "int64"
            if apply_anvil_transforms:
                if table_entry["name"] == "file_inventory":
                    int_table_dict["columns"].append({"name": "orig_file_ref", "datatype": "string", "array_of": False, "required": False})
                    int_table_dict["columns"].append({"name": "orig_datarepo_row_id", "datatype": "string", "array_of": False, "required": False})
                elif "anvil_" not in table_entry["name"]:
                    int_table_dict["columns"].append({"name": "orig_datarepo_row_id", "datatype": "string", "array_of": False, "required": False})
            new_schema_dict["tables"].append(int_table_dict)
        for rel_entry in dataset_details["schema"]["relationships"]:
            int_rel_dict = rel_entry.copy()
            int_rel_dict["from"] = int_rel_dict.pop("_from")
            new_schema_dict["relationships"].append(int_rel_dict)
        for asset_entry in dataset_details["schema"]["assets"]:
            int_asset_dict = asset_entry.copy()
            int_asset_dict["rootTable"] = int_asset_dict.pop("root_table")
            int_asset_dict["rootColumn"] = int_asset_dict.pop("root_column")
            new_schema_dict["assets"].append(int_asset_dict)

        # Retrieve original dataset policies
        try:
            dataset_policies = datasets_api.retrieve_dataset_policies(id=src_tdr_object_uuid).to_dict()
            for policy in dataset_policies["policies"]:
                if policy["name"] == "steward":
                    stewards_list = policy["members"]
                elif policy["name"] == "custodian":
                    custodians_list = policy["members"]
                elif policy["name"] == "snapshot_creator":
                    snapshot_creators_list = policy["members"]
        except:
            logging.info("Error retrieving original dataset policies. Skipping policy copy.")
            stewards_list = []
            custodians_list = []
            snapshot_creators_list = []
        policies = {
            "stewards": stewards_list,
            "custodians": custodians_list,
            "snapshotCreators": snapshot_creators_list
        }

        # Determine dataset properties
        new_object_name = format_dataset_name(orig_object_name)
        new_description = dataset_details["description"] + f"\n\nCopy of dataset {orig_object_name} from TDR prod."
        self_hosted = False
        dedicated_ingest_sa = False
        phs_id = dataset_details["phs_id"]
        predictable_file_ids = dataset_details["predictable_file_ids"]
        secure_monitoring_enabled = dataset_details["secure_monitoring_enabled"]
        properties = dataset_details["properties"]
        tags = dataset_details["tags"]

        # Create new TDR dataset
        logging.info("Submitting dataset creation request.")
        dataset_request = {
            "name": new_object_name,
            "description": new_description,
            "defaultProfileId": billing_profile,
            "cloudPlatform": "azure",
            "region": "southcentralus",
            "phsId": phs_id,
            "experimentalSelfHosted": self_hosted,
            "experimentalPredictableFileIds": predictable_file_ids,
            "dedicatedIngestServiceAccount": dedicated_ingest_sa,
            "enableSecureMonitoring": secure_monitoring_enabled,
            "properties": properties,
            "tags": tags,
            "policies": policies,
            "schema": new_schema_dict
        }
        attempt_counter = 1
        while True:
            try:
                create_dataset_result, job_id = wait_for_tdr_job(datasets_api.create_dataset(dataset=dataset_request), "https://data.terra.bio")
                logging.info("Dataset Creation succeeded: {}".format(create_dataset_result))
                new_dataset_id = create_dataset_result["id"]
                break
            except Exception as e:
                error_str = f"Error on Dataset Creation: {str(e)}"
                logging.error(error_str)
                if attempt_counter < 3:
                    logging.info("Retrying Dataset Creation (attempt #{})...".format(str(attempt_counter)))
                    sleep(10)
                    attempt_counter += 1
                    continue
                else:
                    logging.error("Maximum number of retries exceeded. Exiting job.")
                    return None, None, None, None, None
        
    # Exit function
    return orig_object_name, new_dataset_id, new_object_name, bq_project, bq_dataset

# Function to create file transfer details
def output_file_details(orig_dataset_id, orig_dataset_name, new_dataset_id, new_dataset_name, bq_project, bq_dataset, public_flag, target_bigquery_table, delete_existing_records):
    
    # Setup/refresh TDR clients (and BQ client)
    api_client = refresh_tdr_api_client("https://data.terra.bio")
    datasets_api = data_repo_client.DatasetsApi(api_client=api_client)
    snapshots_api = data_repo_client.SnapshotsApi(api_client=api_client)
    client = bigquery.Client()
    
    # Clear records from target BQ table
    if delete_existing_records:
        logging.info(f"Preparing target BQ table ({target_bigquery_table}).")
        delete_query = f"""DELETE FROM `{target_bigquery_table}` WHERE gcp_dataset_id = '{orig_dataset_id}'"""
        try:
            delete_query_job = client.query(delete_query)
            delete_query_job.result()
        except Exception as e:
            logging.info("Error deleting records for the original dataset from the target BQ table.")
    
    # Retrieve table data from the original dataset and write to target BQ table
    logging.info(f"Fetching and recording all rows from table 'file_inventory' in the original dataset ({orig_dataset_id}). BQ Project = '{bq_project}' and BQ Dataset = '{bq_dataset}'.")
    current_datetime_string = datetime.datetime.now().strftime("%Y%m%d%H%M%S")
    job_config = bigquery.QueryJobConfig()
    job_config.destination = target_bigquery_table
    job_config.write_disposition = "WRITE_APPEND"
    query = f"""WITH drlh_deduped AS
                (
                  SELECT DISTINCT file_id, target_path, source_name 
                  FROM 
                  (
                    SELECT *, ROW_NUMBER() OVER (PARTITION BY source_name ORDER BY load_time DESC) AS rn
                    --SELECT *, ROW_NUMBER() OVER (PARTITION BY source_name, target_path ORDER BY load_time DESC) AS rn
                    FROM `{bq_project}.{bq_dataset}.datarepo_load_history`
                    WHERE state = "succeeded" 
                  )
                  WHERE rn = 1
                ),
                file_records AS
                (
                  SELECT '{orig_dataset_id}' AS gcp_dataset_id, '{orig_dataset_name}' AS gcp_dataset_name, 
                  '{new_dataset_id}' AS az_dataset_id, '{new_dataset_name}' AS az_dataset_name, 
                  b.source_name AS source_path, b.target_path, a.size_in_bytes, a.md5_hash, a.file_ref AS orig_tdr_file_id,
                  '{current_datetime_string}' AS date_added, '{public_flag}' AS public_flag, ROW_NUMBER() OVER (PARTITION BY a.file_ref ORDER BY b.source_name) AS rn
                  FROM `{bq_project}.{bq_dataset}.file_inventory` a
                      LEFT JOIN drlh_deduped b
                      ON a.uri = b.source_name
                      LEFT JOIN `broad-dsde-prod-analytics-dev.anvil_azure_migration.azure_migration_file_list` c
                      ON a.file_ref = c.orig_tdr_file_id AND c.az_dataset_id = '{new_dataset_id}'
                  WHERE c.source_path IS NULL
                )
                SELECT * EXCEPT(rn)
                FROM file_records
                WHERE rn = 1"""
    #print(query)
    attempt_counter = 0
    while True:
        try:
            query_job = client.query(query, job_config=job_config)
            query_job.result()
            logging.info("Records recorded successfully.")
            return
        except Exception as e:
            if attempt_counter < 5:
                sleep(10)
                attempt_counter += 1
                continue
            else:
                err_str = f"Error recording records for all rows of table 'file_inventory': {str(e)}."
                logging.error(err_str)
                return
    

#############################################
## Input Parameters
#############################################

# General parameters
target_bigquery_table = "broad-dsde-prod-analytics-dev.anvil_azure_migration.azure_migration_file_list"
azure_billing_profile = "9ee23bed-b46c-4561-9103-d2a723113f7f"

# Specify the list of datasets to process, leaving the target Azure dataset ID empty to create a new one
migration_list = [
    #["src_gcp_dataset_id", "tar_az_dataset_id", "open_access (Y/N)"]
    ['902596ce-714e-49b3-8271-f3dfece52309', 'e091028e-a6b1-4989-9477-498e7ea206f0', 'N'],
]

# Specify whether existing records in the azure_migration_file_list table should be deleted before running
delete_existing_records = False


#############################################
## Execution
#############################################

# Loop through migration list and process entries
results = []
for entry in migration_list:
    logging.info(f"Processing Migration List Entry: {str(entry)}")
    dataset_name, new_dataset_id, new_dataset_name, bq_project, bq_dataset = create_dataset_from_dataset(entry[0], entry[1], azure_billing_profile)
    if new_dataset_id:
        output_file_details(entry[0], dataset_name, new_dataset_id, new_dataset_name, bq_project, bq_dataset, entry[2], target_bigquery_table, delete_existing_records)
        results.append([entry[0], dataset_name, "Success", new_dataset_id, new_dataset_name])
    else:
        results.append([entry[0], dataset_name, "Failure", new_dataset_id, new_dataset_name])
        
# Display final results
print("\nFinal Results:")
results_df = pd.DataFrame(results, columns = ["Source Dataset ID", "Source Dataset Name", "Status", "New Dataset ID", "New Dataset Name"])
display(results_df)
            

## Step 2: Post-Connector Processing
For each GCP Dataset - Azure Dataset pair:
1. Retrieve the source GCP Dataset for the Snapshot
2. Extract, pre-process, and ingest tabular data from the GCP Dataset to the Azure Dataset
3. Create a new Azure snapshot based on the GCP snapshot

In [4]:
#############################################
## Functions
#############################################

# Function to fetch data from BigQuery
def fetch_source_records_bigquery(config, new_dataset_id, array_col_dict, table, start_row, end_row):
    # Extract parameters from config
    src_tdr_object_uuid = config["source_dataset_id"]
    src_tdr_object_type = "dataset"
    tdr_host = config["tdr_host"]
    files_already_ingested = True
    datarepo_row_ids_to_ingest = []
    apply_anvil_transforms = True
    bq_project = config["bigquery_project"]
    bq_dataset = config["bigquery_dataset"]
    
    # Setup/refresh TDR clients (and BQ client)
    api_client = refresh_tdr_api_client(tdr_host)
    datasets_api = data_repo_client.DatasetsApi(api_client=api_client)
    snapshots_api = data_repo_client.SnapshotsApi(api_client=api_client)
    client = bigquery.Client()
    
    # Retrieve table data from the original dataset
    logging.info(f"Fetching rows {str(start_row)}-{str(end_row)} from table '{table}' in the original {src_tdr_object_type} ({src_tdr_object_uuid}).")
    table_recs_str = f"Table: {table} -- Rows: {str(start_row)}-{str(end_row)}"
    final_records = []
    if apply_anvil_transforms and "anvil_" not in table:
        if table == "file_inventory":
            if files_already_ingested == False:
                file_ref_sql = "TO_JSON_STRING(STRUCT(source_name AS sourcePath, target_path AS targetPath, 'Ingest of '||source_name AS description, COALESCE(content_type, 'application/octet-stream') AS mimeType))"
            else:
                file_ref_sql = "file_ref"
            rec_fetch_query = f"""WITH drlh_deduped AS
                            (
                              SELECT DISTINCT file_id, target_path, source_name
                              FROM `{bq_project}.{bq_dataset}.datarepo_load_history`
                              WHERE state = "succeeded" 
                            )
                            SELECT * EXCEPT(rownum)
                            FROM
                            (
                              SELECT datarepo_row_id, datarepo_row_id AS orig_datarepo_row_id, a.file_id, name, path, target_path AS uri, content_type, full_extension, size_in_bytes, crc32c, md5_hash, ingest_provenance,
                              file_ref AS orig_file_ref, {file_ref_sql} AS file_ref,
                              ROW_NUMBER() OVER (ORDER BY datarepo_row_id) AS rownum
                              FROM `{bq_project}.{bq_dataset}.{table}` a
                                  LEFT JOIN drlh_deduped b
                                  ON a.file_ref = b.file_id
                            )
                            WHERE rownum BETWEEN {start_row} AND {end_row}"""
        else:
            rec_fetch_query = f"""SELECT * EXCEPT(rownum)
                            FROM
                            (
                              SELECT *, datarepo_row_id AS orig_datarepo_row_id,
                              ROW_NUMBER() OVER (ORDER BY datarepo_row_id) AS rownum
                              FROM `{bq_project}.{bq_dataset}.{table}`
                            )
                            WHERE rownum BETWEEN {start_row} AND {end_row}"""
    else:
        rec_fetch_query = f"""SELECT * EXCEPT(rownum)
                            FROM
                            (
                              SELECT *, 
                              ROW_NUMBER() OVER (ORDER BY datarepo_row_id) AS rownum
                              FROM `{bq_project}.{bq_dataset}.{table}`
                            )
                            WHERE rownum BETWEEN {start_row} AND {end_row}"""
    attempt_counter = 0
    while True:
        try:
            df = client.query(rec_fetch_query).result().to_dataframe()
            df = df.astype(object).where(pd.notnull(df),None)
            for column in array_col_dict[table]:
                df[column] = df[column].apply(lambda x: list(x))
            if apply_anvil_transforms and table == "file_inventory" and files_already_ingested == False: 
                df["file_ref"] = df.apply(lambda x: json.loads(x["file_ref"].replace("\'", "\"")), axis=1)
            final_records = df.to_dict(orient="records")
            break
        except Exception as e:
            if attempt_counter < 5:
                sleep(10)
                attempt_counter += 1
                continue
            else:
                err_str = f"Error retrieving records for rows {str(start_row)}-{str(end_row)} of table {table}: {str(e)}."
                logging.error(err_str)
                config["migration_results"].append(["Dataset Ingestion", table_recs_str, "Failure", err_str])
                return {}
    
    # Filter retrieved data if necessary and return as dict of records
    if final_records:
        df_temp = pd.DataFrame.from_dict(final_records)
        if datarepo_row_ids_to_ingest:
            df_orig = df_temp[df_temp["datarepo_row_id"].isin(datarepo_row_ids_to_ingest)].copy()
        else:
            df_orig = df_temp.copy()
        del df_temp
        df_orig.drop(columns=["datarepo_row_id"], inplace=True, errors="ignore")
        df_orig = df_orig.astype(object).where(pd.notnull(df_orig),None)
        records_orig = df_orig.to_dict(orient="records")
        if not records_orig:
            msg_str = f"No records found in rows {str(start_row)}-{str(end_row)} of table {table} after filtering based on datarepo_row_ids_to_ingest parameter. Continuing to next record set or table validation."
            logging.info(msg_str)
            config["migration_results"].append(["Dataset Ingestion", table_recs_str, "Skipped", msg_str])
            return records_orig
        elif len(final_records) != len(records_orig):
            logging.info(f"Filtering records to ingest based on the datarepo_row_ids_to_ingest parameter. {str(len(records_orig))} of {str(len(final_records))} records to be ingested.")
            return records_orig
        else:
            return records_orig
    else:
        msg_str = f"No records found for rows {str(start_row)}-{str(end_row)} of table {table} in original {src_tdr_object_type}. Continuing to next record set or table validation."
        logging.info(msg_str)
        config["migration_results"].append(["Dataset Ingestion", table_recs_str, "Skipped", msg_str])
        return final_records

# Function to process ingests for specific table
def ingest_table_data(config, new_dataset_id, array_col_dict, data_type_col_dict, table, start_row, end_row):
    
    # Extract parameters from config
    src_tdr_object_uuid = config["source_dataset_id"]
    src_tdr_object_type = "dataset"
    tdr_host = config["tdr_host"]
    tar_tdr_billing_profile = config["tar_tdr_billing_profile"]
    records_processing_method = "in_memory"
    write_to_cloud_platform = ""
    apply_anvil_transforms = True
    dr_row_id_xwalk = config["dr_row_id_xwalk"]

    # Setup/refresh TDR clients
    api_client = refresh_tdr_api_client(tdr_host)
    datasets_api = data_repo_client.DatasetsApi(api_client=api_client)
    snapshots_api = data_repo_client.SnapshotsApi(api_client=api_client)
    
    # Retrieve table data from the original dataset
    table_recs_str = f"Table: {table} -- Rows: {str(start_row)}-{str(end_row)}"
    records_orig = fetch_source_records_bigquery(config, new_dataset_id, array_col_dict, table, start_row, end_row)
    if not records_orig:
        return

    # Pre-process records before ingest
    if "anvil_" in table:
        try:
            # Pre-process records in AnVIL_ records to use new datarepo_row_ids in the source_datarepo_row_ids field
            logging.info("FSS (anvil_%) table with ingest.apply_anvil_transforms parameter set to 'True'. Pre-processing records before submitting ingestion request.")
            records_processed = []
            for record in records_orig:
                int_record = record.copy()
                new_dr_row_id_list = []
                for row_id in int_record["source_datarepo_row_ids"]:
                    new_row_id = dr_row_id_xwalk.get(row_id)
                    if new_row_id:
                        new_dr_row_id_list.append(new_row_id)
                    else:
                        err_str = f"Failure in pre-processing: row_id '{row_id}'' not found in datarepo_row_id crosswalk."
                        logging.error(err_str)
                        config["migration_results"].append(["Dataset Ingestion", table_recs_str, "Failure", err_str])
                        return   
                int_record["source_datarepo_row_ids"] = new_dr_row_id_list
                for fcol in data_type_col_dict["float"][table]:
                    if int_record[fcol]:
                        int_record[fcol] = float(int_record[fcol])
                for icol in data_type_col_dict["int"][table]:
                    if int_record[icol]:
                        int_record[icol] = int(int_record[icol])
                records_processed.append(int_record)
        except Exception as e:
            err_str = f"Failure in pre-processing: {str(e)}"
            logging.error(err_str)
            config["migration_results"].append(["Dataset Ingestion", table_recs_str, "Failure", err_str])
            return
    else:
        records_processed = []
        for record in records_orig:
            int_record = record.copy()
            for fcol in data_type_col_dict["float"][table]:
                if int_record[fcol]:
                    int_record[fcol] = float(int_record[fcol])
            for icol in data_type_col_dict["int"][table]:
                if int_record[icol]:
                    int_record[icol] = int(int_record[icol])
            records_processed.append(int_record)
    
    # Write out records to cloud, if specified by user
    if records_processing_method == "write_to_cloud":
        logging.info(f"Writing records to a control file in the cloud.")
        if write_to_cloud_platform == "gcp":
            control_file_path = write_records_to_gcp(config, table, records_processed)
        else:
            control_file_path = write_records_to_azure(config, table, records_processed)

    # Build, submit, and monitor ingest request
    logging.info(f"Submitting ingestion request to new dataset ({new_dataset_id}).")
    if records_processing_method == "write_to_cloud":
        ingest_request = {
            "table": table,
            "profile_id": tar_tdr_billing_profile,
            "ignore_unknown_values": True,
            "resolve_existing_files": True,
            "updateStrategy": "append",
            "format": "json",
            "load_tag": "Ingest for {}".format(new_dataset_id),
            "path": control_file_path
        }        
    else:
        ingest_request = {
            "table": table,
            "profile_id": tar_tdr_billing_profile,
            "ignore_unknown_values": True,
            "resolve_existing_files": True,
            "updateStrategy": "append",
            "format": "array",
            "load_tag": "Ingest for {}".format(new_dataset_id),
            "records": records_processed
        }
    attempt_counter = 1
    while True:
        try:
            api_client = refresh_tdr_api_client(tdr_host)
            datasets_api = data_repo_client.DatasetsApi(api_client=api_client)
            ingest_request_result, job_id = wait_for_tdr_job(datasets_api.ingest_dataset(id=new_dataset_id, ingest=ingest_request), tdr_host)
            logging.info("Ingest succeeded: {}".format(str(ingest_request_result)[0:1000]))
            config["migration_results"].append(["Dataset Ingestion", table_recs_str, "Success", str(ingest_request_result)[0:1000]])
            break
        except Exception as e:
            logging.error("Error on ingest: {}".format(str(e)[0:2500]))
            if attempt_counter < 2:
                logging.info("Retrying ingest (attempt #{})...".format(str(attempt_counter)))
                sleep(10)
                attempt_counter += 1
                continue
            else:
                logging.error("Maximum number of retries exceeded. Logging error.")
                err_str = f"Error on ingest: {str(e)[0:2500]}"
                config["migration_results"].append(["Dataset Ingestion", table_recs_str, "Failure", err_str])  
                break

    # Remove control file from cloud, if written out
    if records_processing_method == "write_to_cloud":
        logging.info(f"Removing control file from the cloud.")
        if write_to_cloud_platform == "gcp":
            client = storage.Client()
            target_bucket = control_file_path.split("/")[2]
            target_object = "/".join(control_file_path.split("/")[3:])
            bucket = client.bucket(target_bucket)
            blob = bucket.blob(target_object)
            blob.delete()
        else:
            blob = BlobClient.from_blob_url(control_file_path)
            blob.delete_blob()

# Function to orchestration the migration of tabular data
def migrate_tabular_data(config):

    # Extract parameters from config
    source_dataset_id = config["source_dataset_id"]
    target_dataset_id = config["target_dataset_id"] 
    tables_to_ingest = config["tables_to_ingest"] 
    tdr_host = config["tdr_host"] 
    tdr_sa_to_use = config["tdr_sa_to_use"] 
    chunk_size = config["chunk_size"] 
    max_combined_rec_ref_size = config["max_combined_rec_ref_size"] 
    skip_ingests = config["skip_ingests"]

    # Setup/refresh TDR clients
    api_client = refresh_tdr_api_client(tdr_host)
    datasets_api = data_repo_client.DatasetsApi(api_client=api_client)
    snapshots_api = data_repo_client.SnapshotsApi(api_client=api_client)

    # Add TDR SA to original dataset
    logging.info(f"Adding TDR general SA ({tdr_sa_to_use}) to original dataset: {source_dataset_id}")
    try:
        resp = datasets_api.add_dataset_policy_member(id=source_dataset_id, policy_name="steward", policy_member={"email": tdr_sa_to_use}) 
        logging.info("TDR SA added successfully.")
    except:
        error_str = f"Error adding TDR SA to dataset {source_dataset_id}: {str(e)}"
        logging.error(error_str)
        config["migration_results"].append(["Dataset Ingestion", "All Tables", "Failure", error_str])
        return

    # Collect details from original dataset to build inventory of tables to migrate
    logging.info(f"Retrieving dataset details from original dataset: {source_dataset_id}")
    try:
        dataset_details = datasets_api.retrieve_dataset(id=source_dataset_id, include=["SCHEMA", "ACCESS_INFORMATION", "PROPERTIES", "DATA_PROJECT", "STORAGE"]).to_dict()
        config["bigquery_project"] = dataset_details["access_information"]["big_query"]["project_id"]
        config["bigquery_dataset"] = dataset_details["access_information"]["big_query"]["dataset_name"]
        fileref_col_dict = {}
        array_col_dict = {}
        data_type_col_dict = {}
        float_col_dict = {}
        int_col_dict = {}
        for table_entry in dataset_details["schema"]["tables"]:
            fileref_list = []
            array_list = []
            float_list = []
            int_list = []
            for idx, column_entry in enumerate(table_entry["columns"]):
                if column_entry["datatype"] == "fileref":
                    fileref_list.append(column_entry["name"])
                elif column_entry["datatype"] == "float":
                    float_list.append(column_entry["name"])
                elif column_entry["datatype"] == "integer":
                    int_list.append(column_entry["name"])
                if column_entry["array_of"] == True:
                    array_list.append(column_entry["name"])
            fileref_col_dict[table_entry["name"]] = fileref_list
            array_col_dict[table_entry["name"]] = array_list
            float_col_dict[table_entry["name"]] = float_list
            data_type_col_dict["float"] = float_col_dict
            int_col_dict[table_entry["name"]] = int_list
            data_type_col_dict["int"] = int_col_dict
    except Exception as e:
        error_str = f"Error retrieving details from dataset {source_dataset_id}: {str(e)}"
        logging.error(error_str)
        config["migration_results"].append(["Dataset Ingestion", "All Tables", "Failure", error_str])
        return

    # Read in existing datarepo_row_id crosswalk, if one exists
    logging.info("Fetching existing datarepo_row_id crosswalk (if one exists).")
    xwalk_json_file_name = f"{source_dataset_id}_{target_dataset_id}_rowid_xwalk.json"
    try:
        with open(xwalk_json_file_name,"r") as file:
            datarepo_row_id_xwalk = json.load(file)
    except:
        datarepo_row_id_xwalk = {}
        logging.warning(f"No datarepo_row_id crosswalk file name '{xwalk_json_file_name}' found.")

    # Order tables for ingestion
    logging.info("Ordering tables and pulling current record counts for validation.")
    table_rank_dict = {}
    for table in fileref_col_dict.keys():
        if table == "file_inventory":
            table_rank_dict[table] = 1
        elif "anvil_" not in table:
            table_rank_dict[table] = 2
        else:
            table_rank_dict[table] = 3
    ordered_table_list = sorted(table_rank_dict, key= lambda key: table_rank_dict[key])

    # Fetch total record counts for all tables
    populated_table_dict = {}
    for table in ordered_table_list:
        api_client = refresh_tdr_api_client(tdr_host)
        datasets_api = data_repo_client.DatasetsApi(api_client=api_client)
        snapshots_api = data_repo_client.SnapshotsApi(api_client=api_client)
        attempt_counter = 0
        while True:
            payload = {
              "offset": 0,
              "limit": 10,
              "sort": "datarepo_row_id",
              "direction": "asc",
              "filter": ""
            }
            try:
                record_results = datasets_api.query_dataset_data_by_id(id=source_dataset_id, table=table, query_data_request_model=payload).to_dict()
                total_record_count = record_results["total_row_count"]
                break
            except Exception as e:
                if attempt_counter < 2:
                    sleep(10)
                    attempt_counter += 1
                    continue
                else:
                    total_record_count = -1
                    break
        if total_record_count == -1:
            error_str = f"Error retrieving current record counts for tables in dataset {source_dataset_id}: {str(e)}"
            logging.error(error_str)
            config["migration_results"].append(["Dataset Ingestion", "All Tables", "Failure", error_str])
            return
        elif total_record_count > 0:
            populated_table_dict[table] = total_record_count

    # Loop through and process tables for ingestion
    logging.info("Processing dataset ingestion requests.")
    pop_fss_table_cnt = 0
    for table in ordered_table_list:

        # Determine whether table should be processed, and skip if not
        logging.info(f"Processing dataset ingestion for table '{table}'.")
        total_record_count = 0
        if tables_to_ingest and table not in tables_to_ingest:
            msg_str = f"Table '{table}' not listed in the tables_to_ingest parameter. Skipping."
            logging.info(msg_str)
            config["migration_results"].append(["Dataset Ingestion", f"Table: {table}", "Skipped", msg_str])
            continue
        elif table not in populated_table_dict.keys():
            msg_str = f"No records found for table '{table}' in original dataset. Continuing to next table/record set."
            logging.info(msg_str)
            config["migration_results"].append(["Dataset Ingestion", f"Table: {table}", "Skipped", msg_str])
            continue
        elif "anvil_" in table:
            # Confirm all non-FSS tables are present in datarepo_row_id_xwalk
            pop_fss_table_cnt += 1
            missing_tab_list = []
            for tab in populated_table_dict.keys():
                if "anvil_" not in tab and tab not in datarepo_row_id_xwalk.keys():
                    missing_tab_list.append(tab)
            if len(missing_tab_list) > 0:
                missing_tab_string = ", ".join(missing_tab_list)
                msg_str = f"Populated non-FSS tables missing from datarepo_row_id crosswalk: {missing_tab_string}. Skipping FSS table '{table}'."
                logging.info(msg_str)
                config["migration_results"].append(["Dataset Ingestion", f"Table: {table}", "Skipped", msg_str])
                continue
        
        # Aggregate datarepo_row_id crosswalk information for use in FSS table processing
        if pop_fss_table_cnt == 1:
            dr_row_id_xwalk = {}
            for key in datarepo_row_id_xwalk.keys():
                dr_row_id_xwalk.update(datarepo_row_id_xwalk[key])
            config["dr_row_id_xwalk"] = dr_row_id_xwalk 
            
        # Chunk table records as necessary, then loop through and process each chunk
        total_record_count = populated_table_dict.get(table)
        if skip_ingests:
            msg_str = f"Parameter 'skip_ingests' set to true. Skipping ingestion for table '{table}'."
            logging.info(msg_str)
            config["migration_results"].append(["Dataset Ingestion", f"Table: {table}", "Skipped", msg_str])
        else:
            if fileref_col_dict[table]:
                ref_chunk_size = math.floor(max_combined_rec_ref_size / len(fileref_col_dict[table]))
                table_chunk_size = min(chunk_size, ref_chunk_size)
                logging.info(f"Table '{table}' contains fileref columns. Will use a chunk size of {table_chunk_size} rows per ingestion request, to keep the number of file references per chunk below {max_combined_rec_ref_size}.")
            else:
                table_chunk_size = chunk_size
                logging.info(f"Table '{table}' does not contain fileref columns. Will use a chunk size of {table_chunk_size} rows per ingestion request.")
            start_row = 1
            end_row = min((table_chunk_size), total_record_count)
            while start_row <= total_record_count:
                if end_row > total_record_count:
                    end_row = total_record_count
                ingest_table_data(config, target_dataset_id, array_col_dict, data_type_col_dict, table, start_row, end_row)    
                start_row += table_chunk_size
                end_row += table_chunk_size

        # Build datarepo_row_id crosswalk for the table, add to datarepo_row_id_xwalk dict, and write out updated dict to file
        if "anvil_" not in table: 
            logging.info("Fetching ingested records and building datarepo_row_id lookup for use in AnVIL transforms.")
            temp_dr_xwalk = {}
            api_client = refresh_tdr_api_client(tdr_host)
            datasets_api = data_repo_client.DatasetsApi(api_client=api_client)
            max_page_size = 1000
            records_fetched = 0
            retrieval_error = False
            while records_fetched < total_record_count and not retrieval_error:
                row_start = records_fetched
                attempt_counter = 0
                while True:
                    payload = {
                      "offset": row_start,
                      "limit": max_page_size,
                      "sort": "datarepo_row_id",
                      "direction": "asc",
                      "filter": ""
                    }
                    try:
                        dataset_results = datasets_api.query_dataset_data_by_id(id=target_dataset_id, table=table, query_data_request_model=payload).to_dict() 
                        if len(dataset_results["result"]) == 0:
                            warn_str = f"No records found for '{table}' table, which prevents the proper building of the datarepo_row_id_xwalk. Note that this may cause failures in FSS table ingestion requests downstream."
                            logging.warning(warn_str)
                            retrieval_error = True
                            break  
                        else:
                            for record in dataset_results["result"]:
                                key = table + ":" + record["orig_datarepo_row_id"]
                                val = table + ":" + record["datarepo_row_id"]
                                temp_dr_xwalk[key] = val
                                records_fetched += 1
                            break
                    except Exception as e:
                        if attempt_counter < 2:
                            sleep(10)
                            attempt_counter += 1
                            continue
                        else:
                            warn_str = f"Error retrieving records for '{table}' table to build datarepo_row_id_xwalk. Note that this may cause failures in FSS table ingestion requests downstream. Error: {str(e)}"
                            logging.warning(warn_str)
                            retrieval_error = True
                            break
            if not retrieval_error:
                datarepo_row_id_xwalk[table] = temp_dr_xwalk
                with open(xwalk_json_file_name, 'w') as file:
                    json.dump(datarepo_row_id_xwalk, file)
        
        # Fetch total record count for the new table
        api_client = refresh_tdr_api_client(tdr_host)
        datasets_api = data_repo_client.DatasetsApi(api_client=api_client)
        snapshots_api = data_repo_client.SnapshotsApi(api_client=api_client)
        attempt_counter = 0
        while True:
            payload = {
              "offset": 0,
              "limit": 10,
              "sort": "datarepo_row_id",
              "direction": "asc",
              "filter": ""
            }
            try:
                record_results = datasets_api.query_dataset_data_by_id(id=target_dataset_id, table=table, query_data_request_model=payload).to_dict()
                new_record_count = record_results["total_row_count"]
                break
            except Exception as e:
                if attempt_counter < 2:
                    sleep(10)
                    attempt_counter += 1
                    continue
                else:
                    new_record_count = -1
                    break
        if new_record_count == -1:
            err_str = f"Error retrieving record count for table '{table}' in new dataset. Skipping validation and continuing to next table."
            logging.error(err_str)
            config["migration_results"].append(["Dataset Validation", f"Table: {table}", "Failure", err_str])
            continue 

        # Validate the new table against the old table, with extra scrutiny given to the file_inventory table for AnVIL migrations
        logging.info(f"Validating table '{table}' in new dataset vs. original dataset.")
        if new_record_count == total_record_count:
            config["migration_results"].append(["Dataset Validation", f"Table: {table}", "Success", f"{new_record_count} records found in both new and original table."])
        else:
            config["migration_results"].append(["Dataset Validation", f"Table: {table}", "Failure", f"{new_record_count} records found in new table doesn't match {total_record_count} records in original table."])

    # Display results
    pipeline_results = pd.DataFrame(config["migration_results"], columns = ["Task", "Step", "Status", "Message"])
    failures = pipeline_results[pipeline_results["Status"].str.contains("Failure")]
    logging.info("Migration Pipeline Results:")
    display(pipeline_results)
    logging.info(f"\nPipeline finished with {len(failures)} failures.")
    return len(failures)

# Function for creating a snapshot for the new dataset
def recreate_snapshot(config):
    
    # Extract parameters from config
    target_dataset_id = config["target_dataset_id"] 
    azure_billing_profile = config["azure_billing_profile"] 
    tdr_host = config["tdr_host"] 
    anvil_schema = config["anvil_schema"] 
    
    # Setup/refresh TDR clients
    api_client = refresh_tdr_api_client(tdr_host)
    datasets_api = data_repo_client.DatasetsApi(api_client=api_client)
    snapshots_api = data_repo_client.SnapshotsApi(api_client=api_client)
    
    # Retrieve new dataset details
    logging.info(f"Retrieving dataset details from prod environment. UUID:  {target_dataset_id}")
    try:
        dataset_details = datasets_api.retrieve_dataset(id=target_dataset_id, include=["SCHEMA", "ACCESS_INFORMATION", "PROPERTIES", "DATA_PROJECT", "STORAGE"]).to_dict()
        dataset_name = dataset_details["name"]
        phs_id = dataset_details["phs_id"]
        consent_name = dataset_details["properties"]["consent_name"]
        auth_domains = dataset_details["properties"]["auth_domains"]
        current_datetime = datetime.datetime.now()
        current_datetime_string = current_datetime.strftime("%Y%m%d%H%M")
        snapshot_name = dataset_name + "_" + anvil_schema + "_" + current_datetime_string
    except Exception as e:
        error_str = f"Error retrieving details from dataset: {str(e)}"
        logging.error(error_str)
    # Build config and submit snapshot job
    snapshot_config = {
        "profile_id": azure_billing_profile,
        "snapshot_readers_list": ["azul-anvil-prod@firecloud.org", "auth-domain"],
        "anvil_schema_versin": anvil_schema,
        "ws_bucket": os.environ["WORKSPACE_BUCKET"],
        "dataset_id": entry[1],
        "dataset_name": dataset_name,
        "phs_id": phs_id,
        "consent_name": consent_name,
        "auth_domains": auth_domains,
        "pipeline_results": [],
        "snapshot_name": snapshot_name
    }
    utils.create_and_share_snapshot(snapshot_config)
    int_df_results = pd.DataFrame(snapshot_config["pipeline_results"], columns = ["Dataset", "Time", "Step", "Task", "Status", "Message"])
    errors = int_df_results[int_df_results["Status"].str.contains("Error")]
    if len(errors) > 0:
        logging.error("Errors reported in snapshotting. See logs for details.")
        status = "Failure"
        message = f"{len(errors)} failures reported. See log for details."
        snapshot_id = ""
        snapshot_name = ""
    else:
        status = "Success"
        message = ""
        snapshot_id = re.search("{'id': '([a-z0-9]{8}-[a-z0-9]{4}-[a-z0-9]{4}-[a-z0-9]{4}-[a-z0-9]{12})'", str(int_df_results[int_df_results["Task"]=="Create and Share Snapshot"]["Message"]))[1]
        snapshot_name = re.search("'name': '([a-zA-Z0-9_\-]+)'", str(int_df_results[int_df_results["Task"]=="Create and Share Snapshot"]["Message"]))[1]
    return status, message, snapshot_id, snapshot_name
        
#############################################
## Input Parameters
#############################################

# Specify migration pairs: [Source GCP Dataset, Target Azure Dataset]
migration_list = [
    #["gcp_dataset_id", "az_dataset_id"]
    ['c6f3bd64-ea67-488f-904f-f0bdf6320b5c', 'fbc7f442-585f-4885-9e2e-bdb38425867d'],
]

# Run parameters
azure_billing_profile = "9ee23bed-b46c-4561-9103-d2a723113f7f"
anvil_schema = "ANV5"
run_data_migration = Fa
skip_ingests = False # Set to True to build datarepo_row_id xwalk and run validation w/o ingesting more records
#tables_to_ingest = ["anvil_biosample", "anvil_dataset", "anvil_donor", "anvil_file", "anvil_project"] # Leave empty for all
tables_to_ingest = []
run_snapshot_creation = False


#############################################
## Execution
#############################################

# Set up logging
current_datetime_string = datetime.datetime.now().strftime("%Y%m%d%H%M%S")
logs_stream_file_path = "processing_details_" + current_datetime_string + ".log"
while logging.root.handlers:
    logging.root.removeHandler(logging.root.handlers[-1])
logging.basicConfig(format="%(asctime)s - %(levelname)s: %(message)s", datefmt="%m/%d/%Y %I:%M:%S %p", level=logging.INFO, handlers=[logging.FileHandler(logs_stream_file_path), logging.StreamHandler(sys.stdout)])

# Loop through migration list and process entries
results = []
for entry in migration_list:
    
    # Run cross-cloud ingestion, if specified
    failure_count = 0
    if run_data_migration:
        logging.info(f"\nMigrating tabular data from TDR dataset {entry[0]} to TDR dataset {entry[1]}.")
        # Build config and submit migration job
        config = {
            "source_dataset_id": entry[0], 
            "target_dataset_id": entry[1],
            "tables_to_ingest": tables_to_ingest,
            "tdr_host": "https://data.terra.bio",
            "tdr_sa_to_use": "datarepo-jade-api@terra-datarepo-production.iam.gserviceaccount.com",
            "tar_tdr_billing_profile": azure_billing_profile,
            "chunk_size": 250000,
            "max_combined_rec_ref_size": 40000,
            "migration_results": [],
            "dr_row_id_xwalk": {},
            "skip_ingests": skip_ingests
        }
        failure_count = migrate_tabular_data(config)
        status = "Failure" if failure_count > 0 else "Success"
        msg = f"{failure_count} failures reported. See log for details." if failure_count > 0 else ""
        results.append([entry[0], entry[1], "Data Ingestion", status, msg, "", ""])

    # Run snapshotting, if specified and no upstream errors detected
    if run_snapshot_creation:
        logging.info(f"Creating a snapshot for TDR dataset {entry[1]}.")
        # Build config and submit snapshot job
        config = { 
            "target_dataset_id": entry[1],
            "tdr_host": "https://data.terra.bio",
            "azure_billing_profile": azure_billing_profile,
            "anvil_schema": anvil_schema
        }
        if failure_count > 0:
            logging.error("Failures noted in upstream data processing. Skipping snapshotting.")
            results.append([entry[0], entry[1], "Data Snapshotting", "Skipped", "Failures noted in upstream data processing.", "", ""])
        else:
            status, message, snapshot_id, snapshot_name = recreate_snapshot(config)
            results.append([entry[0], entry[1], "Data Snapshotting", status, message, snapshot_id, snapshot_name])
            
# Display final results
print("\nFinal Results:")
results_df = pd.DataFrame(results, columns = ["Source Dataset ID", "Target Dataset ID", "Processing Step", "Status", "Message", "Snapshot ID", "Snapshot Name"])
display(results_df)


06/16/2025 06:28:29 PM - INFO: 
Migrating tabular data from TDR dataset c6f3bd64-ea67-488f-904f-f0bdf6320b5c to TDR dataset fbc7f442-585f-4885-9e2e-bdb38425867d.
06/16/2025 06:28:29 PM - INFO: Adding TDR general SA (datarepo-jade-api@terra-datarepo-production.iam.gserviceaccount.com) to original dataset: c6f3bd64-ea67-488f-904f-f0bdf6320b5c
06/16/2025 06:28:30 PM - INFO: TDR SA added successfully.
06/16/2025 06:28:30 PM - INFO: Retrieving dataset details from original dataset: c6f3bd64-ea67-488f-904f-f0bdf6320b5c
06/16/2025 06:28:30 PM - INFO: Fetching existing datarepo_row_id crosswalk (if one exists).
06/16/2025 06:28:30 PM - INFO: Ordering tables and pulling current record counts for validation.
06/16/2025 06:28:45 PM - INFO: Processing dataset ingestion requests.
06/16/2025 06:28:45 PM - INFO: Processing dataset ingestion for table 'file_inventory'.
06/16/2025 06:28:45 PM - INFO: Table 'file_inventory' contains fileref columns. Will use a chunk size of 40000 rows per ingestion requ

06/16/2025 06:37:25 PM - INFO: Validating table 'anvil_activity' in new dataset vs. original dataset.
06/16/2025 06:37:25 PM - INFO: Processing dataset ingestion for table 'anvil_antibody'.
06/16/2025 06:37:25 PM - INFO: No records found for table 'anvil_antibody' in original dataset. Continuing to next table/record set.
06/16/2025 06:37:25 PM - INFO: Processing dataset ingestion for table 'anvil_diagnosis'.
06/16/2025 06:37:25 PM - INFO: No records found for table 'anvil_diagnosis' in original dataset. Continuing to next table/record set.
06/16/2025 06:37:25 PM - INFO: Processing dataset ingestion for table 'anvil_sequencingactivity'.
06/16/2025 06:37:25 PM - INFO: No records found for table 'anvil_sequencingactivity' in original dataset. Continuing to next table/record set.
06/16/2025 06:37:25 PM - INFO: Processing dataset ingestion for table 'anvil_variantcallingactivity'.
06/16/2025 06:37:25 PM - INFO: No records found for table 'anvil_variantcallingactivity' in original dataset. C

Unnamed: 0,Task,Step,Status,Message
0,Dataset Ingestion,Table: file_inventory -- Rows: 1-8619,Success,"{'dataset_id': 'fbc7f442-585f-4885-9e2e-bdb38425867d', 'dataset': 'ANVIL_GTEx_V9_hg38_20240221', 'table': 'file_inventory', 'path': None, 'load_tag': 'Ingest for fbc7f442-585f-4885-9e2e-bdb38425867d', 'row_count': 8619, 'bad_row_count': 0, 'load_result': None}"
1,Dataset Validation,Table: file_inventory,Success,8619 records found in both new and original table.
2,Dataset Ingestion,Table: workspace_attributes -- Rows: 1-37,Success,"{'dataset_id': 'fbc7f442-585f-4885-9e2e-bdb38425867d', 'dataset': 'ANVIL_GTEx_V9_hg38_20240221', 'table': 'workspace_attributes', 'path': None, 'load_tag': 'Ingest for fbc7f442-585f-4885-9e2e-bdb38425867d', 'row_count': 37, 'bad_row_count': 0, 'load_result': None}"
3,Dataset Validation,Table: workspace_attributes,Success,37 records found in both new and original table.
4,Dataset Ingestion,Table: anvil_file -- Rows: 1-8619,Success,"{'dataset_id': 'fbc7f442-585f-4885-9e2e-bdb38425867d', 'dataset': 'ANVIL_GTEx_V9_hg38_20240221', 'table': 'anvil_file', 'path': None, 'load_tag': 'Ingest for fbc7f442-585f-4885-9e2e-bdb38425867d', 'row_count': 8619, 'bad_row_count': 0, 'load_result': None}"
5,Dataset Validation,Table: anvil_file,Success,8619 records found in both new and original table.
6,Dataset Ingestion,Table: anvil_biosample,Skipped,No records found for table 'anvil_biosample' in original dataset. Continuing to next table/record set.
7,Dataset Ingestion,Table: anvil_project -- Rows: 1-1,Success,"{'dataset_id': 'fbc7f442-585f-4885-9e2e-bdb38425867d', 'dataset': 'ANVIL_GTEx_V9_hg38_20240221', 'table': 'anvil_project', 'path': None, 'load_tag': 'Ingest for fbc7f442-585f-4885-9e2e-bdb38425867d', 'row_count': 1, 'bad_row_count': 0, 'load_result': None}"
8,Dataset Validation,Table: anvil_project,Success,1 records found in both new and original table.
9,Dataset Ingestion,Table: anvil_dataset -- Rows: 1-1,Success,"{'dataset_id': 'fbc7f442-585f-4885-9e2e-bdb38425867d', 'dataset': 'ANVIL_GTEx_V9_hg38_20240221', 'table': 'anvil_dataset', 'path': None, 'load_tag': 'Ingest for fbc7f442-585f-4885-9e2e-bdb38425867d', 'row_count': 1, 'bad_row_count': 0, 'load_result': None}"


06/16/2025 06:37:25 PM - INFO: 
Pipeline finished with 0 failures.

Final Results:


Unnamed: 0,Source Dataset ID,Target Dataset ID,Processing Step,Status,Message,Snapshot ID,Snapshot Name
0,c6f3bd64-ea67-488f-904f-f0bdf6320b5c,fbc7f442-585f-4885-9e2e-bdb38425867d,Data Ingestion,Success,,,


### Manual Ingest Scratch

In [None]:
tdr_host = "https://data.terra.bio"
api_client = refresh_tdr_api_client(tdr_host)
datasets_api = data_repo_client.DatasetsApi(api_client=api_client)
float_col_dict = {}
float_col_dict["sample"] = ['fold_80_base_penalty', 'fold_enrichment', 'het_snp_sensitivity', 'library_1_mean_insert_size', 'library_1_pct_exc_dupe', 'library_1_percent_duplication', 'mean_bait_coverage', 'mean_insert_size', 'mean_target_coverage', 'on_bait_vs_selected', 'pct_chimeras', 'pct_contamination', 'pct_exc_baseq', 'pct_exc_dupe', 'pct_exc_mapq', 'pct_exc_off_target', 'pct_exc_overlap', 'pct_off_bait', 'pct_pf_reads_aligned', 'pct_reads_aligned_in_pairs', 'pct_selected_bases', 'pct_target_bases_100x', 'pct_target_bases_10x', 'pct_target_bases_20x', 'pct_target_bases_2x', 'pct_target_bases_30x', 'pct_target_bases_50x', 'pct_usable_bases_on_bait', 'pct_usable_bases_on_target', 'pf_hq_error_rate', 'strand_balance', 'zero_cvg_targets_pct', 'library_2_mean_insert_size', 'library_2_pct_exc_dupe', 'library_2_percent_duplication']
table = "sample"

# Pull ingested samples
payload = {
  "offset": 0,
  "limit": 1000,
  "sort": "datarepo_row_id",
  "direction": "asc",
  "filter": ""
}
ingested_records = datasets_api.query_dataset_data_by_id(id="5f4ece3e-d76e-4d78-99e0-e62a24cd163d", table=table, query_data_request_model=payload).to_dict()
already_processed_samples = [rec["sample_id"] for rec in ingested_records["result"]]

# Pull samples to ingest
payload = {
  "offset": 0,
  "limit": 1000,
  "sort": "datarepo_row_id",
  "direction": "asc",
  "filter": ""
}
records_orig = datasets_api.query_dataset_data_by_id(id="85dbde76-c130-40b2-8a8a-ba815ba499da", table=table, query_data_request_model=payload).to_dict()
records_processed = []
for record in records_orig["result"]:
    int_record = record.copy()
    for fcol in float_col_dict[table]:
        if int_record[fcol]:
            int_record[fcol] = float(int_record[fcol])
    if int_record["sample_id"] not in already_processed_samples:
        records_processed.append(int_record)

# Build ingest request
ingest_request = {
    "table": table,
    "profile_id": "9ee23bed-b46c-4561-9103-d2a723113f7f",
    "ignore_unknown_values": True,
    "resolve_existing_files": True,
    "updateStrategy": "append",
    "format": "array",
    "load_tag": "Ingest for 5f4ece3e-d76e-4d78-99e0-e62a24cd163d",
    "records": records_processed[0:100]
}

In [None]:
len(already_processed_samples)

In [None]:
len(records_processed)

In [None]:
len(ingest_request["records"])

In [None]:
json.dumps(ingest_request)

### Manual datarepo_row_id_xwalk modifications

In [None]:
# Inputs
tdr_host = "https://data.terra.bio"
source_dataset_id = '75fb0984-2124-444f-881b-30a1a6f8b8f7'
target_dataset_id = '7a9eee5d-95c2-4947-93a8-e31d53a2a09a'
table = "subject"
key_field = "subject_id"

# Load existing datarepo_row_id_walk
logging.info("Fetching existing datarepo_row_id crosswalk (if one exists).")
xwalk_json_file_name = f"{source_dataset_id}_{target_dataset_id}_rowid_xwalk.json"
try:
    with open(xwalk_json_file_name,"r") as file:
        datarepo_row_id_xwalk = json.load(file)
except:
    datarepo_row_id_xwalk = {}
    logging.warning(f"No datarepo_row_id crosswalk file name '{xwalk_json_file_name}' found.")

# Fetch records from source dataset
logging.info("Fetching records from source dataset.")
api_client = refresh_tdr_api_client(tdr_host)
datasets_api = data_repo_client.DatasetsApi(api_client=api_client)
max_page_size = 1000
total_record_count = 1000
records_fetched = 0
retrieval_error = False
source_records = []
while records_fetched < total_record_count and not retrieval_error:
    row_start = records_fetched
    attempt_counter = 0
    while True:
        payload = {
          "offset": row_start,
          "limit": max_page_size,
          "sort": "datarepo_row_id",
          "direction": "asc",
          "filter": ""
        }
        try:
            dataset_results = datasets_api.query_dataset_data_by_id(id=source_dataset_id, table=table, query_data_request_model=payload).to_dict() 
            total_record_count = dataset_results["total_row_count"]
            if len(dataset_results["result"]) == 0:
                warn_str = f"No records found for '{table}' table, which prevents the proper building of the datarepo_row_id_xwalk. Note that this may cause failures in FSS table ingestion requests downstream."
                logging.warning(warn_str)
                retrieval_error = True
                break  
            else:
                for record in dataset_results["result"]:
                    source_records.append([record[key_field], record["datarepo_row_id"]])
                    records_fetched += 1
                break
        except Exception as e:
                if attempt_counter < 0:
                    sleep(10)
                    attempt_counter += 1
                    continue
                else:
                    warn_str = f"Error retrieving records for '{table}' table to build datarepo_row_id_xwalk. Note that this may cause failures in FSS table ingestion requests downstream. Error: {str(e)}"
                    logging.warning(warn_str)
                    retrieval_error = True
                    break
    logging.info(f"Records fetched: {str(records_fetched)}")

# Fetch records from target dataset
logging.info("Fetching records from target dataset.")
api_client = refresh_tdr_api_client(tdr_host)
datasets_api = data_repo_client.DatasetsApi(api_client=api_client)
max_page_size = 1000
total_record_count = 1000
records_fetched = 0
retrieval_error = False
target_records = []
while records_fetched < total_record_count and not retrieval_error:
    row_start = records_fetched
    attempt_counter = 0
    while True:
        payload = {
          "offset": row_start,
          "limit": max_page_size,
          "sort": "datarepo_row_id",
          "direction": "asc",
          "filter": ""
        }
        try:
            dataset_results = datasets_api.query_dataset_data_by_id(id=target_dataset_id, table=table, query_data_request_model=payload).to_dict() 
            total_record_count = dataset_results["total_row_count"]
            if len(dataset_results["result"]) == 0:
                warn_str = f"No records found for '{table}' table, which prevents the proper building of the datarepo_row_id_xwalk. Note that this may cause failures in FSS table ingestion requests downstream."
                logging.warning(warn_str)
                retrieval_error = True
                break  
            else:
                for record in dataset_results["result"]:
                    target_records.append([record[key_field], record["datarepo_row_id"]])
                    records_fetched += 1
                break
        except Exception as e:
                if attempt_counter < 0:
                    sleep(10)
                    attempt_counter += 1
                    continue
                else:
                    warn_str = f"Error retrieving records for '{table}' table to build datarepo_row_id_xwalk. Note that this may cause failures in FSS table ingestion requests downstream. Error: {str(e)}"
                    logging.warning(warn_str)
                    retrieval_error = True
                    break
    logging.info(f"Records fetched: {str(records_fetched)}")

# Match records and update datarepo_row_id_xwalk
logging.info("Building records for datarepo_row_id_xwalk")
temp_dr_xwalk = {}
for source_record in source_records:
    for target_record in target_records:
        if source_record[0] == target_record[0]:
            key = table + ":" + source_record[1]
            val = table + ":" + target_record[1]
            temp_dr_xwalk[key] = val
            break
if len(temp_dr_xwalk) == total_record_count:
    datarepo_row_id_xwalk[table] = temp_dr_xwalk
    with open(xwalk_json_file_name, 'w') as file:
        json.dump(datarepo_row_id_xwalk, file)
    logging.info("Processing complete.")
else:
    logging.error("Rows in xwalk doesn't match table record count.")


In [None]:
# Inputs
source_dataset_id = '65793118-3c88-4185-9172-2354850e6056'
target_dataset_id = '183ec762-f867-46c5-bb19-8b2b3417f7b2'

# Load existing datarepo_row_id_walk
logging.info("Fetching existing datarepo_row_id crosswalk (if one exists).")
xwalk_json_file_name = f"{source_dataset_id}_{target_dataset_id}_rowid_xwalk.json"
try:
    with open(xwalk_json_file_name,"r") as file:
        datarepo_row_id_xwalk = json.load(file)
except:
    datarepo_row_id_xwalk = {}
    logging.warning(f"No datarepo_row_id crosswalk file name '{xwalk_json_file_name}' found.")

# Output crosswalk record counts
for key in datarepo_row_id_xwalk.keys():
    length = len(datarepo_row_id_xwalk[key])
    print(f"{key}: {length}")

## Validation

### Pull and Compare Tabular Data between TDR Datasets

In [5]:
#############################################
## Functions
#############################################

def compare_row_counts(dataset_1_id, dataset_2_id):
    
    # Setup/refresh TDR clients
    logging.info(f"Comparing tabular data record counts between TDR dataset {dataset_1_id} and TDR dataset {dataset_2_id}.")
    api_client = refresh_tdr_api_client("https://data.terra.bio")
    datasets_api = data_repo_client.DatasetsApi(api_client=api_client)

    # Pull table list across datasets
    logging.info(f"Pulling the superset of tables across the two datasets.")
    try:
        dataset_1_details = datasets_api.retrieve_dataset(id=dataset_1_id, include=["SCHEMA"]).to_dict()
        dataset_2_details = datasets_api.retrieve_dataset(id=dataset_2_id, include=["SCHEMA"]).to_dict()
    except Exception as e:
        error_str = f"Error retrieving details from datasets: {str(e)}"
        logging.error(error_str)
    table_set = set()
    for table_entry in dataset_1_details["schema"]["tables"]:
        table_set.add(table_entry["name"])
    for table_entry in dataset_2_details["schema"]["tables"]:
        table_set.add(table_entry["name"])   

    # For each table in the table list, pull record counts from the two datasets and compare
    results = []
    payload = {
      "offset": 0,
      "limit": 10,
      "sort": "datarepo_row_id",
      "direction": "asc",
      "filter": ""
    }
    for table in table_set:
        logging.info(f"Comparing record counts for table '{table}'")
        # Pulling record counts for dataset 1
        ds1_table_present = "True"
        attempt_counter = 0
        while True:
            try:
                record_results = datasets_api.query_dataset_data_by_id(id=dataset_1_id, table=table, query_data_request_model=payload).to_dict()
                ds1_record_count = record_results["total_row_count"]
                break
            except Exception as e:
                if "No dataset table exists" in str(e):
                    ds1_record_count = 0
                    ds1_table_present = "False"
                    break
                else:
                    if attempt_counter < 2:
                        sleep(10)
                        attempt_counter += 1
                        continue
                    else:
                        ds1_record_count = 0
                        ds1_table_present = "Unknown"
                        break
        # Pulling record counts for dataset 2
        ds2_table_present = "True"
        attempt_counter = 0
        while True:
            try:
                record_results = datasets_api.query_dataset_data_by_id(id=dataset_2_id, table=table, query_data_request_model=payload).to_dict()
                ds2_record_count = record_results["total_row_count"]
                break
            except Exception as e:
                if "No dataset table exists" in str(e):
                    ds2_record_count = 0
                    ds2_table_present = "False"
                    break
                else:
                    if attempt_counter < 2:
                        sleep(10)
                        attempt_counter += 1
                        continue
                    else:
                        ds2_record_count = 0
                        ds2_table_present = "Unknown"
                        break
        # Build table comparison
        if ds1_table_present == "Unknown" or ds2_table_present == "Unknown":
            status = "Fail"
            error_reason = "Error retrieving table data from dataset(s)"
        elif ds1_table_present == "False" or ds2_table_present == "False":
            status = "Fail"
            error_reason = "Table presence mismatch between datasets"
        elif ds1_record_count != ds2_record_count:
            status = "Fail"
            error_reason = "Difference in record count"
        else:
            status = "Pass"
            error_reason = ""
        results.append([dataset_1_id, dataset_2_id, table, ds1_table_present, ds1_record_count, ds2_table_present, ds2_record_count, status, error_reason])

    # Display detailed results
    print("\nResults:")
    results_df = pd.DataFrame(results, columns = ["Dataset 1 ID", "Dataset 2 ID", "Table", "Table in DS1", "DS1 Record Count", "Table in DS2", "DS2 Record Count", "Status", "Message"])
    display(results_df)

    # Return final aggregated results
    status = "Pass"
    failed_tables = []
    for entry in results:
        if entry[7] == "Fail":
            failed_tables.append(entry[2])
            status = "Fail"
    return status, sorted(failed_tables)
        
def compare_contents_sample(dataset_1_id, dataset_2_id, sample_size, fields_to_ignore):
    # Pull schema, record first column in each table (for ordering)
    # Setup/refresh TDR clients
    logging.info(f"Comparing tabular data record counts between TDR dataset {dataset_1_id} and TDR dataset {dataset_2_id}.")
    api_client = refresh_tdr_api_client("https://data.terra.bio")
    datasets_api = data_repo_client.DatasetsApi(api_client=api_client)

    # Pull table list across datasets
    logging.info(f"Pulling the superset of tables across the two datasets.")
    try:
        dataset_1_details = datasets_api.retrieve_dataset(id=dataset_1_id, include=["SCHEMA"]).to_dict()
        dataset_2_details = datasets_api.retrieve_dataset(id=dataset_2_id, include=["SCHEMA"]).to_dict()
    except Exception as e:
        error_str = f"Error retrieving details from datasets: {str(e)}"
        logging.error(error_str)
    table_set = {}
    for table_entry in dataset_1_details["schema"]["tables"]:
        table_set.add(table_entry["name"])
    for table_entry in dataset_2_details["schema"]["tables"]:
        table_set.add(table_entry["name"])  
    
    
    # Loop through tables, pull xxx records (by sample size), ordering by first column
    # Drop fields_to_ignore
    # Compare --> How to best do this
    pass
    
#############################################
## Input Parameters
#############################################

# Specify the list of dataset pairs to compare
dataset_id_pairs_list = [
    #["gcp_dataset_id", "az_dataset_id"]
    ['c6f3bd64-ea67-488f-904f-f0bdf6320b5c', 'fbc7f442-585f-4885-9e2e-bdb38425867d'],
]

# Specify whether row comparison checks should run
run_row_count_comparison = True

# Specify whether table content checks should run, the size of the sample to use (if so), and which fields should be excluded from comparison
run_contents_sample_comparison = False
contents_sample_comparison_size = 1000
fields_to_ignore = ["datarepo_row_id", "orig_datarepo_row_id", "orig_file_ref", "source_datarepo_row_ids", "uri"]

#############################################
## Execution
#############################################

# Run validation
results = []
for dataset_id_pair in dataset_id_pairs_list:
    if run_row_count_comparison:
        status, failed_tables = compare_row_counts(dataset_id_pair[0], dataset_id_pair[1])
        results.append([dataset_id_pair[0], dataset_id_pair[1], "Record Count Comparison", status, ', '.join(failed_tables)])

# Display final results
print("\nFinal Validation Results:")
results_df = pd.DataFrame(results, columns = ["Dataset 1 ID", "Dataset 2 ID", "Validation Type", "Status", "Failed Tables"])
display(results_df)   


06/16/2025 07:33:27 PM - INFO: Comparing tabular data record counts between TDR dataset c6f3bd64-ea67-488f-904f-f0bdf6320b5c and TDR dataset fbc7f442-585f-4885-9e2e-bdb38425867d.
06/16/2025 07:33:27 PM - INFO: Pulling the superset of tables across the two datasets.
06/16/2025 07:33:28 PM - INFO: Comparing record counts for table 'anvil_donor'
06/16/2025 07:33:33 PM - INFO: Comparing record counts for table 'anvil_project'
06/16/2025 07:33:40 PM - INFO: Comparing record counts for table 'anvil_sequencingactivity'
06/16/2025 07:33:45 PM - INFO: Comparing record counts for table 'anvil_biosample'
06/16/2025 07:33:49 PM - INFO: Comparing record counts for table 'anvil_diagnosis'
06/16/2025 07:33:53 PM - INFO: Comparing record counts for table 'anvil_assayactivity'
06/16/2025 07:33:56 PM - INFO: Comparing record counts for table 'anvil_alignmentactivity'
06/16/2025 07:34:00 PM - INFO: Comparing record counts for table 'anvil_dataset'
06/16/2025 07:34:04 PM - INFO: Comparing record counts fo

Unnamed: 0,Dataset 1 ID,Dataset 2 ID,Table,Table in DS1,DS1 Record Count,Table in DS2,DS2 Record Count,Status,Message
0,c6f3bd64-ea67-488f-904f-f0bdf6320b5c,fbc7f442-585f-4885-9e2e-bdb38425867d,anvil_donor,True,0,True,0,Pass,
1,c6f3bd64-ea67-488f-904f-f0bdf6320b5c,fbc7f442-585f-4885-9e2e-bdb38425867d,anvil_project,True,1,True,1,Pass,
2,c6f3bd64-ea67-488f-904f-f0bdf6320b5c,fbc7f442-585f-4885-9e2e-bdb38425867d,anvil_sequencingactivity,True,0,True,0,Pass,
3,c6f3bd64-ea67-488f-904f-f0bdf6320b5c,fbc7f442-585f-4885-9e2e-bdb38425867d,anvil_biosample,True,0,True,0,Pass,
4,c6f3bd64-ea67-488f-904f-f0bdf6320b5c,fbc7f442-585f-4885-9e2e-bdb38425867d,anvil_diagnosis,True,0,True,0,Pass,
5,c6f3bd64-ea67-488f-904f-f0bdf6320b5c,fbc7f442-585f-4885-9e2e-bdb38425867d,anvil_assayactivity,True,0,True,0,Pass,
6,c6f3bd64-ea67-488f-904f-f0bdf6320b5c,fbc7f442-585f-4885-9e2e-bdb38425867d,anvil_alignmentactivity,True,0,True,0,Pass,
7,c6f3bd64-ea67-488f-904f-f0bdf6320b5c,fbc7f442-585f-4885-9e2e-bdb38425867d,anvil_dataset,True,1,True,1,Pass,
8,c6f3bd64-ea67-488f-904f-f0bdf6320b5c,fbc7f442-585f-4885-9e2e-bdb38425867d,anvil_activity,True,1210,True,1210,Pass,
9,c6f3bd64-ea67-488f-904f-f0bdf6320b5c,fbc7f442-585f-4885-9e2e-bdb38425867d,anvil_antibody,True,0,True,0,Pass,



Final Validation Results:


Unnamed: 0,Dataset 1 ID,Dataset 2 ID,Validation Type,Status,Failed Tables
0,c6f3bd64-ea67-488f-904f-f0bdf6320b5c,fbc7f442-585f-4885-9e2e-bdb38425867d,Record Count Comparison,Pass,


In [None]:
# Parameters
dataset_1_id = "b12fb9be-2ce0-4bfd-8503-732fabba06ab"
dataset_2_id = "744c85cc-13d2-4f90-9d2e-d3143cb01edb"
contents_sample_comparison_size = 1000
fields_to_ignore = ["datarepo_row_id", "orig_datarepo_row_id", "orig_file_ref", "source_datarepo_row_ids", "uri"]

# Setup/refresh TDR clients
logging.info(f"Comparing a sample of tabular data content between TDR dataset {dataset_1_id} and TDR dataset {dataset_2_id}.")
api_client = refresh_tdr_api_client("https://data.terra.bio")
datasets_api = data_repo_client.DatasetsApi(api_client=api_client)

# Pull table list across datasets
logging.info(f"Pulling the superset of tables across the two datasets.")
try:
    dataset_1_details = datasets_api.retrieve_dataset(id=dataset_1_id, include=["SCHEMA"]).to_dict()
    dataset_2_details = datasets_api.retrieve_dataset(id=dataset_2_id, include=["SCHEMA"]).to_dict()
except Exception as e:
    error_str = f"Error retrieving details from datasets: {str(e)}"
    logging.error(error_str)
table_set = {}
for table_entry in dataset_1_details["schema"]["tables"]:
    table_set[table_entry["name"]] = table_entry["columns"][0]["name"]
for table_entry in dataset_2_details["schema"]["tables"]:
    table_set[table_entry["name"]] = table_entry["columns"][0]["name"]
    
# For each table in the table list, pull sample records from the two datasets and compare
results = []
for table in ["file_inventory"]: #table_set.keys():
    logging.info(f"Comparing sample records for table '{table}'")
    # Pulling sample records for dataset 1
    ds1_table_present = "True"
    max_page_size = 1000
    total_records_fetched = 0
    ds1_final_records = []
    attempt_counter = 0
    while True:
        offset = total_records_fetched
        page_size = min(max_page_size, contents_sample_comparison_size - total_records_fetched)
        attempt_counter = 0
        while True:
            payload = {
              "offset": offset,
              "limit": page_size,
              "sort": table_set[table],
              "direction": "asc",
              "filter": ""
            }
            try:
                record_results = datasets_api.query_dataset_data_by_id(id=dataset_1_id, table=table, query_data_request_model=payload).to_dict() 
                break
            except Exception as e:
                if "No dataset table exists" in str(e):
                    record_results = []
                    ds1_table_present = "False"
                    break
                else:
                    if attempt_counter < 2:
                        sleep(10)
                        attempt_counter += 1
                        continue
                    else:
                        record_results = []
                        ds1_table_present = "Unknown"
                        break
        if record_results["result"]:
            ds1_final_records.extend(record_results["result"])
            total_records_fetched += len(record_results["result"])
        else:
            break
        if total_records_fetched >= contents_sample_comparison_size:
            break
    # Pulling sample records for dataset 2
    ds2_table_present = "True"
    max_page_size = 1000
    total_records_fetched = 0
    ds2_final_records = []
    attempt_counter = 0
    while True:
        offset = total_records_fetched
        page_size = min(max_page_size, contents_sample_comparison_size - total_records_fetched)
        attempt_counter = 0
        while True:
            payload = {
              "offset": offset,
              "limit": page_size,
              "sort": table_set[table],
              "direction": "asc",
              "filter": ""
            }
            try:
                record_results = datasets_api.query_dataset_data_by_id(id=dataset_2_id, table=table, query_data_request_model=payload).to_dict() 
                break
            except Exception as e:
                if "No dataset table exists" in str(e):
                    record_results = []
                    ds2_table_present = "False"
                    break
                else:
                    if attempt_counter < 2:
                        sleep(10)
                        attempt_counter += 1
                        continue
                    else:
                        record_results = []
                        ds2_table_present = "Unknown"
                        break
        if record_results["result"]:
            ds2_final_records.extend(record_results["result"])
            total_records_fetched += len(record_results["result"])
        else:
            break
        if total_records_fetched >= contents_sample_comparison_size:
            break

In [None]:
df_ds1_records_int = pd.DataFrame.from_dict(ds1_final_records)
df_ds2_records_int = pd.DataFrame.from_dict(ds2_final_records)
cols = df_ds1_records_int.columns.tolist()
for field in fields_to_ignore:
    if field in cols:
        cols.remove(field)
df_ds1_records = df_ds1_records_int[cols]
df_ds2_records = df_ds2_records_int[cols]

In [None]:
diff = df_ds1_records.compare(df_ds2_records)

In [None]:
if df_ds1_records.equals(df_ds2_records):
    print("True")
else:
    print("False")

### Pull and Compare File Counts and Sizes between TDR Datasets

In [None]:
#############################################
## Functions
#############################################

def collect_file_stats(dataset_id_pairs_list):
    
    results = []
    for dataset_id_pair in dataset_id_pairs_list:

            # Setup/refresh TDR clients
            logging.info(f"Processing dataset_id_pair: {dataset_id_pair}")
            api_client = refresh_tdr_api_client("https://data.terra.bio")
            datasets_api = data_repo_client.DatasetsApi(api_client=api_client)

            # Initialize variables
            dataset_id_1 = dataset_id_pair[0]
            file_count_1 = 0
            total_file_size_1 = 0
            max_file_size_1 = 0
            status_1 = "Success"
            message_1 = ""
            dataset_id_2 = dataset_id_pair[1]
            file_count_2 = 0
            total_file_size_2 = 0
            max_file_size_2 = 0
            status_2 = "Success"
            message_2 = ""
            validation_status = "Passed"
            validation_message = ""

            # For dataset_id_1, loop through dataset files and record information
            logging.info(f"Retrieving files from dataset_id {dataset_id_1}...")
            try:
                max_page_size = 1000
                total_records_fetched = 0
                attempt_counter = 0
                while True:
                    try:
                        row_start = total_records_fetched
                        dataset_file_results = datasets_api.list_files(id=dataset_id_1, offset=row_start, limit=max_page_size)
                        if dataset_file_results:
                            total_records_fetched += len(dataset_file_results)
                            for entry in dataset_file_results:
                                file_count_1 += 1
                                total_file_size_1 += entry.size
                                if entry.size > max_file_size_1:
                                    max_file_size_1 = entry.size
                            logging.info(f"{total_records_fetched} records fetched...")
                            attempt_counter = 0
                        else:
                            break
                    except Exception as e:
                        attempt_counter += 1
                        if attempt_counter <= 10:
                            logging.info(f"Failure in file retrieval (attempt #{attempt_counter}). Trying again...")
                            continue
                        else:
                            status_1 = "Failure"
                            message_1 = str(e)
                            logging.error(f"Failure in file retrieval: {message_1}")
                            break
                if status_1 == "Success":
                    logging.info(f"File retrieval complete!")
            except Exception as e:
                status_1 = "Failure"
                message_1 = str(e)
                logging.error(f"Failure in file retrieval: {message_1}")
            
            # For dataset_id_2, loop through dataset files and record information
            logging.info(f"Retrieving files from dataset_id {dataset_id_2}...")
            try:
                max_page_size = 1000
                total_records_fetched = 0
                attempt_counter = 0
                while True:
                    try:
                        row_start = total_records_fetched
                        dataset_file_results = datasets_api.list_files(id=dataset_id_2, offset=row_start, limit=max_page_size)
                        if dataset_file_results:
                            total_records_fetched += len(dataset_file_results)
                            for entry in dataset_file_results:
                                file_count_2 += 1
                                total_file_size_2 += entry.size
                                if entry.size > max_file_size_2:
                                    max_file_size_2 = entry.size
                            logging.info(f"{total_records_fetched} records fetched...")
                            attempt_counter = 0
                        else:
                            break
                    except Exception as e:
                        attempt_counter += 1
                        if attempt_counter <= 10:
                            logging.info(f"Failure in file retrieval (attempt #{attempt_counter}). Trying again...")
                            continue
                        else:
                            status_2 = "Failure"
                            message_2 = str(e)
                            logging.error(f"Failure in file retrieval: {message_2}")
                            break
                if status_2 == "Success":
                    logging.info(f"File retrieval complete!")
            except Exception as e:
                status_2 = "Failure"
                message_2 = str(e)
                logging.error(f"Failure in file retrieval: {message_2}")
                
            # Record and display interim results
            file_count_diff = file_count_1 - file_count_2
            total_file_size_diff = total_file_size_1 - total_file_size_2
            max_file_size_diff = max_file_size_1 - max_file_size_2
            if status_1 == "Failure" or status_2 == "Failure":
                validation_status = "Failed"
                validation_message = "Errors pulling counts for one or more datasets."
            elif file_count_diff > 0 or total_file_size_diff > 0 or max_file_size_diff > 0:
                validation_status = "Failed"
                validation_message = "Difference in counts between datasets."
            results.append([dataset_id_1, dataset_id_2, validation_status, validation_message, file_count_diff, total_file_size_diff, max_file_size_diff, file_count_1, total_file_size_1, max_file_size_1, status_1, message_1, file_count_2, total_file_size_2, max_file_size_2, status_2, message_2])
            int_results_df = pd.DataFrame([[dataset_id_1, dataset_id_2, validation_status, validation_message, file_count_diff, total_file_size_diff, max_file_size_diff, file_count_1, total_file_size_1, max_file_size_1, status_1, message_1, file_count_2, total_file_size_2, max_file_size_2, status_2, message_2]], columns = ["Dataset ID 1", "Dataset ID 2", "Validation Status", "Validation Message", "File Count Diff", "Total File Size (Bytes) Diff", "Max File Size (Bytes) Diff", "File Count 1", "Total File Size (Bytes) 1", "Max File Size (Bytes) 1", "Status 1 ", "Message 1", "File Count 2", "Total File Size (Bytes) 2", "Max File Size (Bytes) 2", "Status 2 ", "Message 2"])
            logging.info("Results recorded:")
            display(int_results_df)
        
    # Display final results
    logging.info("Aggregating results...")
    ws_bucket = os.environ["WORKSPACE_BUCKET"]
    destination_dir = "ingest_pipeline/resources/azure_migration"
    current_datetime_string = datetime.datetime.now().strftime("%Y%m%d%H%M%S")
    output_file = f"validation_results_{current_datetime_string}.tsv"
    results_df = pd.DataFrame(results, columns = ["Dataset ID 1", "Dataset ID 2", "Validation Status", "Validation Message", "File Count Diff", "Total File Size (Bytes) Diff", "Max File Size (Bytes) Diff", "File Count 1", "Total File Size (Bytes) 1", "Max File Size (Bytes) 1", "Status 1 ", "Message 1", "File Count 2", "Total File Size (Bytes) 2", "Max File Size (Bytes) 2", "Status 2 ", "Message 2"])
    results_df.to_csv(output_file, index=False, sep="\t")
    !gsutil cp $output_file $ws_bucket/$destination_dir/ 2> stdout
    !rm $output_file
    print("\nAggregated Validation Results:")
    display(results_df)   
    

#############################################
## Input Parameters
#############################################

# Specify the list of dataset IDs
dataset_id_pairs_list = [
#    ['bef62e8a-5f5c-4e81-a8f8-ddeaf657b4e8', 'c7206e9a-78ad-4c9d-927f-3ca76646227d'],
#    ['902596ce-714e-49b3-8271-f3dfece52309', 'e091028e-a6b1-4989-9477-498e7ea206f0'],
    ['bef62e8a-5f5c-4e81-a8f8-ddeaf657b4e8', 'c7206e9a-78ad-4c9d-927f-3ca76646227d'],
]

#############################################
## Execution
#############################################

collect_file_stats(dataset_id_pairs_list)


# Migrating Workspaces

## Pre-Connector Processing
For each GCP Workspace - Azure Workspace pair:
1. Build a manifest of files to be copied from the GCP Workspace to the Azure Workspace. 
2. Write the manifest to BigQuery for consumption by downstream processes.

Pre-run steps:
1. Use the anvil_ingest_tools notebook to create the Azure workspaces. 
2. Use the anvil_ingest_tools notebook to add the TDR general SA (datarepo-jade-api@terra-datarepo-production.iam.gserviceaccount.com) as a reader on the source GCP workspaces and a writer on the target Azure workspaces.

Post-run steps:
1. Use the anvil_ingest_tools notebook to remove the TDR general SA from the GCP and Azure workspaces. 


In [None]:
#############################################
## Functions
#############################################

# Function to create file transfer details
def output_file_details(source_ws_project, source_ws_name, target_ws_project, target_ws_name, file_bigquery_table, target_bigquery_table, delete_existing_records):
    
    # Establish credentials and clients
    client = bigquery.Client()
    creds, project = google.auth.default(scopes=['https://www.googleapis.com/auth/cloud-platform', 'openid', 'email', 'profile'])
    auth_req = google.auth.transport.requests.Request()
    creds.refresh(auth_req)

    # Pull bucket from source workspace
    try:
        ws_attributes = requests.get(
            url=f"https://api.firecloud.org/api/workspaces/{source_ws_project}/{source_ws_name}?fields=workspace.bucketName",
            headers={"Authorization": f"Bearer {creds.token}"}
        ).json()
        ws_bucket = ws_attributes["workspace"]["bucketName"]
    except:
        err_str = "Error retrieving workspace attributes for source workspace."
        logging.error(err_str)
        raise Exception(err_str)

    # Pull storage container from target workspace
    try:
        ws_attributes = requests.get(
            url=f"https://api.firecloud.org/api/workspaces/{target_ws_project}/{target_ws_name}?fields=workspace.workspaceId",
            headers={"Authorization": f"Bearer {creds.token}"}
        ).json()
        ws_id = ws_attributes["workspace"]["workspaceId"] 
        ws_resources = requests.get(
            url=f"https://workspace.dsde-prod.broadinstitute.org/api/workspaces/v1/{ws_id}/resources?offset=0&limit=10&resource=AZURE_STORAGE_CONTAINER",
            headers={"Authorization": f"Bearer {creds.token}"}
        ).json()
        resource_id = ""
        for resource_entry in ws_resources["resources"]:
            if resource_entry["resourceAttributes"]["azureStorageContainer"]["storageContainerName"][0:3] == "sc-":
                resource_id = resource_entry["metadata"]["resourceId"]
                break
        if resource_id:
            sas_response = requests.post(
                url=f"https://workspace.dsde-prod.broadinstitute.org/api/workspaces/v1/{ws_id}/resources/controlled/azure/storageContainer/{resource_id}/getSasToken?sasExpirationDuration=86400",
                headers={"Authorization": f"Bearer {creds.token}", "accept": "application/json"}
            ).json()
            base_url = sas_response["url"]
            ws_storage_container = re.search("^[a-z0-9:\/=\-\.]+", base_url, re.IGNORECASE).group(0)
        else:
            err_str = "Error retrieving resource information for target workspace."
            logging.error(err_str)
            raise Exception(err_str)
    except:
        err_str = "Error retrieving workspace attributes for target workspace."
        logging.error(err_str)
        raise Exception(err_str)

    # Clear records from target BQ table (if specified)
    if delete_existing_records:
        logging.info(f"Preparing target BQ table ({target_bigquery_table}).")
        delete_query = f"""DELETE FROM `{target_bigquery_table}` WHERE gcp_ws_project = '{source_ws_project}' and gcp_ws_name = '{source_ws_name}'"""
        try:
            delete_query_job = client.query(delete_query)
            delete_query_job.result()
        except Exception as e:
            logging.warning("Error deleting records for the original dataset from the target BQ table.") 

    # Write the query to pull files into a dataframe
    logging.info(f"Building manifest of files to copy from the source '{source_ws_project}.{source_ws_name}' workspace to the target '{target_ws_project}.{target_ws_name}' workspace.")
    current_datetime_string = datetime.datetime.now().strftime("%Y%m%d%H%M%S")
    job_config = bigquery.LoadJobConfig()
    job_config.write_disposition = "WRITE_APPEND"
    query = f"""SELECT '{source_ws_project}' AS gcp_ws_project, '{source_ws_name}' AS gcp_ws_name, 
                '{target_ws_project}' AS az_ws_project, '{target_ws_name}' AS az_ws_name, 
                 'gs://{ws_bucket}/'||name AS source_path, '{ws_storage_container}/'||name AS target_path, 
                 size AS size_in_bytes, md5Hash AS md5_hash, '{current_datetime_string}' AS date_added
                FROM `{file_bigquery_table}` 
                WHERE bucket = '{ws_bucket}'
                AND name NOT LIKE '%/'"""
    attempt_counter = 0
    while True:
        try:
            df = client.query(query).result().to_dataframe()
            job = client.load_table_from_dataframe(df, target_bigquery_table, job_config=job_config)
            logging.info("Records recorded successfully.")
            break
        except Exception as e:
            if attempt_counter < 5:
                sleep(10)
                attempt_counter += 1
                continue
            else:
                err_str = f"Error building and writing file manifest: {str(e)}."
                logging.error(err_str)
                raise Exception(err_str)

            
#############################################
## Input Parameters
#############################################

# General parameters
file_bigquery_table = "broad-dsde-prod-analytics-dev.anvil_inventory.object_metadata_26_02_2024__17_14_55"
target_bigquery_table = "broad-dsde-prod-analytics-dev.anvil_azure_migration.azure_migration_file_list_workspaces"

# Specify migration pairs: Source GCP Workspace - Target Azure Workspace
migration_list = [
    #{"gcp_ws_project": "anvil-datastorage", "gcp_ws_name": "<name>", "az_ws_project": "AnVILDataStorage_Azure", "az_ws_name": "<name>"}
    {'gcp_ws_project': 'anvil-datastorage', 'gcp_ws_name': 'AnVIL_CCDG_WGS_HAIL_Phased-data', 'az_ws_project': 'AnVILDataStorage_Azure', 'az_ws_name': 'AnVIL_CCDG_WGS_HAIL_Phased-data_Azure'},
]

# Specify whether existing records in the azure_migration_file_list_workspaces table should be deleted before running
delete_existing_records = True


#############################################
## Execution
#############################################

# Loop through migration list and process entries
results = []
for entry in migration_list:
    logging.info(f"Processing Migration List Entry: {str(entry)}")
    try:
        output_file_details(entry["gcp_ws_project"], entry["gcp_ws_name"], entry["az_ws_project"], entry["az_ws_name"], file_bigquery_table, target_bigquery_table, delete_existing_records)
        results.append([entry["gcp_ws_name"], entry["az_ws_name"], "Success", ""])
    except Exception as e:
        results.append([entry["gcp_ws_name"], entry["az_ws_name"], "Failure", str(e)])
        
# Display final results
print("\nFinal Results:")
results_df = pd.DataFrame(results, columns = ["Source Workspace Name", "Target Workspace Name", "Status", "Message"])
display(results_df)


## Validation

### Pull and Compare File Counts and Sizes between Workspace Buckets

In [None]:
#############################################
## Functions
#############################################

def collect_file_stats(storage_pairs_list):
    
    results = []
    for storage_pair in storage_pairs_list:

            # Initialize variables
            logging.info(f"Processing storage pair: {storage_pair}")
            gcs_storage_location = storage_pair[0]
            gcs_file_count = 0
            gcs_total_file_size = 0
            gcs_max_file_size = 0
            gcs_status = "Success"
            gcs_message = ""
            az_storage_location = storage_pair[1]
            az_file_count = 0
            az_total_file_size = 0
            az_max_file_size = 0
            az_status = "Success"
            az_message = ""
            validation_status = "Passed"
            validation_message = ""

            # For gcs_storage_location, loop through files and record information
            logging.info("Pulling and parsing GCP bucket contents to create a list of existing files.")
            existing_gcs_files = []
            try:
                cmd = f"gsutil ls -L '{gcs_storage_location}/**'"
                output = subprocess.check_output(cmd, shell=True, universal_newlines=True)
                file_name = ""
                file_size = ""
                for line in output.split("\n"):
                    if line[0:2] == "gs":
                        if file_name and file_size and file_name[-1] != "/":
                            existing_gcs_files.append([file_name, file_size])
                        file_name = re.sub(":$", "", line)
                    else:
                        if "Content-Length:" in line:
                            file_size = re.match("\s*Content-Length:\s*([0-9]+)", line).group(1)
                if file_name and file_size and file_name[-1] != "/":
                    existing_gcs_files.append([file_name, file_size])
                for entry in existing_gcs_files:
                    gcs_file_count += 1
                    entry_file_size = int(entry[1])
                    gcs_total_file_size += entry_file_size
                    if entry_file_size > gcs_max_file_size:
                        gcs_max_file_size = entry_file_size
            except Exception as e:
                gcs_status = "Failure"
                gcs_message = str(e)
                logging.error(f"Failure in file retrieval: {gcs_message}")
                
            # For az_storage_location, loop through files and record information
            logging.info("Pulling and parsing target Azure container contents to create a list of existing files.")
            cmd = f"azcopy_linux_amd64_10.24.0/azcopy list '{az_storage_location}' --machine-readable"
            output = subprocess.check_output(cmd, shell=True, universal_newlines=True)
            existing_az_files = []
            try:
                for line in output.split("\n"):
                    if line:
                        file_name = re.match(r"^INFO: (.*);", line).group(1)
                        file_size = re.match(r".*Content Length: ([0-9\.]+).*", line).group(1)
                        existing_az_files.append([file_name, file_size])
                for entry in existing_az_files:
                    az_file_count += 1
                    entry_file_size = int(entry[1])
                    az_total_file_size += entry_file_size
                    if entry_file_size > az_max_file_size:
                        az_max_file_size = entry_file_size
            except Exception as e:
                az_status = "Failure"
                az_message = str(e)
                logging.error(f"Failure in file retrieval: {az_message}")
                
            # Record and display interim results
            file_count_diff = gcs_file_count - az_file_count
            total_file_size_diff = gcs_total_file_size - az_total_file_size
            max_file_size_diff = gcs_max_file_size - az_max_file_size
            if gcs_status == "Failure" or az_status == "Failure":
                validation_status = "Failed"
                validation_message = "Errors pulling counts for one or more storage locations."
            elif file_count_diff > 0 or total_file_size_diff > 0 or max_file_size_diff > 0:
                validation_status = "Failed"
                validation_message = "Difference in counts between storage locations."
            results.append([gcs_storage_location, az_storage_location, validation_status, validation_message, file_count_diff, total_file_size_diff, max_file_size_diff, gcs_file_count, gcs_total_file_size, gcs_max_file_size, gcs_status, gcs_message, az_file_count, az_total_file_size, az_max_file_size, az_status, az_message])
            int_results_df = pd.DataFrame([[gcs_storage_location, az_storage_location, validation_status, validation_message, file_count_diff, total_file_size_diff, max_file_size_diff, gcs_file_count, gcs_total_file_size, gcs_max_file_size, gcs_status, gcs_message, az_file_count, az_total_file_size, az_max_file_size, az_status, az_message]], columns = ["GCS Storage Location", "AZ Storage Location", "Validation Status", "Validation Message", "File Count Diff", "Total File Size (Bytes) Diff", "Max File Size (Bytes) Diff", "GCS File Count", "GCS Total File Size (Bytes)", "GCS Max File Size (Bytes)", "GCS Status", "GCS Message", "AZ File Count", "AZ Total File Size (Bytes)", "AZ Max File Size (Bytes)", "AZ Status", "AZ Message"])
            logging.info("Results recorded:")
            display(int_results_df)
        
    # Display final results
    logging.info("Aggregating results...")
    ws_bucket = os.environ["WORKSPACE_BUCKET"]
    destination_dir = "ingest_pipeline/resources/azure_migration"
    current_datetime_string = datetime.datetime.now().strftime("%Y%m%d%H%M%S")
    output_file = f"workspace_validation_results_{current_datetime_string}.tsv"
    results_df = pd.DataFrame(results, columns = ["GCS Storage Location", "AZ Storage Location", "Validation Status", "Validation Message", "File Count Diff", "Total File Size (Bytes) Diff", "Max File Size (Bytes) Diff", "GCS File Count", "GCS Total File Size (Bytes)", "GCS Max File Size (Bytes)", "GCS Status", "GCS Message", "AZ File Count", "AZ Total File Size (Bytes)", "AZ Max File Size (Bytes)", "AZ Status", "AZ Message"])
    results_df.to_csv(output_file, index=False, sep="\t")
    !gsutil cp $output_file $ws_bucket/$destination_dir/ 2> stdout
    !rm $output_file
    print("\nAggregated Validation Results:")
    display(results_df)   
    

#############################################
## Input Parameters
#############################################

# Specify the list of dataset IDs
storage_pairs_list = [
#     ["gcs_bucket_path", "azure_storage_container_sas_url"]
    ['gs://fc-secure-0932b76c-22e6-4321-94f7-9726ad4aeb76', 'https://lzb34bb58bfb122730765416.blob.core.windows.net/sc-0ef0b0b4-92b6-462e-8b4f-498f1cb7983b?sv=2023-11-03&spr=https&st=2024-04-23T13%3A45%3A11Z&se=2024-04-23T22%3A00%3A11Z&sr=c&sp=racwdlt&sig=7Usayb1DzV4LEcQYheVZrSrvEkiaiot9wEZGmnEO3BM%3D&rscd=2661442731880e5cbc2c9'],
]

#############################################
## Execution
#############################################

collect_file_stats(storage_pairs_list)


# Migrating Data Back!?!?!?

## Collecting Files in TDR Datasets

In [None]:
#############################################
## Functions
#############################################

# Function to record file information from TDR datasets
def output_file_details(dataset_id, target_bigquery_table):

    # Delete records for dataset if already in BQ table
    client = bigquery.Client()
    logging.info(f"Preparing target BQ table ({target_bigquery_table}).")
    delete_query = f"""DELETE FROM `{target_bigquery_table}` WHERE object_id = '{dataset_id}'"""
    try:
        delete_query_job = client.query(delete_query)
        delete_query_job.result()
    except Exception as e:
        err_msg = f"Error deleting records for the original dataset from the target BQ table: {str(e)}"
        logging.info(err_msg)
        return "Failure", err_msg
    
    # Page through and pull files from dataset
    logging.info(f"Fetching and recording all files found in the original dataset ({dataset_id}).") 
    file_list = []
    total_files_fetched = 0
    page_number = 1
    max_page_size = 1000
    attempt_counter = 0
    while True:
        logging.info(f"Pulling files from page {str(page_number)}...")
        try:
            api_client = refresh_tdr_api_client("https://data.terra.bio")
            datasets_api = data_repo_client.DatasetsApi(api_client=api_client)
            file_results = datasets_api.list_files(id=dataset_id, offset=total_files_fetched, limit=max_page_size)
        except Exception as e:
            if "Invalid UUID string" in str(e) or "Dataset not found" in str(e):
                err_msg = f"Error fetching files: {str(e)}"
                logging.info(err_msg)
                return "Failure", err_msg
            elif attempt_counter <= 5:
                sleep(10)
                attempt_counter += 1
                continue
            else:
                err_msg = f"Error fetching files: {str(e)}"
                logging.info(err_msg)
                return "Failure", err_msg
        if file_results:
            total_files_fetched += len(file_results)
            page_number += 1
            for file in file_results:
                file_id = file.file_id
                file_path = file.path
                file_size = file.size
                file_md5 = ""
                for cs in file.checksums:
                    if cs.type == "md5":
                        file_md5 = cs.checksum
                original_url = file.description.replace("Ingest of ", "")
                access_url = file.file_detail.access_url
                file_list.append(["tdr", dataset_id, file_id, file_path, file_size, file_md5, original_url, access_url])
            if len(file_results) < max_page_size:
                break
        else:
            break

    # Convert to dataframe and write to BigQuery
    client = bigquery.Client()
    logging.info(f"{total_files_fetched} files found. Writing results to BigQuery.")
    df_file_list = pipeline_results = pd.DataFrame(file_list, columns = ["object_type", "object_id", "file_id", "file_path", "file_size", "file_md5", "file_original_url", "file_access_url"])
    job_config = bigquery.LoadJobConfig()
    job_config.write_disposition = "WRITE_APPEND"
    job = client.load_table_from_dataframe(df_file_list, target_bigquery_table, job_config=job_config)
    job.result()
    return "Success", ""
    

#############################################
## Input Parameters
#############################################

# General parameters
target_bigquery_table = "broad-dsde-prod-analytics-dev.anvil_azure_migration.current_azure_inventory"
dataset_id_list = [
    '9fe49126-f4ac-4e46-a231-99820fb0d4c2',
    '20a58869-8e3c-4959-8b04-fb12044ac0d3',
]
#dataset_id_list = ['2c7b4971-a67b-4786-ac86-de56f968cc84']


#############################################
## Execution
#############################################

# Loop through migration list and process entries
results = []
for dataset_id in dataset_id_list:
    logging.info(f"Processing Dataset ID: {dataset_id}")
    status, message = output_file_details(dataset_id, target_bigquery_table)
    results.append([dataset_id, status, message])
        
# Display final results
print("\nFinal Results:")
results_df = pd.DataFrame(results, columns = ["Dataset ID", "Status", "Message"])
display(results_df)

## Collecting Files in Azure Workspaces

In [None]:
#############################################
## Functions
#############################################

# Function to record file information from Azure datasets
def output_file_details(workspace, target_bigquery_table):
    
    # Setup/refresh credentials and BQ client
    creds, project = google.auth.default(scopes=['https://www.googleapis.com/auth/cloud-platform', 'openid', 'email', 'profile'])
    auth_req = google.auth.transport.requests.Request()
    creds.refresh(auth_req)
    client = bigquery.Client()

    # Delete records for dataset if already in BQ table
#     logging.info(f"Preparing target BQ table ({target_bigquery_table}).")
#     delete_query = f"""DELETE FROM `{target_bigquery_table}` WHERE object_id = '{workspace}'"""
#     try:
#         delete_query_job = client.query(delete_query)
#         delete_query_job.result()
#     except Exception as e:
#         err_msg = f"Error deleting records for the original workspace from the target BQ table: {str(e)}"
#         logging.info(err_msg)
#         return "Failure", err_msg

    # Get Workspace ID
    logging.info(f"Fetching and recording all files found in the original workspace ({workspace}).") 
    try:
        workspace_id = ""
        workspace_response = requests.get(
            url=f"https://api.firecloud.org/api/workspaces/AnVILDataStorage_Azure/{workspace}",
            headers={"Authorization": f"Bearer {creds.token}"}
        ).json()
        workspace_id = workspace_response["workspace"]["workspaceId"]

        # Get Workspace Resources
        resource_id = ""
        storage_container = ""
        workspace_response = requests.get(
            url=f"https://workspace.dsde-prod.broadinstitute.org/api/workspaces/v1/{workspace_id}/resources?offset=0&limit=10&resource=AZURE_STORAGE_CONTAINER",
            headers={"Authorization": f"Bearer {creds.token}"}
        ).json() 
        for resource_entry in workspace_response["resources"]:
            storage_container_name = resource_entry["resourceAttributes"]["azureStorageContainer"]["storageContainerName"]
            if storage_container_name.startswith("sc-"):
                resource_id = resource_entry["metadata"]["resourceId"]
                storage_container = storage_container_name

        # Generate SAS URL
        sas_response = requests.post(
            url=f"https://workspace.dsde-prod.broadinstitute.org/api/workspaces/v1/{workspace_id}/resources/controlled/azure/storageContainer/{resource_id}/getSasToken?sasExpirationDuration=86400",
            headers={"Authorization": "Bearer " + creds.token, "accept": "application/json"}
        )
        sas_response_json = json.loads(sas_response.text)
        sas_token = sas_response_json.get("token")
        sas_url = sas_response_json.get("url")
        print(sas_url)
        return "Temp", "Temp"
        base_url = sas_url.replace("/" + storage_container + "?" + sas_token, "")

        # Establish Azure Clients and pull files
        logger = logging.getLogger("azure.core.pipeline.policies.http_logging_policy")
        logger.setLevel(logging.WARNING)   
        file_list = []
        blob_service_client = BlobServiceClient(account_url=base_url, credential=sas_token)
        container_client = blob_service_client.get_container_client(storage_container)
        blob_list = container_client.list_blobs(results_per_page=1000)
        paged_list = blob_list.by_page()
        page_number = 0
        for blob_page in paged_list:
            page_number += 1
            logging.info(f"Pulling files from page {str(page_number)}...")
            for blob in blob_page:
                blob_client = container_client.get_blob_client(blob)
                props = blob_client.get_blob_properties()
                if not blob.name.endswith('/') and not blob.deleted:
                    md5_hash = base64.b64encode(props.content_settings.content_md5).decode("utf-8") if props.content_settings.content_md5 else ""
                    full_path = blob_client.url.replace(f"?{sas_token}", "")
                    file_id = ""
                    file_path = "/" + blob.name
                    file_size = props.size
                    file_list.append(["workspace", workspace, file_id, file_path, file_size, md5_hash, "", full_path]) 
        total_files_fetched = len(file_list)
    except Exception as e:
        err_msg = f"Error fetching files: {str(e)}"
        logging.info(err_msg)
        return "Failure", err_msg
                
    # Convert to dataframe and write to BigQuery
    logging.info(f"{total_files_fetched} files found. Writing results to BigQuery.")
    df_file_list = pipeline_results = pd.DataFrame(file_list, columns = ["object_type", "object_id", "file_id", "file_path", "file_size", "file_md5", "file_original_url", "file_access_url"])
    job_config = bigquery.LoadJobConfig()
    job_config.write_disposition = "WRITE_APPEND"
    job = client.load_table_from_dataframe(df_file_list, target_bigquery_table, job_config=job_config)
    job.result()
    return "Success", ""
    

#############################################
## Input Parameters
#############################################

# General parameters
target_bigquery_table = "broad-dsde-prod-analytics-dev.anvil_azure_migration.current_azure_inventory"
workspace_list = [
    'ANVIL_dbGap_data_conversion_Azure',
]


#############################################
## Execution
#############################################

# Loop through migration list and process entries
results = []
for workspace in workspace_list:
    logging.info(f"Processing Workspace: {workspace}")
    status, message = output_file_details(workspace, target_bigquery_table)
    results.append([workspace, status, message])
        
# Display final results
print("\nFinal Results:")
results_df = pd.DataFrame(results, columns = ["Workspace", "Status", "Message"])
display(results_df)

In [None]:
workspace = 'AnVIL_NIA_CARD_LR_WGS_Deposit_Azure'

# Setup/refresh credentials and BQ client
creds, project = google.auth.default(scopes=['https://www.googleapis.com/auth/cloud-platform', 'openid', 'email', 'profile'])
auth_req = google.auth.transport.requests.Request()
creds.refresh(auth_req)
client = bigquery.Client()

# # Delete records for dataset if already in BQ table
# logging.info(f"Preparing target BQ table ({target_bigquery_table}).")
# delete_query = f"""DELETE FROM `{target_bigquery_table}` WHERE object_id = '{workspace}'"""
# try:
#     delete_query_job = client.query(delete_query)
#     delete_query_job.result()
# except Exception as e:
#     err_msg = f"Error deleting records for the original workspace from the target BQ table: {str(e)}"
#     logging.info(err_msg)
#     return "Failure", err_msg

# Get Workspace ID
logging.info(f"Fetching and recording all files found in the original workspace ({workspace}).") 
try:
    workspace_id = ""
    workspace_response = requests.get(
        url=f"https://api.firecloud.org/api/workspaces/AnVILDataStorage_Azure/{workspace}",
        headers={"Authorization": f"Bearer {creds.token}"}
    ).json()
    workspace_id = workspace_response["workspace"]["workspaceId"]

    # Get Workspace Resources
    resource_id = ""
    storage_container = ""
    workspace_response = requests.get(
        url=f"https://workspace.dsde-prod.broadinstitute.org/api/workspaces/v1/{workspace_id}/resources?offset=0&limit=10&resource=AZURE_STORAGE_CONTAINER",
        headers={"Authorization": f"Bearer {creds.token}"}
    ).json() 
    for resource_entry in workspace_response["resources"]:
        storage_container_name = resource_entry["resourceAttributes"]["azureStorageContainer"]["storageContainerName"]
        if storage_container_name.startswith("sc-"):
            resource_id = resource_entry["metadata"]["resourceId"]
            storage_container = storage_container_name

    # Generate SAS URL
    sas_response = requests.post(
        url=f"https://workspace.dsde-prod.broadinstitute.org/api/workspaces/v1/{workspace_id}/resources/controlled/azure/storageContainer/{resource_id}/getSasToken?sasExpirationDuration=86400",
        headers={"Authorization": "Bearer " + creds.token, "accept": "application/json"}
    )
    sas_response_json = json.loads(sas_response.text)
    sas_token = sas_response_json["token"]
    sas_url = sas_response_json["url"]
    base_url = sas_url.replace("/" + storage_container + "?" +sas_token, "")

    # Establish Azure Clients and pull files
    logger = logging.getLogger("azure.core.pipeline.policies.http_logging_policy")
    logger.setLevel(logging.WARNING)   
    file_list = []
    blob_service_client = BlobServiceClient(account_url=base_url, credential=sas_token)
    container_client = blob_service_client.get_container_client(storage_container)
    blob_list = container_client.list_blobs(results_per_page=1000, name_starts_with='CARD_cell_line_first_data_release/HG002', include='deleted')
    paged_list = blob_list.by_page()
    page_number = 0
    for blob_page in paged_list:
        page_number += 1
        logging.info(f"Pulling files from page {str(page_number)}...")
        for blob in blob_page:
            print(blob.name)
            print(blob.deleted)
            blob_client = container_client.get_blob_client(blob)
            props = blob_client.get_blob_properties()
            if blob.deleted:
                print(blob.name)
                print(blob.deleted)
                print(type(blob.deleted))
                break
            if not blob.name.endswith('/') and not blob.deleted:
                md5_hash = base64.b64encode(props.content_settings.content_md5).decode("utf-8") if props.content_settings.content_md5 else ""
                full_path = blob_client.url.replace(f"?{sas_token}", "")
                file_id = ""
                file_path = "/" + blob.name
                file_size = props.size
                file_list.append(["workspace", workspace, file_id, file_path, file_size, md5_hash, "", full_path]) 
    total_files_fetched = len(file_list)
except Exception as e:
    err_msg = f"Error fetching files: {str(e)}"
    logging.info(err_msg)
    #return "Failure", err_msg

# # Convert to dataframe and write to BigQuery
# logging.info(f"{total_files_fetched} files found. Writing results to BigQuery.")
# df_file_list = pipeline_results = pd.DataFrame(file_list, columns = ["object_type", "object_id", "file_id", "file_path", "file_size", "file_md5", "file_original_url", "file_access_url"])
# job_config = bigquery.LoadJobConfig()
# job_config.write_disposition = "WRITE_APPEND"
# job = client.load_table_from_dataframe(df_file_list, target_bigquery_table, job_config=job_config)
# job.result()
# return "Success", ""

In [None]:
for page in paged_list:
    for blob in blob_page:
            print(blob.name)

In [None]:
blob_page

## Collecting Files in TDR Snapshots

In [None]:
#############################################
## Functions
#############################################

# Function to record file information from TDR snapshots
def output_file_details(snapshot_id, target_bigquery_table):

    # Delete records for dataset if already in BQ table
    client = bigquery.Client()
    logging.info(f"Preparing target BQ table ({target_bigquery_table}).")
    delete_query = f"""DELETE FROM `{target_bigquery_table}` WHERE snapshot_id = '{snapshot_id}'"""
    try:
        delete_query_job = client.query(delete_query)
        delete_query_job.result()
    except Exception as e:
        err_msg = f"Error deleting records for the original snapshot from the target BQ table: {str(e)}"
        logging.info(err_msg)
        return "Failure", err_msg
    
    # Pull snapshot details
    api_client = refresh_tdr_api_client("https://data.terra.bio")
    snapshots_api = data_repo_client.SnapshotsApi(api_client=api_client)
    snapshot_detail = snapshots_api.retrieve_snapshot(id=snapshot_id, include=["SOURCES"])
    phs_id = snapshot_detail.source[0].dataset.phs_id
    consent_code = snapshot_detail.consent_code
    
    # Page through and pull files from snapshot
    logging.info(f"Fetching and recording all files found in the original snapshot ({snapshot_id}).") 
    file_list = []
    total_files_fetched = 0
    page_number = 1
    max_page_size = 1000
    attempt_counter = 0
    while True:
        logging.info(f"Pulling files from page {str(page_number)}...")
        try:
            api_client = refresh_tdr_api_client("https://data.terra.bio")
            snapshots_api = data_repo_client.SnapshotsApi(api_client=api_client)
            file_results = snapshots_api.list_files(id=snapshot_id, offset=total_files_fetched, limit=max_page_size)
        except Exception as e:
            if "Invalid UUID string" in str(e) or "Snapshot not found" in str(e):
                err_msg = f"Error fetching files: {str(e)}"
                logging.info(err_msg)
                return "Failure", err_msg
            elif attempt_counter <= 5:
                sleep(10)
                attempt_counter += 1
                continue
            else:
                err_msg = f"Error fetching files: {str(e)}"
                logging.info(err_msg)
                return "Failure", err_msg
        if file_results:
            total_files_fetched += len(file_results)
            page_number += 1
            for file in file_results:
                file_id = file.file_id
                file_path = file.path
                file_size = file.size
                file_md5 = ""
                for cs in file.checksums:
                    if cs.type == "md5":
                        file_md5 = cs.checksum
                access_url = file.file_detail.access_url
                drs_uri = "drs://drs.anv0:v2_" + file_id
                file_list.append([snapshot_id, phs_id, consent_code, drs_uri, file_id, file_path, file_size, file_md5, access_url])
            if len(file_results) < max_page_size:
                break
        else:
            break

    # Convert to dataframe and write to BigQuery
    client = bigquery.Client()
    logging.info(f"{total_files_fetched} files found. Writing results to BigQuery.")
    df_file_list = pipeline_results = pd.DataFrame(file_list, columns = ["snapshot_id", "phs_id", "consent_code", "drs_uri", "file_id", "file_path", "file_size", "file_md5", "file_access_url"])
    job_config = bigquery.LoadJobConfig()
    job_config.write_disposition = "WRITE_APPEND"
    job = client.load_table_from_dataframe(df_file_list, target_bigquery_table, job_config=job_config)
    job.result()
    return "Success", ""
    

#############################################
## Input Parameters
#############################################

# General parameters
target_bigquery_table = "broad-dsde-prod-analytics-dev.anvil_inventory.snapshot_drs_export"
snapshot_id_list = [
    '01cf2450-604b-43e5-9f4e-9ec4e0bf0a61',
    '85b0b351-cd0a-4efe-95a4-e39273c42831',
    '2fdba9a4-6593-439a-a7fc-c3a5825c26cd',
    'ad4ed62b-bf63-4dff-ab94-70a6432c161c',
    'b300b5ae-6ca3-4350-bc46-345173f6faba',
    '7bc891a2-a634-4cf2-b41e-0b1e98fce599',
    '9dbac1be-a33c-419c-be92-d1a5452c1292',
    '40d6feec-e6f7-42f1-8e74-a3404e1f9208',
    '95b4c57b-8e88-45f5-9dbb-e2575f4b2a68',
    '6f1d6a31-1997-4b59-a311-f84631ebdcbf',
    '79502d0c-bc1c-4d51-a6de-eb0334b3b660',
    '0af0d35e-1f9a-464d-80fe-474b5dbbd914',
    '79c20af6-5788-47ce-9651-f6a6ae084cbc',
    'f90f565e-0ade-4750-a308-5c8e1677b43d',
    '194c4b14-cb6a-469c-83db-d37f7ec65f29',
    '88b16321-7f0a-44b1-8131-d4b2188d9839',
    '9345adce-2f83-4c02-8859-72ddccb22069',
    '75f5452d-ceca-402e-bfc4-759c8352f4da',
    'cdcfc6ac-6c9f-4d99-a8c3-4d1e5d171261',
    '6441b9e0-ca7b-4ab4-b7e7-9c7c7041ebaa',
    'ae0c27d6-c8e3-4dd5-abf5-06e5f39fc4a0',
    'cb5a6268-c0c8-433d-b62c-7beeeb0a6a92',
    'ad18153b-870c-491a-9d4e-df30d902a03f',
    '461c1b26-7306-4feb-b141-f83c209baf27',
    '6a477149-a7f0-4758-8570-b288a8314fbd',
    '07b0243c-48fc-4eee-a338-c7571cc2df1a',
    '94f79040-68f5-4801-bf41-6f29bc0be8c6',
    '53fd76c8-6745-414e-adbe-62ff72011fc5',
    'f6db6471-03c5-44b6-a463-4976d8fc6350',
    '1985e363-b6da-47ec-8c92-dabcd587e6b6',
    'e8dacaea-d37d-48e5-b0a9-c88777753423',
    '0d3a9994-49e4-45ee-add9-1af8909ed298',
    '50a46bb1-1fd3-4745-999c-0201edd5dcd2',
    'a8febca7-6f5e-4284-b6c6-fd345a01997f',
    'a14a4e7e-5bb9-41e9-a000-728ed7a24418',
    'a29a3406-c176-4129-b778-adf27cdb4ced',
    '694937e1-4919-47f1-aa35-4db860e70763',
    'a00fcb63-6b32-46e6-b31e-39424da76a15',
    'd858e821-cfb9-435e-b1c0-39e95898b6b8',
    '01d7ebf5-b429-403d-aaae-831bba6bc08b',
    'b905ceb2-d9b3-4b0c-8c2c-4d3552aa0a65',
    '14e83711-6650-4649-9921-4f8dc93f20e3',
    '66d53a48-3fe3-4069-a664-9955e5a61f4b',
    'fab48331-5eb9-4546-b24b-4153912fdce3',
    '4cb9a680-c7f6-4b78-915e-6761af08489d',
    '5d2df2e4-e93e-41de-910f-547c29751891',
    'fc8a4c6d-02d8-4bf4-ae4e-db326056c383',
    '35533657-7416-4fcb-b8e6-edc0ced2d845',
    'c24cd039-b55d-486c-88f4-4ad36c732998',
    '7211c075-4d65-43f6-91b0-20afbbf52ae8',
    '53426657-9ee3-4a28-848a-6371e42590a5',
    '27c5d19b-311a-4fa3-aa85-fe577117e835',
    '5e81dcbb-86c1-43d8-9b54-8ec248abfe3b',
    '0fe1fbab-7d97-42d3-bea4-74d8d25d41a3',
    '79e9bf95-531b-4351-a1c0-670af028be26',
    'fcd069d1-c657-4631-a4a5-334b2ab32535',
    '8a57de06-0bb0-4e9f-9fde-3c24582078d8',
    'ee46ffd0-0ccc-401e-b352-13b18cdc5d44',
    '53a55f8d-949d-4f63-be0d-d02f466be469',
    '8f3aaeb7-997e-4378-9ae4-73de81999edb',
    '4f1c52eb-0aff-4d4d-a45b-076998ebb092',
    '6c105846-e057-4378-8670-c9efa53402a6',
    'f41d7fc3-5756-482f-92c3-453e68e211a6',
    '7ea38205-4437-4dab-b9d5-3e05e33a9650',
    'f12683d4-f970-4d19-98b8-51f17bc7cae1',
    '9f317e49-d880-4b34-b192-b22880869efd',
    '09b6df1a-2fe6-4b9a-8134-936bae416497',
    'ee25a947-21d0-4272-bb2b-6f3007ccce9a',
    '4d509d4b-14d5-4d78-a09f-4de74a9cd39a',
    '3eb6bb6f-f7e6-43d4-9f1f-7864d22485f6',
    '7efd80c0-1f52-4866-a0b9-094b6409de83',
    '6ffb37a4-28ce-47d8-be08-3b3c82035a41',
    '47279ceb-fd83-4ad0-bca8-70d54da9422b',
    '4b4299e1-4792-4de8-8fcb-568f46cf8412',
    'd93b025f-5cec-42e1-b3f7-e02f695a14e8',
    'fcb345ff-3371-4cf4-ab0a-174c61162150',
    'f52a7bc0-0caa-478e-87ca-30915002434c',
    '832e5fb4-d584-4c55-8e2d-af3762344194',
    '2feb61bb-2b60-4042-a90a-94687a9954d3',
    '97721323-affa-4d87-9e6d-05002df97338',
    '23189b7e-aed8-4311-92ba-587d1116d749',
    '3dfd6eb9-9a7b-4745-8cba-f8f52565bd4d',
    '5dd57133-db66-41d4-b922-0db2b5645632',
    'e437f3b5-afc9-4c98-a828-9bb8b683ff26',
    '2104eae5-a3e6-4bc2-b558-aa60db07338e',
    '71ff1f40-bb8c-4173-a203-ba23bcd0ee24',
    'b58c7949-510e-48d7-80c3-b5fda1669ecb',
    'acbe4c99-c129-409d-a60b-5033c6053f1f',
    'a9129165-0a5a-4817-a26e-9d68a75870da',
    '0eb173b4-afac-463c-a7dd-3d296fa104da',
    'b97322c6-c769-4c42-8fa2-6724cddf7575',
    '07864c11-e488-43b6-ad56-bf400adbb289',
    '535705ef-bd8f-437d-bc50-31353965d232',
    'b7ef6b4d-efa5-46e4-9f11-5a3f009c9d94',
    'a3f4ad5c-7e26-4fbc-8106-1c0aba10744d',
    'b5ca118d-ef68-4861-bc29-b1245154a57e',
    'f457da62-894b-44be-a3a7-f908be937ab2',
    '18b009a8-6537-4d07-aa3f-93820789bdfc',
    'd7dc181f-ccbf-4715-9bc4-79ee63f395af',
    'e7d96fd3-4e6c-41df-9d01-ad88e084dc6f',
    'dce7657c-14b3-4c9f-b067-eb32163b28dc',
    'e2ae5f40-13f9-4f65-8bb1-32ac544a0fec',
    '3c0c307f-b507-4463-a0bd-6852e2657e53',
    'e7c5a9bf-108f-4e63-85eb-0f12d98342e9',
    'e6eddaa4-d5b4-4cec-bbf1-40242ea3a6e1',
    'bf348a75-eb6d-464a-a705-16047d93b824',
    '76e02df6-28d5-474d-b3df-d9427ab4b5e4',
    'c6b976b2-5473-41ad-96ac-e04e4821cd2d',
    'c6de2268-5d7a-4b0d-a055-8e558dbc60cc',
    '634d44c2-f101-44dd-9568-0ea2e080855d',
    'eb81e0ad-5ba9-46e9-971d-e291b407b441',
    'b26a2be2-beab-4d72-a2ce-de45f6aebfeb',
    '7bda878d-cfdf-42a8-b9ef-f82f39b0c651',
    'f7242460-e413-4d3e-92c9-21e69709f9ae',
    'b4c4452a-d588-457d-877d-bc652a5e0ddc',
    '19826317-e7de-4cf3-9e36-1a34b556e310',
    'c873e8c0-d38f-4b5c-8bc4-582f57bb7811',
    '247b3216-ab1b-4d36-8fab-f6695c6481f9',
    '3c9aea01-f190-4721-99c7-30e84dee1464',
    '6fb0cf8b-09b9-49f7-9ea4-0009f968ddb3',
    'b37faff3-0851-4866-986c-171c663dba66',
    '9da2472f-65b5-4dc4-8dbc-b6ad298bdbed',
    'a744898d-be25-483c-bf08-d3dd4f99346d',
    '5b2d5e9d-07f6-409c-b759-e9e12561df45',
    '0ff4e69c-fd1c-4c6e-83c7-02334364cc2a',
    'de61f82c-f51f-476c-a5cd-14757ceb571d',
    'f291a61b-8d1c-47fc-b4de-09048a6db2ac',
    'f5ad2a09-5885-4164-b160-36bb8d33cd77',
    '82f10584-d7e8-40ea-9347-fdf6437b3db1',
    '3a573fa9-08c4-43a1-b248-1ca97b501718',
    '2c87eb60-e958-4fc3-aa8d-fa393647da11',
    '7f735e63-bbc6-495c-86b4-aa4f8eb3596e',
    'd2a81a67-f615-4dd1-a86a-b877d6eaaa2b',
    '4d1e2bdd-7763-4c33-8e96-2702d81cab59',
    '7baf715d-930d-45b6-afbd-15a9ff360eb2',
    '1a243684-5924-4b26-a1e5-0d62de99b974',
    'c0369447-9bd4-4b92-a426-ec69e432630d',
    'e3a0f135-28ff-4882-9c66-4ddee0561b7f',
    'e038aa3d-3a46-458c-a5af-77cccef31e11',
    'b7cc8070-e616-43e5-8dde-12c7962278dc',
    '0d8d1286-433a-4b1f-8b99-09e3f8066998',
    'ee9f92a0-eee3-4af0-816a-98ab698be2d4',
    '4508cd6a-527c-45b1-b36e-a8253c037c01',
    'b8df672a-23f6-4d4e-b6bd-cdec514b7e1f',
    '66eceadc-7f5d-4951-aa7b-417c477e4e25',
    '7f0397d1-ae43-4201-a36a-f597c64598e3',
    '8df70ead-fa9d-4ec4-b5d4-9f6d9a9afe03',
    '9499831f-712b-4d38-8c5a-491eb5229844',
    '1ee44373-1e44-4a5c-a435-eaa702c946f5',
    '4f3220c9-8cbc-4245-9394-f752fd6063eb',
    'aa955b24-23b8-4310-9c20-38daa20e9881',
    '96375f1f-59a4-4fe5-b58c-bb459b70cc17',
    '8451b97b-fad4-44e1-b9a7-6bd78702a546',
    '2b82d3da-39ed-4814-b497-d7e7c1c8c431',
    'e876fd42-5765-4946-8686-0147084d3b47',
    'f5f576e6-715e-4e42-a53b-5eb2cf59b1c6',
    '2a758bcb-ba02-4be7-ac8e-8b5750a3bfa4',
    'a31e12de-826d-4fc5-8191-e48906179fdc',
    '9ecaeb8e-8e12-4437-8c98-9c57fc151d4a',
    '0cd6e0a3-616a-4993-885e-088f9abee915',
    'fceeb22c-feb3-439a-a457-6b34097a4532',
    '97f2da30-bb21-4e69-a84c-443fcf7ea6bc',
    '05058de3-8f8a-4098-bab8-844427923e6c',
    '8bb724f7-e407-4e9c-8b62-e6ce60a7e4f8',
    'd1d6bcc3-e7d0-4bdb-9f48-a87a222869f4',
    'c2561327-2ba0-4c79-ad74-b393f3b3a933',
    '719337c5-7c4d-4dd8-bb31-462a182781b8',
    '0d98abdd-01ba-4af8-b56a-ed90f89e5bd1',
    '340d35d2-df52-4786-b0c7-dd1b8ee343cf',
    'b9e66305-c7c1-4b84-859b-d716fbb928f6',
    'ce8a56ce-a016-4eda-8cdf-4e5ff3ce5e9f',
    'a90ec7cf-dce8-4fb6-9fb8-8adb130ae518',
    '163d6dc2-2770-4c30-a293-7c8576737f5c',
    '4729a86a-1e08-4392-b5a6-d687ff48cfb2',
    '1e2d99b0-c98f-471e-a177-2e57a83634bc',
    '77ff80cd-bcf9-49b1-9823-ea38665cd8c2',
    '37488483-1ae1-495e-a85b-0aeba08fad39',
    'ce8a8649-d311-4363-a220-eceafaa7615e',
    'fec9dec9-7ce4-4bb2-bd52-243a06c0ecce',
    '0e4c60e0-74c7-478e-9ffe-400697b5217a',
    '43df6fd8-b590-4bf6-afe0-13030d683d8e',
    '81caad3d-22f5-4c75-8a38-96e7b7e740c1',
    '43115bac-0dea-4dcd-b881-414182a37b35',
    '60c90a71-0928-41aa-b7b3-30cc043383be',
    'ccaa2c0f-214d-4ed9-94ec-b8df01351dd8',
    '4616ebc2-2864-41e0-90f2-f05abc3193d8',
    '4b67c08e-85e0-49d5-82ea-76dd3787db5d',
    '16c06f69-2241-4cf1-8a33-a9b0a0c41b7d',
    '51a36b75-41ac-4b3e-8d25-6b3bc4f1ced3',
    '9034e587-f306-4a7a-931d-cb161ce9ddf3',
    'd70ef6b0-de45-46cc-9587-7ebf0c806db8',
    'f1b9f860-46ce-4503-8314-b0d2d33f3f1e',
    '4032ce09-e024-4d4b-8149-54ac5477c8fa',
    'cd6e3a69-dfc1-4166-b128-682996a48798',
    '0ac01a18-0bf7-4e53-acbe-3971234b8899',
    'bf7a5681-ca5d-4e92-8726-071ac409a85c',
    '8433547b-8182-4cab-a3f4-44a1deedbb0f',
    'd5cb020a-b3cc-4569-8873-58924ef9dad7',
    'dd9f78e4-757b-455d-83ac-ba9dfeeed6e1',
    'f7307ad5-7250-483c-a8e2-aeb1c1226b13',
    '9e210e43-61b5-4a85-8d1a-0abc111f60db',
    'c720c477-dfe2-45e0-805f-94ed11d006b4',
    'e4bc2365-282a-4c51-afe3-971899f61f74',
    '2df015fc-06f8-46aa-a9ac-e12bc608b2f3',
    'c68d1d19-aac9-4042-ab62-5f40837fe10d',
    '55d26ed5-f046-414f-8c8b-bdc941fa58aa',
    '5ee23a8c-e9d7-49b9-b23e-ecf1c1cf51a1',
    '39fd1d5b-2e6f-4035-beb3-05322174dd4b',
    'dc9c9587-1ec3-42d5-bcd5-ce425f79e7d0',
    'c10c8878-2b5a-4468-9931-c35a6ee034ae',
    '71a24ce8-9e13-4b16-9812-367d929d6367',
    'a65a95b8-d6ae-4349-9478-f4df9f43801e',
    'e70ce083-4e8e-403a-a222-6bf235521930',
    'f6d1d60f-d5b7-4944-94be-a9dbb17d9057',
    '450cc5ad-47d6-434a-81a6-783fe277a58c',
    'cbfe8c00-7683-4e34-a7ee-23cfb0a1145d',
    'e8afa16f-16de-4bc7-ab91-0a1667f088a3',
    '03ce0d8a-13a8-4b6f-8a80-7c016c583ee2',
    'e4245e63-a141-4547-aa20-bbb5ad7c6f4c',
    'a0be830f-6d75-405c-a14b-95c56ee88f06',
    'a8b8b258-2b61-40a8-95b2-68247cf29eb3',
    '1c6223bf-9665-4b54-bbc3-40847ebaf92c',
    'c0d588bc-5a77-490f-86b0-fcbe3f06654c',
    '4277cb35-8d37-49c7-8bb1-eeb57a68b739',
    'ba20eb5d-3553-4f0c-92eb-5793697da74d',
    '2054866e-b906-42cd-9e1a-d3eaf1b6057e',
    '568ba9a0-95ba-486b-bab1-efc3798e5f41',
    'd5f3d7ce-5a55-4508-b662-e624a83b304d',
    'ce99c021-32fc-4278-bad0-03f9dadeeed8',
    '00c1a1f1-6bfc-478b-8b32-5d5911081638',
    '2283bdf2-82c2-45c8-9e2c-f5855bd6e103',
    '07c0ce25-4c13-443c-b2c0-721620747ec3',
    '29ec75f0-53ac-405f-a973-b034126ae457',
    '9393d37f-8c9d-43fa-a42a-52536a24236d',
    '79bdf4d2-10e0-4736-83fc-9a4d207659d9',
    'ed88c711-d973-4363-8fee-c88439a8dd57',
    '2cff71df-202e-43d1-9f62-997f01bf23ac',
    '97a2190a-ca36-4423-bcef-f9f8be800187',
    '753d9871-495a-4cab-944a-e925ec269282',
    '7397dca5-45a0-4744-9222-50dbcfbc1500',
    '4e5d4393-1693-44bb-ab65-1d7a51c13dff',
    'fd8eab82-8317-4d9c-97aa-9752a0b9340e',
    '7daf63d7-56af-4ddc-9601-4ae7605f0420',
    '583b36e5-8843-4be2-b40f-9fde7151497f',
    '3532b277-89ff-46b3-a6fd-7f002e524bb4',
    '161c78cb-899e-41d8-af6b-a400ec7322f0',
    '82d35b38-8b69-4eaf-b298-ff35fae9c092',
    '9406166c-28ce-454c-9db6-fc2cdafa4913',
    'fde2ddfd-3edf-415f-888b-cc5dba868669',
    'ff8f595d-9b15-419a-9df1-fc3bd94184f0',
    'fde093c5-a4b4-4080-ba36-7663e0de7047',
    'c1009cce-77ff-47cd-b349-109a1bd283d5',
    '03ad9d34-f5f7-428a-8411-44b97815a95c',
    '26edc6c1-f79c-4bb6-84b0-d1492fd64738',
    'b48a3a5c-9d3f-4228-8a24-19f315890f7b',
    '4312ca34-388a-4d28-baa3-b64ec6eaf7af',
    'f71051be-17e3-42bf-b534-5af0e3740937',
    '98a13ebb-0a91-427e-bdd5-8f5833036d81',
    '4a815323-5dc1-4a0c-93c0-1c2333345c4d',
    'db0c9cff-f3f9-4315-addb-6b6c9088b124',
    '3c279159-bd0e-4df9-bcea-9640e49a694b',
    '1cd2be27-fc8b-4779-9398-d06979ce43c3',
    '08568995-f347-4c5a-bd55-d9cc44441e07',
    '4998e043-4d6a-4003-b6d4-0170b0ca31b5',
    '82d7d1b6-15b7-4c1f-b272-b5fa3f1eedbd',
    '3d8ec3f3-cd6e-42c7-bcdb-51baa5113160',
    '16ac2579-72d8-4615-bcc3-c8d0438ded30',
    '8ac300b2-0d66-49ad-b859-b3634aa82a1a',
    'aacb9f15-d921-47ae-bbe0-e5f7ae160fa8',
    '8afa7677-ce77-4ff4-9968-04f8794f26bf',
    '7dc0b534-a3be-40d7-becd-653bc1b0cb96',
    '3a334096-5835-4a7c-b32f-42d5b913f3a2',
    'a8554428-ea43-4f67-9b65-1dbb7555f2cd',
    '8b57c879-6550-48da-85c9-96bfa0011b80',
    '69d0762d-8acd-4962-86eb-b924630858d0',
    '48417de5-c3b9-4a1b-807b-f7cb5ba05fea',
    '5a82adb8-0fd6-4875-b6ab-cb32aea747d6',
    'a1053a9f-f328-4af5-8268-32e1b5b364a0',
    '2a85b62d-863e-4e74-a272-899c21bb5be7',
    '10e413c2-729c-4802-a387-6763a7798d8f',
    'cd9e1b65-cda3-441a-a75c-1c63512c9819',
    '8af6ff06-4720-4e6c-9a44-323c2b3d73a8',
    'f8bd54c4-8d6b-446a-8060-8c5a29720933',
    'a1f2aa06-9139-4b37-8478-5157de27002a',
]


#############################################
## Execution
#############################################

# Loop through migration list and process entries
results = []
for snapshot_id in snapshot_id_list:
    logging.info(f"Processing Snapshot ID: {snapshot_id}")
    status, message = output_file_details(snapshot_id, target_bigquery_table)
    results.append([snapshot_id, status, message])
        
# Display final results
print("\nFinal Results:")
results_df = pd.DataFrame(results, columns = ["Snapshot ID", "Status", "Message"])
display(results_df)

# Utility

## Dataset Deletion

In [None]:
# Function to delete a specific TDR Snapshot
def delete_snapshot(snapshot_id):
    api_client = refresh_tdr_api_client("https://data.terra.bio")
    snapshots_api = data_repo_client.SnapshotsApi(api_client=api_client)
    print("Attempting to delete snapshot = {}".format(snapshot_id))
    try:
        delete_snapshot_result, job_id = wait_for_tdr_job(snapshots_api.delete_snapshot(id=snapshot_id), "https://data.terra.bio")
        print("Result: {}".format(delete_snapshot_result))
    except Exception as e:
        print("Error: {}".format(e))

# Function to delete a specific TDR Dataset
def delete_dataset(dataset_id):
    api_client = refresh_tdr_api_client("https://data.terra.bio")
    datasets_api = data_repo_client.DatasetsApi(api_client=api_client)
    print("Attempting to delete dataset = {}".format(dataset_id))
    try:
        delete_dataset_result, job_id = wait_for_tdr_job(datasets_api.delete_dataset(id=dataset_id), "https://data.terra.bio")
        print("Result: {}".format(delete_dataset_result))
    except Exception as e:
        print("Error: {}".format(e))

# Function to delete a specific TDR Dataset and all of its Snapshots
def delete_dataset_and_all_snapshots(dataset_id):
    api_client = refresh_tdr_api_client("https://data.terra.bio")
    datasets_api = data_repo_client.DatasetsApi(api_client=api_client)
    snapshots_api = data_repo_client.SnapshotsApi(api_client=api_client)
    print("Attempting to delete dataset = {} and all associated snapshots".format(dataset_id))
    dataset_id_list = [dataset_id]
    # Delete snapshots
    snapshot_list = snapshots_api.enumerate_snapshots(dataset_ids=dataset_id_list)
    if snapshot_list.items:
        for snapshot in snapshot_list.items:
            snapshot_id = str(snapshot.id)
            delete_snapshot(snapshot_id)
            sleep(10)
    # Delete dataset
    delete_dataset(dataset_id)

# Delete snapshots
# snapshot_id_list = [
# '1234',
# ]
# for snapshot_id in snapshot_id_list:
#     delete_snapshot(snapshot_id)

# Delete datasets and all their associated snapshots
dataset_id_list = [
'1be5b5e6-019e-419a-9248-6e80d067d697',
]
for dataset_id in dataset_id_list:
    delete_dataset_and_all_snapshots(dataset_id)

## Update Migration File List Table

In [None]:
# General parameters
target_bigquery_table = "broad-dsde-prod-analytics-dev.anvil_azure_migration.azure_migration_file_list"

# Update parameters
update_list = [
    {"az_dataset_id": "6007151f-45bc-4111-8e9a-b667bc722a6a", "new_gcp_dataset_id": "b22c71b2-2cb2-4b27-a49b-9a2a83d432e8", "new_gcp_dataset_name": "ANVIL_1000G_PRIMED_data_model_20240301"},
    {"az_dataset_id": "a28e4ab5-a07b-4316-b743-7f5f9cc88211", "new_gcp_dataset_id": "3a89c170-2939-4c12-9940-f32d96fa9e55", "new_gcp_dataset_name": "ANVIL_CMH_GAFK_GS_long_read_20240301"}
]

# Execute updates
client = bigquery.Client()
for entry in update_list:
    logging.info(f"Running update for entry: {str(entry)}")
    az_dataset_id = entry["az_dataset_id"]
    gcp_dataset_id = entry["new_gcp_dataset_id"]
    gcp_dataset_name = entry["new_gcp_dataset_name"]
    update_query = f"""UPDATE `{target_bigquery_table}` 
                       SET gcp_dataset_id = '{gcp_dataset_id}', gcp_dataset_name = '{gcp_dataset_name}'
                       WHERE az_dataset_id = '{az_dataset_id}'"""
    try:
        update_query_job = client.query(update_query)
        update_query_job.result()
        logging.info("Update complete.")
    except Exception as e:
        logging.info("Error running update.")
