# Imports and Common Functions

In [None]:
#!pip install --upgrade data_repo_client

In [3]:
# Imports
import import_ipynb
import data_repo_client
import google.auth
import datetime
import os
import sys
import logging
from time import sleep
from google.cloud import bigquery
from google.cloud import storage
import ingest_pipeline_utilities as utils
import pandas as pd
import json
import re
import math
import requests

# Configure logging format
while logging.root.handlers:
    logging.root.removeHandler(logging.root.handlers[-1])
logging.basicConfig(format="%(asctime)s - %(levelname)s: %(message)s", datefmt="%m/%d/%Y %I:%M:%S %p", level=logging.INFO, handlers=[logging.StreamHandler(sys.stdout)])
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option("display.max_colwidth", None)
pd.set_option('display.width', 1000)
pd.set_option('display.colheader_justify', 'center')
pd.set_option('display.precision', 3)

# Function to refresh TDR API client
def refresh_tdr_api_client(host):
    creds, project = google.auth.default()
    auth_req = google.auth.transport.requests.Request()
    creds.refresh(auth_req)
    config = data_repo_client.Configuration()
    config.host = host
    config.access_token = creds.token
    api_client = data_repo_client.ApiClient(configuration=config)
    api_client.client_side_validation = False
    return api_client

# Function to wait for TDR job completion
def wait_for_tdr_job(job_model, host):
    result = job_model
    print("TDR Job ID: " + job_model.id)
    counter = 0
    job_state = "UNKNOWN"
    while True:
        # Re-establish credentials and API clients every 30 minutes
        if counter == 0 or counter%180 == 0:
            api_client = refresh_tdr_api_client(host)
            jobs_api = data_repo_client.JobsApi(api_client=api_client)
        # Check for TDR connectivity issues and raise exception if the issue persists
        conn_err_counter = 0
        while job_state == "UNKNOWN":
            conn_err_counter += 1
            if conn_err_counter >= 10:
                raise Exception("Error interacting with TDR: {}".format(result.status_code)) 
            elif result == None or result.status_code in ["500", "502", "503", "504"]:
                sleep(10)
                counter += 1
                attempt_counter = 0
                while True:
                    try:
                        result = jobs_api.retrieve_job(job_model.id)
                        break
                    except Exception as e:
                        if attempt_counter < 5:
                            attempt_counter += 1
                            sleep(10)
                            continue
                        else:
                            raise Exception("Error retrieving job status from TDR: {}".format(str(e)))
            else:
                job_state = "KNOWN"
        # Check if job is still running, and sleep/re-check if so
        if job_state == "KNOWN" and result.job_status == "running":
            sleep(10)
            counter += 1
            attempt_counter = 0
            while True:
                try:
                    result = jobs_api.retrieve_job(job_model.id)
                    break
                except Exception as e:
                    if attempt_counter < 5:
                        sleep(10)
                        attempt_counter += 1
                        continue
                    else:
                        raise Exception("Error retrieving job status from TDR: {}".format(str(e)))
        # If job has returned as failed, confirm this is the correct state and retrieve result if so
        elif job_state == "KNOWN" and result.job_status == "failed":
            fail_counter = 0
            while True:
                attempt_counter = 0
                while True:
                    try:
                        result = jobs_api.retrieve_job(job_model.id)
                        if result.job_status == "failed":
                            fail_counter += 1
                        break
                    except Exception as e:
                        if attempt_counter < 5:
                            sleep(10)
                            attempt_counter += 1
                            continue
                        else:
                            raise Exception("Error retrieving job status from TDR: {}".format(str(e)))
                if fail_counter >= 3:
                    try:
                        fail_result = jobs_api.retrieve_job_result(job_model.id)
                        raise Exception("Job " + job_model.id + " failed: " + fail_result)
                    except Exception as e:
                        raise Exception("Job " + job_model.id + " failed: " + str(e))
        # If a job has returned as succeeded, retrieve result
        elif job_state == "KNOWN" and result.job_status == "succeeded":
            attempt_counter = 0
            while True:
                try:
                    return jobs_api.retrieve_job_result(job_model.id), job_model.id
                except Exception as e:
                    if attempt_counter < 3:
                        sleep(10)
                        attempt_counter += 1
                        continue
                    else:
                        return "Job succeeded, but error retrieving job result: {}".format(str(e)), job_model.id
        else:
            raise Exception("Unrecognized job state: {}".format(result.job_status))

# Migrating TDR Datasets

## Step 1: Pre-Connector Processing
For the list of GCP TDR datasets provided:
1. Extract the schema
2. Create an Azure TDR dataset using the extracted schema
3. Build a manifest of files to be copied from the GCP dataset to the Azure dataset and write to BigQuery.

In [7]:
#############################################
## Functions
#############################################

# Function to build default target TDR dataset name
def format_dataset_name(input_str):
    current_datetime = datetime.datetime.now()
    current_date_string = current_datetime.strftime("%Y%m%d")
    input_str = input_str[:-9]
    output_str = "ANVIL_" + re.sub("^ANVIL[_]?", "", input_str, flags=re.IGNORECASE) + "_" + current_date_string
    output_str = re.sub("[^a-zA-Z0-9_]", "_", output_str)
    return output_str

# Function to create a new TDR dataset from an existing TDR dataset
def create_dataset_from_dataset(src_tdr_object_uuid, tar_tdr_object_uuid, billing_profile):

    # Setup/refresh TDR clients
    api_client = refresh_tdr_api_client("https://data.terra.bio")
    datasets_api = data_repo_client.DatasetsApi(api_client=api_client)

    # Retrieve original dataset details
    logging.info(f"Retrieving original dataset details from prod environment. UUID:  {src_tdr_object_uuid}")
    try:
        dataset_details = datasets_api.retrieve_dataset(id=src_tdr_object_uuid, include=["SCHEMA", "ACCESS_INFORMATION", "PROPERTIES", "DATA_PROJECT", "STORAGE"]).to_dict()
        bq_project = dataset_details["access_information"]["big_query"]["project_id"]
        bq_dataset = dataset_details["access_information"]["big_query"]["dataset_name"]
        orig_object_name = dataset_details["name"]
    except Exception as e:
        error_str = f"Error retrieving details from dataset {src_tdr_object_uuid} in TDR prod environment: {str(e)}"
        logging.error(error_str)
        return None, None, None, None, None

    # If target dataset specified, retrieve name
    if tar_tdr_object_uuid:
        new_dataset_id = tar_tdr_object_uuid
        logging.info(f"Retrieving new dataset details from prod environment. UUID:  {tar_tdr_object_uuid}")
        try:
            dataset_details = datasets_api.retrieve_dataset(id=tar_tdr_object_uuid, include=["SCHEMA", "ACCESS_INFORMATION", "PROPERTIES", "DATA_PROJECT", "STORAGE"]).to_dict()
            new_object_name = dataset_details["name"]
        except Exception as e:
            error_str = f"Error retrieving details from dataset {tar_tdr_object_uuid} in TDR prod environment: {str(e)}"
            logging.error(error_str)
            return None, None, None, None, None 
    else:
        # Build new dataset schema
        apply_anvil_transforms = True
        new_schema_dict = {"tables": [], "relationships": [], "assets": []}
        for table_entry in dataset_details["schema"]["tables"]:
            int_table_dict = table_entry.copy()
            int_table_dict["primaryKey"] = int_table_dict.pop("primary_key")
            for key in ["partition_mode", "date_partition_options", "int_partition_options", "row_count"]:
                del int_table_dict[key]
            for idx, column_entry in enumerate(table_entry["columns"]):
                if column_entry["datatype"] == "integer":
                    table_entry["columns"][idx]["datatype"] = "int64"
            if apply_anvil_transforms:
                if table_entry["name"] == "file_inventory":
                    int_table_dict["columns"].append({"name": "orig_file_ref", "datatype": "string", "array_of": False, "required": False})
                    int_table_dict["columns"].append({"name": "orig_datarepo_row_id", "datatype": "string", "array_of": False, "required": False})
                elif "anvil_" not in table_entry["name"]:
                    int_table_dict["columns"].append({"name": "orig_datarepo_row_id", "datatype": "string", "array_of": False, "required": False})
            new_schema_dict["tables"].append(int_table_dict)
        for rel_entry in dataset_details["schema"]["relationships"]:
            int_rel_dict = rel_entry.copy()
            int_rel_dict["from"] = int_rel_dict.pop("_from")
            new_schema_dict["relationships"].append(int_rel_dict)
        for asset_entry in dataset_details["schema"]["assets"]:
            int_asset_dict = asset_entry.copy()
            int_asset_dict["rootTable"] = int_asset_dict.pop("root_table")
            int_asset_dict["rootColumn"] = int_asset_dict.pop("root_column")
            new_schema_dict["assets"].append(int_asset_dict)

        # Retrieve original dataset policies
        try:
            dataset_policies = datasets_api.retrieve_dataset_policies(id=src_tdr_object_uuid).to_dict()
            for policy in dataset_policies["policies"]:
                if policy["name"] == "steward":
                    stewards_list = policy["members"]
                elif policy["name"] == "custodian":
                    custodians_list = policy["members"]
                elif policy["name"] == "snapshot_creator":
                    snapshot_creators_list = policy["members"]
        except:
            logging.info("Error retrieving original dataset policies. Skipping policy copy.")
            stewards_list = []
            custodians_list = []
            snapshot_creators_list = []
        policies = {
            "stewards": stewards_list,
            "custodians": custodians_list,
            "snapshotCreators": snapshot_creators_list
        }

        # Determine dataset properties
        new_object_name = format_dataset_name(orig_object_name)
        new_description = dataset_details["description"] + f"\n\nCopy of dataset {orig_object_name} from TDR prod."
        self_hosted = False
        dedicated_ingest_sa = False
        phs_id = dataset_details["phs_id"]
        predictable_file_ids = dataset_details["predictable_file_ids"]
        secure_monitoring_enabled = dataset_details["secure_monitoring_enabled"]
        properties = dataset_details["properties"]
        tags = dataset_details["tags"]

        # Create new TDR dataset
        logging.info("Submitting dataset creation request.")
        dataset_request = {
            "name": new_object_name,
            "description": new_description,
            "defaultProfileId": billing_profile,
            "cloudPlatform": "azure",
            "region": "southcentralus",
            "phsId": phs_id,
            "experimentalSelfHosted": self_hosted,
            "experimentalPredictableFileIds": predictable_file_ids,
            "dedicatedIngestServiceAccount": dedicated_ingest_sa,
            "enableSecureMonitoring": secure_monitoring_enabled,
            "properties": properties,
            "tags": tags,
            "policies": policies,
            "schema": new_schema_dict
        }
        attempt_counter = 1
        while True:
            try:
                create_dataset_result, job_id = wait_for_tdr_job(datasets_api.create_dataset(dataset=dataset_request), "https://data.terra.bio")
                logging.info("Dataset Creation succeeded: {}".format(create_dataset_result))
                new_dataset_id = create_dataset_result["id"]
                break
            except Exception as e:
                error_str = f"Error on Dataset Creation: {str(e)}"
                logging.error(error_str)
                if attempt_counter < 3:
                    logging.info("Retrying Dataset Creation (attempt #{})...".format(str(attempt_counter)))
                    sleep(10)
                    attempt_counter += 1
                    continue
                else:
                    logging.error("Maximum number of retries exceeded. Exiting job.")
                    return None, None, None, None, None
        
    # Exit function
    return orig_object_name, new_dataset_id, new_object_name, bq_project, bq_dataset

# Function to create file transfer details
def output_file_details(orig_dataset_id, orig_dataset_name, new_dataset_id, new_dataset_name, bq_project, bq_dataset, public_flag, target_bigquery_table, delete_existing_records):
    
    # Setup/refresh TDR clients (and BQ client)
    api_client = refresh_tdr_api_client("https://data.terra.bio")
    datasets_api = data_repo_client.DatasetsApi(api_client=api_client)
    snapshots_api = data_repo_client.SnapshotsApi(api_client=api_client)
    client = bigquery.Client()
    
    # Clear records from target BQ table
    if delete_existing_records:
        logging.info(f"Preparing target BQ table ({target_bigquery_table}).")
        delete_query = f"""DELETE FROM `{target_bigquery_table}` WHERE gcp_dataset_id = '{orig_dataset_id}'"""
        try:
            delete_query_job = client.query(delete_query)
            delete_query_job.result()
        except Exception as e:
            logging.info("Error deleting records for the original dataset from the target BQ table.")
    
    # Retrieve table data from the original dataset and write to target BQ table
    logging.info(f"Fetching and recording all rows from table 'file_inventory' in the original dataset ({orig_dataset_id}). BQ Project = '{bq_project}' and BQ Dataset = '{bq_dataset}'.")
    current_datetime_string = datetime.datetime.now().strftime("%Y%m%d%H%M%S")
    job_config = bigquery.QueryJobConfig()
    job_config.destination = target_bigquery_table
    job_config.write_disposition = "WRITE_APPEND"
    query = f"""WITH drlh_deduped AS
                (
                  SELECT DISTINCT file_id, target_path, source_name 
                  FROM 
                  (
                    SELECT *, ROW_NUMBER() OVER (PARTITION BY source_name ORDER BY load_time DESC) AS rn
                    --SELECT *, ROW_NUMBER() OVER (PARTITION BY source_name, target_path ORDER BY load_time DESC) AS rn
                    FROM `{bq_project}.{bq_dataset}.datarepo_load_history`
                    WHERE state = "succeeded" 
                  )
                  WHERE rn = 1
                ),
                file_records AS
                (
                  SELECT '{orig_dataset_id}' AS gcp_dataset_id, '{orig_dataset_name}' AS gcp_dataset_name, 
                  '{new_dataset_id}' AS az_dataset_id, '{new_dataset_name}' AS az_dataset_name, 
                  b.source_name AS source_path, b.target_path, a.size_in_bytes, a.md5_hash, a.file_ref AS orig_tdr_file_id,
                  '{current_datetime_string}' AS date_added, '{public_flag}' AS public_flag, ROW_NUMBER() OVER (PARTITION BY a.file_ref ORDER BY b.source_name) AS rn
                  FROM `{bq_project}.{bq_dataset}.file_inventory` a
                      LEFT JOIN drlh_deduped b
                      ON a.uri = b.source_name
                      LEFT JOIN `broad-dsde-prod-analytics-dev.anvil_azure_migration.azure_migration_file_list` c
                      ON a.file_ref = c.orig_tdr_file_id AND c.az_dataset_id = '{new_dataset_id}'
                  WHERE c.source_path IS NULL
                )
                SELECT * EXCEPT(rn)
                FROM file_records
                WHERE rn = 1"""
    attempt_counter = 0
    while True:
        try:
            query_job = client.query(query, job_config=job_config)
            query_job.result()
            logging.info("Records recorded successfully.")
            return
        except Exception as e:
            if attempt_counter < 5:
                sleep(10)
                attempt_counter += 1
                continue
            else:
                err_str = f"Error recording records for all rows of table 'file_inventory': {str(e)}."
                logging.error(err_str)
                return
    

#############################################
## Input Parameters
#############################################

# General parameters
target_bigquery_table = "broad-dsde-prod-analytics-dev.anvil_azure_migration.azure_migration_file_list"
azure_billing_profile = "9ee23bed-b46c-4561-9103-d2a723113f7f"

# Specify the list of datasets to process, leaving the target Azure dataset ID empty to create a new one
migration_list = [
    #["src_gcp_dataset_id", "tar_az_dataset_id", "open_access (Y/N)"]
    ['ec6f49a2-176c-4564-82c5-e751baab46aa', 'fcf41f7a-9de2-4105-af33-48abe616e386', 'Y'],
]

# Specify whether existing records in the azure_migration_file_list table should be deleted before running
delete_existing_records = False


#############################################
## Execution
#############################################

# Loop through migration list and process entries
results = []
for entry in migration_list:
    logging.info(f"Processing Migration List Entry: {str(entry)}")
    dataset_name, new_dataset_id, new_dataset_name, bq_project, bq_dataset = create_dataset_from_dataset(entry[0], entry[1], azure_billing_profile)
    if new_dataset_id:
        output_file_details(entry[0], dataset_name, new_dataset_id, new_dataset_name, bq_project, bq_dataset, entry[2], target_bigquery_table, delete_existing_records)
        results.append([entry[0], dataset_name, "Success", new_dataset_id, new_dataset_name])
    else:
        results.append([entry[0], dataset_name, "Failure", new_dataset_id, new_dataset_name])
        
# Display final results
print("\nFinal Results:")
results_df = pd.DataFrame(results, columns = ["Source Dataset ID", "Source Dataset Name", "Status", "New Dataset ID", "New Dataset Name"])
display(results_df)
            

04/12/2024 08:07:35 PM - INFO: Processing Migration List Entry: ['ec6f49a2-176c-4564-82c5-e751baab46aa', 'fcf41f7a-9de2-4105-af33-48abe616e386', 'Y']
04/12/2024 08:07:35 PM - INFO: Retrieving original dataset details from prod environment. UUID:  ec6f49a2-176c-4564-82c5-e751baab46aa
04/12/2024 08:07:36 PM - INFO: Retrieving new dataset details from prod environment. UUID:  fcf41f7a-9de2-4105-af33-48abe616e386
04/12/2024 08:07:37 PM - INFO: Fetching and recording all rows from table 'file_inventory' in the original dataset (ec6f49a2-176c-4564-82c5-e751baab46aa). BQ Project = 'datarepo-bf821d10' and BQ Dataset = 'datarepo_ANVIL_HPRC_20240401'.
04/12/2024 08:07:42 PM - INFO: Records recorded successfully.

Final Results:


Unnamed: 0,Source Dataset ID,Source Dataset Name,Status,New Dataset ID,New Dataset Name
0,ec6f49a2-176c-4564-82c5-e751baab46aa,ANVIL_HPRC_20240401,Success,fcf41f7a-9de2-4105-af33-48abe616e386,ANVIL_HPRC_20240408


## Step 2: Post-Connector Processing
For each GCP Dataset - Azure Dataset pair:
1. Retrieve the source GCP Dataset for the Snapshot
2. Extract, pre-process, and ingest tabular data from the GCP Dataset to the Azure Dataset
3. Create a new Azure snapshot based on the GCP snapshot

In [None]:
#############################################
## Functions
#############################################

# Function to fetch data from BigQuery
def fetch_source_records_bigquery(config, new_dataset_id, array_col_dict, table, start_row, end_row):
    # Extract parameters from config
    src_tdr_object_uuid = config["source_dataset_id"]
    src_tdr_object_type = "dataset"
    tdr_host = config["tdr_host"]
    files_already_ingested = True
    datarepo_row_ids_to_ingest = []
    apply_anvil_transforms = True
    bq_project = config["bigquery_project"]
    bq_dataset = config["bigquery_dataset"]
    
    # Setup/refresh TDR clients (and BQ client)
    api_client = refresh_tdr_api_client(tdr_host)
    datasets_api = data_repo_client.DatasetsApi(api_client=api_client)
    snapshots_api = data_repo_client.SnapshotsApi(api_client=api_client)
    client = bigquery.Client()
    
    # Retrieve table data from the original dataset
    logging.info(f"Fetching rows {str(start_row)}-{str(end_row)} from table '{table}' in the original {src_tdr_object_type} ({src_tdr_object_uuid}).")
    table_recs_str = f"Table: {table} -- Rows: {str(start_row)}-{str(end_row)}"
    final_records = []
    if apply_anvil_transforms and "anvil_" not in table:
        if table == "file_inventory":
            if files_already_ingested == False:
                file_ref_sql = "TO_JSON_STRING(STRUCT(source_name AS sourcePath, target_path AS targetPath, 'Ingest of '||source_name AS description, COALESCE(content_type, 'application/octet-stream') AS mimeType))"
            else:
                file_ref_sql = "file_ref"
            rec_fetch_query = f"""WITH drlh_deduped AS
                            (
                              SELECT DISTINCT file_id, target_path, source_name
                              FROM `{bq_project}.{bq_dataset}.datarepo_load_history`
                              WHERE state = "succeeded" 
                            )
                            SELECT * EXCEPT(rownum)
                            FROM
                            (
                              SELECT datarepo_row_id, datarepo_row_id AS orig_datarepo_row_id, a.file_id, name, path, target_path AS uri, content_type, full_extension, size_in_bytes, crc32c, md5_hash, ingest_provenance,
                              file_ref AS orig_file_ref, {file_ref_sql} AS file_ref,
                              ROW_NUMBER() OVER (ORDER BY datarepo_row_id) AS rownum
                              FROM `{bq_project}.{bq_dataset}.{table}` a
                                  LEFT JOIN drlh_deduped b
                                  ON a.file_ref = b.file_id
                            )
                            WHERE rownum BETWEEN {start_row} AND {end_row}"""
        else:
            rec_fetch_query = f"""SELECT * EXCEPT(rownum)
                            FROM
                            (
                              SELECT *, datarepo_row_id AS orig_datarepo_row_id,
                              ROW_NUMBER() OVER (ORDER BY datarepo_row_id) AS rownum
                              FROM `{bq_project}.{bq_dataset}.{table}`
                            )
                            WHERE rownum BETWEEN {start_row} AND {end_row}"""
    else:
        rec_fetch_query = f"""SELECT * EXCEPT(rownum)
                            FROM
                            (
                              SELECT *, 
                              ROW_NUMBER() OVER (ORDER BY datarepo_row_id) AS rownum
                              FROM `{bq_project}.{bq_dataset}.{table}`
                            )
                            WHERE rownum BETWEEN {start_row} AND {end_row}"""
    attempt_counter = 0
    while True:
        try:
            df = client.query(rec_fetch_query).result().to_dataframe()
            df = df.astype(object).where(pd.notnull(df),None)
            for column in array_col_dict[table]:
                df[column] = df[column].apply(lambda x: list(x))
            if apply_anvil_transforms and table == "file_inventory" and files_already_ingested == False: 
                df["file_ref"] = df.apply(lambda x: json.loads(x["file_ref"].replace("\'", "\"")), axis=1)
            final_records = df.to_dict(orient="records")
            break
        except Exception as e:
            if attempt_counter < 5:
                sleep(10)
                attempt_counter += 1
                continue
            else:
                err_str = f"Error retrieving records for rows {str(start_row)}-{str(end_row)} of table {table}: {str(e)}."
                logging.error(err_str)
                config["migration_results"].append(["Dataset Ingestion", table_recs_str, "Failure", err_str])
                return {}
    
    # Filter retrieved data if necessary and return as dict of records
    if final_records:
        df_temp = pd.DataFrame.from_dict(final_records)
        if datarepo_row_ids_to_ingest:
            df_orig = df_temp[df_temp["datarepo_row_id"].isin(datarepo_row_ids_to_ingest)].copy()
        else:
            df_orig = df_temp.copy()
        del df_temp
        df_orig.drop(columns=["datarepo_row_id"], inplace=True, errors="ignore")
        df_orig = df_orig.astype(object).where(pd.notnull(df_orig),None)
        records_orig = df_orig.to_dict(orient="records")
        if not records_orig:
            msg_str = f"No records found in rows {str(start_row)}-{str(end_row)} of table {table} after filtering based on datarepo_row_ids_to_ingest parameter. Continuing to next record set or table validation."
            logging.info(msg_str)
            config["migration_results"].append(["Dataset Ingestion", table_recs_str, "Skipped", msg_str])
            return records_orig
        elif len(final_records) != len(records_orig):
            logging.info(f"Filtering records to ingest based on the datarepo_row_ids_to_ingest parameter. {str(len(records_orig))} of {str(len(final_records))} records to be ingested.")
            return records_orig
        else:
            return records_orig
    else:
        msg_str = f"No records found for rows {str(start_row)}-{str(end_row)} of table {table} in original {src_tdr_object_type}. Continuing to next record set or table validation."
        logging.info(msg_str)
        config["migration_results"].append(["Dataset Ingestion", table_recs_str, "Skipped", msg_str])
        return final_records

# Function to process ingests for specific table
def ingest_table_data(config, new_dataset_id, fileref_col_dict, array_col_dict, table, start_row, end_row):
    
    # Extract parameters from config
    src_tdr_object_uuid = config["source_dataset_id"]
    src_tdr_object_type = "dataset"
    tdr_host = config["tdr_host"]
    tar_tdr_billing_profile = config["tar_tdr_billing_profile"]
    records_processing_method = "in_memory"
    write_to_cloud_platform = ""
    apply_anvil_transforms = True
    dr_row_id_xwalk = config["dr_row_id_xwalk"]

    # Setup/refresh TDR clients
    api_client = refresh_tdr_api_client(tdr_host)
    datasets_api = data_repo_client.DatasetsApi(api_client=api_client)
    snapshots_api = data_repo_client.SnapshotsApi(api_client=api_client)
    
    # Retrieve table data from the original dataset
    table_recs_str = f"Table: {table} -- Rows: {str(start_row)}-{str(end_row)}"
    records_orig = fetch_source_records_bigquery(config, new_dataset_id, array_col_dict, table, start_row, end_row)
    if not records_orig:
        return

    # Pre-process records before ingest
    if "anvil_" in table:
        try:
            # Pre-process records in AnVIL_ records to use new datarepo_row_ids in the source_datarepo_row_ids field
            logging.info("FSS (anvil_%) table with ingest.apply_anvil_transforms parameter set to 'True'. Pre-processing records before submitting ingestion request.")
            records_processed = []
            for record in records_orig:
                int_record = record.copy()
                new_dr_row_id_list = []
                for row_id in int_record["source_datarepo_row_ids"]:
                    new_row_id = dr_row_id_xwalk.get(row_id)
                    if new_row_id:
                        new_dr_row_id_list.append(new_row_id)
                    else:
                        err_str = f"Failure in pre-processing: row_id '{row_id}'' not found in datarepo_row_id crosswalk."
                        logging.error(err_str)
                        config["migration_results"].append(["Dataset Ingestion", table_recs_str, "Failure", err_str])
                        return   
                int_record["source_datarepo_row_ids"] = new_dr_row_id_list
                records_processed.append(int_record)
        except Exception as e:
            err_str = f"Failure in pre-processing: {str(e)}"
            logging.error(err_str)
            config["migration_results"].append(["Dataset Ingestion", table_recs_str, "Failure", err_str])
            return
    else:
        records_processed = records_orig    
    
    # Write out records to cloud, if specified by user
    if records_processing_method == "write_to_cloud":
        logging.info(f"Writing records to a control file in the cloud.")
        if write_to_cloud_platform == "gcp":
            control_file_path = write_records_to_gcp(config, table, records_processed)
        else:
            control_file_path = write_records_to_azure(config, table, records_processed)

    # Build, submit, and monitor ingest request
    logging.info(f"Submitting ingestion request to new dataset ({new_dataset_id}).")
    if records_processing_method == "write_to_cloud":
        ingest_request = {
            "table": table,
            "profile_id": tar_tdr_billing_profile,
            "ignore_unknown_values": True,
            "resolve_existing_files": True,
            "updateStrategy": "append",
            "format": "json",
            "load_tag": "Ingest for {}".format(new_dataset_id),
            "path": control_file_path
        }        
    else:
        ingest_request = {
            "table": table,
            "profile_id": tar_tdr_billing_profile,
            "ignore_unknown_values": True,
            "resolve_existing_files": True,
            "updateStrategy": "append",
            "format": "array",
            "load_tag": "Ingest for {}".format(new_dataset_id),
            "records": records_processed
        }
    attempt_counter = 1
    while True:
        try:
            api_client = refresh_tdr_api_client(tdr_host)
            datasets_api = data_repo_client.DatasetsApi(api_client=api_client)
            ingest_request_result, job_id = wait_for_tdr_job(datasets_api.ingest_dataset(id=new_dataset_id, ingest=ingest_request), tdr_host)
            logging.info("Ingest succeeded: {}".format(str(ingest_request_result)[0:1000]))
            config["migration_results"].append(["Dataset Ingestion", table_recs_str, "Success", str(ingest_request_result)[0:1000]])
            break
        except Exception as e:
            logging.error("Error on ingest: {}".format(str(e)[0:2500]))
            if attempt_counter < 3:
                logging.info("Retrying ingest (attempt #{})...".format(str(attempt_counter)))
                sleep(10)
                attempt_counter += 1
                continue
            else:
                logging.error("Maximum number of retries exceeded. Logging error.")
                err_str = f"Error on ingest: {str(e)[0:2500]}"
                config["migration_results"].append(["Dataset Ingestion", table_recs_str, "Failure", err_str])  
                break

    # Remove control file from cloud, if written out
    if records_processing_method == "write_to_cloud":
        logging.info(f"Removing control file from the cloud.")
        if write_to_cloud_platform == "gcp":
            client = storage.Client()
            target_bucket = control_file_path.split("/")[2]
            target_object = "/".join(control_file_path.split("/")[3:])
            bucket = client.bucket(target_bucket)
            blob = bucket.blob(target_object)
            blob.delete()
        else:
            blob = BlobClient.from_blob_url(control_file_path)
            blob.delete_blob()

# Function to orchestration the migration of tabular data
def migrate_tabular_data(config):

    # Extract parameters from config
    source_dataset_id = config["source_dataset_id"]
    target_dataset_id = config["target_dataset_id"] 
    tables_to_ingest = config["tables_to_ingest"] 
    tdr_host = config["tdr_host"] 
    tdr_sa_to_use = config["tdr_sa_to_use"] 
    chunk_size = config["chunk_size"] 
    max_combined_rec_ref_size = config["max_combined_rec_ref_size"] 
    skip_ingests = config["skip_ingests"]

    # Setup/refresh TDR clients
    api_client = refresh_tdr_api_client(tdr_host)
    datasets_api = data_repo_client.DatasetsApi(api_client=api_client)
    snapshots_api = data_repo_client.SnapshotsApi(api_client=api_client)

    # Add TDR SA to original dataset
    logging.info(f"Adding TDR general SA ({tdr_sa_to_use}) to original dataset: {source_dataset_id}")
    try:
        resp = datasets_api.add_dataset_policy_member(id=source_dataset_id, policy_name="steward", policy_member={"email": tdr_sa_to_use}) 
        logging.info("TDR SA added successfully.")
    except:
        error_str = f"Error adding TDR SA to dataset {source_dataset_id}: {str(e)}"
        logging.error(error_str)
        config["migration_results"].append(["Dataset Ingestion", "All Tables", "Failure", error_str])
        return

    # Collect details from original dataset to build inventory of tables to migrate
    logging.info(f"Retrieving dataset details from original dataset: {source_dataset_id}")
    try:
        dataset_details = datasets_api.retrieve_dataset(id=source_dataset_id, include=["SCHEMA", "ACCESS_INFORMATION", "PROPERTIES", "DATA_PROJECT", "STORAGE"]).to_dict()
        config["bigquery_project"] = dataset_details["access_information"]["big_query"]["project_id"]
        config["bigquery_dataset"] = dataset_details["access_information"]["big_query"]["dataset_name"]
        fileref_col_dict = {}
        array_col_dict = {}
        for table_entry in dataset_details["schema"]["tables"]:
            fileref_list = []
            array_list = []
            for idx, column_entry in enumerate(table_entry["columns"]):
                if column_entry["datatype"] == "fileref":
                    fileref_list.append(column_entry["name"])
                if column_entry["array_of"] == True:
                    array_list.append(column_entry["name"])
            fileref_col_dict[table_entry["name"]] = fileref_list
            array_col_dict[table_entry["name"]] = array_list
    except Exception as e:
        error_str = f"Error retrieving details from dataset {source_dataset_id}: {str(e)}"
        logging.error(error_str)
        config["migration_results"].append(["Dataset Ingestion", "All Tables", "Failure", error_str])
        return

    # Read in existing datarepo_row_id crosswalk, if one exists
    logging.info("Fetching existing datarepo_row_id crosswalk (if one exists).")
    xwalk_json_file_name = f"{source_dataset_id}_{target_dataset_id}_rowid_xwalk.json"
    try:
        with open(xwalk_json_file_name,"r") as file:
            datarepo_row_id_xwalk = json.load(file)
    except:
        datarepo_row_id_xwalk = {}
        logging.warning(f"No datarepo_row_id crosswalk file name '{xwalk_json_file_name}' found.")

    # Order tables for ingestion
    logging.info("Ordering tables and pulling current record counts for validation.")
    table_rank_dict = {}
    for table in fileref_col_dict.keys():
        if table == "file_inventory":
            table_rank_dict[table] = 1
        elif "anvil_" not in table:
            table_rank_dict[table] = 2
        else:
            table_rank_dict[table] = 3
    ordered_table_list = sorted(table_rank_dict, key= lambda key: table_rank_dict[key])

    # Fetch total record counts for all tables
    populated_table_dict = {}
    for table in ordered_table_list:
        api_client = refresh_tdr_api_client(tdr_host)
        datasets_api = data_repo_client.DatasetsApi(api_client=api_client)
        snapshots_api = data_repo_client.SnapshotsApi(api_client=api_client)
        attempt_counter = 0
        while True:
            payload = {
              "offset": 0,
              "limit": 10,
              "sort": "datarepo_row_id",
              "direction": "asc",
              "filter": ""
            }
            try:
                record_results = datasets_api.query_dataset_data_by_id(id=source_dataset_id, table=table, query_data_request_model=payload).to_dict()
                total_record_count = record_results["total_row_count"]
                break
            except Exception as e:
                if attempt_counter < 5:
                    sleep(10)
                    attempt_counter += 1
                    continue
                else:
                    total_record_count = -1
                    break
        if total_record_count == -1:
            error_str = f"Error retrieving current record counts for tables in dataset {source_dataset_id}: {str(e)}"
            logging.error(error_str)
            config["migration_results"].append(["Dataset Ingestion", "All Tables", "Failure", error_str])
            return
        elif total_record_count > 0:
            populated_table_dict[table] = total_record_count

    # Loop through and process tables for ingestion
    logging.info("Processing dataset ingestion requests.")
    pop_fss_table_cnt = 0
    for table in ordered_table_list:

        # Determine whether table should be processed, and skip if not
        logging.info(f"Processing dataset ingestion for table '{table}'.")
        total_record_count = 0
        if tables_to_ingest and table not in tables_to_ingest:
            msg_str = f"Table '{table}' not listed in the tables_to_ingest parameter. Skipping."
            logging.info(msg_str)
            config["migration_results"].append(["Dataset Ingestion", f"Table: {table}", "Skipped", msg_str])
            continue
        elif table not in populated_table_dict.keys():
            msg_str = f"No records found for table '{table}' in original dataset. Continuing to next table/record set."
            logging.info(msg_str)
            config["migration_results"].append(["Dataset Ingestion", f"Table: {table}", "Skipped", msg_str])
            continue
        elif "anvil_" in table:
            # Confirm all non-FSS tables are present in datarepo_row_id_xwalk
            pop_fss_table_cnt += 1
            missing_tab_list = []
            for tab in populated_table_dict.keys():
                if "anvil_" not in tab and tab not in datarepo_row_id_xwalk.keys():
                    missing_tab_list.append(tab)
            if len(missing_tab_list) > 0:
                missing_tab_string = ", ".join(missing_tab_list)
                msg_str = f"Populated non-FSS tables missing from datarepo_row_id crosswalk: {missing_tab_string}. Skipping FSS table '{table}'."
                logging.info(msg_str)
                config["migration_results"].append(["Dataset Ingestion", f"Table: {table}", "Skipped", msg_str])
                continue
        
        # Aggregate datarepo_row_id crosswalk informatino for us in FSS table processing
        if pop_fss_table_cnt == 1:
            dr_row_id_xwalk = {}
            for key in datarepo_row_id_xwalk.keys():
                dr_row_id_xwalk.update(datarepo_row_id_xwalk[key])
            config["dr_row_id_xwalk"] = dr_row_id_xwalk 
            
        # Chunk table records as necessary, then loop through and process each chunk
        total_record_count = populated_table_dict.get(table)
        if skip_ingests:
            msg_str = f"Parameter 'skip_ingests' set to true. Skipping ingestion for table '{table}'."
            logging.info(msg_str)
            config["migration_results"].append(["Dataset Ingestion", f"Table: {table}", "Skipped", msg_str])
        else:
            if fileref_col_dict[table]:
                ref_chunk_size = math.floor(max_combined_rec_ref_size / len(fileref_col_dict[table]))
                table_chunk_size = min(chunk_size, ref_chunk_size)
                logging.info(f"Table '{table}' contains fileref columns. Will use a chunk size of {table_chunk_size} rows per ingestion request, to keep the number of file references per chunk below {max_combined_rec_ref_size}.")
            else:
                table_chunk_size = chunk_size
                logging.info(f"Table '{table}' does not contain fileref columns. Will use a chunk size of {table_chunk_size} rows per ingestion request.")
            start_row = 1
            end_row = min((table_chunk_size), total_record_count)
            while start_row <= total_record_count:
                if end_row > total_record_count:
                    end_row = total_record_count
                ingest_table_data(config, target_dataset_id, fileref_col_dict, array_col_dict, table, start_row, end_row)    
                start_row += table_chunk_size
                end_row += table_chunk_size

        # Build datarepo_row_id crosswalk for the table, add to datarepo_row_id_xwalk dict, and write out updated dict to file
        if "anvil_" not in table: 
            logging.info("Fetching ingested records and building datarepo_row_id lookup for use in AnVIL transforms.")
            temp_dr_xwalk = {}
            api_client = refresh_tdr_api_client(tdr_host)
            datasets_api = data_repo_client.DatasetsApi(api_client=api_client)
            max_page_size = 1000
            records_fetched = 0
            retrieval_error = False
            while records_fetched < total_record_count and not retrieval_error:
                row_start = records_fetched
                attempt_counter = 0
                while True:
                    payload = {
                      "offset": row_start,
                      "limit": max_page_size,
                      "sort": "datarepo_row_id",
                      "direction": "asc",
                      "filter": ""
                    }
                    try:
                        dataset_results = datasets_api.query_dataset_data_by_id(id=target_dataset_id, table=table, query_data_request_model=payload).to_dict() 
                        if len(dataset_results["result"]) == 0:
                            warn_str = f"No records found for '{table}' table, which prevents the proper building of the datarepo_row_id_xwalk. Note that this may cause failures in FSS table ingestion requests downstream."
                            logging.warning(warn_str)
                            retrieval_error = True
                            break  
                        else:
                            for record in dataset_results["result"]:
                                key = table + ":" + record["orig_datarepo_row_id"]
                                val = table + ":" + record["datarepo_row_id"]
                                temp_dr_xwalk[key] = val
                                records_fetched += 1
                            break
                    except Exception as e:
                        if attempt_counter < 5:
                            sleep(10)
                            attempt_counter += 1
                            continue
                        else:
                            warn_str = f"Error retrieving records for '{table}' table to build datarepo_row_id_xwalk. Note that this may cause failures in FSS table ingestion requests downstream."
                            logging.warning(warn_str)
                            retrieval_error = True
                            break
            if not retrieval_error:
                datarepo_row_id_xwalk[table] = temp_dr_xwalk
                with open(xwalk_json_file_name, 'w') as file:
                    json.dump(datarepo_row_id_xwalk, file)
        
        # Fetch total record count for the new table
        api_client = refresh_tdr_api_client(tdr_host)
        datasets_api = data_repo_client.DatasetsApi(api_client=api_client)
        snapshots_api = data_repo_client.SnapshotsApi(api_client=api_client)
        attempt_counter = 0
        while True:
            payload = {
              "offset": 0,
              "limit": 10,
              "sort": "datarepo_row_id",
              "direction": "asc",
              "filter": ""
            }
            try:
                record_results = datasets_api.query_dataset_data_by_id(id=target_dataset_id, table=table, query_data_request_model=payload).to_dict()
                new_record_count = record_results["total_row_count"]
                break
            except Exception as e:
                if attempt_counter < 5:
                    sleep(10)
                    attempt_counter += 1
                    continue
                else:
                    new_record_count = -1
                    break
        if new_record_count == -1:
            err_str = f"Error retrieving record count for table '{table}' in new dataset. Skipping validation and continuing to next table."
            logging.error(err_str)
            config["migration_results"].append(["Dataset Validation", f"Table: {table}", "Failure", err_str])
            continue 

        # Validate the new table against the old table, with extra scrutiny given to the file_inventory table for AnVIL migrations
        logging.info(f"Validating table '{table}' in new dataset vs. original dataset.")
        if new_record_count == total_record_count:
            config["migration_results"].append(["Dataset Validation", f"Table: {table}", "Success", f"{new_record_count} records found in both new and original table."])
        else:
            config["migration_results"].append(["Dataset Validation", f"Table: {table}", "Failure", f"{new_record_count} records found in new table doesn't match {total_record_count} records in original table."])

    # Display results
    pipeline_results = pd.DataFrame(config["migration_results"], columns = ["Task", "Step", "Status", "Message"])
    failures = pipeline_results[pipeline_results["Status"].str.contains("Failure")]
    logging.info("Migration Pipeline Results:")
    display(pipeline_results)
    logging.info(f"\nPipeline finished with {len(failures)} failures.")
    return len(failures)

# Function for creating a snapshot for the new dataset
def recreate_snapshot(config):
    
    # Extract parameters from config
    target_dataset_id = config["target_dataset_id"] 
    azure_billing_profile = config["azure_billing_profile"] 
    tdr_host = config["tdr_host"] 
    anvil_schema = config["anvil_schema"] 
    
    # Setup/refresh TDR clients
    api_client = refresh_tdr_api_client(tdr_host)
    datasets_api = data_repo_client.DatasetsApi(api_client=api_client)
    snapshots_api = data_repo_client.SnapshotsApi(api_client=api_client)
    
    # Retrieve new dataset details
    logging.info(f"Retrieving dataset details from prod environment. UUID:  {target_dataset_id}")
    try:
        dataset_details = datasets_api.retrieve_dataset(id=target_dataset_id, include=["SCHEMA", "ACCESS_INFORMATION", "PROPERTIES", "DATA_PROJECT", "STORAGE"]).to_dict()
        dataset_name = dataset_details["name"]
        phs_id = dataset_details["phs_id"]
        consent_name = dataset_details["properties"]["consent_name"]
        auth_domains = dataset_details["properties"]["auth_domains"]
        current_datetime = datetime.datetime.now()
        current_datetime_string = current_datetime.strftime("%Y%m%d%H%M")
        snapshot_name = dataset_name + "_" + anvil_schema + "_" + current_datetime_string
    except Exception as e:
        error_str = f"Error retrieving details from dataset: {str(e)}"
        logging.error(error_str)
    # Build config and submit snapshot job
    snapshot_config = {
        "profile_id": azure_billing_profile,
        "snapshot_readers_list": ["azul-anvil-prod@firecloud.org", "auth-domain"],
        "anvil_schema_versin": anvil_schema,
        "ws_bucket": os.environ["WORKSPACE_BUCKET"],
        "dataset_id": entry[1],
        "dataset_name": dataset_name,
        "phs_id": phs_id,
        "consent_name": consent_name,
        "auth_domains": auth_domains,
        "pipeline_results": [],
        "snapshot_name": snapshot_name
    }
    utils.create_and_share_snapshot(snapshot_config)
    int_df_results = pd.DataFrame(snapshot_config["pipeline_results"], columns = ["Dataset", "Time", "Step", "Task", "Status", "Message"])
    errors = int_df_results[int_df_results["Status"].str.contains("Error")]
    if len(errors) > 0:
        logging.error("Errors reported in snapshotting. See logs for details.")
        status = "Failure"
        message = f"{len(errors)} failures reported. See log for details."
        snapshot_id = ""
        snapshot_name = ""
    else:
        status = "Success"
        message = ""
        snapshot_id = re.search("{'id': '([a-z0-9]{8}-[a-z0-9]{4}-[a-z0-9]{4}-[a-z0-9]{4}-[a-z0-9]{12})'", str(int_df_results[int_df_results["Task"]=="Create and Share Snapshot"]["Message"]))[1]
        snapshot_name = re.search("'name': '([a-zA-Z0-9_\-]+)'", str(int_df_results[int_df_results["Task"]=="Create and Share Snapshot"]["Message"]))[1]
    return status, message, snapshot_id, snapshot_name
        
#############################################
## Input Parameters
#############################################

# Specify migration pairs: [Source GCP Dataset, Target Azure Dataset]
migration_list = [
    #["gcp_dataset_id", "az_dataset_id"]
    ["09642596-d33a-4261-8bf7-eb1dbb37d572", "89d9ed6a-28c6-4b1c-97a1-10b1a26382be"],
    ["75119ed5-b8aa-4f45-bdef-e3c673bbe44c", "cb009d23-9a05-44a9-82ac-82ef0722ab81"],
]

# Run parameters
azure_billing_profile = "9ee23bed-b46c-4561-9103-d2a723113f7f"
anvil_schema = "ANV5"
run_data_migration = True
skip_ingests = False # Set to True to build datarepo_row_id xwalk and run validation w/o ingesting more records
tables_to_ingest = [] # Leave empty for all
run_snapshot_creation = True

#############################################
## Execution
#############################################

# Set up logging
current_datetime_string = datetime.datetime.now().strftime("%Y%m%d%H%M%S")
logs_stream_file_path = "processing_details_" + current_datetime_string + ".log"
while logging.root.handlers:
    logging.root.removeHandler(logging.root.handlers[-1])
logging.basicConfig(format="%(asctime)s - %(levelname)s: %(message)s", datefmt="%m/%d/%Y %I:%M:%S %p", level=logging.INFO, handlers=[logging.FileHandler(logs_stream_file_path), logging.StreamHandler(sys.stdout)])

# Loop through migration list and process entries
results = []
for entry in migration_list:
    
    # Run cross-cloud ingestion, if specified
    failure_count = 0
    if run_data_migration:
        logging.info(f"\nMigrating tabular data from TDR dataset {entry[0]} to TDR dataset {entry[1]}.")
        # Build config and submit migration job
        config = {
            "source_dataset_id": entry[0], 
            "target_dataset_id": entry[1],
            "tables_to_ingest": tables_to_ingest,
            "tdr_host": "https://data.terra.bio",
            "tdr_sa_to_use": "datarepo-jade-api@terra-datarepo-production.iam.gserviceaccount.com",
            "tar_tdr_billing_profile": azure_billing_profile,
            "chunk_size": 250000,
            "max_combined_rec_ref_size": 40000,
            "migration_results": [],
            "dr_row_id_xwalk": {},
            "skip_ingests": skip_ingests
        }
        failure_count = migrate_tabular_data(config)
        status = "Failure" if failure_count > 0 else "Success"
        msg = f"{failure_count} failures reported. See log for details." if failure_count > 0 else ""
        results.append([entry[0], entry[1], "Data Ingestion", status, msg, "", ""])

    # Run snapshotting, if specified and no upstream errors detected
    if run_snapshot_creation:
        logging.info(f"Creating a snapshot for TDR dataset {entry[1]}.")
        # Build config and submit snapshot job
        config = { 
            "target_dataset_id": entry[1],
            "tdr_host": "https://data.terra.bio",
            "azure_billing_profile": azure_billing_profile,
            "anvil_schema": anvil_schema
        }
        if failure_count > 0:
            logging.error("Failures noted in upstream data processing. Skipping snapshotting.")
            results.append([entry[0], entry[1], "Data Snapshotting", "Skipped", "Failures noted in upstream data processing.", "", ""])
        else:
            status, message, snapshot_id, snapshot_name = recreate_snapshot(config)
            results.append([entry[0], entry[1], "Data Snapshotting", status, message, snapshot_id, snapshot_name])
            
# Display final results
print("\nFinal Results:")
results_df = pd.DataFrame(results, columns = ["Source Dataset ID", "Target Dataset ID", "Processing Step", "Status", "Message", "Snapshot ID", "Snapshot Name"])
display(results_df)


## Validation

### Pull and Compare Tabular Data between TDR Datasets

In [None]:
#############################################
## Functions
#############################################

def compare_row_counts(dataset_1_id, dataset_2_id):
    
    # Setup/refresh TDR clients
    logging.info(f"Comparing tabular data record counts between TDR dataset {dataset_1_id} and TDR dataset {dataset_2_id}.")
    api_client = refresh_tdr_api_client("https://data.terra.bio")
    datasets_api = data_repo_client.DatasetsApi(api_client=api_client)

    # Pull table list across datasets
    logging.info(f"Pulling the superset of tables across the two datasets.")
    try:
        dataset_1_details = datasets_api.retrieve_dataset(id=dataset_1_id, include=["SCHEMA"]).to_dict()
        dataset_2_details = datasets_api.retrieve_dataset(id=dataset_2_id, include=["SCHEMA"]).to_dict()
    except Exception as e:
        error_str = f"Error retrieving details from datasets: {str(e)}"
        logging.error(error_str)
    table_set = set()
    for table_entry in dataset_1_details["schema"]["tables"]:
        table_set.add(table_entry["name"])
    for table_entry in dataset_2_details["schema"]["tables"]:
        table_set.add(table_entry["name"])   

    # For each table in the table list, pull record counts from the two datasets and compare
    results = []
    payload = {
      "offset": 0,
      "limit": 10,
      "sort": "datarepo_row_id",
      "direction": "asc",
      "filter": ""
    }
    for table in table_set:
        logging.info(f"Comparing record counts for table '{table}'")
        # Pulling record counts for dataset 1
        ds1_table_present = "True"
        attempt_counter = 0
        while True:
            try:
                record_results = datasets_api.query_dataset_data_by_id(id=dataset_1_id, table=table, query_data_request_model=payload).to_dict()
                ds1_record_count = record_results["total_row_count"]
                break
            except Exception as e:
                if "No dataset table exists" in str(e):
                    ds1_record_count = 0
                    ds1_table_present = "False"
                    break
                else:
                    if attempt_counter < 2:
                        sleep(10)
                        attempt_counter += 1
                        continue
                    else:
                        ds1_record_count = 0
                        ds1_table_present = "Unknown"
                        break
        # Pulling record counts for dataset 2
        ds2_table_present = "True"
        attempt_counter = 0
        while True:
            try:
                record_results = datasets_api.query_dataset_data_by_id(id=dataset_2_id, table=table, query_data_request_model=payload).to_dict()
                ds2_record_count = record_results["total_row_count"]
                break
            except Exception as e:
                if "No dataset table exists" in str(e):
                    ds2_record_count = 0
                    ds2_table_present = "False"
                    break
                else:
                    if attempt_counter < 2:
                        sleep(10)
                        attempt_counter += 1
                        continue
                    else:
                        ds2_record_count = 0
                        ds2_table_present = "Unknown"
                        break
        # Build table comparison
        if ds1_table_present == "Unknown" or ds2_table_present == "Unknown":
            status = "Fail"
            error_reason = "Error retrieving table data from dataset(s)"
        elif ds1_table_present == "False" or ds2_table_present == "False":
            status = "Fail"
            error_reason = "Table presence mismatch between datasets"
        elif ds1_record_count != ds2_record_count:
            status = "Fail"
            error_reason = "Difference in record count"
        else:
            status = "Pass"
            error_reason = ""
        results.append([dataset_1_id, dataset_2_id, table, ds1_table_present, ds1_record_count, ds2_table_present, ds2_record_count, status, error_reason])

    # Display detailed results
    print("\nResults:")
    results_df = pd.DataFrame(results, columns = ["Dataset 1 ID", "Dataset 2 ID", "Table", "Table in DS1", "DS1 Record Count", "Table in DS2", "DS2 Record Count", "Status", "Message"])
    display(results_df)

    # Return final aggregated results
    status = "Pass"
    failed_tables = []
    for entry in results:
        if entry[7] == "Fail":
            failed_tables.append(entry[2])
            status = "Fail"
    return status, sorted(failed_tables)
        
def compare_contents_sample(dataset_1_id, dataset_2_id, sample_size, fields_to_ignore):
    # Pull schema, record first column in each table (for ordering)
    # Setup/refresh TDR clients
    logging.info(f"Comparing tabular data record counts between TDR dataset {dataset_1_id} and TDR dataset {dataset_2_id}.")
    api_client = refresh_tdr_api_client("https://data.terra.bio")
    datasets_api = data_repo_client.DatasetsApi(api_client=api_client)

    # Pull table list across datasets
    logging.info(f"Pulling the superset of tables across the two datasets.")
    try:
        dataset_1_details = datasets_api.retrieve_dataset(id=dataset_1_id, include=["SCHEMA"]).to_dict()
        dataset_2_details = datasets_api.retrieve_dataset(id=dataset_2_id, include=["SCHEMA"]).to_dict()
    except Exception as e:
        error_str = f"Error retrieving details from datasets: {str(e)}"
        logging.error(error_str)
    table_set = {}
    for table_entry in dataset_1_details["schema"]["tables"]:
        table_set.add(table_entry["name"])
    for table_entry in dataset_2_details["schema"]["tables"]:
        table_set.add(table_entry["name"])  
    
    
    # Loop through tables, pull xxx records (by sample size), ordering by first column
    # Drop fields_to_ignore
    # Compare --> How to best do this
    pass
    

#############################################
## Input Parameters
#############################################

# Specify the list of dataset pairs to compare
dataset_id_pairs_list = [
    #["gcp_dataset_id", "az_dataset_id"]
    ["a9212179-efd5-4c53-a57a-93016eff9017", "00832d1d-0145-4e04-88dc-2bad4c5d87bf"],
]

# Specify whether row comparison checks should run
run_row_count_comparison = True

# Specify whether table content checks should run, the size of the sample to use (if so), and which fields should be excluded from comparison
run_contents_sample_comparison = False
contents_sample_comparison_size = 1000
fields_to_ignore = ["datarepo_row_id", "orig_datarepo_row_id", "orig_file_ref", "source_datarepo_row_ids", "uri"]

#############################################
## Execution
#############################################

# Run validation
results = []
for dataset_id_pair in dataset_id_pairs_list:
    if run_row_count_comparison:
        status, failed_tables = compare_row_counts(dataset_id_pair[0], dataset_id_pair[1])
        results.append([dataset_id_pair[0], dataset_id_pair[1], "Record Count Comparison", status, ', '.join(failed_tables)])

# Display final results
print("\nFinal Validation Results:")
results_df = pd.DataFrame(results, columns = ["Dataset 1 ID", "Dataset 2 ID", "Validation Type", "Status", "Failed Tables"])
display(results_df)   


In [None]:
# Parameters
dataset_1_id = "b12fb9be-2ce0-4bfd-8503-732fabba06ab"
dataset_2_id = "744c85cc-13d2-4f90-9d2e-d3143cb01edb"
contents_sample_comparison_size = 1000
fields_to_ignore = ["datarepo_row_id", "orig_datarepo_row_id", "orig_file_ref", "source_datarepo_row_ids", "uri"]

# Setup/refresh TDR clients
logging.info(f"Comparing a sample of tabular data content between TDR dataset {dataset_1_id} and TDR dataset {dataset_2_id}.")
api_client = refresh_tdr_api_client("https://data.terra.bio")
datasets_api = data_repo_client.DatasetsApi(api_client=api_client)

# Pull table list across datasets
logging.info(f"Pulling the superset of tables across the two datasets.")
try:
    dataset_1_details = datasets_api.retrieve_dataset(id=dataset_1_id, include=["SCHEMA"]).to_dict()
    dataset_2_details = datasets_api.retrieve_dataset(id=dataset_2_id, include=["SCHEMA"]).to_dict()
except Exception as e:
    error_str = f"Error retrieving details from datasets: {str(e)}"
    logging.error(error_str)
table_set = {}
for table_entry in dataset_1_details["schema"]["tables"]:
    table_set[table_entry["name"]] = table_entry["columns"][0]["name"]
for table_entry in dataset_2_details["schema"]["tables"]:
    table_set[table_entry["name"]] = table_entry["columns"][0]["name"]
    
# For each table in the table list, pull sample records from the two datasets and compare
results = []
for table in ["file_inventory"]: #table_set.keys():
    logging.info(f"Comparing sample records for table '{table}'")
    # Pulling sample records for dataset 1
    ds1_table_present = "True"
    max_page_size = 1000
    total_records_fetched = 0
    ds1_final_records = []
    attempt_counter = 0
    while True:
        offset = total_records_fetched
        page_size = min(max_page_size, contents_sample_comparison_size - total_records_fetched)
        attempt_counter = 0
        while True:
            payload = {
              "offset": offset,
              "limit": page_size,
              "sort": table_set[table],
              "direction": "asc",
              "filter": ""
            }
            try:
                record_results = datasets_api.query_dataset_data_by_id(id=dataset_1_id, table=table, query_data_request_model=payload).to_dict() 
                break
            except Exception as e:
                if "No dataset table exists" in str(e):
                    record_results = []
                    ds1_table_present = "False"
                    break
                else:
                    if attempt_counter < 2:
                        sleep(10)
                        attempt_counter += 1
                        continue
                    else:
                        record_results = []
                        ds1_table_present = "Unknown"
                        break
        if record_results["result"]:
            ds1_final_records.extend(record_results["result"])
            total_records_fetched += len(record_results["result"])
        else:
            break
        if total_records_fetched >= contents_sample_comparison_size:
            break
    # Pulling sample records for dataset 2
    ds2_table_present = "True"
    max_page_size = 1000
    total_records_fetched = 0
    ds2_final_records = []
    attempt_counter = 0
    while True:
        offset = total_records_fetched
        page_size = min(max_page_size, contents_sample_comparison_size - total_records_fetched)
        attempt_counter = 0
        while True:
            payload = {
              "offset": offset,
              "limit": page_size,
              "sort": table_set[table],
              "direction": "asc",
              "filter": ""
            }
            try:
                record_results = datasets_api.query_dataset_data_by_id(id=dataset_2_id, table=table, query_data_request_model=payload).to_dict() 
                break
            except Exception as e:
                if "No dataset table exists" in str(e):
                    record_results = []
                    ds2_table_present = "False"
                    break
                else:
                    if attempt_counter < 2:
                        sleep(10)
                        attempt_counter += 1
                        continue
                    else:
                        record_results = []
                        ds2_table_present = "Unknown"
                        break
        if record_results["result"]:
            ds2_final_records.extend(record_results["result"])
            total_records_fetched += len(record_results["result"])
        else:
            break
        if total_records_fetched >= contents_sample_comparison_size:
            break

In [None]:
df_ds1_records_int = pd.DataFrame.from_dict(ds1_final_records)
df_ds2_records_int = pd.DataFrame.from_dict(ds2_final_records)
cols = df_ds1_records_int.columns.tolist()
for field in fields_to_ignore:
    if field in cols:
        cols.remove(field)
df_ds1_records = df_ds1_records_int[cols]
df_ds2_records = df_ds2_records_int[cols]

In [None]:
diff = df_ds1_records.compare(df_ds2_records)

In [None]:
if df_ds1_records.equals(df_ds2_records):
    print("True")
else:
    print("False")

### Pull and Compare File Counts and Sizes between TDR Datasets

In [2]:
#############################################
## Functions
#############################################

def collect_file_stats(dataset_id_pairs_list):
    
    results = []
    for dataset_id_pair in dataset_id_pairs_list:

            # Setup/refresh TDR clients
            logging.info(f"Processing dataset_id_pair: {dataset_id_pair}")
            api_client = refresh_tdr_api_client("https://data.terra.bio")
            datasets_api = data_repo_client.DatasetsApi(api_client=api_client)

            # Initialize variables
            dataset_id_1 = dataset_id_pair[0]
            file_count_1 = 0
            total_file_size_1 = 0
            max_file_size_1 = 0
            status_1 = "Success"
            message_1 = ""
            dataset_id_2 = dataset_id_pair[1]
            file_count_2 = 0
            total_file_size_2 = 0
            max_file_size_2 = 0
            status_2 = "Success"
            message_2 = ""
            validation_status = "Passed"
            validation_message = ""

            # For dataset_id_1, loop through dataset files and record information
            logging.info(f"Retrieving files from dataset_id {dataset_id_1}...")
            try:
                max_page_size = 1000
                total_records_fetched = 0
                while True:
                    row_start = total_records_fetched
                    dataset_file_results = datasets_api.list_files(id=dataset_id_1, offset=row_start, limit=max_page_size)
                    if dataset_file_results:
                        total_records_fetched += len(dataset_file_results)
                        for entry in dataset_file_results:
                            file_count_1 += 1
                            total_file_size_1 += entry.size
                            if entry.size > max_file_size_1:
                                max_file_size_1 = entry.size
                        logging.info(f"{total_records_fetched} records fetched...")
                    else:
                        break
                logging.info(f"File retrieval complete!")
            except Exception as e:
                status_1 = "Failure"
                message_1 = str(e)
                logging.error(f"Failure in file retrieval: {message_1}")
            
            # For dataset_id_2, loop through dataset files and record information
            logging.info(f"Retrieving files from dataset_id {dataset_id_2}...")
            try:
                max_page_size = 1000
                total_records_fetched = 0
                while True:
                    row_start = total_records_fetched
                    dataset_file_results = datasets_api.list_files(id=dataset_id_2, offset=row_start, limit=max_page_size)
                    if dataset_file_results:
                        total_records_fetched += len(dataset_file_results)
                        for entry in dataset_file_results:
                            file_count_2 += 1
                            total_file_size_2 += entry.size
                            if entry.size > max_file_size_2:
                                max_file_size_2 = entry.size
                        logging.info(f"{total_records_fetched} records fetched...")
                    else:
                        break
                logging.info(f"File retrieval complete!")
            except Exception as e:
                status_2 = "Failure"
                message_2 = str(e)
                logging.error(f"Failure in file retrieval: {message_2}")
                
            # Record and display interim results
            file_count_diff = file_count_1 - file_count_2
            total_file_size_diff = total_file_size_1 - total_file_size_2
            max_file_size_diff = max_file_size_1 - max_file_size_2
            if status_1 == "Failure" or status_2 == "Failure":
                validation_status = "Failed"
                validation_message = "Errors pulling counts for one or more datasets."
            elif file_count_diff > 0 or total_file_size_diff > 0 or max_file_size_diff > 0:
                validation_status = "Failed"
                validation_message = "Difference in counts between datasets."
            results.append([dataset_id_1, dataset_id_2, validation_status, validation_message, file_count_diff, total_file_size_diff, max_file_size_diff, file_count_1, total_file_size_1, max_file_size_1, status_1, message_1, file_count_2, total_file_size_2, max_file_size_2, status_2, message_2])
            int_results_df = pd.DataFrame([[dataset_id_1, dataset_id_2, validation_status, validation_message, file_count_diff, total_file_size_diff, max_file_size_diff, file_count_1, total_file_size_1, max_file_size_1, status_1, message_1, file_count_2, total_file_size_2, max_file_size_2, status_2, message_2]], columns = ["Dataset ID 1", "Dataset ID 2", "Validation Status", "Validation Message", "File Count Diff", "Total File Size (Bytes) Diff", "Max File Size (Bytes) Diff", "File Count 1", "Total File Size (Bytes) 1", "Max File Size (Bytes) 1", "Status 1 ", "Message 1", "File Count 2", "Total File Size (Bytes) 2", "Max File Size (Bytes) 2", "Status 2 ", "Message 2"])
            logging.info("Results recorded:")
            display(int_results_df)
        
    # Display final results
    destination_dir = "ingest_pipeline/resources/azure_migration"
    current_datetime_string = datetime.datetime.now().strftime("%Y%m%d%H%M%S")
    output_file = f"validation_results_{current_datetime_string}.tsv"
    results_df = pd.DataFrame(results, columns = ["Dataset ID 1", "Dataset ID 2", "Validation Status", "Validation Message", "File Count Diff", "Total File Size (Bytes) Diff", "Max File Size (Bytes) Diff", "File Count 1", "Total File Size (Bytes) 1", "Max File Size (Bytes) 1", "Status 1 ", "Message 1", "File Count 2", "Total File Size (Bytes) 2", "Max File Size (Bytes) 2", "Status 2 ", "Message 2"])
    results_df.to_csv(output_file, index=False, sep="\t")
    !gsutil cp $output_file $ws_bucket/$destination_dir/ 2> stdout
    !rm $output_file
    print("\nAggregated Validation Results:")
    display(results_df)   
    

#############################################
## Input Parameters
#############################################

# Specify the list of dataset IDs
dataset_id_pairs_list = [
    ['8de6dae2-55ff-4287-9b75-5b2a950c1f44', 'e1fdd1b9-fe56-42ca-8e86-b4bb32d9bbce'],
    ['bef62e8a-5f5c-4e81-a8f8-ddeaf657b4e8', 'c7206e9a-78ad-4c9d-927f-3ca76646227d'],
    ['d306000b-88c1-4220-8d7e-933c0118a983', 'e5dc1a69-cb2c-4626-9799-6bb5fae7b147'],
]

#############################################
## Execution
#############################################

collect_file_stats(dataset_id_pairs_list)


04/12/2024 12:54:10 PM - INFO: Processing dataset_id_pair: ['8de6dae2-55ff-4287-9b75-5b2a950c1f44', 'e1fdd1b9-fe56-42ca-8e86-b4bb32d9bbce']
04/12/2024 12:54:10 PM - INFO: Retrieving files from dataset_id 8de6dae2-55ff-4287-9b75-5b2a950c1f44...
04/12/2024 12:54:12 PM - INFO: 86 records fetched...
04/12/2024 12:54:12 PM - INFO: File retrieval complete!
04/12/2024 12:54:12 PM - INFO: Retrieving files from dataset_id e1fdd1b9-fe56-42ca-8e86-b4bb32d9bbce...
04/12/2024 12:54:15 PM - INFO: 86 records fetched...
04/12/2024 12:54:16 PM - INFO: File retrieval complete!
04/12/2024 12:54:17 PM - INFO: Results recorded:


Unnamed: 0,Dataset ID 1,Dataset ID 2,Validation Status,Validation Message,File Count Diff,Total File Size (Bytes) Diff,Max File Size (Bytes) Diff,File Count 1,Total File Size (Bytes) 1,Max File Size (Bytes) 1,Status 1,Message 1,File Count 2,Total File Size (Bytes) 2,Max File Size (Bytes) 2,Status 2,Message 2
0,8de6dae2-55ff-4287-9b75-5b2a950c1f44,e1fdd1b9-fe56-42ca-8e86-b4bb32d9bbce,Passed,,0,0,0,86,3632191049,349377430,Success,,86,3632191049,349377430,Success,


04/12/2024 12:54:17 PM - INFO: Processing dataset_id_pair: ['bef62e8a-5f5c-4e81-a8f8-ddeaf657b4e8', 'c7206e9a-78ad-4c9d-927f-3ca76646227d']
04/12/2024 12:54:17 PM - INFO: Retrieving files from dataset_id bef62e8a-5f5c-4e81-a8f8-ddeaf657b4e8...
04/12/2024 12:54:19 PM - INFO: 1000 records fetched...
04/12/2024 12:54:21 PM - INFO: 2000 records fetched...
04/12/2024 12:54:23 PM - INFO: 3000 records fetched...
04/12/2024 12:54:25 PM - INFO: 4000 records fetched...
04/12/2024 12:54:28 PM - INFO: 5000 records fetched...
04/12/2024 12:54:31 PM - INFO: 6000 records fetched...
04/12/2024 12:54:33 PM - INFO: 7000 records fetched...
04/12/2024 12:54:35 PM - INFO: 8000 records fetched...
04/12/2024 12:54:38 PM - INFO: 9000 records fetched...
04/12/2024 12:54:40 PM - INFO: 10000 records fetched...
04/12/2024 12:54:42 PM - INFO: 11000 records fetched...
04/12/2024 12:54:45 PM - INFO: 12000 records fetched...
04/12/2024 12:54:48 PM - INFO: 13000 records fetched...
04/12/2024 12:54:50 PM - INFO: 14000 

Unnamed: 0,Dataset ID 1,Dataset ID 2,Validation Status,Validation Message,File Count Diff,Total File Size (Bytes) Diff,Max File Size (Bytes) Diff,File Count 1,Total File Size (Bytes) 1,Max File Size (Bytes) 1,Status 1,Message 1,File Count 2,Total File Size (Bytes) 2,Max File Size (Bytes) 2,Status 2,Message 2
0,bef62e8a-5f5c-4e81-a8f8-ddeaf657b4e8,c7206e9a-78ad-4c9d-927f-3ca76646227d,Failed,Difference in counts between datasets.,25203,120403205483569,0,50408,259299049376285,111500944870,Success,,25205,138895843892716,111500944870,Success,


04/12/2024 01:01:25 PM - INFO: Processing dataset_id_pair: ['d306000b-88c1-4220-8d7e-933c0118a983', 'e5dc1a69-cb2c-4626-9799-6bb5fae7b147']
04/12/2024 01:01:25 PM - INFO: Retrieving files from dataset_id d306000b-88c1-4220-8d7e-933c0118a983...
04/12/2024 01:01:28 PM - INFO: 1000 records fetched...
04/12/2024 01:01:31 PM - INFO: 2000 records fetched...
04/12/2024 01:01:33 PM - INFO: 3000 records fetched...
04/12/2024 01:01:35 PM - INFO: 4000 records fetched...
04/12/2024 01:01:38 PM - INFO: 5000 records fetched...
04/12/2024 01:01:41 PM - INFO: 6000 records fetched...
04/12/2024 01:01:43 PM - INFO: 7000 records fetched...
04/12/2024 01:01:46 PM - INFO: 8000 records fetched...
04/12/2024 01:01:49 PM - INFO: 9000 records fetched...
04/12/2024 01:01:52 PM - INFO: 10000 records fetched...
04/12/2024 01:01:55 PM - INFO: 11000 records fetched...
04/12/2024 01:01:58 PM - INFO: 12000 records fetched...
04/12/2024 01:02:01 PM - INFO: 13000 records fetched...
04/12/2024 01:02:04 PM - INFO: 14000 

Unnamed: 0,Dataset ID 1,Dataset ID 2,Validation Status,Validation Message,File Count Diff,Total File Size (Bytes) Diff,Max File Size (Bytes) Diff,File Count 1,Total File Size (Bytes) 1,Max File Size (Bytes) 1,Status 1,Message 1,File Count 2,Total File Size (Bytes) 2,Max File Size (Bytes) 2,Status 2,Message 2
0,d306000b-88c1-4220-8d7e-933c0118a983,e5dc1a69-cb2c-4626-9799-6bb5fae7b147,Passed,,0,0,0,14416,493485549019,35643261478,Success,,14416,493485549019,35643261478,Success,



Aggregated Validation Results:


Unnamed: 0,Dataset ID 1,Dataset ID 2,Validation Status,Validation Message,File Count Diff,Total File Size (Bytes) Diff,Max File Size (Bytes) Diff,File Count 1,Total File Size (Bytes) 1,Max File Size (Bytes) 1,Status 1,Message 1,File Count 2,Total File Size (Bytes) 2,Max File Size (Bytes) 2,Status 2,Message 2
0,8de6dae2-55ff-4287-9b75-5b2a950c1f44,e1fdd1b9-fe56-42ca-8e86-b4bb32d9bbce,Passed,,0,0,0,86,3632191049,349377430,Success,,86,3632191049,349377430,Success,
1,bef62e8a-5f5c-4e81-a8f8-ddeaf657b4e8,c7206e9a-78ad-4c9d-927f-3ca76646227d,Failed,Difference in counts between datasets.,25203,120403205483569,0,50408,259299049376285,111500944870,Success,,25205,138895843892716,111500944870,Success,
2,d306000b-88c1-4220-8d7e-933c0118a983,e5dc1a69-cb2c-4626-9799-6bb5fae7b147,Passed,,0,0,0,14416,493485549019,35643261478,Success,,14416,493485549019,35643261478,Success,


# Migrating Workspaces

## Pre-Connector Processing
For each GCP Workspace - Azure Workspace pair:
1. Build a manifest of files to be copied from the GCP Workspace to the Azure Workspace. 
2. Write the manifest to BigQuery for consumption by downstream processes.
3. Add the appropriate SAs to the source and destination workspaces to facilitate the copy.

In [None]:
#############################################
## Functions
#############################################

# Function to create file transfer details
def output_file_details(source_ws_project, source_ws_name, target_ws_project, target_ws_name, file_bigquery_table, target_bigquery_table, delete_existing_records):
    
    # Establish credentials and clients
    client = bigquery.Client()
    creds, project = google.auth.default(scopes=['https://www.googleapis.com/auth/cloud-platform', 'openid', 'email', 'profile'])
    auth_req = google.auth.transport.requests.Request()
    creds.refresh(auth_req)

    # Pull bucket from source workspace
    try:
        ws_attributes = requests.get(
            url=f"https://api.firecloud.org/api/workspaces/{source_ws_project}/{source_ws_name}?fields=workspace.bucketName",
            headers={"Authorization": f"Bearer {creds.token}"}
        ).json()
        ws_bucket = ws_attributes["workspace"]["bucketName"]
    except:
        err_str = "Error retrieving workspace attributes for source workspace."
        logging.error(err_str)
        raise Exception(err_str)

    # Pull storage container from target workspace
    try:
        ws_attributes = requests.get(
            url=f"https://api.firecloud.org/api/workspaces/{target_ws_project}/{target_ws_name}?fields=workspace.workspaceId",
            headers={"Authorization": f"Bearer {creds.token}"}
        ).json()
        ws_id = ws_attributes["workspace"]["workspaceId"] 
        ws_resources = requests.get(
            url=f"https://workspace.dsde-prod.broadinstitute.org/api/workspaces/v1/{ws_id}/resources?offset=0&limit=10&resource=AZURE_STORAGE_CONTAINER",
            headers={"Authorization": f"Bearer {creds.token}"}
        ).json()
        resource_id = ""
        for resource_entry in ws_resources["resources"]:
            if resource_entry["resourceAttributes"]["azureStorageContainer"]["storageContainerName"][0:3] == "sc-":
                resource_id = resource_entry["metadata"]["resourceId"]
                break
        if resource_id:
            sas_response = requests.post(
                url=f"https://workspace.dsde-prod.broadinstitute.org/api/workspaces/v1/{ws_id}/resources/controlled/azure/storageContainer/{resource_id}/getSasToken?sasExpirationDuration=86400",
                headers={"Authorization": f"Bearer {creds.token}", "accept": "application/json"}
            ).json()
            base_url = sas_response["url"]
            ws_storage_container = re.search("^[a-z0-9:\/=\-\.]+", base_url, re.IGNORECASE).group(0)
        else:
            err_str = "Error retrieving resource information for target workspace."
            logging.error(err_str)
            raise Exception(err_str)
    except:
        err_str = "Error retrieving workspace attributes for target workspace."
        logging.error(err_str)
        raise Exception(err_str)

    # Clear records from target BQ table (if specified)
    if delete_existing_records:
        logging.info(f"Preparing target BQ table ({target_bigquery_table}).")
        delete_query = f"""DELETE FROM `{target_bigquery_table}` WHERE gcp_ws_project = '{source_ws_project}' and gcp_ws_name = '{source_ws_name}'"""
        try:
            delete_query_job = client.query(delete_query)
            delete_query_job.result()
        except Exception as e:
            logging.warning("Error deleting records for the original dataset from the target BQ table.") 

    # Write the query to pull files into a dataframe
    logging.info(f"Building manifest of files to copy from the source '{source_ws_project}.{source_ws_name}' workspace to the target '{target_ws_project}.{target_ws_name}' workspace.")
    current_datetime_string = datetime.datetime.now().strftime("%Y%m%d%H%M%S")
    job_config = bigquery.LoadJobConfig()
    job_config.write_disposition = "WRITE_APPEND"
    query = f"""SELECT '{source_ws_project}' AS gcp_ws_project, '{source_ws_name}' AS gcp_ws_name, 
                '{target_ws_project}' AS az_ws_project, '{target_ws_name}' AS az_ws_name, 
                 'gs://{ws_bucket}/'||name AS source_path, '{ws_storage_container}/'||name AS target_path, 
                 size AS size_in_bytes, md5Hash AS md5_hash, '{current_datetime_string}' AS date_added
                FROM `{file_bigquery_table}` 
                WHERE bucket = '{ws_bucket}'
                AND name NOT LIKE '%/'"""
    attempt_counter = 0
    while True:
        try:
            df = client.query(query).result().to_dataframe()
            job = client.load_table_from_dataframe(df, target_bigquery_table, job_config=job_config)
            logging.info("Records recorded successfully.")
            break
        except Exception as e:
            if attempt_counter < 5:
                sleep(10)
                attempt_counter += 1
                continue
            else:
                err_str = f"Error building and writing file manifest: {str(e)}."
                logging.error(err_str)
                raise Exception(err_str)
    
    # Add SAs where needed
    pass

            
#############################################
## Input Parameters
#############################################

# General parameters
file_bigquery_table = "broad-dsde-prod-analytics-dev.anvil_inventory.object_metadata_26_02_2024__17_14_55"
target_bigquery_table = "broad-dsde-prod-analytics-dev.anvil_azure_migration.azure_migration_file_list_workspaces"

# Specify migration pairs: Source GCP Workspace - Target Azure Workspace
migration_list = [
    #{"gcp_ws_project": "anvil-datastorage", "gcp_ws_name": "<name>", "az_ws_project": "AnVILDataStorage_Azure", "az_ws_name": "<name>"}
    {"gcp_ws_project": "anvil-datastorage", "gcp_ws_name": "AnVIL_GTEx_Deposit", "az_ws_project": "AnVILDataStorage_Azure", "az_ws_name": "AnVIL_GTEx_Deposit_Azure"}
]

# Specify whether existing records in the azure_migration_file_list_workspaces table should be deleted before running
delete_existing_records = True


#############################################
## Execution
#############################################

# Loop through migration list and process entries
results = []
for entry in migration_list:
    logging.info(f"Processing Migration List Entry: {str(entry)}")
    try:
        output_file_details(entry["gcp_ws_project"], entry["gcp_ws_name"], entry["az_ws_project"], entry["az_ws_name"], file_bigquery_table, target_bigquery_table, delete_existing_records)
        results.append([entry["gcp_ws_name"], entry["az_ws_name"], "Success", ""])
    except Exception as e:
        results.append([entry["gcp_ws_name"], entry["az_ws_name"], "Failure", str(e)])
        
# Display final results
print("\nFinal Results:")
results_df = pd.DataFrame(results, columns = ["Source Workspace Name", "Target Workspace Name", "Status", "Message"])
display(results_df)


In [None]:
source_ws_project = "anvil-datastorage"
source_ws_name = "AnVIL_GTEx_Deposit"
target_ws_project = "AnVILDataStorage_Azure"
target_ws_name = "AnVIL_GTEx_Deposit_Azure"

# ADD SA to resources


## Post-Connector Processing
1. Remove access for any SAs added in the pre-connector step.

In [None]:
# TO DO

# Deleting Workspaces

In [None]:
# TO DO

# Utility

## Dataset Deletion

In [None]:
# Function to delete a specific TDR Snapshot
def delete_snapshot(snapshot_id):
    api_client = refresh_tdr_api_client("https://data.terra.bio")
    snapshots_api = data_repo_client.SnapshotsApi(api_client=api_client)
    print("Attempting to delete snapshot = {}".format(snapshot_id))
    try:
        delete_snapshot_result, job_id = wait_for_tdr_job(snapshots_api.delete_snapshot(id=snapshot_id), "https://data.terra.bio")
        print("Result: {}".format(delete_snapshot_result))
    except Exception as e:
        print("Error: {}".format(e))

# Function to delete a specific TDR Dataset
def delete_dataset(dataset_id):
    api_client = refresh_tdr_api_client("https://data.terra.bio")
    datasets_api = data_repo_client.DatasetsApi(api_client=api_client)
    print("Attempting to delete dataset = {}".format(dataset_id))
    try:
        delete_dataset_result, job_id = wait_for_tdr_job(datasets_api.delete_dataset(id=dataset_id), "https://data.terra.bio")
        print("Result: {}".format(delete_dataset_result))
    except Exception as e:
        print("Error: {}".format(e))

# Function to delete a specific TDR Dataset and all of its Snapshots
def delete_dataset_and_all_snapshots(dataset_id):
    api_client = refresh_tdr_api_client("https://data.terra.bio")
    datasets_api = data_repo_client.DatasetsApi(api_client=api_client)
    snapshots_api = data_repo_client.SnapshotsApi(api_client=api_client)
    print("Attempting to delete dataset = {} and all associated snapshots".format(dataset_id))
    dataset_id_list = [dataset_id]
    # Delete snapshots
    snapshot_list = snapshots_api.enumerate_snapshots(dataset_ids=dataset_id_list)
    if snapshot_list.items:
        for snapshot in snapshot_list.items:
            snapshot_id = str(snapshot.id)
            delete_snapshot(snapshot_id)
            sleep(10)
    # Delete dataset
    delete_dataset(dataset_id)

# Delete snapshots
# snapshot_id_list = [
# '1234',
# ]
# for snapshot_id in snapshot_id_list:
#     delete_snapshot(snapshot_id)

# Delete datasets and all their associated snapshots
dataset_id_list = [
'1be5b5e6-019e-419a-9248-6e80d067d697',
]
for dataset_id in dataset_id_list:
    delete_dataset_and_all_snapshots(dataset_id)

## Update Migration File List Table

In [None]:
# General parameters
target_bigquery_table = "broad-dsde-prod-analytics-dev.anvil_azure_migration.azure_migration_file_list"

# Update parameters
update_list = [
    {"az_dataset_id": "6007151f-45bc-4111-8e9a-b667bc722a6a", "new_gcp_dataset_id": "b22c71b2-2cb2-4b27-a49b-9a2a83d432e8", "new_gcp_dataset_name": "ANVIL_1000G_PRIMED_data_model_20240301"},
    {"az_dataset_id": "a28e4ab5-a07b-4316-b743-7f5f9cc88211", "new_gcp_dataset_id": "3a89c170-2939-4c12-9940-f32d96fa9e55", "new_gcp_dataset_name": "ANVIL_CMH_GAFK_GS_long_read_20240301"}
]

# Execute updates
client = bigquery.Client()
for entry in update_list:
    logging.info(f"Running update for entry: {str(entry)}")
    az_dataset_id = entry["az_dataset_id"]
    gcp_dataset_id = entry["new_gcp_dataset_id"]
    gcp_dataset_name = entry["new_gcp_dataset_name"]
    update_query = f"""UPDATE `{target_bigquery_table}` 
                       SET gcp_dataset_id = '{gcp_dataset_id}', gcp_dataset_name = '{gcp_dataset_name}'
                       WHERE az_dataset_id = '{az_dataset_id}'"""
    try:
        update_query_job = client.query(update_query)
        update_query_job.result()
        logging.info("Update complete.")
    except Exception as e:
        logging.info("Error running update.")
