In [None]:
# Version History
#print("Version 1.0.0: 09/15/2022 2:06pm - Nate Calvanese - First version created")
#print("Version 1.0.1: 09/16/2022 3:10pm - Nate Calvanese - Shifted from transform to mapping compatibility")
#print("Version 1.0.2: 10/14/2022 7:40pm - Nate Calvanese - Added compatibility evaluation and support for multiple mapping specs")
#print("Version 1.0.3: 10/18/2022 1:33pm - Nate Calvanese - Encoded column names to match mapping specifications")
#print("Version 1.0.4: 10/20/2022 11:50am - Nate Calvanese - Added ability to pull schemas for workspaces missing from workspace_schemas.csv")
print("Version 1.0.4: 10/20/2022 11:50am - Nate Calvanese - Improved algorithm and added a compatibility score")


In [1]:
#!pip install --upgrade import_ipynb

# Main Script

## Imports and Helpers

In [1]:
## Imports and environment variables
# imports
import import_ipynb
import pandas as pd
import json
import re
import os
from google.cloud import storage
from firecloud import api as fapi
import ingest_pipeline_utilities as utils
import build_mapping_query as bmq
import data_repo_client

# Configure pandas display
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option("display.max_colwidth", None)
pd.set_option('display.width', 1000)
pd.set_option('display.colheader_justify', 'center')
pd.set_option('display.precision', 3)

# workspace environment variables
ws_name = os.environ["WORKSPACE_NAME"]
ws_project = os.environ["WORKSPACE_NAMESPACE"]
ws_bucket = os.environ["WORKSPACE_BUCKET"]
ws_bucket_name = re.sub('^gs://', '', ws_bucket)


importing Jupyter notebook from ingest_pipeline_utilities.ipynb
Version 1.0.41: 6/24/2024 9:48am - Nate Calvanese - Updated snapshot creation requests to include data access control groups.
importing Jupyter notebook from source_files_creation.ipynb
Version 1.0.9: 2/25/2023 3:15pm - Nate Calvanese - Replaced FAPI with utils functions
importing Jupyter notebook from build_file_inventory.ipynb
Version 2.0.4: 4/12/2024 2:30pm - Nate Calvanese - Fixed target path logic to remove unsupported characters
importing Jupyter notebook from process_table_data.ipynb
Version: 1.0.10: 1/12/2024 11:25am - Nate Calvanese - Made max_combined_rec_ref_size configurable
importing Jupyter notebook from build_mapping_query.ipynb
Version 1.0.14: 6/24/2024 1:07pm - Nate Calvanese - Added support for vocab mapping array fields
importing Jupyter notebook from output_data_validation.ipynb
Version 2.0.7: 12/13/2023 1:13pm -- Replaced deprecated df append with pd.concat
importing Jupyter notebook from resolve_dangl

## Parameters

In [7]:
## Inputs:

# Mapping specification to evaluate
mapping_target = "anvil"
mapping_target_spec_list = ["cmg_ext_2", "cmg_ext_3", "cmg_ext_4", "gtex_ext_2", "gtex_ext_3", "gregor_1"]

# Any known data_file_refs, so file ref fields can be properly evaluated
data_file_refs = {   
}

# Dataset IDs to evaluate
dataset_id_list = [
    'cefc1a79-446c-40d2-b140-ba8d8b1c0712',
    '4e699ead-bbb5-460d-9b32-2b1b322c601b',
    'cefc1a79-446c-40d2-b140-ba8d8b1c0712',
    '2355554e-8951-4b41-bcd8-32e18cddb7c9',
    'bafbf771-1cd2-44fc-9b38-5a4bbead8ab2',
    'a36eeaf7-d6dd-4887-bdbd-e435a07ba156',
    '8de6dae2-55ff-4287-9b75-5b2a950c1f44',
    'ce6692aa-0f97-48fa-8628-b8fa3eab4726',
    '31433635-91d4-431d-8d26-bc54e84c8e8c',
    '0b06619d-39d9-4437-8c42-2e415faa634c',
    '12ffb586-5f6a-4f0a-a353-d2f34599f4cc',
    'e642bca0-52fb-4ab3-ab3a-acaab83deda7',
    'b7fb531e-25a4-427c-9679-b7bdc3d03535',
    '3615e063-f24b-47f7-87cb-430e8aca8d0c',
    'a3ea4f97-6657-4d3c-9be6-96f097f5c952',
    '9f4ac69c-0919-4ac1-98a8-976ed79ace03',
    '96461004-f4b3-4f82-a842-293b3ec46a60',
    '841b7883-9447-4ea0-ae4a-84ea0240d919',
    'd0fc3d6a-c3f4-4533-8a23-817a4e27f9be',
    '2cace5dc-f660-45d4-b689-c4c89e77697c',
    'd2272f2d-c606-4027-b8ea-0bdd6d9d6535',
    'b12fb9be-2ce0-4bfd-8503-732fabba06ab',
    '9f9fc99a-b867-49a9-a3dc-8a39efbd5fa2',
    'ce58654d-b7d3-466b-99ba-b203d527a543',
    '179eb85e-2557-4677-9cba-d763310f3df9',
    'cba804c9-0bdd-4219-a53e-98c8db6334a0',
    'd239dd7b-8d10-4960-aa91-8f8ede641e25',
    '5c6a1c4f-ccd3-48a8-ac00-e18e5ecaa0bb',
    '19e2c8ab-853a-4204-86c3-f591125fbf63',
    '7cf0d3d0-f79b-4bfe-bfc8-e4e6c33dd4c3',
    '3a72e4b8-afb4-4299-98ec-a9ba9606be06',
    'f3c89298-0dd2-40da-8627-3baea553b34a',
    '9a32e23e-840d-4ba3-8cd9-392f48b8e9d2',
    'c5d967fd-09ce-4b02-97dd-ac3abf6f79fa',
    '5069fc2c-b957-4130-adca-6eabae943867',
    '173e56f7-b813-4c41-89ff-09a824e1407f',
    '80312f74-bd56-4938-96ba-e9bed95d1f3b',
    '017445d7-d56e-4e2e-b480-b4879b51e944',
    '13b2076a-cfe1-49ec-ac61-bad1af9a52ea',
    '175dd803-02c7-4823-81d5-9e0621652ace',
    '15492baa-05ed-47bc-b50c-e587679ae51a',
    '33705ce9-b2b3-4edc-9b47-f54283e193cf',
    '92486440-3a46-44dd-b853-b300ef75b31e',
    'dd2cb8fc-42a6-482f-898e-ef6125feccb8',
    '4e99b8e1-40b9-4fb2-90a0-d85e926ef31e',
    '128332b6-5060-4ec4-b6a6-f53b54a810be',
    '06f05f58-3c83-4f5c-bddd-bed7d2d1d147',
    '51e9935f-ec18-4832-801a-6d9186537572',
    'd6291444-8c3c-470c-b28c-7cf1d5c7aad8',
    '41cb9f29-4ba6-4690-821c-cb085e6b0f2f',
    'e68d1d39-99df-4cd7-8053-1b298f03eabb',
    'b252e3ac-4a8c-48e0-9999-5ee0c9a5842d',
    '7ea006d9-1e19-4678-b2e6-d4a1ea327f74',
    '34fd3b22-ac73-47d2-8849-5877158ec072',
    'a08dc7a6-f8ce-4205-95d2-83f614c2c32f',
    '577f36fe-8154-4c82-ac87-b2a64cb68f35',
    '7ce3270e-b2f2-47f4-a288-639751b2f87f',
    '36bdd59f-4f5b-43cd-8d34-a21ef87bbf30',
    '41d12dc1-8718-4439-b409-26cc23573107',
    'c4c49fcd-0c20-4cff-841a-cb58f5689c5b',
    '6b40557c-ddc3-4e7e-8a45-1761e7fcb8b5',
    'd6518df9-fc11-46ed-9c12-b9782d3829a0',
    '9ee2a552-89f8-4a48-9c94-9fa26ebb7483',
    '425412ba-894a-4824-acb8-bf18fe4576e0',
    'f22bd762-5c45-453e-bf22-b174514abb84',
    '0ee62643-b064-42f8-9b09-5d10eacd70a3',
    '1a7f6728-5116-4f24-897a-59a7f322cfd2',
    'c37b388c-7107-43d6-bee6-4e82b40ed271',
    'bf6f1d78-6a0d-4afb-aea6-17a3c34340db',
    'a3becdde-018b-46f0-adea-d587076eef4a',
    'a9ad3a05-24fb-4e59-85b0-ee09e55a4492',
    '719f7581-21db-4aec-8c46-4a5811832710',
    '3725b660-1106-4173-9c4b-0a15926becf5',
    '318a75f4-ac50-4944-81b0-70a1323e7497',
    '75fb0984-2124-444f-881b-30a1a6f8b8f7',
    '15be288e-53e1-41cb-8d20-8ea87efb9258',
    '700303c2-fcef-48a5-9900-096bf34e2d83',
    '38fd20ce-affd-4791-9810-7f5a7fe876d0',
    '8b8185d3-ba5c-4832-af23-3ff8ca6ed016',
    '140797da-dc94-4fc2-8b0b-f2e1dec7bd43',
    'ec97fa0f-e174-40fe-a6b8-ee240bdf4318',
    '5488d7c1-5195-4ebc-b0f0-31033fa06dc9',
    '56f9888f-e623-4a1a-b2b4-46378a6cd6fe',
    'b2e7f15b-65d5-4812-abfd-b2dbc6d18850',
    '69f8d7c2-2e14-48e6-b838-7881016313fb',
    '809fa952-3178-46b7-bb82-8a476ef32e67',
    'dfb14a1f-38b2-4668-b98f-59b5b5b53ca3',
    'f553b765-1c9c-464c-a8fa-07700a1691c5',
    'b108dfd0-711d-4bc1-aab5-1b312226c8ad',
    '38eca26c-d79e-4447-99d3-1889d20ade21',
    'd5a0e24d-689a-4854-92c7-9a39f980b523',
    'ecd0606f-4fa7-4e57-b6e8-eea377e65d5b',
    '0c6bc810-5ae0-4926-a56b-2bc2fe7dbe6d',
    '9dc31133-c882-4f39-903a-a25f316bb560',
    '23a0ede0-4f97-46af-9f04-bd2805050980',
    '7593c1c2-3680-4bf5-8a65-dce5f96a3b59',
    '02661394-2886-4ef7-aff1-d53225c82025',
    'c1644d4e-06e2-4fa8-95f1-5c1da5831257',
    '267cf516-dd33-4640-a71a-78bd8f5db9d8',
    'df06ff22-6a2d-4934-aac9-c8368efbea1a',
    'dcdefb14-f6de-4c46-ac7e-842b273416bf',
    '28208cc6-50bf-4864-9a48-981632066640',
    'ccfe264d-a35e-44f8-9b2b-241a0f8327cc',
    'f177843d-47fb-46ae-83be-73c92ee85081',
    'd049d487-1a69-4358-8dad-0e6fa6c06fdd',
    'da29226b-e856-4014-8c8b-c4268d0df2cc',
    '714dcdbd-8d17-40b0-8246-0e941af8175d',
    '53ce7d12-facc-4412-a710-f535efb209a2',
    'b9842819-5fd6-40c5-9668-aae1ea44a308',
    '7c056125-3ed8-459c-b73e-edfa3f80cc27',
    'ed1215f1-787c-40f5-9d77-4b5bc2dfbb84',
    '7feeb2b1-1926-4968-b6dc-e0a1e4cf8d4a',
    'dbf5d87b-4cab-44cd-a792-1d0218aad973',
    '85dbde76-c130-40b2-8a8a-ba815ba499da',
    '1c2fe11d-b020-4c54-8c71-1ea91623d626',
    '84133066-68cd-41fa-819b-d74a3ac85862',
    '54c6fa73-9b84-4a3b-9e97-e4e43165c48b',
    '90ba1853-f845-4502-ba36-b75b9e571bc5',
    'f3c88c3c-8e1b-4af9-9467-0621404e314c',
    '332bb145-6ef1-40ef-932c-aec5bb6210d9',
    '44f83f20-d618-40b5-b2cb-3676b8fe3ad7',
    '3c2c39a9-4cc2-4f7c-89e0-054a871e2c4e',
    '2ebb722f-a3df-4ea4-b72a-813e3db0bab5',
    '1817528a-4f88-4ed6-8965-9eae0220ab27',
    'e03eb011-05f9-4491-b779-0cc2aefabff1',
    'd4bb7169-5a7d-4090-ba62-12ea799c3ade',
    '3f172982-060d-4339-a09b-6994c2c9eb16',
    '51789659-5233-4ee7-8bca-dedebfc87773',
    'a5fe75bb-d28c-42fb-aaf8-92fa37b266d2',
    '6545d602-e5b4-4dd1-8f6a-64e0a1952ddc',
    'e5c79b74-20d5-4b6f-8085-0bc788eed2ea',
    'f492567d-6db8-45c8-b44e-6b5def26c812',
    '12e54f96-00d9-4a38-921e-e0d42610b2c1',
    '845b131e-7c05-4397-ad40-23dea8e9b399',
    'bd492b71-b20e-4056-b8ae-ad8c94cfbc02',
    '9ecc231f-e3d3-4417-a98a-c4db4c638161',
    '3fb2d04a-d18b-4bdc-9372-99b992f2ae42',
    'e922a496-e686-4fa1-911d-2159ceb0f09f',
    '8fbfea50-6a71-4b19-98e9-f95e3a8594c7',
    'd911e57a-ebb8-4be8-876b-d8e5790ddce3',
    '6c9423a2-3ea7-4c3c-9b12-0cc993bc095f',
    '52e015b5-22b7-4a96-9f0a-ea3afccbfcbc',
    '325f3ee8-2adb-4092-bd78-1b5ea5b0d1d6',
    'ae50ef98-ef3d-4427-b094-83b2d90787a0',
    'e6771964-50e9-482f-9d23-18c22cd89ab8',
    '3fbacc64-4c53-4770-8cdf-a616c10ec5c7',
    'a3ae33bb-8b3a-47e5-a2d1-a49c954776b3',
    '0b0a52bb-a1a2-4638-9259-4447761c2da4',
    '0eb42259-7b44-450f-a9d7-500b2ea7179c',
    'e16adabb-88e0-4739-983a-98ac5c181842',
    '71f94dff-fbe8-4881-af1f-4987b67d5181',
    '5627cdbb-22a0-436f-a7a4-34d7ce21bb45',
    '0e65b131-fd14-4fce-908b-c5b89a71a9c1',
    'd56ae233-d6d2-483c-917e-1de0fe1cfeb7',
    'd00353de-f6f9-42d9-8a8f-f88b3d880dbf',
    '3be57453-9325-4c2e-b73a-832139b61778',
    'fcc60ac9-0d20-4a7c-97e4-e3c8d3aa8f76',
    'dc5f85d8-333b-4b68-b160-ad9856233887',
    '655e6a61-5400-4d8a-95bc-1506e026b289',
    '1f2d14d4-1bd8-46fc-9d35-1a415e5f326a',
    '64fd39fc-b32e-4b0a-8f83-4bf11b197462',
    'a77a2c65-38fe-4bf7-9ea6-0a2dc65eb21f',
    '25248cd8-2e98-4a83-9ccf-af7214fa71d6',
    '158ebecd-4596-4541-b832-a137232b7036',
    '1ccb95c3-1901-428e-b7bb-34495f41f4d2',
    '02ff1051-cd1d-4bbb-a005-21384cbff846',
    '0144b0d3-a809-46df-8c67-7ce42bdd579a',
    '35a1009d-93a2-49b1-a801-fe84d6b7a2f5',
    '50132478-c9fb-4dc5-86cd-d5dfab909393',
    '35064fc1-6c52-4005-8e99-cb0d6afd3f8c',
    '5cf859f6-990c-4b04-8609-35d5c57920f0',
    '62cfdce6-2d4d-415c-a11e-5ab60131c668',
    '2d07dd45-a263-440d-a339-9ccbab93aba8',
    'f1513955-0264-4733-bd25-3f752c61a323',
    '93e712f2-3e54-466e-aa53-57eb69c43bc0',
    '296f653a-91a8-4139-9bab-e6ae13afe99c',
    '633dc1aa-084d-43bd-9b17-bc6e57f81d48',
    '9320b3b5-3944-4bd2-913a-23b72bccd86c',
    '86ab4d3b-86ce-422b-ae6f-1ec6968a874d',
    '4124010f-7308-4831-80d7-ea14343249ab',
    '3037caeb-fa7a-4924-b399-7e4c7173b3b8',
    '146b72bc-1dcc-4e3b-bcda-d3dd25418012',
    '1d575e14-c3b1-4ead-a63b-a21c08c6a14d',
    'c5b1e333-7203-41ce-b8f7-3ef3a3bd721f',
    'bf519ea2-afe1-486a-9954-7362f10b6b60',
    '254ffffc-2bd6-4b2e-905b-a8c54c348cd0',
    'bb65d291-a673-4e4d-8a37-ab1f7401a902',
    '3a9604d7-456a-453d-a46b-40408624a07e',
    '2cbe079d-e7ab-47d8-836e-454a71440297',
    '84fad495-2756-472f-ad20-f91de6f67baf',
    '28e73469-12d4-493b-bf6f-83359c1f69c5',
    'db266afc-2f75-4b03-a3b8-c69e0ce6f713',
    'dd6866e4-8949-45bd-8910-8ce64f79e3c7',
    '12bbfa4c-c30a-4cf6-b79f-45354f842964',
    '84ac0d05-4be5-43e9-973e-ef999144d802',
    'ff8b1212-858a-4048-8f63-9464c922591a',
    'c814d754-cdc5-4b0d-8671-a39e85b2c473',
    '797b2563-5d56-4f5c-bdaf-3bfd11e8f5b3',
    '85287d84-fefe-40df-ad40-5b135ee0c07f',
    '7eeede5a-c86f-4577-9f3c-65ab618a6dee',
    'a52c04ee-cfef-46bb-9b40-6a9b292e1a7b',
    '0194eea9-d779-4957-8521-11717a378e66',
    '0d82658c-44b3-4cea-a388-3353a96a31ef',
    '77dca0d5-4d22-4415-8858-075590d25cb5',
    '1b05159b-6277-4345-9d59-f7bba5ea1d56',
    '92299ff4-c0d0-4e94-b374-75d0038cbd68',
    '60f96582-79ad-4461-9f9a-53c1bc3d17b6',
    'cc107de7-d623-464a-a875-c8b7ae5fb09d',
    'b5d7c34a-c383-4fc7-aa4d-b6dc941cd41a',
    '85baa8f8-619c-4165-9d3e-53220f645814',
    '3f278de3-f201-4344-9639-d35cd7a62adb',
    'c423b18d-12f2-43e4-97f9-993e2943270e',
    '416b8daa-9537-46db-ae7b-3f5ff5f01dc3',
    '61940344-e6c1-484e-ba10-131f43a9b13a',
    'ceee2791-0fdf-45fc-a4e8-8077916771aa',
    '5205f817-7de0-48b4-89fc-6398cf13bff7',
    '1d2f5472-ab6a-4a9b-ba53-520858cf79db',
    'f757278a-3c74-4690-bf89-5149d21ff3af',
    '5a103ab3-29c3-4d07-a0f6-4999c256cf26',
    '2a263db0-8c33-4171-840f-54bf4755a4b9',
    '9828f3fe-f676-4bf1-b600-5effa24ea9c8',
    '28849dc9-a97f-469b-b2ac-a8ff97693f02',
    '29cd0578-fb47-495a-8f48-b37325eed81a',
    'ba503d2e-48af-48bb-910a-be41790d921c',
    '472f01ad-7bc3-4fe5-9771-2695930dbc95',
    '956cd931-7077-4a08-9c75-ab8b4e5d1eb8',
    '31a42df0-29f6-4d4e-ae5c-2e13abc355f2',
    '13364604-ed08-4a61-89cd-65eb372ac8c3',
    'b724164c-712c-4615-97b7-529a108a753a',
    '22199347-9454-41e5-8912-eb38edd33a25',
    '60cadee3-9e63-4897-ac81-4fb283033648',
    'dcd4112f-09d3-43ed-8441-df9bf4c9ddc8',
    'da02c3f1-371b-4afe-9b5e-b8c584fd5907',
    'f6565f2f-4478-45ad-8c11-04dd242fc6a9',
    '275ea204-4612-4d3c-ac0d-f110f61d62ad',
    '72f73fc5-6a3a-43a0-8cce-09f4726b736c',
    'be72f1e3-b5f5-43f4-80db-6d7de93a654e',
    'b6bf4699-6f61-4c6a-9d42-ad055a0de008',
    '8abf299c-cd4e-4ce0-b5cf-4f9abe8cc891',
    '0b6eb077-2eca-4fe6-b012-26fab725b907',
    '5c659e81-e687-4710-a4fd-000ca593155d',
    '42965913-4223-484a-9b3d-abc0002d277d',
    '72e639d1-b8c5-45fd-9acd-a8e5e2b7fa0d',
    'd6823ccd-7247-4efc-8841-f53f456351ed',
    '1048a860-d5ff-4f61-95e5-851e1266d4c1',
    '8681cdcf-b775-4b56-aace-3f3e448261ef',
    'f0db3b27-c952-477d-bc33-9b96a250e168',
    '9e1a6a7d-b45e-4fd2-a1ff-df131da4c713',
    'd596ee91-481c-4eb5-9a8a-88c1e10ba9b6',
    '24470eb6-97c2-4cd4-b484-87a7d634c5b3',
    '3a781e70-cf6e-41c2-8d68-2326f16986e7',
    '048afc84-cdd2-4b39-8ea5-7351f4699761',
    '3fdcdafd-5328-418b-85f8-47b0006de468',
    '032d39fb-d278-427d-b7d2-de648a25a20c',
    '27acea14-41c9-4bf9-ad43-3ebb3ce90456',
    'ec6f49a2-176c-4564-82c5-e751baab46aa',
    '68ea655f-b4a3-43e2-95e4-f158ca2d67dd',
    '0447c960-bbfe-4e42-a95b-dd3d1d9a368e',
    'cc19d19e-6f7e-41b8-87a6-77f41d53e650',
    '16031a34-f1ba-4bde-af43-1822f1516944',
    '483d3454-54da-4243-bbeb-98cbf1d088d0',
    '6765ce2d-ebc8-4367-8855-c0f8e62cb355',
    '1b4a324a-5621-4399-85d2-f91aa03418b5',
    '09642596-d33a-4261-8bf7-eb1dbb37d572',
    '75119ed5-b8aa-4f45-bdef-e3c673bbe44c',
    '04a874df-c57b-40fc-9139-bc3a05129115',
    '2b8ad26a-e66e-4b03-a65a-5b504cecacfd',
    'a7226f10-bdba-4284-97b3-0738a5912770',
    '93b2ac60-2208-4ef8-a1c2-68a623e45807',
    'a963c15d-9c97-49e4-af95-cdee96333a76',
    'fb5d9952-ebe7-4ee6-ba00-819ed00f3593',
    '8da05494-fe7a-4af5-b257-bada143ee426',
    '8b098ab4-df02-4619-8ded-657e496695c1',
    'd48adc59-8934-41bb-9720-63e71f1933be',
    '8e88cabc-e713-44ed-a5d2-41935c3b4eb5',
    'be8cfc23-cd19-46fb-92e1-a77ac380d7aa',
    'f9224ea2-dd31-421d-80d4-f35082ef8d68',
    'e2a398ff-18c3-4258-9d75-89adb2923e88',
    'cb8ebcd0-bb5e-4a6d-bfef-5c651a1a9f6e',
    '6238f8f7-5efb-4023-8d85-ef7db9b4dad7',
    '32c09444-3d4a-44d5-af6b-07eef92189db',
    'd7686f98-05a4-45c9-af2e-3ebc524a5b2d',
    '1939b7ae-fc6b-42a8-ad5f-dc51a1682a17',
    '8ccefc59-38a5-476f-b7d3-3f98315a97f0',
    '2cda53ba-b852-47e8-8f24-59ab8e9f1d1f',
    '6e67e1e1-5c39-43da-960f-48385789c4e1',
    '92382848-f5e9-426c-b7dc-f2841ae97018',
    '4999a410-990e-484b-b4f3-d636f894a741',
    '1f534eb4-701f-4182-9895-64c5e5b52d82',
    'd01a4268-1bfe-4a2d-a2d4-e296162c406e',
    'feca4815-b44b-4b2b-8d77-75edd62ba5a6',
    '039dd3d6-0cb5-4cd1-86b3-e9579c9b5218',
    'd0ce8b95-9c3b-4f9e-8ce0-169fd89a8b20',
    '7427b2eb-a84f-413c-bfb0-7d2e36b0628f',
    '9d796a02-e2aa-4c15-b8d6-1e90cd736681',
    '28c3df75-0b08-4d5c-9feb-6e2e918572ea',
    '433e3a09-661a-46a5-96f2-dbb07bdc87f3',
    'f69c21e9-cb5f-4e72-acfe-c54b672a9f3b',
    '2ef4530a-cc36-4f32-9a1a-63a555346587',
    'e917c83d-c482-442d-81ce-869de7d20903',
    '10774229-1487-4188-b2c3-1fabcf85492a',
    '65793118-3c88-4185-9172-2354850e6056',
    'fcb03f4f-e685-4803-aadb-0e8940ff4f37',
    '46536136-08e4-4521-8e6c-67f023de020d',
    'c2f0e7cf-ac07-48f7-b5f1-497ee6c134b2',
    'b8e7fe18-9c3d-4cc0-bbc7-85b27197fc8f',
    '3abfc362-7e73-4663-9dcf-07b78b9aa2d4',
    'b60b4737-c646-4299-85a0-520890e830b7',
    '280c5d6f-39a3-4d1d-aad2-a174451cd9b2',
    '2d434f2c-6aaa-46b2-ada9-de4b887e13d3',
    'bad1fb5c-d263-48d7-8e4c-fa873a17d707',
    'e4ccd185-2b0c-445d-9c57-0dc45c8f9d7e',
    'd48db47e-acba-4377-b031-f6dfc21f3658',
    '3fd2204c-8654-4af7-832f-c186447262e0',
    'ae34e63e-13af-48b8-8b72-8137289091b3',
    'd3ed2595-b8be-40c8-b7b6-10a4997b9d2e',
    '575dc7da-58ed-407d-9e88-7b586f28bf65',
    '20ddfcd5-d456-431b-9f05-781e05d873d6',
    '15d41c35-943c-474b-afa6-e1c6d6e4be2b',
    '61803dc8-f649-43e5-ab15-d351f2cef629',
    'abe58d43-e1c7-4953-aa41-4d3b6f6cca44',
    '3ef7966a-ec1e-4dba-9d31-cdb33692e78f',
    'fa278604-7d85-4491-a30d-15c7821f8b00',
    'dd6c6688-b73a-464c-86d9-3369fdf98268',
    'b5c0bf91-9d20-41a2-9dd2-87d0ef0310f9',
    '97c636f9-0983-481f-8ff9-7b5b3ee6b10e',
    '15b153f5-ed02-4216-8f96-99743b8b4fc3',
    '747858c0-d139-4f52-9f0e-a618b880d6d6',
    '32bc49c6-7583-4613-a72f-5edb12b2a808',
    '3eb8ea77-4605-4bb7-90f9-671953abe4a2',
    '2b08cb76-061d-44c6-a00f-b43a5421df5e',
    'ab7e390a-adc5-4f9e-b317-a216a2904c93',
    'd1e6d0e4-d49e-4a16-93c6-7956b2c03414',
    '713f8676-8034-4827-bccc-cd6d95b1a4c4',
    'b00883d8-9251-435d-aefc-8a703d96d2fb',
    'eefbea02-0d65-441e-b455-35aa21d25ba3',
    'ff7e3be2-c0ac-4d97-85da-6229bf7585ac',
    '7ac92a42-e112-49c3-a8f5-8ad2c7ef5578',
    '0701aae2-8661-4eec-84e0-7c8be1c89a18',
    '7efb1905-34b4-4f1c-a8a6-8e64b3640a68',
    '11a2b088-8c1c-47d2-9c1e-455d457d2f05',
    '74608bd9-39e4-4f48-9b7c-1cd9d3c599c9',
    '7baf8e8c-de11-452d-b2e1-aad7c08cc18e',
    '23b0219d-0820-4017-b942-bda8578e90e2',
    'd7bcfc5d-e258-4bd6-a413-bb7a118e6bff',
    'a5f631ea-2b4b-43f2-9ea0-e31f2b11fa27',
    '8523489a-f57c-4993-81e4-1ed86a5c092d',
    '395da421-e6e8-4a26-ac93-eb7050a7cb1f',
    '9cb5ce25-38e8-4628-9ddf-d6aedf5efe0f',
    '615f6246-1c39-4e44-a9d4-c7133a2ae62d',
    'a647528d-925e-4c02-8825-ff54720c6ee4',
    '2c6f63b2-439e-499f-b687-b3fdd88a492e',
    '68a916af-2e0c-41bd-8535-c7eacbc2d1b7',
    '0e7f31a0-c712-4ebf-ab3a-64c37f43e52a',
    'c8b1d323-f352-482e-bf17-82075c23dcee',
    'd30f51c7-d642-4e7d-a168-967b9520a80a',
    '8d89608c-0d61-4d71-a2e3-9fbc6cda69bf',
    '71219f56-551f-4ad4-9a38-cc4aaf8a1e9a',
    '48dd6010-77dc-465b-a27c-695e29b57a5e',
    '3376a8b6-7ef6-4191-97ab-a547da0d330d',
    '21384132-1697-4e9b-b863-a6492d13285d',
    '582f5f8d-b96f-490e-b417-ba824baeb06c',
    '7e825ee6-7c03-43cc-b0a4-0d9203a30bd9',
    '2843292e-e494-4642-90e0-57e5c153f12c',
    '4ecbb7c8-0246-47f8-9654-4caca1d52565',
    '7e3ea1bd-95ba-4cad-90c8-3eec95be9cc8',
    'bbba696b-d023-4bb1-a213-c8bee31e8bae',
    '00bd45f9-beb2-4fb0-8680-bd30e392975a',
    'f85e467a-958f-4da5-a01b-8df883e69122',
    '470eee0f-2053-4d9b-9f5e-ca9661a6cc16',
    '5c1dc76d-b703-445c-9b38-cc2d00b9ab16',
    '0b25d09e-b2d9-4452-9810-1d0ef777f9d6',
    '608d793e-a78b-4872-a50c-21a9eaa60ec3',
    '74ede771-6781-4980-bfb9-5d853b7cdd6f',
    '6c47e282-5d5e-445c-b6bd-c0024946fbe0',
    '6ac178b7-a923-407f-8cd8-1733e1b2ebd5',
    'ff8ffbcf-c932-48c7-8d5e-d995d5680e21',
    '9d74b4f0-b2d4-46aa-867a-52fb6102bfdf',
    'e34f15f7-c225-4314-a638-90504bb0aa0d',
    'c1d222ab-bc0e-4e13-8379-0ee5be9e140e',
    '582187a5-ad63-4759-9162-55fa6337eb07',
    'd1e8d19a-970d-4ede-b5bc-9cab7237adec',
    'af867604-d801-41cc-9949-017eb30a0cbf',
    'c9dd3578-01db-4687-9807-4f71368941d1',
    '722e332c-fb1a-45fe-80c7-cc670f025b7f',
    '9f152896-ebf1-4756-b678-bdf739a92256',
    '478aa270-fbd4-4a45-8f63-221b4066168e',
    'e9c7ad29-2213-4648-9164-33a07bd42cdb',
    '1d140c76-a06b-42a0-bae8-b9e169ebe394',
    '5edcc3db-c676-412a-9506-600959bb81f2',
    '9f7dbe05-96b5-4b2f-9f3a-34b552e3dd21',
    'ccc524ab-d9ad-467c-a25b-9a14fb05e976',
    '9e3fb02d-dcf6-486f-a42d-89446a852057',
    '4b341ba9-49a5-43a2-9b7e-cc96beb59946',
    '15ae6390-6f6d-4fd8-9a51-ecf988676c4d',
    '3a3100bb-369e-47c1-a77c-2cacb7cf020d',
    '2c11b505-17c8-402e-8422-0239accb449d',
    'a6d7e030-e6c8-4c62-8cb5-165ef54987c4',
    'e25a8172-1e34-442c-a45d-583027a2d734',
    '0c18589c-6432-4a6c-90ce-985a47a66f39',
    'c911503c-f010-4c17-ac57-1d82e954bdc7',
    '487016d8-ea02-4b20-a45f-7382139aa865',
    '677f0bdf-6c5c-462b-8294-3666f777bbc5',
    '34da5c11-bbe8-4e55-8d89-9ef8a1c66200',
    '9a4d9d5f-72aa-4d7d-90f1-6d1181ee984c',
    'e6b15b39-daba-431f-a918-e4e43e702c30',
    'bef62e8a-5f5c-4e81-a8f8-ddeaf657b4e8',
    '128dce74-fa37-4f2f-8a80-d542edd81a11',
    '841970b7-bed0-4a75-a28a-a4cc59740a84',
    'a5f53fc8-8f9b-4e9a-af63-6f8c54d478b2',
    'f461fca1-80b2-4980-83a8-e165d49acc18',
    '37f0f1f9-83fb-49a1-9941-093c068c32d0',
    'cfb3dad7-c6d9-47c0-81b0-2133d75f5c0d',
    'c5c0893f-b254-4038-8d08-b28ef5a26b5d',
    'bbcf8529-1a04-43fc-b6cf-cb161028159d',
    '06421648-dfcb-4460-b93b-c7d6804dddbb',
    'e0b28b59-1cb5-44f4-ab8f-badf5c74f69f',
    '631deea0-2821-4d14-ad02-dc0ce4864924',
    '95788aa7-c897-4ae8-9166-4b8fc1fc5342',
    'eede320a-ed63-41d8-960d-5405a26a194f',
    '36dccf81-6932-43ae-9864-53379832d878',
    '9102024d-58c0-4bb9-aa55-12c00d98b6cd',
    '01eaf423-8cab-491a-b82e-6915dbc73594',
    '0481a135-9db1-424f-9065-a83ebd7ec995',
    'b60876c5-d825-4303-befb-ffff55b92aba',
    '49022563-1be1-4e42-a11c-01743cd5c94d',
    '64f2dbe1-6f58-493d-ab6b-c93568d828f4',
    'a9626803-72c2-4e23-968c-a090e3f22c5e',
    '095728d6-4ea1-4909-8a74-a8f3fa7f86cb',
    '8309cd89-a912-462a-90ad-f13ae0d7aa6c',
    '902596ce-714e-49b3-8271-f3dfece52309',
    'f5f29e4b-68f7-443c-b290-0827d4167fd5',
    '07c3a7f4-1e59-4dcb-a244-2fd3d084e2b0',
    '6f49717f-8f57-42d0-8548-316ecc292415',
    '7e693091-8ae4-4c40-8e66-c3b39f01b90e',
    '544f643d-b19f-4aa0-a6ec-a90e1a8681d6',
    '5137255f-0c58-4ac7-9266-bda8ab0247c2',
    '5243df74-712d-49a8-989b-528d15088e8f',
    '278a26cb-a710-4fff-928e-fc2e7084a75a',
    '58a1d168-8290-4c69-bf01-17ba3a084365',
    '9fc492f3-8d13-47ae-93e9-812c0224f1aa',
    '822d381e-cea0-45bb-8fa0-1b7194b4b64b',
    '2dc01a50-ea7b-4d9b-be57-1ffbdd98b27b',
    '5b6676dc-f46e-43a8-b87c-e431e369e53c',
    'eb35085f-0cbf-4829-a3ad-acaa53a250b5',
    '36fa2d20-622d-4cca-80b0-683672c94170',
    '44161b51-953d-4f6b-9448-5cba4a44a9d8',
    '0b90b2ea-8ca3-406a-9f69-95eddf7699ef',
    'bfb202bc-4078-4df0-82b9-9218dbc1f1a1',
    '8cc59f51-b0df-4a5d-a3c5-83ee526ff1af',
    'fda7c4b9-9f35-482b-9eff-be7f11058d94',
    '85646f4a-e424-4363-8033-1e7522e8f175',
    '8945794e-174a-49f9-a2d4-4242f9bf3833',
    '529343b4-698a-4b36-ac55-db8a6965ad3f',
    '0f949ee9-0986-42b2-af5d-0f4c8338c664',
    'cb1f06fa-b916-477d-8ab6-fb4b3f24efd3',
    '05253b3c-e8a3-4db4-8a6d-014eac7b3d94',
    '4807db90-b0f7-441d-b489-932f9b341f74',
    'c33b1f32-6021-4d1c-a4d5-fc3d501107f4',
    'aa314675-af62-41df-b5cb-3b22558e903b',
    '20741062-7d1d-44b7-bc33-39c9ad26e414',
    '69ce1be3-1815-43a4-bdd2-4696d9c8d09a',
    '76dd508c-aa80-4e54-9ac4-23b5e0545316',
    '1c6bef41-3cfa-46b2-b183-0a523e417457',
    '18716daf-4223-44a9-bba9-fc9baeef7d07',
    '475430c5-28cb-456d-9c5c-bdbfab9fafb2',
    'f0061cb3-688e-4ad4-aeb8-8614282292ec',
    '6905d8d1-da77-4f7c-86e5-3af7db2b00b4',
    '3a89c170-2939-4c12-9940-f32d96fa9e55',
    '9a06c401-da3f-41b4-b38b-238796fcae09',
    'b32d88c8-31e3-4789-a75f-e52bf1272937',
    '2a81cd6f-aa6e-436b-b4ba-68d5f713fb07',
    '5e0e8f9a-ce97-4b18-9540-3015c61e393c',
    '1c8ba244-1c7f-433a-825b-d2d34d018dcf',
    'efcdb584-7659-4780-9d6d-e6599fb0033c',
    '373ff2e8-0f63-4179-a55c-3fe0b85556aa',
    '352a503b-41eb-4a84-b257-68d70e55337e',
    '737d39b8-2f99-4eac-bcda-a03996e08939',
    '7577f264-8e84-440d-9346-7c4d5affda51',
    'febd8561-4769-4f3b-b7c0-ae7ff6ede2e9',
    'b8c5b185-8669-43d1-8ec7-c0f6d223d505',
    '31e61d00-61cc-46f2-a793-8ea8dfbb0832',
    '2355554e-8951-4b41-bcd8-32e18cddb7c9',
    'de1e7762-673f-4d44-8f45-7e693bb338b7',
    '239a484f-67c2-4ba3-a3d0-d6e4c2b27475',
    'af6c6f09-f0d2-46fe-bda0-c6fa5901c4a3',
    'c6f3bd64-ea67-488f-904f-f0bdf6320b5c',
    '166746e8-ce26-4fa1-a587-443ca9fc59a1',
    '6d18aafc-0240-499c-902e-a72a5b98ff0a',
    '263ab7c9-bd69-45dd-abb7-bbf35b9786ed',
    'e0c7877e-75d7-47d8-b5e9-5dd677d03353',
    '49a97523-0a7a-4d5a-ae20-496f86de2032',
    '583023a1-aa12-40e2-a964-8ad50ad400ba',
    'c56f0a76-2b91-4860-8dff-63c9504bb0e2',
    '17d3ffb4-e891-4ac6-a91a-fd52971c1115',
    '5203f051-7e84-4969-b4ce-eda56a859793',
    '488a38ee-f996-482d-a562-a4474f5594de',
    '680d748c-7c60-46e2-aea5-7fc557a916ea',
    '462d992a-7c13-45ac-a6da-1254fc3a9031',
    'dbb4df81-9115-45d1-b51d-875e0669edc4',
    '6fd0f009-3c34-4529-9a38-c59745545490',
    '74d1e549-5ae8-4410-9428-f8f2cc85fa80',
    '868f72af-99e8-406e-9f7e-14577e6c7157',
    '73f7d2b4-86ec-4f7e-a1f9-37c7b023e3bf',
    '595b6755-e7ae-4e83-af2e-693c089aeec3',
    '4d01e12e-503e-4447-8e49-8c2b77ffb00d',
    '732eaae3-b509-4a7a-8961-09d861e55253',
    '279e5670-8a47-4992-bb10-14e6c719db97',
    'd306000b-88c1-4220-8d7e-933c0118a983',
    '703c4bc2-81bf-435a-87fa-21dc9278bad6',
    'ab76b5ca-e464-4063-b949-853f61036370',
    '672b617f-936e-440a-a735-80f94798aed1',
    '516ceb43-1378-4c02-88fc-a1d2a2258d59',
    'c9986260-0c1b-4fd3-8132-6fa7353046e6',
    'bda2bec8-a142-47ab-bfb4-83759ac2bddd',
    'e858d4f9-3385-4640-b0cb-4894e86d501c',
    '39fe0c8b-bd78-4565-9415-63eabc1d6d85',
    'da4e904f-0346-4cd3-a5c2-ba932511d98d',
    'c46c2220-da88-4f60-a0cf-eebfd0a8ff12',
    '629e31cb-dd7b-4345-abf2-fa23c6c65a09',
    'e9a57082-5a93-481a-bbd0-1acb03ac751a',
    '0faf149d-b316-4fbd-8605-a59354f0eacd',
    '1d23d3cc-5db6-4734-bfaa-507dd366d99b',
    'dd58f556-0049-49c3-9a51-d6470a2abddc',
    '53185d06-f2cc-4942-88c4-8534b559a9ff',
    '0132f320-830d-40d0-a4da-06a5d5f9e8d9',
    'bb7d6408-941a-4da6-8613-36498bc6d91b',
    'b8d11ca1-3db8-4efa-bf57-0305e004a26d',
    '9bd56ad6-080e-4d26-acca-83e4df8aa913',
    '00c11c7e-8530-4bfc-abd7-8c10f4c602d3',
    '8f6b9e20-9468-4f46-aa45-eeab9de88e53',
    '272dff18-acf3-4874-a55f-ba8fb6f80352',
    '70ac3659-06bb-4022-be55-af81d3e35b6f',
    '409b92cf-5c4d-4997-9736-ef2ea10d19e9',
    'a8636719-e26c-49b6-9a53-7d77f3d3c94b',
    'a3e81d5f-8dd6-43dd-9172-d80d212efa2d',
    'd40af129-c13f-45b2-92f0-d0e8fa5cc1c9',
    'ecd2d2f9-2b6f-4743-8d04-c9bb554a96cb',
    '9ee78822-7acd-4fab-9999-c58e9fe266ad',
]

## Script Execution

In [11]:
# Establish API clients
api_client = utils.refresh_tdr_api_client()
api_client.client_side_validation = False
datasets_api = data_repo_client.DatasetsApi(api_client=api_client)

# Loop through the dataset_ids to evaluate and pull out the existing TDR schemas
input_datasets_dict = {}
for dataset_id in dataset_id_list:
    
    # Retrieve source schema
    dataset_table_list = []
    dataset_rels_list = []
    try:
        datasets_api = data_repo_client.DatasetsApi(api_client=api_client)
        response = datasets_api.retrieve_dataset(id=dataset_id, include=["SCHEMA", "ACCESS_INFORMATION"]).to_dict()
        for table in response["schema"]["tables"]:
            if "anvil_" in table["name"]:
                continue
            table_dict = {}
            table_dict["name"] = table["name"]
            table_dict["columns"] = []
            for column in table["columns"]:
                table_dict["columns"].append({"name": column["name"]})
            dataset_table_list.append(table_dict)
        input_datasets_dict[dataset_id] = {}
        input_datasets_dict[dataset_id]["tables"] = dataset_table_list
        for relationship in response["schema"]["relationships"]:
            if "anvil_" in relationship["_from"]["table"]:
                continue
            dataset_rels_list.append([relationship["_from"]["table"], relationship["to"]["table"]])
        input_datasets_dict[dataset_id]["relationships"] = dataset_rels_list
    except Exception as e:
        print("Error retrieving source schema from TDR. Error: {}".format(e))
#print(input_datasets_dict)

# Read in target schema
storage_client = storage.Client()
bucket = storage_client.get_bucket(ws_bucket_name)
target_schema_dict = {}
try:
    blob = bucket.blob(f"ingest_pipeline/mapping/{mapping_target}/mapping_schema_object.json")
    target_schema_dict = json.loads(blob.download_as_string(client=None))
except Exception as e:
    print("Error retrieving target schema for specified mapping_target. Error: {}".format(e))
#print(json.dumps(target_schema_dict))

# Loop through mapping specifications for evaluation
spec_dict = {}
for mapping_target_spec in mapping_target_spec_list:


    # Read in mapping specification
    storage_client = storage.Client()
    bucket = storage_client.get_bucket(ws_bucket_name)
    mapping_spec = {}
    try:
        blob = bucket.blob(f"ingest_pipeline/mapping/{mapping_target}/{mapping_target_spec}/mapping_specification.json")
        blob_string = blob.download_as_text(client=None)
        blob_string = blob_string.replace("$DATASET_NAME", "Dataset").replace("$PROJECT_NAME", "Project") #UPDATE WITH REAL PARAMETERS
        mapping_spec = json.loads(blob_string)
    except Exception as e:
        print("Error retrieving mapping specification for specified mapping_target and mapping_target_spec. Error: {}".format(e))
    #print(json.dumps(mapping_spec))

    # Loop through datasets and evaluate mapping compatibility 
    detail_dict = {}
    for ds_key in input_datasets_dict:
        entity_list = []
        for entity in mapping_spec["entities"]:
            entity_dict = {}
            record_set_list = []
            target_table = {}
            for table in target_schema_dict["tables"]:
                if table["name"] == entity["name"]:
                    target_table = table
            if target_table:
                for record_set in entity["record_sets"]:
                    record_set_dict = {}
                    record_set_dict["record_set"] = record_set["name"]
                    if bmq.validate_record_set(record_set, input_datasets_dict[ds_key], target_table):
                        record_set_dict["can_run"] = True
                    else:
                        record_set_dict["can_run"] = False
                    record_set_dict["total_attrs"] = len(record_set["attributes"])
                    valid_attr_count = 0
                    invalid_attrs_set = set()
                    base_table = "Unspecified"
                    for attribute in record_set["attributes"]:
                        if attribute["name"] == target_table["name"].replace("anvil_", "") + "_id":
                            try:
                                base_table = attribute["source"]["fields"][0].split(".")[0]
                            except:
                                pass
                        if bmq.validate_attribute(attribute, input_datasets_dict[ds_key], target_table):
                            valid_attr_count += 1
                        else:
                            invalid_attrs_set.add(target_table["name"] + "." + attribute["name"])
                    record_set_dict["valid_attrs"] = valid_attr_count
                    record_set_dict["invalid_attrs_set"] = list(invalid_attrs_set)
                    record_set_dict["base_table"] = base_table
                    record_set_list.append(record_set_dict)
                entity_dict[entity["name"]] = record_set_list
                entity_list.append(entity_dict)
        detail_dict[ds_key] = {}
        detail_dict[ds_key]["entities"] = entity_list
    #print(json.dumps(detail_dict))
    spec_dict[mapping_target_spec] = {}
    spec_dict[mapping_target_spec]["detail_dict"] = detail_dict

    # Collect target tables and columns not in mapping specification
    missing_table_set = set()
    missing_column_set = set()
    entity_table_list = [val["name"] for val in mapping_spec["entities"]]
    entity_column_list = []
    for entity in mapping_spec["entities"]:
        entity_name = entity["name"]
        for record_set in entity["record_sets"]:
            for attribute in record_set["attributes"]:
                attribute_name = entity_name + "." + attribute["name"]
                entity_column_list.append(attribute_name)
    for table_entry in target_schema_dict["tables"]:
        if table_entry["name"] not in entity_table_list:
            missing_table_set.add(table_entry["name"])
        else:
            for column_entry in table_entry["columns"]:
                column_name = table_entry["name"] + "." + column_entry["name"]
                if column_name not in entity_column_list:
                    missing_column_set.add(column_name)
    spec_dict[mapping_target_spec]["missing_table_set"] = missing_table_set
    spec_dict[mapping_target_spec]["missing_column_set"] = missing_column_set

# Summarize mapping compatibility
results_list = []
for spec_key, spec_val in spec_dict.items():
    for ds_key, value in spec_val["detail_dict"].items():
        dataset_results_list = []
        can_run_set = set()
        can_run_fully_set = set()
        sum_valid_attrs = 0
        sum_total_attrs = 0
        invalid_attrs_list = []
        seqactivity_source = set()
        activity_source = set()
        biosample_source = set()
        donor_source = set()
        diagnosis_source = set()
        seqactivity_biosample_ri = False
        activity_biosample_ri = False
        biosample_donor_ri = False
        diagnosis_donor_ri = False
        for entities in value["entities"]:
            max_valid_attrs = 0
            max_total_attrs = 0
            for key, val in entities.items():
                invalid_attrs_set = set(val[0]["invalid_attrs_set"])
                for record_sets in val:
                    if record_sets["can_run"] == True:
                        can_run_set.add(key)
                        if key == "anvil_sequencingactivity":
                            seqactivity_source.add(record_sets["base_table"])
                        elif key == "anvil_activity":
                            activity_source.add(record_sets["base_table"])
                        elif key == "anvil_biosample":
                            biosample_source.add(record_sets["base_table"])
                        elif key == "anvil_donor":
                            donor_source.add(record_sets["base_table"])
                        elif key == "anvil_diagnosis":
                            diagnosis_source.add(record_sets["base_table"])
                    if record_sets["total_attrs"] == record_sets["valid_attrs"]:
                        can_run_fully_set.add(key)
                    if record_sets["valid_attrs"] > max_valid_attrs:
                        max_valid_attrs = record_sets["valid_attrs"]
                    if record_sets["total_attrs"] > max_total_attrs:
                        max_total_attrs = record_sets["total_attrs"]
                    invalid_attrs_set = invalid_attrs_set.union(set(record_sets["invalid_attrs_set"]))
                sum_valid_attrs += max_valid_attrs
                sum_total_attrs += max_total_attrs
                invalid_attrs_list.extend(list(invalid_attrs_set))  
        percent_valid_attrs = round(sum_valid_attrs/sum_total_attrs,2)
        # Check RI between fields
        if seqactivity_source and biosample_source:
            for sa in seqactivity_source:
                for bio in biosample_source:
                    for rel_pair in input_datasets_dict[ds_key]["relationships"]:
                        if sa == bio or (sa in rel_pair and bio in rel_pair):
                            seqactivity_biosample_ri = True
                            break
                    if seqactivity_biosample_ri:
                        break
                if seqactivity_biosample_ri:
                    break
        if activity_source and biosample_source:
            for act in activity_source:
                for bio in biosample_source:
                    for rel_pair in input_datasets_dict[ds_key]["relationships"]:
                        if act == bio or (act in rel_pair and bio in rel_pair):
                            activity_biosample_ri = True
                            break
                    if activity_biosample_ri:
                        break
                if activity_biosample_ri:
                    break
        if biosample_source and donor_source:
            for bio in biosample_source:
                for don in donor_source:
                    for rel_pair in input_datasets_dict[ds_key]["relationships"]:
                        if don == bio or (don in rel_pair and bio in rel_pair):
                            biosample_donor_ri = True
                            break
                    if biosample_donor_ri:
                        break
                if biosample_donor_ri:
                    break
        if diagnosis_source and donor_source:
            for diag in diagnosis_source:
                for don in donor_source:
                    for rel_pair in input_datasets_dict[ds_key]["relationships"]:
                        if diag == don or (diag in rel_pair and don in rel_pair):
                            diagnosis_donor_ri = True
                            break
                    if diagnosis_donor_ri:
                        break
                if diagnosis_donor_ri:
                    break
        # Score compatibility
        compatibility_score = 0
        if "anvil_donor" in can_run_set:
            compatibility_score += 25
        if "anvil_biosample" in can_run_set:
            compatibility_score += 5
            if "anvil_biosample.donor_id" not in invalid_attrs_list and biosample_donor_ri:
                compatibility_score += 20
        if "anvil_diagnosis" in can_run_set:
            compatibility_score += 5
            if "anvil_diagnosis.donor_id" not in invalid_attrs_list and diagnosis_donor_ri:
                compatibility_score += 20
        if "anvil_activity" in can_run_set or "anvil_sequencingactivity" in can_run_set:
            compatibility_score += 5
            if ("anvil_activity" in can_run_set and "anvil_activity.used_biosample_id" not in invalid_attrs_list and activity_biosample_ri) or ("anvil_sequencingactivity" in can_run_set and "anvil_sequencingactivity.used_biosample_id" not in invalid_attrs_list and seqactivity_biosample_ri):
                compatibility_score += 20
        dataset_results_list.append(ds_key)
        dataset_results_list.append(spec_key)
        dataset_results_list.append(compatibility_score)
        dataset_results_list.append(len(can_run_set))
        dataset_results_list.append(can_run_set)
        dataset_results_list.append(len(can_run_fully_set))
        dataset_results_list.append(can_run_fully_set)
        dataset_results_list.append(sum_valid_attrs)
        dataset_results_list.append(percent_valid_attrs)
        dataset_results_list.append(invalid_attrs_list)
        dataset_results_list.append(seqactivity_source)
        dataset_results_list.append(activity_source)
        dataset_results_list.append(biosample_source)
        dataset_results_list.append(donor_source)
        dataset_results_list.append(diagnosis_source)
        dataset_results_list.append(seqactivity_biosample_ri)
        dataset_results_list.append(activity_biosample_ri)
        dataset_results_list.append(biosample_donor_ri)
        dataset_results_list.append(diagnosis_donor_ri)
        results_list.append(dataset_results_list)

results_df = pd.DataFrame(results_list, columns = ['dataset_id', 'mapping_spec', 'compatibility_score', 'can_run_count', 'can_run_entities', 'can_fully_run_count', 'can_fully_run_entities', 'cnt_valid_attrs', 'perc_valid_attrs', 'invalid_attr_list', 'seqactivity_source', 'activity_source', 'biosample_source', 'donor_source', 'diagnosis_source', 'seqactivity_biosample_ri', 'activity_biosample_ri', 'biosample_donor_ri', 'diagnosis_donor_ri'])

# Sort results dataframe and write out to tsv
destination_dir = "ingest_pipeline/resources/mapping_compatibility/output"
sorted_df = results_df.sort_values(['dataset_id', 'compatibility_score', 'can_run_count', 'perc_valid_attrs'], ascending=[True, False, False, False], ignore_index=True)
output_file = "mapping_compatibility_results.tsv"
sorted_df.to_csv(output_file, index=False, sep="\t")
!gsutil cp $output_file $ws_bucket/$destination_dir/ 2> stdout
!rm $output_file

# Aggregate compatible mapping specs and write out to tsv
agg_df = results_df[results_df["compatibility_score"] >= 60].sort_values(["dataset_id", "can_fully_run_count", "can_run_count", "perc_valid_attrs", "mapping_spec"], ascending=[True, False, False, False, True]).groupby('dataset_id').agg(compatible_mapping_specs=('mapping_spec', 'unique')).reset_index()
output_file = "mapping_compatibility_aggregation.tsv"
agg_df.to_csv(output_file, index=False, sep="\t")
!gsutil cp $output_file $ws_bucket/$destination_dir/ 2> stdout
!rm $output_file

# Output results to the user
print("------------------------------------------------------------------------------------------------------")
print("Mapping Compatibility Results for Mapping Target Specifications:")
print("------------------------------------------------------------------------------------------------------")
print("Target tables not included in specifications:")
for spec_key, spec_val in spec_dict.items():
    print("\t" + spec_key + ": " + ", ".join(sorted(list(spec_val["missing_table_set"]))))
print("\n")
print("Target fields not included in specification:")
for spec_key, spec_val in spec_dict.items():
    print("\t" + spec_key + ": " + ", ".join(sorted(list(spec_val["missing_column_set"]))))
print("\n")
print("Dataset evaluation against specifications:")
display(sorted_df)

------------------------------------------------------------------------------------------------------
Mapping Compatibility Results for Mapping Target Specifications:
------------------------------------------------------------------------------------------------------
Target tables not included in specifications:
	cmg_ext_2: anvil_alignmentactivity, anvil_antibody, anvil_assayactivity, anvil_variantcallingactivity
	cmg_ext_3: anvil_alignmentactivity, anvil_antibody, anvil_assayactivity, anvil_variantcallingactivity
	cmg_ext_4: anvil_alignmentactivity, anvil_antibody, anvil_assayactivity, anvil_variantcallingactivity
	gtex_ext_2: anvil_alignmentactivity, anvil_antibody, anvil_assayactivity, anvil_sequencingactivity, anvil_variantcallingactivity
	gtex_ext_3: anvil_alignmentactivity, anvil_antibody, anvil_assayactivity, anvil_sequencingactivity, anvil_variantcallingactivity
	gregor_1: anvil_alignmentactivity, anvil_antibody, anvil_assayactivity, anvil_diagnosis, anvil_sequencingactivity

Unnamed: 0,dataset_id,mapping_spec,compatibility_score,can_run_count,can_run_entities,can_fully_run_count,can_fully_run_entities,cnt_valid_attrs,perc_valid_attrs,invalid_attr_list,seqactivity_source,activity_source,biosample_source,donor_source,diagnosis_source,seqactivity_biosample_ri,activity_biosample_ri,biosample_donor_ri,diagnosis_donor_ri
0,00bd45f9-beb2-4fb0-8680-bd30e392975a,gtex_ext_2,75,6,"{anvil_dataset, anvil_project, anvil_file, anvil_activity, anvil_donor, anvil_biosample}",4,"{anvil_activity, anvil_dataset, anvil_project, anvil_file}",27,0.77,"[anvil_donor.phenotypic_sex, anvil_diagnosis.disease, anvil_diagnosis.diagnosis_id, anvil_biosample.donor_age_at_collection_unit, anvil_biosample.biosample_type, anvil_biosample.donor_age_at_collection_lower_bound, anvil_biosample.anatomical_site, anvil_biosample.donor_age_at_collection_upper_bound, anvil_activity.generated_file_id]",{},{sample},{sample},{participant},{},False,True,True,False
1,00bd45f9-beb2-4fb0-8680-bd30e392975a,gtex_ext_3,75,6,"{anvil_dataset, anvil_project, anvil_file, anvil_activity, anvil_donor, anvil_biosample}",4,"{anvil_activity, anvil_dataset, anvil_project, anvil_file}",27,0.77,"[anvil_donor.phenotypic_sex, anvil_diagnosis.disease, anvil_diagnosis.diagnosis_id, anvil_biosample.donor_age_at_collection_unit, anvil_biosample.biosample_type, anvil_biosample.donor_age_at_collection_lower_bound, anvil_biosample.anatomical_site, anvil_biosample.donor_age_at_collection_upper_bound, anvil_activity.generated_file_id]",{},{sample},{sample},{participant},{},False,True,True,False
2,00bd45f9-beb2-4fb0-8680-bd30e392975a,cmg_ext_2,30,5,"{anvil_project, anvil_dataset, anvil_file, anvil_activity, anvil_biosample}",4,"{anvil_activity, anvil_dataset, anvil_project, anvil_file}",25,0.57,"[anvil_donor.reported_ethnicity, anvil_donor.phenotypic_sex, anvil_donor.donor_id, anvil_biosample.donor_id, anvil_biosample.donor_age_at_collection_unit, anvil_biosample.biosample_type, anvil_biosample.donor_age_at_collection_lower_bound, anvil_biosample.anatomical_site, anvil_biosample.donor_age_at_collection_upper_bound, anvil_diagnosis.disease, anvil_diagnosis.diagnosis_id, anvil_diagnosis.onset_age_lower_bound, anvil_diagnosis.onset_age_upper_bound, anvil_diagnosis.phenotype, anvil_diagnosis.onset_age_unit, anvil_diagnosis.donor_id, anvil_sequencingactivity.sequencingactivity_id, anvil_sequencingactivity.generated_file_id, anvil_sequencingactivity.used_biosample_id]",{},{sample},{sample},{},{},False,True,False,False
3,00bd45f9-beb2-4fb0-8680-bd30e392975a,cmg_ext_3,30,5,"{anvil_project, anvil_dataset, anvil_file, anvil_activity, anvil_biosample}",4,"{anvil_activity, anvil_dataset, anvil_project, anvil_file}",25,0.57,"[anvil_donor.reported_ethnicity, anvil_donor.phenotypic_sex, anvil_donor.donor_id, anvil_biosample.donor_id, anvil_biosample.donor_age_at_collection_unit, anvil_biosample.biosample_type, anvil_biosample.donor_age_at_collection_lower_bound, anvil_biosample.anatomical_site, anvil_biosample.donor_age_at_collection_upper_bound, anvil_diagnosis.disease, anvil_diagnosis.diagnosis_id, anvil_diagnosis.onset_age_lower_bound, anvil_diagnosis.onset_age_upper_bound, anvil_diagnosis.phenotype, anvil_diagnosis.onset_age_unit, anvil_diagnosis.donor_id, anvil_sequencingactivity.sequencingactivity_id, anvil_sequencingactivity.generated_file_id, anvil_sequencingactivity.used_biosample_id]",{},{sample},{sample},{},{},False,True,False,False
4,00bd45f9-beb2-4fb0-8680-bd30e392975a,cmg_ext_4,30,5,"{anvil_project, anvil_dataset, anvil_file, anvil_activity, anvil_biosample}",4,"{anvil_activity, anvil_dataset, anvil_project, anvil_file}",25,0.57,"[anvil_donor.reported_ethnicity, anvil_donor.phenotypic_sex, anvil_donor.donor_id, anvil_biosample.donor_id, anvil_biosample.donor_age_at_collection_unit, anvil_biosample.biosample_type, anvil_biosample.donor_age_at_collection_lower_bound, anvil_biosample.anatomical_site, anvil_biosample.donor_age_at_collection_upper_bound, anvil_diagnosis.disease, anvil_diagnosis.diagnosis_id, anvil_diagnosis.onset_age_lower_bound, anvil_diagnosis.onset_age_upper_bound, anvil_diagnosis.phenotype, anvil_diagnosis.onset_age_unit, anvil_diagnosis.donor_id, anvil_sequencingactivity.sequencingactivity_id, anvil_sequencingactivity.generated_file_id, anvil_sequencingactivity.used_biosample_id]",{},{sample},{sample},{},{},False,True,False,False
5,00bd45f9-beb2-4fb0-8680-bd30e392975a,gregor_1,25,4,"{anvil_dataset, anvil_donor, anvil_project, anvil_file}",3,"{anvil_dataset, anvil_project, anvil_file}",23,0.62,"[anvil_donor.phenotypic_sex, anvil_donor.reported_ethnicity, anvil_biosample.donor_id, anvil_biosample.biosample_id, anvil_biosample.donor_age_at_collection_unit, anvil_biosample.biosample_type, anvil_biosample.donor_age_at_collection_lower_bound, anvil_biosample.donor_age_at_collection_upper_bound, anvil_activity.used_biosample_id, anvil_activity.activity_id, anvil_activity.generated_file_id, anvil_variantcallingactivity.variantcallingactivity_id, anvil_variantcallingactivity.used_file_id, anvil_variantcallingactivity.generated_file_id]",{},{},{},{participant},{},False,False,False,False
6,00c11c7e-8530-4bfc-abd7-8c10f4c602d3,cmg_ext_4,80,7,"{anvil_dataset, anvil_project, anvil_file, anvil_sequencingactivity, anvil_donor, anvil_biosample, anvil_diagnosis}",4,"{anvil_sequencingactivity, anvil_dataset, anvil_project, anvil_file}",35,0.8,"[anvil_donor.reported_ethnicity, anvil_biosample.donor_age_at_collection_unit, anvil_biosample.donor_age_at_collection_lower_bound, anvil_biosample.donor_age_at_collection_upper_bound, anvil_diagnosis.onset_age_lower_bound, anvil_diagnosis.onset_age_upper_bound, anvil_diagnosis.phenotype, anvil_diagnosis.onset_age_unit, anvil_activity.generated_file_id]",{sequencing},{},{sample},{subject},{subject},False,False,True,True
7,00c11c7e-8530-4bfc-abd7-8c10f4c602d3,cmg_ext_2,80,7,"{anvil_dataset, anvil_project, anvil_file, anvil_sequencingactivity, anvil_donor, anvil_biosample, anvil_diagnosis}",3,"{anvil_dataset, anvil_project, anvil_file}",34,0.77,"[anvil_donor.reported_ethnicity, anvil_biosample.donor_age_at_collection_unit, anvil_biosample.donor_age_at_collection_lower_bound, anvil_biosample.donor_age_at_collection_upper_bound, anvil_diagnosis.onset_age_lower_bound, anvil_diagnosis.onset_age_upper_bound, anvil_diagnosis.phenotype, anvil_diagnosis.onset_age_unit, anvil_sequencingactivity.used_biosample_id, anvil_activity.generated_file_id]",{sequencing},{},{sample},{subject},{subject},False,False,True,True
8,00c11c7e-8530-4bfc-abd7-8c10f4c602d3,cmg_ext_3,80,7,"{anvil_dataset, anvil_project, anvil_file, anvil_sequencingactivity, anvil_donor, anvil_biosample, anvil_diagnosis}",3,"{anvil_dataset, anvil_project, anvil_file}",34,0.77,"[anvil_donor.reported_ethnicity, anvil_biosample.donor_age_at_collection_unit, anvil_biosample.donor_age_at_collection_lower_bound, anvil_biosample.donor_age_at_collection_upper_bound, anvil_diagnosis.onset_age_lower_bound, anvil_diagnosis.onset_age_upper_bound, anvil_diagnosis.phenotype, anvil_diagnosis.onset_age_unit, anvil_sequencingactivity.used_biosample_id, anvil_activity.generated_file_id]",{sequencing},{},{sample},{subject},{subject},False,False,True,True
9,00c11c7e-8530-4bfc-abd7-8c10f4c602d3,gtex_ext_2,30,5,"{anvil_dataset, anvil_project, anvil_file, anvil_donor, anvil_biosample}",3,"{anvil_dataset, anvil_project, anvil_file}",26,0.74,"[anvil_donor.phenotypic_sex, anvil_diagnosis.disease, anvil_diagnosis.diagnosis_id, anvil_biosample.donor_id, anvil_biosample.donor_age_at_collection_unit, anvil_biosample.biosample_type, anvil_biosample.donor_age_at_collection_lower_bound, anvil_biosample.anatomical_site, anvil_biosample.donor_age_at_collection_upper_bound, anvil_activity.generated_file_id]",{},{},{sample},{participant},{},False,False,False,False


# Utility Scripts

In [None]:
## Print detailed results for specific dataset
print(json.dumps(detail_dict["anvil_cmg_uwash_ds-hfa"], indent=2))

## Pull results for specific datasets

In [None]:
## Print workspace dict for specific workspace
print(json.dumps(workspace_dict["anvil_gtex_bcm_gru_corsivs"], indent=2))

## Checking workspaces for fileref fields

In [None]:
## Checking for fileref fields
ws_project = "anvil-datastorage"
data_file_refs = {}
ws_name_list = [
"1000G-high-coverage-2019"
]
file_ref_set = set()
for ws_name in ws_name_list:
    file_ref_list = []
    try:
        ws_attributes = utils.get_workspace_attributes(ws_project, ws_name)
        workspace_bucket = ws_attributes["bucketName"] if ws_attributes.get("bucketName") else "" 
        file_ref_list, data_file_refs, remote_list = utils.find_and_add_fileref_fields(ws_project, ws_name, workspace_bucket, data_file_refs)
        for entry in file_ref_list:
            file_ref_set.add(entry)
    except:
        continue
print(sorted(list(file_ref_set)))



## Collecting schemas for workspaces

In [2]:
## Collecting the schema for a specific workspace
from firecloud import api as fapi
ws_project = "anvil-datastorage"
ws_name_list = [
'ANVIL_CMG_BROAD_BRAIN_ENGLE_WES',
'ANVIL_CMG_Broad_Genitourinary_Sinclair_WES',
'ANVIL_CMG_Broad_Orphan_Jueppner_WES',
'AnVIL_CMG_Broad_Genitourinary_Hirschhorn_WES',
'AnVIL_CMG_Broad_Orphan_Scott_WES',
'AnVIL_CMG_Broad_Orphan_VCGS-White_WES',
'AnVIL_CMG_Broad_Stillbirth_Wilkins-Haug_WES',
]
schema_fields = []

# Loop through workspaces
for ws_name in ws_name_list:

    try:
        # Collect and record all entity types in workspace
        response_etypes = fapi.list_entity_types(ws_project, ws_name)
        dict_all_etypes = json.loads(response_etypes.text)
        etypes_list = [key for key in dict_all_etypes.keys()]

        # Loop through entity types and parse result to build schema
        if etypes_list:
            for etype in etypes_list:
                column_set = set()
                column_set.add(dict_all_etypes[etype]["idName"])
                for attr_key in dict_all_etypes[etype]["attributeNames"]:
                    column_set.add(attr_key)
                for column in column_set:
                    column_entry = []
                    column_entry = [ws_name.lower(), etype.lower(), column.lower()]
                    schema_fields.append(column_entry)
    except:
        pass

# Convert to dataframe and display
df = pd.DataFrame(schema_fields, columns = ["workspace_name", "table_name", "column_name"])
display(df)

Unnamed: 0,workspace_name,table_name,column_name
0,anvil_cmg_broad_brain_engle_wes,subject,19-disease_description
1,anvil_cmg_broad_brain_engle_wes,subject,01-subject_id
2,anvil_cmg_broad_brain_engle_wes,subject,subject_id
3,anvil_cmg_broad_brain_engle_wes,subject,22-age_of_onset
4,anvil_cmg_broad_brain_engle_wes,subject,07-multiple_datasets
5,anvil_cmg_broad_brain_engle_wes,subject,20-affected_status
6,anvil_cmg_broad_brain_engle_wes,subject,11-twin_id
7,anvil_cmg_broad_brain_engle_wes,subject,14-ancestry
8,anvil_cmg_broad_brain_engle_wes,subject,02-prior_testing
9,anvil_cmg_broad_brain_engle_wes,subject,12-proband_relationship
