# pyAnVIL FHIR extract and QA

## Modifed 5-24-2021 to test latest PFB export from Gen3

## overview
![image](https://user-images.githubusercontent.com/47808/102566809-16b1fc00-4095-11eb-8cf8-f78952ba0464.png)


## dependencies

> Ensure latest version of pyAnVIL installed

In [1]:
# Install a pip package in the current Jupyter kernel
import sys
# !{sys.executable} -m pip uninstall  -y pyanvil  
# !{sys.executable} -m pip install   pyAnVIL==0.0.10rc3 --upgrade
!{sys.executable} -m pip show pyanvil
# # >>> 0.0.10



Name: pyAnVIL
Version: 0.0.10rc2
Summary: AnVIL client library. Combines gen3, terra client APIs with single signon and data harmonization use cases.
Home-page: https://github.com/anvilproject/client-apis
Author: The AnVIL project
Author-email: walsbr@ohsu.edu
License: UNKNOWN
Location: /home/jupyter/notebooks/packages
Requires: firecloud, attrdict, Click, gen3, google-cloud-storage, xmltodict, fastavro
Required-by: 


In [2]:
import os 
import sqlite3

def drop_and_create_connection(db_file):
    """ create a database connection to a SQLite database """
    try:
        os.unlink(db_file)
    except FileNotFoundError as e:
        pass    

    conn = None
    try:
        conn = sqlite3.connect(db_file)
    except Error as e:
        print(e)
    finally:
        if conn:
            conn.close()

# drop_and_create_connection('/tmp/terra.sqlite')
# drop_and_create_connection('/tmp/terra-graph.sqlite')
# drop_and_create_connection('/tmp/pyanvil-cache.sqlite')
# drop_and_create_connection('/tmp/gen3-drs.sqlite')

 
!ls -l /tmp/*.sqlite

-rw-r--r-- 1 jupyter users 1636052992 Sep 29 22:25 /tmp/gen3-drs.sqlite
-rw-r--r-- 1 jupyter users 1585963008 Sep 29 23:18 /tmp/pyanvil-cache.sqlite
-rw-r--r-- 1 jupyter users          0 Sep 29 20:42 /tmp/terra-graph.sqlite
-rw-r--r-- 1 jupyter users 5074403328 Sep 29 21:59 /tmp/terra.sqlite


> Ensure PFB extract available
![image](https://user-images.githubusercontent.com/47808/99719432-21ab4980-2a61-11eb-8377-6cbd6ab156ed.png)


In [3]:
import os
import anvil

# AVRO_PATH = "/tmp/export_2021-05-24T22_04_46.avro"
AVRO_PATH = "/tmp/export_2021-09-29T18_43_01.avro"

if not os.path.isfile(AVRO_PATH):
    # !gsutil cp $WORKSPACE_BUCKET/export_2021-05-24T22_04_46.avro /tmp/export_2021-05-24T22_04_46.avro
    !gsutil cp $WORKSPACE_BUCKET/export_2021-09-29T18_43_01.avro /tmp/export_2021-09-29T18_43_01.avro

assert os.path.isfile(AVRO_PATH), f"{AVRO_PATH} should exist. Please export PFB from https://gen3.theanvil.io/" 

# extract

> Extract all meta data, write terra, gen3 sqlite databases and dashboard summary

In [4]:
import os
import logging
import json

from anvil.terra.reconciler import Reconciler, Entities
from anvil.util.reconciler import aggregate, DEFAULT_NAMESPACE
from anvil.transformers.fhir.transformer import FhirTransformer
from anvil.dbgap.api import get_accession, get_study

import pandas as pd
import sqlite3

logging.basicConfig(level=logging.INFO, format='%(asctime)s %(levelname)-8s %(message)s')
DASHBOARD_OUTPUT_PATH = "/tmp"
TERRA_SUMMARY = f"{DASHBOARD_OUTPUT_PATH}/terra_summary.json"
DASHBOARD_OUTPUT_FILE = f"{DASHBOARD_OUTPUT_PATH}/data_dashboard.json"


In [5]:
"""Extract all workspaces."""


def harvest_workspaces(consortiums):
    """Harvest all workspaces, return list of workspace_name. Create detailed sqlite graph and summary dashboard."""
    logging.info("Starting aggregation for all AnVIL workspaces, this will take several minutes.")

    with open(DASHBOARD_OUTPUT_FILE, 'w') as outs:
        views = [v for v in aggregate(namespace=DEFAULT_NAMESPACE,
                 user_project=os.environ['GOOGLE_PROJECT'],
                 consortium=consortiums, avro_path=AVRO_PATH)]
        json.dump({
            'projects': [v for v in views if 'problems' in v],
            'consortiums': [v for v in views if 'problems' not in v]
        }, outs)

    assert os.path.isfile(DASHBOARD_OUTPUT_FILE), f"{DASHBOARD_OUTPUT_FILE} should exist."
    assert os.path.isfile('/tmp/terra.sqlite'), f"'/tmp/terra.sqlite' should exist."

    entities = Entities(path='/tmp/terra.sqlite')
    entities.index()
    return [workspace.name for workspace in entities.get_by_name('workspace')]


def summarize_workspaces():
    """Aggregate harvested workspaces."""
    entities = Entities(path=f'{DASHBOARD_OUTPUT_PATH}/terra.sqlite') 
    # created sql indices
    entities.index()
    emitter = open(TERRA_SUMMARY, "w")
    for workspace in entities.get_by_name('workspace'):
        for subject in workspace.subjects:
            for sample in subject.samples:
                for property, blob in sample.blobs.items():
                    json.dump(
                        {
                            "workspace_id": workspace.id,
                            "subject_id": subject.id,
                            "sample_id": sample.id,
                            "blob": blob['name'],
                        },
                        emitter,
                        separators=(',', ':')
                    )
                    emitter.write('\n')
    emitter.close()    

def write_fhir(workspace_names):
    """Write all fhir objects."""
    entities = Entities(path=f'{DASHBOARD_OUTPUT_PATH}/terra.sqlite')

    for name in workspace_names:
        emitters = {}
        entity = entities.get(name)
        workspace = entity['vertex']
        logging.info(f"Transforming {name}")
        if 'subject' not in entity['edges']:
            logging.error(f"{name} missing subject edges")
            continue
        workspace._subjects = entity['edges']['subject']
        warned_missing_samples = False
        for subject in workspace.subjects:
            entity = entities.get(subject.id)
            if 'sample' not in entity['edges']:
                if not warned_missing_samples:
                    logging.warning(f"{subject.id} missing sample edges")
                warned_missing_samples = True
                continue
            subject.samples = entity['edges']['sample']
            for sample in subject.samples:
                entity = entities.get(sample.id)
                _blobs = entity['edges'].get('blob', None)
                if _blobs:
                    sample.blobs = {b['property_name']: b for b in _blobs}
        transformer = FhirTransformer(workspace=workspace)
        # namespace = workspace.attributes.workspace.namespace
        reconciler_name = workspace.attributes.reconciler_name
        for item in transformer.transform():
            for entity in item.entity():
                resourceType = entity['resourceType']
                focus_reference = None
                dir_path = f"{DASHBOARD_OUTPUT_PATH}/{reconciler_name}/{name}"
                file_path = f"{dir_path}/{resourceType}.json"
                if 'focus' in entity:
                    focus_reference = entity['focus'][0]['reference']
                    if resourceType == 'Observation' and 'ResearchStudy' in focus_reference:
                        file_path = f"{dir_path}/ResearchStudyObservation.json"
                emitter = emitters.get(file_path, None)
                if emitter is None:
                    os.makedirs(dir_path, exist_ok=True)
                    emitter = open(file_path, "w")
                    logging.debug(f"Writing {file_path}")
                    emitters[file_path] = emitter
                json.dump(entity, emitter, separators=(',', ':'))
                emitter.write('\n')
        for stream in emitters.values():
            stream.close()


## extract & validate

In [6]:

consortiums = (
    ('CMG', 'AnVIL_CMG_.*'),
    ('CCDG', 'AnVIL_CCDG_.*'),
    ('GTEx', '^AnVIL_GTEx_V8_hg38$'),
    ('ThousandGenomes', '^1000G-high-coverage-2019$'),
    ('NHGRI', '^AnVIL_NHGRI'),
    ('NIMH', '^AnVIL_NIMH'),
    ('PAGE', '^AnVIL_PAGE'),     
#     ('CSER', '^AnVIL_CSER'),    
#     ('HPRC', '^AnVIL_HPRC$'),
)    

# consortiums = (
#     ('CCDG', 'AnVIL_CCDG_Broad_CVD_EOCAD_PartnersBiobank_HMB_WES'),
# )    
workspace_names = [n for n in harvest_workspaces(consortiums)]
print(len(workspace_names))

2021-09-29 21:34:18,339 INFO     Starting aggregation for all AnVIL workspaces, this will take several minutes.
2021-09-29 21:34:18,367 INFO     Loading AnVIL_CMG_Broad_Muscle_KNC_WGS
2021-09-29 21:34:18,397 INFO     Loading /tmp/export_2021-09-29T18_43_01.avro
2021-09-29 21:34:18,405 INFO     Already indexed /tmp/export_2021-09-29T18_43_01.avro
2021-09-29 21:34:18,524 INFO     Loading AnVIL_CMG_BaylorHopkins_HMB-NPU_WES
2021-09-29 21:34:19,645 INFO     Loading ANVIL_CMG_UWASH_DS-HFA
2021-09-29 21:34:19,732 INFO     Loading ANVIL_CMG_Broad_Muscle_Laing_WES
2021-09-29 21:34:19,808 INFO     Loading AnVIL_CMG_Broad_Orphan_VCGS-White_WES
2021-09-29 21:34:23,199 INFO     Loading AnVIL_CMG_Broad_Muscle_Myoseq_WES
2021-09-29 21:34:39,019 INFO     Loading AnVIL_CMG_UWASH_HMB
2021-09-29 21:34:39,481 INFO     Loading AnVIL_CMG_Broad_Heart_Ware_WES
2021-09-29 21:34:39,523 INFO     Loading AnVIL_CMG_Broad_Muscle_Beggs_WES
2021-09-29 21:34:40,602 INFO     Loading AnVIL_CMG_Broad_Blood_Sankaran_WGS


AnVIL_CCDG_Broad_CVD_AF_EAST_WES fc-282a8e0b-df88-42de-9059-2b7447d9f9c7 403 GET https://storage.googleapis.com/storage/v1/b/fc-282a8e0b-df88-42de-9059-2b7447d9f9c7/o?projection=noAcl&fields=items%28size%2C+etag%2C+crc32c%2C+name%2C+timeCreated%29%2CnextPageToken&userProject=terra-test-bwalsh&prettyPrint=false: pet-110793006573203727769@terra-test-bwalsh.iam.gserviceaccount.com does not have storage.objects.list access to the Google Cloud Storage bucket.


2021-09-29 21:38:37,309 INFO     Loading AnVIL_CCDG_Broad_CVD_AF_Ellinor_MGH_WES
2021-09-29 21:38:39,017 INFO     Loading AnVIL_CCDG_Broad_NP_Epilepsy_ITAUMR_GRU-NPU_GSA-MD
2021-09-29 21:38:41,798 INFO     Loading AnVIL_CCDG_NYGC_NP_Autism_ACE2_DS-MDS_WGS
2021-09-29 21:38:41,866 INFO     Loading AnVIL_CCDG_Broad_NP_Epilepsy_USAMSS_DS_EP_NEURO_MDS_GSA-MD
2021-09-29 21:38:42,417 INFO     Loading AnVIL_CCDG_Broad_NP_Epilepsy_CANUTN_DS-EP_GSA-MD
2021-09-29 21:38:43,008 INFO     Loading anvil_ccdg_broad_ai_ibd_niddk_daly_brant_wes
2021-09-29 21:38:45,700 INFO     Loading AnVIL_CCDG_Broad_NP_Epilepsy_BELATW_GRU_WES
2021-09-29 21:38:46,264 INFO     Loading AnVIL_CCDG_Broad_NP_Epilepsy_HKOSB_GRU_WES
2021-09-29 21:38:46,828 INFO     Loading AnVIL_CCDG_Broad_CVD_AF_TMDU_Cases_WES


AnVIL_CCDG_Broad_CVD_AF_TMDU_Cases_WES fc-2b68ae78-57af-4c65-8020-6f5ed4ae9408 403 GET https://storage.googleapis.com/storage/v1/b/fc-2b68ae78-57af-4c65-8020-6f5ed4ae9408/o?projection=noAcl&fields=items%28size%2C+etag%2C+crc32c%2C+name%2C+timeCreated%29%2CnextPageToken&userProject=terra-test-bwalsh&prettyPrint=false: pet-110793006573203727769@terra-test-bwalsh.iam.gserviceaccount.com does not have storage.objects.list access to the Google Cloud Storage bucket.


2021-09-29 21:38:49,063 INFO     Loading AnVIL_CCDG_Broad_CVD_AF_VAFAR_WES
2021-09-29 21:38:52,836 INFO     Loading AnVIL_CCDG_Broad_CVD_AFib_MGH_WGS
2021-09-29 21:38:53,390 INFO     Loading AnVIL_CCDG_Broad_CVD_AF_Rienstra_WES
2021-09-29 21:38:58,461 INFO     Loading AnVIL_ccdg_asc_ndd_daly_talkowski_kolevzon_asd_wgs
2021-09-29 21:38:58,852 INFO     Loading AnVIL_CCDG_Baylor_CVD_Oregon_SUDS_GRU_WGS
2021-09-29 21:39:02,538 INFO     Loading AnVIL_CCDG_NYGC_AI_Asthma_Gala2_WGS
2021-09-29 21:39:02,978 INFO     Loading AnVIL_CCDG_Broad_NP_Epilepsy_LEBABM_GRU_GSA-MD
2021-09-29 21:39:04,081 INFO     Loading anvil_ccdg_broad_ai_ibd_daly_newberry_share_wes
2021-09-29 21:39:06,416 INFO     Loading AnVIL_CCDG_WashU_CVD_PAGE_HMB-NPU_WGS
2021-09-29 21:39:06,796 INFO     Loading AnVIL_CCDG_Broad_NP_Epilepsy_GBRUNL_GRU_WES
2021-09-29 21:39:07,245 INFO     Loading AnVIL_CCDG_Broad_MI_BRAVE_GRU_WES
2021-09-29 21:39:10,918 INFO     Loading AnVIL_CCDG_Broad_NP_Epilepsy_ITAUMR_GRU_NPU_WES
2021-09-29 21:3

AnVIL_CCDG_Broad_CVD_AF_Natale_TCAI_WES fc-4f070061-0bc2-4f9a-9fe9-869a739c9817 403 GET https://storage.googleapis.com/storage/v1/b/fc-4f070061-0bc2-4f9a-9fe9-869a739c9817/o?projection=noAcl&fields=items%28size%2C+etag%2C+crc32c%2C+name%2C+timeCreated%29%2CnextPageToken&userProject=terra-test-bwalsh&prettyPrint=false: pet-110793006573203727769@terra-test-bwalsh.iam.gserviceaccount.com does not have storage.objects.list access to the Google Cloud Storage bucket.


2021-09-29 21:40:11,482 INFO     Loading AnVIL_CCDG_Broad_NP_Epilepsy_DEUPUM_HMB_MDS_WES
2021-09-29 21:40:12,632 INFO     Loading AnVIL_CCDG_Broad_NP_Epilepsy_USACCH_DS_NEURO_MDS_WES
2021-09-29 21:40:13,813 INFO     Loading AnVIL_CCDG_Broad_NP_Epilepsy_DEUUTB_HMB-NPU_MDS_GSA-MD
2021-09-29 21:40:18,030 INFO     Loading AnVIL_ccdg_asc_ndd_daly_talkowski_menashe_asd_exome
2021-09-29 21:40:20,697 INFO     Loading AnVIL_CCDG_Broad_NP_Epilepsy_HKOSB_GRU_GSA-MD
2021-09-29 21:40:21,331 INFO     Loading AnVIL_ccdg_asc_ndd_daly_talkowski_AGRE_asd_exome
2021-09-29 21:40:23,714 INFO     Loading AnVIL_CCDG_Broad_NP_Epilepsy_GBRUCL_DS_EARET_MDS_GSA-MD
2021-09-29 21:40:25,915 INFO     Loading AnVIL_CCDG_Broad_NP_Epilepsy_GHAKNT_GRU_WES
2021-09-29 21:40:27,852 INFO     Loading AnVIL_ccdg_asc_ndd_daly_talkowski_domenici_asd_exome
2021-09-29 21:40:29,967 INFO     Loading anvil_ccdg_broad_ai_ibd_daly_rioux_niddk_wes
2021-09-29 21:40:34,996 INFO     Loading AnVIL_CCDG_Baylor_CVD_HemStroke_ERICH_WGS
2021-0

AnVIL_CCDG_Broad_NP_Epilepsy_AUTMUV_DS_NS_MDS_NPU_WES fc-82bbaf50-f3d4-48e9-bd76-3874638fa714 403 GET https://storage.googleapis.com/storage/v1/b/fc-82bbaf50-f3d4-48e9-bd76-3874638fa714/o?projection=noAcl&fields=items%28size%2C+etag%2C+crc32c%2C+name%2C+timeCreated%29%2CnextPageToken&userProject=terra-test-bwalsh&prettyPrint=false: pet-110793006573203727769@terra-test-bwalsh.iam.gserviceaccount.com does not have storage.objects.list access to the Google Cloud Storage bucket.


2021-09-29 21:43:57,369 INFO     Loading AnVIL_CCDG_Broad_NP_Epilepsy_NZLUTO_EPIL_BC_ID_MDS_GSA-MD
2021-09-29 21:43:58,507 INFO     Loading AnVIL_CCDG_Freeze2_VCFs
2021-09-29 21:43:59,149 INFO     Loading AnVIL_CCDG_Broad_NP_Epilepsy_HKGHKK_HMB_MDS_WES
2021-09-29 21:44:01,758 INFO     Loading AnVIL_CCDG_Broad_NP_Autism_State-Sanders_WGS
2021-09-29 21:44:03,946 INFO     Loading AnVIL_ccdg_asc_ndd_daly_talkowski_barbosa_asd_exome
2021-09-29 21:44:04,992 INFO     Loading anvil_ccdg_broad_ai_ibd_niddk_daly_duerr_wes
2021-09-29 21:44:12,300 INFO     Loading AnVIL_CCDG_WashU_CVD_EOCAD_WashU-CAD_DS_WGS
2021-09-29 21:44:12,397 INFO     Loading AnVIL_CCDG_Broad_NP_Epilepsy_AUSRMB_DS-EAED-MDS_GSA-MD
2021-09-29 21:44:14,009 INFO     Loading AnVIL_CCDG_Baylor_CVD_HemStroke_BNI_HMB_WGS
2021-09-29 21:44:14,781 INFO     Loading AnVIL_CCDG_Broad_NP_Epilepsy_DEUUKL_HMB_WES
2021-09-29 21:44:15,836 INFO     Loading AnVIL_ccdg_asc_ndd_daly_talkowski_herman_asd_exome
2021-09-29 21:44:16,335 INFO     Loadin

AnVIL_CCDG_Broad_NP_Epilepsy_AUSALF_HMB_IRB_GSRS_WES fc-8da94069-2edc-4e37-8c96-5a25740aeb32 403 GET https://storage.googleapis.com/storage/v1/b/fc-8da94069-2edc-4e37-8c96-5a25740aeb32/o?projection=noAcl&fields=items%28size%2C+etag%2C+crc32c%2C+name%2C+timeCreated%29%2CnextPageToken&userProject=terra-test-bwalsh&prettyPrint=false: pet-110793006573203727769@terra-test-bwalsh.iam.gserviceaccount.com does not have storage.objects.list access to the Google Cloud Storage bucket.


2021-09-29 21:44:18,156 INFO     Loading AnVIL_CCDG_WashU_CVD_EOCAD_BioVu_WGS
2021-09-29 21:44:20,019 INFO     Loading AnVIL_CCDG_Broad_NP_Epilepsy_DEUUKB_HMB_NPU_MDS_WES
2021-09-29 21:44:29,681 INFO     Loading anvil_ccdg_broad_ai_ibd_daly_rioux_igenomed_wes


anvil_ccdg_broad_ai_ibd_daly_rioux_igenomed_wes fc-9197a911-c2f8-4f5f-91f9-389d191626d0 403 GET https://storage.googleapis.com/storage/v1/b/fc-9197a911-c2f8-4f5f-91f9-389d191626d0/o?projection=noAcl&fields=items%28size%2C+etag%2C+crc32c%2C+name%2C+timeCreated%29%2CnextPageToken&userProject=terra-test-bwalsh&prettyPrint=false: pet-110793006573203727769@terra-test-bwalsh.iam.gserviceaccount.com does not have storage.objects.list access to the Google Cloud Storage bucket.


2021-09-29 21:44:30,548 INFO     Loading anvil_ccdg_broad_ai_ibd_daly_mcgovern_niddk_wes
2021-09-29 21:45:03,584 INFO     Loading AnVIL_CCDG_Broad_NP_Epilepsy_FINKPH_EPIL_MDS_GSA-MD
2021-09-29 21:45:07,046 INFO     Loading AnVIL_CCDG_Broad_NP_Epilepsy_AUSAUS_EP_BA_CN_ID_MDS_GSA-MD
2021-09-29 21:45:13,068 INFO     Loading AnVIL_CCDG_Broad_NP_Epilepsy_ITAUBG_DS-EPI-NPU-MDS_GSA-MD
2021-09-29 21:45:15,902 INFO     Loading AnVIL_CCDG_Broad_NP_Epilepsy_CYPCYP_HMB-NPU-MDS_GSA-MD
2021-09-29 21:45:16,783 INFO     Loading AnVIL_CCDG_NYGC_NP_Autism_CAG_DS_WGS
2021-09-29 21:45:19,188 INFO     Loading AnVIL_CCDG_Broad_NP_Epilepsy_TWNCGM_HMB-NPU-ADULTS_GSA-MD
2021-09-29 21:45:23,342 INFO     Loading AnVIL_CCDG_Broad_NP_Epilepsy_CYPCYP_HMB_NPU_MDS_WES
2021-09-29 21:45:24,360 INFO     Loading AnVIL_ccdg_asc_ndd_daly_talkowski_palotie_asd_exome
2021-09-29 21:45:25,247 INFO     Loading AnVIL_CCDG_Broad_CVD_AFib_Intermountain_WGS
2021-09-29 21:45:27,324 INFO     Loading AnVIL_ccdg_asc_ndd_daly_talkowski_

AnVIL_CCDG_Broad_CVD_AF_Marcus_UCSF_WES fc-e7051891-25c8-4776-80ed-26b1af860277 403 GET https://storage.googleapis.com/storage/v1/b/fc-e7051891-25c8-4776-80ed-26b1af860277/o?projection=noAcl&fields=items%28size%2C+etag%2C+crc32c%2C+name%2C+timeCreated%29%2CnextPageToken&userProject=terra-test-bwalsh&prettyPrint=false: pet-110793006573203727769@terra-test-bwalsh.iam.gserviceaccount.com does not have storage.objects.list access to the Google Cloud Storage bucket.


2021-09-29 21:53:00,562 INFO     Loading AnVIL_CCDG_Broad_NP_Epilepsy_GBRUCL_DS_EARET_MDS_WES
2021-09-29 21:53:04,260 INFO     Loading anvil_ccdg_broad_ai_ibd_daly_franchimont_wes
2021-09-29 21:53:34,689 INFO     Loading AnVIL_CCDG_Broad_CVD_EOCAD_PartnersBiobank_HMB_WES
2021-09-29 21:54:32,516 INFO     Loading AnVIL_CCDG_WashU_CVD_EOCAD_Cleveland_WGS
2021-09-29 21:54:34,149 INFO     Loading AnVIL_CCDG_NYGC_NP_Autism_SPARK_GRU_WGS
2021-09-29 21:55:28,202 INFO     Loading AnVIL_CCDG_Broad_CVD_AFib_Duke_WGS
2021-09-29 21:55:29,428 INFO     Loading AnVIL_CCDG_Broad_NP_Epilepsy_USAMON_GRU_GSRS_WES
2021-09-29 21:55:30,236 INFO     Loading AnVIL_CCDG_Broad_NP_Epilepsy_USAUPN_CHOP_GRU_GSA-MD
2021-09-29 21:55:31,849 INFO     Loading AnVIL_CCDG_Broad_NP_Epilepsy_USAVANcontrols_HMB-GSO_WES
2021-09-29 21:55:42,612 INFO     Loading AnVIL_CCDG_TOPMed_WashU_CVD_Afib_Penn_WGS
2021-09-29 21:55:44,956 INFO     Loading AnVIL_CCDG_NYGC_NP_Autism_AGRE_WGS
2021-09-29 21:55:46,704 INFO     Loading AnVIL_CCD

409


# QA Report 

> Show reconciliation with terra, gen3

##  Issues/Questions arising from Terra

In [8]:
# python json serializer setup

import datetime
import json
import os
from anvil.util.reconciler import flatten
import pandas as pd

def json_serial(obj):
    """JSON serializer for objects not serializable by default json code."""
    if isinstance(obj, (datetime, date)):
        return obj.isoformat()
    raise TypeError("Type %s not serializable" % type(obj))


# validate output summary and 
assert os.path.isfile(DASHBOARD_OUTPUT_FILE), "dashboard should exist"
with open(DASHBOARD_OUTPUT_FILE, 'r') as inputs:
    dashboard_data = json.load(inputs)
    
# Flatten dashboard into tsv

(flattened, column_names) = flatten(dashboard_data['projects'])
df = pd.DataFrame(flattened)  
df.columns = column_names
# Print the data  (all rows, all columns)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
# export create a tsv from dataframe
df.to_csv("/tmp/data_dashboard.tsv", sep="\t")
df

Unnamed: 0,source,workspace,accession,Bai,Bam,Bedpe,Crai,Cram,Csv,Gtc,Idat,Loupe,Md5,Tbi,Txt,Vcf,size,Project,Samples,Subject,dbgap_sample_count_mismatch,inconsistent_entityName,inconsistent_subject,missing_accession,missing_blobs,missing_samples,missing_schema,missing_sequence,missing_subjects
0,CMG,AnVIL_CMG_Broad_Muscle_KNC_WGS,phs001272.v1.p1,,,,,,,,,,,,,,0,1.0,14.0,14.0,,,,,,,,True,
1,CMG,AnVIL_CMG_BaylorHopkins_HMB-NPU_WES,,,,,,,,,,,,,,,0,1.0,2054.0,0.0,,,,,,,,,True
2,CMG,ANVIL_CMG_UWASH_DS-HFA,,,,,,,,,,,,,,,0,1.0,83.0,83.0,,,True,,,,,True,
3,CMG,ANVIL_CMG_Broad_Muscle_Laing_WES,phs001272.v1.p1,,,,,,,,,,,,,,0,1.0,31.0,31.0,,,,,,,,True,
4,CMG,AnVIL_CMG_Broad_Orphan_VCGS-White_WES,phs001272.v1.p1,,,,32044685.0,254721517221.0,,,,,7360.0,,,,254753569266,1.0,677.0,230.0,,,True,,,,,True,
5,CMG,AnVIL_CMG_Broad_Muscle_Myoseq_WES,phs001272.v1.p1,,,,,,,,,,,,,,0,1.0,1280.0,1280.0,,,,,,,,True,
6,CMG,AnVIL_CMG_UWASH_HMB,,,,,,,,,,,,,,,0,1.0,419.0,406.0,,,True,,,,,True,
7,CMG,AnVIL_CMG_Broad_Heart_Ware_WES,phs001272.v1.p1,,,,,,,,,,,,,,0,1.0,10.0,10.0,,,,,,,,True,
8,CMG,AnVIL_CMG_Broad_Muscle_Beggs_WES,phs001272.v1.p1,,,,24384977.0,202250201656.0,,,,,5280.0,,,,202274591913,1.0,439.0,439.0,,,True,,,,,True,
9,CMG,AnVIL_CMG_Broad_Blood_Sankaran_WGS,,,,,162800807.0,1609719841636.0,,,,,3072.0,,,,1609882645515,1.0,96.0,96.0,,,True,,,,,,


## summarize terra exceptions

> Extract the list of data transformation problems encountered [see more on dashboard exceptions](https://github.com/anvilproject/client-apis/wiki/dashboard-exceptions)

In [9]:
_projects = [project for project in dashboard_data['projects'] if 'problems' in  project]
flattened = []
problems = set([problem for project in _projects for problem in project['problems']])
for problem in problems:
    projects = [project['project_id'] for project in _projects if problem in project['problems']]
    flattened.append([problem, ','.join(projects)])

# Print the data  (all rows, all columns)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.colheader_justify', 'left')

df = pd.DataFrame(flattened)  
df.columns = ['problem', 'affected_workspaces']
df = df.style.set_properties(**{'text-align': 'left'})
df 

Unnamed: 0,problem,affected_workspaces
0,inconsistent_entityName,"AnVIL_CCDG_WashU_AI_T1D_T1DGC_WGS,AnVIL_CCDG_WashU_CVD_EOCAD_Harvard-Costa-Rica_WGS,AnVIL_CCDG_NYGC_NP_Alz_EFIGA_WGS,AnVIL_CCDG_WashU_CVD-NP-AI_Controls_VCControls_WGS,AnVIL_NIMH_Broad_WGSPD1_McCarroll_Pato_GRU_10XLRGenomes,AnVIL_NIMH_Broad_WGSPD1_McCarroll_Braff_DS_10XLRGenomes,AnVIL_PAGE_BioMe_GRU_WGS,AnVIL_PAGE_Stanford_Global_Reference_Panel_GRU_WGS,AnVIL_PAGE_MEC_GRU_WGS,AnVIL_PAGE_WHI_HMB-IRB_WGS,AnVIL_PAGE_SoL_HMB_WGS"
1,missing_subjects,"AnVIL_CMG_BaylorHopkins_HMB-NPU_WES,AnVIL_CMG_Broad_Blood_Gazda_WGS,AnVIL_CMG_Broad_Brain_Gleeson_WGS,AnVIL_CMG_Broad_Muscle_Beggs_WGS,AnVIL_CMG_Broad_Blood_Fleming_WES,AnVIL_CCDG_Freeze2_VCFAggregation,anvil_ccdg_broad_ai_ibd_daly_franchimont_gsa,AnVIL_CCDG_WashU_CVD_EOCAD_BioImage_WGS,AnVIL_CCDG_WASHU_PAGE,anvil_ccdg_broad_ai_ibd_daly_duerr_niddk_gsa,anvil_ccdg_broad_ai_ibd_daly_cho_niddk_gsa,AnVIL_CCDG_Freeze2_VCFs,ANVIL_CCDG_Broad_CVD_EOCAD_PROMIS_ARRAY,AnVIL_NIMH_Broad_WGSPD1_McCarroll_Pato_GRU_WGS"
2,missing_samples,"AnVIL_CMG_Broad_Blood_Gazda_WGS,AnVIL_CMG_UWash_GRU,AnVIL_CMG_Broad_Brain_Gleeson_WGS,AnVIL_CMG_Broad_Muscle_Beggs_WGS,AnVIL_CMG_BaylorHopkins_HMB-IRB-NPU_WES,ANVIL_CMG_YALE_DS-MC,AnVIL_CMG_Broad_Blood_Fleming_WES,anvil_ccdg_broad_ai_ibd_daly_niddk_cho_wes,AnVIL_CCDG_WashU_CVD_Eufam_WGS,AnVIL_CCDG_WashU_CVD_Wisconsin_WGS,anvil_ccdg_broad_ai_ibd_daly_franchimont_gsa,AnVIL_CCDG_WashU_CVD_EOCAD_WashU-CAD_GRU-IRB_WGS,AnVIL_CCDG_Baylor_CVD_HemStroke_Yale_HMB_WGS,AnVIL_CCDG_WashU_CVD_EOCAD_METSIM_WGS,AnVIL_CCDG_NYGC_NP_Autism_HFA_DS_WGS,AnVIL_CCDG_NYGC_NP_Alz_LOAD_WGS,AnVIL_CCDG_NYGC_NP_Autism_SSC_WGS,AnVIL_CCDG_WashU_CVD_Kanazawa_WGS,AnVIL_CCDG_NYGC_NP_Autism_ACE2_DS-MDS_WGS,AnVIL_CCDG_Baylor_CVD_Oregon_SUDS_GRU_WGS,AnVIL_CCDG_NYGC_AI_Asthma_Gala2_WGS,AnVIL_CCDG_WashU_CVD_PAGE_HMB-NPU_WGS,AnVIL_CCDG_WashU_CVD_Brazil-CVD_WGS,AnVIL_CCDG_WASHU_PAGE,anvil_ccdg_broad_ai_ibd_daly_duerr_niddk_gsa,AnVIL_CCDG_Broad_AI_IBD_Cho_WGS,AnVIL_CCDG_NYGC_NP_Autism_SAGE_WGS,AnVIL_CCDG_NYGC_NP_Autism_PELPHREY_ACE_DS_WGS,anvil_ccdg_broad_ai_ibd_daly_cho_niddk_gsa,AnVIL_CCDG_Baylor_CVD_HemStroke_ERICH_WGS,AnVIL_CCDG_Baylor_CVD_EOCAD_SoL_WGS,AnVIL_CCDG_Baylor_CVD_AFib_BioVU_WGS,AnVIL_CCDG_WashU_CVD_Indiana_WGS,AnVIL_CCDG_Baylor_CVD_HemStroke_WashU_DS_WGS,AnVIL_CCDG_Baylor_CVD_HemStroke_Duke_DS_WGS,AnVIL_CCDG_NYGC_NP_Autism_PELPHREY_ACE_GRU_WGS,AnVIL_CCDG_WashU_CVD_SCCS_WGS,AnVIL_CCDG_Freeze2_VCFs,AnVIL_CCDG_WashU_CVD_EOCAD_WashU-CAD_DS_WGS,AnVIL_CCDG_Baylor_CVD_HemStroke_BNI_HMB_WGS,AnVIL_CCDG_NYGC_NP_Autism_CAG_DS_WGS,AnVIL_CCDG_Baylor_CVD_Ventura_Presto_GRU-IRB_WGS,AnVIL_CCDG_Baylor_CVD_HemStroke_GERFHS_HMB_WGS,AnVIL_CCDG_NYGC_NP_Autism_TASC_WGS,AnVIL_CCDG_WashU_CVD_EOCAD_Emory_WGS,AnVIL_CCDG_NYGC_NP_Autism_SEARCHLIGHT_DS_WGS,AnVIL_CCDG_Baylor_CVD_TexGen_DS_WGS,AnVIL_CCDG_NYGC_NP_Autism_ACE2_GRU-MDS_WGS,AnVIL_CCDG_Baylor_CVD_HemStroke_GOCHA_DS_WGS,AnVIL_CCDG_WashU_CVD_MultiEthnic_WGS,ANVIL_CCDG_Broad_CVD_EOCAD_PROMIS_ARRAY,AnVIL_CCDG_Baylor_CVD_EOCAD_BioMe_WGS,AnVIL_CCDG_NYGC_NP_Alz_WHICAP_WGS,AnVIL_CCDG_WashU_CVD_Corogene_WGS,AnVIL_CCDG_NYGC_NP_Autism_HMCA_WGS,AnVIL_CCDG_Baylor_CVD_HHRC_Brownsville_GRU_WGS,AnVIL_CCDG_NYGC_NP_Autism_GASD_GRU_WGS,AnVIL_CCDG_WashU_CVD_WHI_WGS,AnVIL_CCDG_Broad_CVD_EOCAD_PartnersBiobank_HMB_Arrays,AnVIL_CCDG_Broad_CVD_EOCAD_PartnersBiobank_HMB_WES,AnVIL_CCDG_NYGC_NP_Autism_SPARK_GRU_WGS,AnVIL_CCDG_TOPMed_WashU_CVD_Afib_Penn_WGS,AnVIL_CCDG_NYGC_NP_Autism_AGRE_WGS,AnVIL_CCDG_NYGC_NP_Autism_AFS_DS_WGS,AnVIL_CCDG_Baylor_CVD_ARIC,AnVIL_CCDG_Baylor_CVD_AFib_Groningen_WGS,AnVIL_CCDG_Broad_Deposit,AnVIL_CCDG_Baylor_CVD_HemStroke_Regards_DS_WGS,AnVIL_GTEx_V8_hg38,AnVIL_NIMH_Broad_WGSPD1_McCarroll_Pato_GRU_WGS"
3,dbgap_sample_count_mismatch,"phs001272.v1.p1,phs000693.v6.p2,phs001489.v2.p2,phs001642.v1.p1,phs001222.v1.p1,phs001601.v2.p2,phs001227.v1.p1,phs001259.v1.p1,phs001543.v2.p1,phs001544.v2.p1,phs000160.v1.p1,phs001676.v1.p1,phs001502.v1.p1,phs001062.v5.p2,phs001180.v2.p1,phs000496.v1.p1,phs001395.v2.p1,phs001624.v2.p2,phs001545.v2.p1,phs000997.v5.p2,phs002018.v1.p1,phs001398.v1.p1,phs001600.v2.p2,phs001901.v1.p1"
4,inconsistent_subject,"ANVIL_CMG_UWASH_DS-HFA,AnVIL_CMG_Broad_Orphan_VCGS-White_WES,AnVIL_CMG_UWASH_HMB,AnVIL_CMG_Broad_Muscle_Beggs_WES,AnVIL_CMG_Broad_Blood_Sankaran_WGS,ANVIL_CMG_UWASH_DS-BDIS,AnVIL_CMG_UWash_DS-EP,AnVIL_CMG_UWash_GRU-IRB,AnVIL_CMG_Broad_Muscle_Topf_WES,ANVIL_CMG_UWASH_DS-NBIA,AnVIL_CMG_Broad_Blood_Sankaran_WES,AnVIL_CMG_Broad_Brain_Walsh_WES,AnVIL_CMG_Broad_Muscle_OGrady_WES,AnVIL_CMG_UWash_GRU,ANVIL_CMG_UWASH_HMB-IRB,ANVIL_CMG_YALE_DS-RARED,AnVIL_CMG_Broad_Blood_Gazda_WES,ANVIL_CMG_Yale_GRU,AnVIL_CMG_BaylorHopkins_HMB-IRB-NPU_WES,AnVIL_CMG_Broad_Stillbirth_Wilkins-Haug_WES,ANVIL_CMG_YALE_DS-MC,AnVIL_CMG_Broad_Brain_Engle_WGS,AnVIL_CMG_Broad_Kidney_Pollak_WES,AnVIL_CMG_Yale_HMB,AnVIL_CMG_Broad_Orphan_Scott_WES,AnVIL_CMG_Broad_Muscle_KNC_WES,ANVIL_CMG_UWASH_DS-BAV-IRB-PUB-RD,AnVIL_CMG_Yale_HMB-GSO,AnVIL_CCDG_WashU_CVD-NP-AI_Controls_VCControls_WGS"
5,missing_schema,"AnVIL_CMG_Broad_Blood_Gazda_WGS,AnVIL_CMG_Broad_Brain_Gleeson_WGS,AnVIL_CMG_Broad_Muscle_Beggs_WGS,AnVIL_CMG_Broad_Blood_Fleming_WES,anvil_ccdg_broad_ai_ibd_daly_franchimont_gsa,AnVIL_CCDG_WASHU_PAGE,anvil_ccdg_broad_ai_ibd_daly_duerr_niddk_gsa,anvil_ccdg_broad_ai_ibd_daly_cho_niddk_gsa,AnVIL_CCDG_Freeze2_VCFs,ANVIL_CCDG_Broad_CVD_EOCAD_PROMIS_ARRAY,AnVIL_NIMH_Broad_WGSPD1_McCarroll_Pato_GRU_WGS"
6,missing_sequence,"AnVIL_CMG_Broad_Muscle_KNC_WGS,ANVIL_CMG_UWASH_DS-HFA,ANVIL_CMG_Broad_Muscle_Laing_WES,AnVIL_CMG_Broad_Orphan_VCGS-White_WES,AnVIL_CMG_Broad_Muscle_Myoseq_WES,AnVIL_CMG_UWASH_HMB,AnVIL_CMG_Broad_Heart_Ware_WES,AnVIL_CMG_Broad_Muscle_Beggs_WES,ANVIL_CMG_UWASH_DS-BDIS,AnVIL_CMG_UWash_DS-EP,AnVIL_CMG_Broad_Orphan_Estonia-Ounap_WES,AnVIL_CMG_Broad_Eye_Pierce_WES,AnVIL_CMG_UWash_GRU-IRB,AnVIL_CMG_Broad_Orphan_Estonia-Ounap_WGS,AnVIL_CMG_Broad_Muscle_Topf_WES,ANVIL_CMG_UWASH_DS-NBIA,AnVIL_CMG_Broad_Blood_Sankaran_WES,AnVIL_CMG_Broad_Brain_Walsh_WES,AnVIL_CMG_Broad_Muscle_Kang_WES,AnVIL_CMG_Broad_Kidney_Hildebrandt_WES,AnVIL_CMG_Broad_Muscle_OGrady_WES,AnVIL_CMG_Broad_Brain_Gleeson_WES,AnVIL_CMG_UWash_GRU,ANVIL_CMG_UWASH_HMB-IRB,AnVIL_CMG_Broad_Kidney_Hildebrandt_WGS,AnVIL_CMG_Broad_Orphan_Manton_WES,AnVIL_CMG_Broad_Heart_PCGC-Tristani_WGS,AnVIL_CMG_Broad_Muscle_KNC_WES,AnVIL_CMG_Broad_Heart_Seidman_WES,AnVIL_CMG_Broad_Muscle_Bonnemann_WGS,AnVIL_CMG_Broad_Muscle_Bonnemann_WES,AnVIL_CMG_Broad_Orphan_Manton_WGS,ANVIL_CMG_UWASH_DS-BAV-IRB-PUB-RD,AnVIL_CMG_Broad_Muscle_Ravenscroft_WES,AnVIL_CMG_Broad_Orphan_VCGS-White_WGS,AnVIL_CMG_Broad_Muscle_Myoseq_WGS,AnVIL_CMG_Broad_Muscle_Kang_WGS,AnVIL_CMG_Broad_Eye_Pierce_WGS"
7,missing_blobs,"AnVIL_CMG_Broad_Blood_Gazda_WGS,AnVIL_CMG_Broad_Brain_Gleeson_WGS,ANVIL_CMG_YALE_DS-RARED,ANVIL_CMG_Yale_GRU,ANVIL_CMG_YALE_DS-MC,AnVIL_CMG_Yale_HMB-GSO,AnVIL_CMG_Broad_Blood_Fleming_WES,anvil_ccdg_broad_ai_ibd_daly_niddk_cho_wes,AnVIL_CCDG_Broad_NP_Epilepsy_DEUUPM_HMB_MDS_WES,anvil_ccdg_broad_ai_ibd_daly_pekow_share_gsa,anvil_ccdg_broad_ai_ibd_daly_sands_msccr_gsa,AnVIL_CCDG_Broad_CVD_AF_EAST_WES,AnVIL_CCDG_Broad_CVD_AF_TMDU_Cases_WES,AnVIL_CCDG_Broad_NP_Epilepsy_USAUPN_GRU_NPU_WES,AnVIL_CCDG_Broad_CVD_AF_Natale_TCAI_WES,AnVIL_CCDG_Broad_NP_Epilepsy_DEUUKB_HMB-NPU-MDS_GSA-MD,AnVIL_ccdg_asc_ndd_daly_talkowski_AGRE-FEMF_asd_exome,AnVIL_CCDG_Broad_NP_Epilepsy_HRVUZG_HMB-MDS_GSA-MD,AnVIL_CCDG_Broad_NP_Epilepsy_LEBABM_GRU_WES,AnVIL_CCDG_Broad_NP_Epilepsy_AUTMUV_DS_NS_MDS_NPU_WES,AnVIL_CCDG_Broad_NP_Epilepsy_AUSALF_HMB_IRB_GSRS_WES,anvil_ccdg_broad_ai_ibd_daly_rioux_igenomed_wes,anvil_ccdg_broad_ai_ibd_daly_mcgovern_niddk_wes,anvil_ccdg_broad_ai_ibd_daly_mcgovern_share_wes,ANVIL_CCDG_Broad_CVD_EOCAD_PROMIS_ARRAY,anvil_ccdg_broad_ai_ibd_daly_xavier_prism_wes,AnVIL_CCDG_WashU_CVD-NP-AI_Controls_VCControls_WGS,AnVIL_CCDG_Broad_NP_Epilepsy_BELATW_GRU_GSA-MD,AnVIL_CCDG_Broad_CVD_AF_Marcus_UCSF_WES,anvil_ccdg_broad_ai_ibd_daly_burnstein_gsa,AnVIL_CCDG_Broad_Deposit,AnVIL_PAGE_SoL_HMB_WGS"
8,missing_accession,"AnVIL_CCDG_WashU_CVD_EOCAD_BioMe_WGS,AnVIL_CCDG_WashU_CVD_EOCAD_METSIM_WGS,AnVIL_CCDG_TOPMED_Broad_CVD_EOCAD_PROMIS_WGS,ANVIL_CCDG_Broad_CVD_EOCAD_PROMIS_ARRAY,AnVIL_CCDG_Broad_CVD_EOCAD_PROMIS_WGS,AnVIL_CCDG_Broad_CVD_PROMIS_GRU_WES,AnVIL_CCDG_WashU_CVD_EOCAD_Duke_WGS"


## list consistent terra workspaces

In [10]:
# list consistent workspaces

df = pd.DataFrame([project['project_id'] for project in _projects if len(project['problems']) == 0])  
df.columns = ['workspace']
df = df.style.set_properties(**{'text-align': 'left'})
df 

Unnamed: 0,workspace
0,AnVIL_CCDG_Broad_NP_Epilepsy_ITAUBG_DS_EPI_NPU_MDS_WES
1,AnVIL_CCDG_Broad_NP_Epilepsy_ITAUMC_DS_NEURO_MDS_GSA-MD
2,AnVIL_CCDG_Broad_NP_Epilepsy_USAMGH_MGBB_HMB_MDS_WES
3,AnVIL_CCDG_Broad_NP_Epilepsy_USACRW_EPI_ASZ_MED_MDS_WES
4,AnVIL_CCDG_Broad_NP_Epilepsy_ITAICB_HMB-NPU-MDS_GSA-MD
5,AnVIL_ccdg_asc_ndd_daly_talkowski_chung_asd_exome
6,AnVIL_CCDG_Broad_NP_Epilepsy_DEUPUM_HMB-MDS_GSA-MD
7,AnVIL_CCDG_Broad_CVD_EOCAD_TaiChi_WGS
8,AnVIL_CCDG_Broad_AI_IBD_Brant_DS-IBD_WGS
9,AnVIL_ccdg_asc_ndd_daly_talkowski_ac-boston_asd_exome


## Issues/Questions arising from Gen3 PFB

In [11]:
# create 
summarize_workspaces()


2021-09-29 22:01:59,552 INFO     Indexing


AnVIL_CCDG_Broad_CVD_AF_EAST_WES fc-282a8e0b-df88-42de-9059-2b7447d9f9c7 403 GET https://storage.googleapis.com/storage/v1/b/fc-282a8e0b-df88-42de-9059-2b7447d9f9c7/o?projection=noAcl&fields=items%28size%2C+etag%2C+crc32c%2C+name%2C+timeCreated%29%2CnextPageToken&userProject=terra-test-bwalsh&prettyPrint=false: pet-110793006573203727769@terra-test-bwalsh.iam.gserviceaccount.com does not have storage.objects.list access to the Google Cloud Storage bucket.
AnVIL_CCDG_Broad_CVD_AF_TMDU_Cases_WES fc-2b68ae78-57af-4c65-8020-6f5ed4ae9408 403 GET https://storage.googleapis.com/storage/v1/b/fc-2b68ae78-57af-4c65-8020-6f5ed4ae9408/o?projection=noAcl&fields=items%28size%2C+etag%2C+crc32c%2C+name%2C+timeCreated%29%2CnextPageToken&userProject=terra-test-bwalsh&prettyPrint=false: pet-110793006573203727769@terra-test-bwalsh.iam.gserviceaccount.com does not have storage.objects.list access to the Google Cloud Storage bucket.
AnVIL_CCDG_Broad_CVD_AF_Natale_TCAI_WES fc-4f070061-0bc2-4f9a-9fe9-869a739c9

In [12]:
logging.getLogger().setLevel(logging.INFO)

conn = sqlite3.connect('/tmp/gen3-drs.sqlite')
cur = conn.cursor()

#
# load the terra dashboard summary into db
#
cur.executescript("""
-- 
drop table if exists terra_details ;
CREATE TABLE IF NOT EXISTS terra_details (
    workspace_id text,
    subject_id text,
    sample_id text,
    blob text
);
""")

conn.commit()

logging.info("created table")

with open(f"{DASHBOARD_OUTPUT_PATH}/terra_summary.json", 'rb') as fo:
    for line in fo.readlines():
        record = json.loads(line)
        cur.execute("REPLACE into terra_details values (?, ?, ?, ?);", (record['workspace_id'], record['subject_id'], record['sample_id'], record['blob'],))
conn.commit()

cur.executescript("""
CREATE UNIQUE INDEX IF NOT EXISTS terra_details_idx ON terra_details(workspace_id, subject_id, sample_id, blob);
""")
conn.commit()

logging.info("created index")

#
# reconcile with gen3
#

sql = """

-- missing sequencing
drop table if exists flattened ;
create table flattened
as
select
    json_extract(su.json, '$.object.project_id') as "project_id",
    json_extract(su.json, '$.object.anvil_project_id') as "anvil_project_id",
    su.name as "subject_type",
    su.key as "subject_id",
    json_extract(su.json, '$.object.participant_id') as "participant_id",
    json_extract(su.json, '$.object.submitter_id') as "subject_submitter_id",
    sa.name as "sample_type",
    sa.key  as "sample_id",
    json_extract(sa.json, '$.object.sample_id') as "sample_sample_id",
    json_extract(sa.json, '$.object.submitter_id') as "sample_submitter_id",
    json_extract(sa.json, '$.object.specimen_id') as "sample_specimen_id",
    'sequencing' as "sequencing_type",
    sequencing_edge.src  as "sequencing_id",
    json_extract(sq.json, '$.object.submitter_id') as "sequencing_submitter_id",
    json_extract(sq.json, '$.object.ga4gh_drs_uri') as "ga4gh_drs_uri"
    from vertices as su 
        join edges as sample_edge on sample_edge.dst = su.key and sample_edge.src_name = 'sample'
            join vertices as sa on sample_edge.src = sa.key  
                left join edges as sequencing_edge on sequencing_edge.dst = sa.key and sequencing_edge.src_name = 'sequencing'
                    join vertices as sq on sequencing_edge.src = sq.key 

    where           
    su.name = 'subject'            ;


drop table if exists summary ;
create table summary
as
 select f.project_id, f.anvil_project_id, 
    count(distinct f.subject_id) as "subject_count", 
    count(distinct f.sample_id) as "sample_count",
    count(distinct m.sequencing_id) as "sequencing_count",
    count(distinct m.ga4gh_drs_uri) as "ga4gh_drs_uri_count"
    from flattened as f
        left join flattened as m on f.project_id = m.project_id and f.anvil_project_id = m.anvil_project_id
    group by f.project_id, f.anvil_project_id;


drop table if exists reconcile_counts;
create table reconcile_counts as 
select w.workspace_id,
    count(distinct w.sample_id) as "terra_sample_id_count",
    count(distinct f.sample_submitter_id) as "gen3_sample_id_count",
    count(distinct w.blob) as "terra_blob_count",
    count(distinct f.ga4gh_drs_uri) as "gen3_drs_uri_count"
    from terra_details as w 
        left join flattened as f on (w.sample_id || '_sample' = f.sample_submitter_id)
group by w.workspace_id    
having gen3_sample_id_count > 0 
UNION
select w.workspace_id,
    count(distinct w.sample_id) as "terra_sample_id_count",
    count(distinct f.sample_submitter_id) as "gen3_sample_id_count",
    count(distinct w.blob) as "terra_blob_count",
    count(distinct f.ga4gh_drs_uri) as "gen3_drs_uri_count"
    from terra_details as w 
        left join flattened as f on (w.sample_id   = f.sample_submitter_id)
group by w.workspace_id    
having gen3_sample_id_count > 0 
UNION
select w.workspace_id,
    count(distinct w.sample_id) as "terra_sample_id_count",
    count(distinct f.sample_submitter_id) as "gen3_sample_id_count",
    count(distinct w.blob) as "terra_blob_count",
    count(distinct f.ga4gh_drs_uri) as "gen3_drs_uri_count"
    from terra_details as w 
        left join flattened as f on (w.sample_id   = f.sample_specimen_id)
group by w.workspace_id    
having gen3_sample_id_count > 0 
;

insert into reconcile_counts
select w.workspace_id,
    count(distinct w.sample_id) as "terra_sample_id_count",
    0 as "gen3_sample_id_count",
    count(distinct w.blob) as "terra_blob_count",
    0 as "gen3_drs_uri_count"
from terra_details  as w
where workspace_id not in ( select distinct workspace_id from reconcile_counts ) 
group by w.workspace_id    ;
;

drop table if exists missing_sequencing;

create table missing_sequencing
as 
select s.key, s.submitter_id  from vertices  as s
where s.name = 'sample' 
and
not EXISTS(
    select q.src from edges as q where q.dst = s.key 
) ;

drop table if exists subjects_missing_sequencing;
create table subjects_missing_sequencing
as
select s.key, s.submitter_id  from vertices  as s
where s.name = 'subject' 
and s.key in
(
    select q.dst from edges as q where q.src in (select ms.key from missing_sequencing as ms)
) ;


"""

cur.executescript(sql)
conn.commit()

logging.info("loaded table")
logging.getLogger().setLevel(logging.ERROR)

2021-09-29 22:21:02,819 INFO     created table
2021-09-29 22:21:17,701 INFO     created index
2021-09-29 22:25:28,826 INFO     loaded table


## PFB contains gen3 projects without anvil(terra) project

In [14]:


conn = sqlite3.connect('/tmp/gen3-drs.sqlite')
cur = conn.cursor()

df = pd.read_sql_query("SELECT * from summary where anvil_project_id is null;", conn)
df

Unnamed: 0,project_id,anvil_project_id,subject_count,sample_count,sequencing_count,ga4gh_drs_uri_count
0,CCDG-phs001259-DS-CARD-MDS-GSO,,2158,2159,0,0
1,CCDG-phs001398-GRU,,496,496,0,0
2,CCDG-phs001487-DS-MULTIPLE_DISEASES-IRB-COL-NPU-RD,,773,773,0,0
3,CCDG-phs001569-GRU,,1136,1136,0,0
4,CCDG-phs001642-DS-GID,,31,31,0,0
5,CCDG-phs001642-DS-IBD,,199,199,0,0
6,CCDG-phs001642-GRU,,1351,1351,0,0
7,CCDG-phs001642-HMB,,1248,1248,0,0
8,CF-GTEx,,981,47068,0,0
9,open_access-1000Genomes,,3202,3202,0,0


## Not all terra projects found in Gen3

In [15]:
df = pd.read_sql_query("SELECT * from reconcile_counts where gen3_sample_id_count = 0;", conn)
df

Unnamed: 0,workspace_id,terra_sample_id_count,gen3_sample_id_count,terra_blob_count,gen3_drs_uri_count
0,1000G-high-coverage-2019,3202,0,9606,0
1,ANVIL_CMG_UWASH_DS-NBIA,107,0,107,0
2,ANVIL_CMG_YALE_DS-MC,695,0,1389,0
3,ANVIL_CMG_YALE_DS-RARED,170,0,170,0
4,ANVIL_CMG_Yale_GRU,1731,0,3461,0
5,AnVIL_CCDG_Broad_AI_IBD_Cho_WGS,344,0,688,0
6,AnVIL_CCDG_Broad_AI_IBD_McCauley_WGS,913,0,1826,0
7,AnVIL_CCDG_Broad_AI_IBD_McGovern_WGS,1633,0,3266,0
8,AnVIL_CCDG_Broad_CVD_AF_BioVU_HMB_GSO_WES,5031,0,10062,0
9,AnVIL_CCDG_Broad_CVD_AF_Darbar_UIC_Cases_WES,304,0,608,0


## Terra / Gen3 samples count mismatch

In [16]:
df = pd.read_sql_query("SELECT * from reconcile_counts where gen3_sample_id_count > 0 and gen3_sample_id_count <> terra_sample_id_count;", conn)
df

Unnamed: 0,workspace_id,terra_sample_id_count,gen3_sample_id_count,terra_blob_count,gen3_drs_uri_count
0,AnVIL_CCDG_Broad_CVD_EOCAD_VIRGO_WGS,2159,2148,4318,4296


## Terra / Gen3 blob/drs count alignment

In [17]:
df = pd.read_sql_query("SELECT * from reconcile_counts where terra_sample_id_count = gen3_sample_id_count and terra_blob_count = gen3_drs_uri_count;", conn)
df

Unnamed: 0,workspace_id,terra_sample_id_count,gen3_sample_id_count,terra_blob_count,gen3_drs_uri_count
0,AnVIL_CCDG_Broad_AI_IBD_Brant_DS-IBD_WGS,199,199,398,398
1,AnVIL_CCDG_Broad_AI_IBD_Brant_HMB_WGS,904,904,1808,1808
2,AnVIL_CCDG_Broad_AI_IBD_Kugathasan_WGS,1351,1351,2702,2702
3,AnVIL_CCDG_Broad_AI_IBD_Newberry_WGS,31,31,62,62
4,AnVIL_CCDG_Broad_CVD_EOCAD_PROMIS_WGS,1136,1136,2272,2272
5,AnVIL_CCDG_Broad_CVD_EOCAD_TaiChi_WGS,773,773,1546,1546
6,AnVIL_CCDG_Broad_CVD_Stroke_BRAVE_WGS,496,496,992,992


### Terra / Gen3 blob/drs count mismatch

In [18]:

df = pd.read_sql_query("SELECT * from reconcile_counts where terra_sample_id_count = gen3_sample_id_count and terra_blob_count <> gen3_drs_uri_count;", conn)
df

Unnamed: 0,workspace_id,terra_sample_id_count,gen3_sample_id_count,terra_blob_count,gen3_drs_uri_count


### Unexpected extra files not in terra [leafcutter, bigWig]

In [19]:
pd.set_option('max_colwidth', 256)
df = pd.read_sql_query("select * from terra_details where blob like '%GTEX-12KS4-1526-SM-5EQ6E%' ;", conn)
df

Unnamed: 0,workspace_id,subject_id,sample_id,blob
0,AnVIL_GTEx_V8_hg38,AnVIL_GTEx_V8_hg38/Su/GTEX-12KS4,GTEx_V8_hg38/Sa/GTEX-12KS4-1526-SM-5EQ6E,gs://fc-secure-ff8156a3-ddf3-42e4-9211-0fd89da62108/GTEx_Analysis_2017-06-05_v8_RNAseq_BAM_files/GTEX-12KS4-1526-SM-5EQ6E.Aligned.sortedByCoord.out.patched.md.bam
1,AnVIL_GTEx_V8_hg38,AnVIL_GTEx_V8_hg38/Su/GTEX-12KS4,GTEx_V8_hg38/Sa/GTEX-12KS4-1526-SM-5EQ6E,gs://fc-secure-ff8156a3-ddf3-42e4-9211-0fd89da62108/GTEx_Analysis_2017-06-05_v8_RNAseq_BAM_files/GTEX-12KS4-1526-SM-5EQ6E.Aligned.sortedByCoord.out.patched.md.bam.bai


In [20]:
df = pd.read_sql_query("select key, name, submitter_id  from vertices where submitter_id like '%GTEX-12KS4-1526-SM-5EQ6E%' ;", conn)
df

Unnamed: 0,key,name,submitter_id
0,0019be63-8121-4869-bc6f-2c2694d6aefd,sample,GTEX-12KS4-1526-SM-5EQ6E
1,4e090504-a493-44ee-a1d2-db8dd34f29d0,sequencing,GTEX-12KS4-1526-SM-5EQ6E.Aligned.sortedByCoord.out.patched.md.bam.bai
2,663f2cb3-467d-46ec-8a05-908cfedc08e1,sequencing,GTEX-12KS4-1526-SM-5EQ6E.leafcutter.junc.gz
3,8abef4da-24c3-4698-9ee6-d2f71f0347ce,sequencing,GTEX-12KS4-1526-SM-5EQ6E.Aligned.sortedByCoord.out.patched.md.bigWig
4,8bcad8c0-3122-42fd-bdaa-fc912002057a,sequencing,GTEX-12KS4-1526-SM-5EQ6E.Aligned.sortedByCoord.out.patched.md.bam
5,deb14768-af73-44fd-891c-a4206ae94e0e,sequencing,GTEX-12KS4-1526-SM-5EQ6E.SJ.out.tab


## Unexpected Suffixes on gen3 identifiers [_RNASEQ_BAM_FILES, _RNASEQ_BIGWIG]

In [21]:
df = pd.read_sql_query("""select key, name,  json_extract(json, '$.object.submitter_id') as "gen3_submitter_id"   from vertices where gen3_submitter_id like '%_RNASEQ_BAM_FILES' or gen3_submitter_id like '%_BIGWIG'  limit 10;""", conn)
df

Unnamed: 0,key,name,gen3_submitter_id
0,000320ba-55cf-48bf-a08c-f34de815685c,sequencing,GTEX-15SHU-0126-SM-7KUEH.Aligned.sortedByCoord.out.patched.md.bam.bai_RNASEQ_BAM_FILES
1,00309d0c-6d1d-4d58-84b1-8f9f49ecebfe,sequencing,GTEX-1OJC3-1626-SM-E9U65.Aligned.sortedByCoord.out.patched.md.bam.bai_RNASEQ_BAM_FILES
2,003d4214-0ff1-41ee-9677-74b360ea7cfa,sequencing,GTEX-TSE9-2526-SM-4DXUS.Aligned.sortedByCoord.out.patched.md.bigWig_RNASEQ_BIGWIG
3,00420a20-5544-45fa-b12f-9356ccb7d013,sequencing,GTEX-11EM3-0826-SM-5N9CC.Aligned.sortedByCoord.out.patched.md.bam.bai_RNASEQ_BAM_FILES
4,004af85b-1aee-4287-9b22-9e0e514ff6a4,sequencing,GTEX-11DXW-0626-SM-5N9ER.Aligned.sortedByCoord.out.patched.md.bam_RNASEQ_BAM_FILES
5,00a309e1-367b-475a-9c61-a9e5821c2eba,sequencing,GTEX-ZY6K-1526-SM-5GZXE.Aligned.sortedByCoord.out.patched.md.bigWig_RNASEQ_BIGWIG
6,00b132b9-8602-41b0-8c4e-9697195795f8,sequencing,GTEX-1A8G7-1126-SM-731ED.Aligned.sortedByCoord.out.patched.md.bigWig_RNASEQ_BIGWIG
7,00d78b3f-76b5-4803-b694-6b71b210a090,sequencing,GTEX-15CHC-1126-SM-6LLHQ.Aligned.sortedByCoord.out.patched.md.bam_RNASEQ_BAM_FILES
8,00daebe6-352e-4e7f-87ed-c207cc156d20,sequencing,GTEX-13NYS-3126-SM-5KLYV.Aligned.sortedByCoord.out.patched.md.bam_RNASEQ_BAM_FILES
9,00f5fc03-c78f-4b5f-97bb-2b5a018efee2,sequencing,GTEX-Q2AG-0326-SM-48U1O.Aligned.sortedByCoord.out.patched.md.bigWig_RNASEQ_BIGWIG


## subjects without PFB `sequencing` record

In [22]:

df = pd.read_sql_query("select project_id, count(*) from flattened  where subject_id  in (select key from  subjects_missing_sequencing);", conn)
df

Unnamed: 0,project_id,count(*)
0,CF-GTEx,59316


## unexpected suffix on subject identifiers [_subject]

In [23]:
sql = """select project_id, count( distinct subject_id)   from flattened where subject_submitter_id  like '%_subject'"""
df = pd.read_sql_query(sql, conn)
df

Unnamed: 0,project_id,count( distinct subject_id)
0,,0


# Transform 
## write FHIR
> Missing ontologies and/or malformed identifiers will be logged

In [6]:
# rm old json
!rm -r /tmp/CCDG
!rm -r /tmp/CMG
!rm -r /tmp/GTEx
!rm -r /tmp/ThousandGenomes

!rm -r /tmp/NHGRI
!rm -r /tmp/NIMH
!rm -r /tmp/PAGE

rm: cannot remove '/tmp/GTEx': No such file or directory
rm: cannot remove '/tmp/ThousandGenomes': No such file or directory
rm: cannot remove '/tmp/NHGRI': No such file or directory
rm: cannot remove '/tmp/NIMH': No such file or directory
rm: cannot remove '/tmp/PAGE': No such file or directory


In [7]:
logging.getLogger().setLevel(logging.INFO)
entities = Entities(path='/tmp/terra.sqlite') 
workspace_names = [workspace.name for workspace in entities.get_by_name('workspace')]
write_fhir(workspace_names)

2021-09-29 23:19:41,240 INFO     Transforming AnVIL_CMG_Broad_Muscle_KNC_WGS
2021-09-29 23:19:41,248 INFO     Loading /tmp/export_2021-09-29T18_43_01.avro
2021-09-29 23:19:41,250 INFO     Already indexed /tmp/export_2021-09-29T18_43_01.avro
2021-09-29 23:19:41,264 INFO     Transforming AnVIL_CMG_BaylorHopkins_HMB-NPU_WES
2021-09-29 23:19:41,265 ERROR    AnVIL_CMG_BaylorHopkins_HMB-NPU_WES missing subject edges
2021-09-29 23:19:41,273 INFO     Transforming ANVIL_CMG_UWASH_DS-HFA
2021-09-29 23:19:41,357 INFO     Transforming ANVIL_CMG_Broad_Muscle_Laing_WES
2021-09-29 23:19:41,434 INFO     Transforming AnVIL_CMG_Broad_Orphan_VCGS-White_WES
2021-09-29 23:19:45,421 INFO     Transforming AnVIL_CMG_Broad_Muscle_Myoseq_WES
2021-09-29 23:20:01,969 INFO     Transforming AnVIL_CMG_UWASH_HMB
2021-09-29 23:20:02,593 INFO     Transforming AnVIL_CMG_Broad_Heart_Ware_WES
2021-09-29 23:20:02,661 INFO     Transforming AnVIL_CMG_Broad_Muscle_Beggs_WES
2021-09-29 23:20:03,649 INFO     Transforming AnVIL_

AnVIL_CCDG_Broad_CVD_AF_EAST_WES fc-282a8e0b-df88-42de-9059-2b7447d9f9c7 403 GET https://storage.googleapis.com/storage/v1/b/fc-282a8e0b-df88-42de-9059-2b7447d9f9c7/o?projection=noAcl&fields=items%28size%2C+etag%2C+crc32c%2C+name%2C+timeCreated%29%2CnextPageToken&userProject=terra-test-bwalsh&prettyPrint=false: pet-110793006573203727769@terra-test-bwalsh.iam.gserviceaccount.com does not have storage.objects.list access to the Google Cloud Storage bucket.


2021-09-29 23:23:58,031 INFO     Transforming AnVIL_CCDG_Broad_CVD_AF_Ellinor_MGH_WES
2021-09-29 23:24:00,950 INFO     Transforming AnVIL_CCDG_Broad_NP_Epilepsy_ITAUMR_GRU-NPU_GSA-MD
2021-09-29 23:24:04,814 INFO     Transforming AnVIL_CCDG_NYGC_NP_Autism_ACE2_DS-MDS_WGS
2021-09-29 23:24:05,028 INFO     Transforming AnVIL_CCDG_Broad_NP_Epilepsy_USAMSS_DS_EP_NEURO_MDS_GSA-MD
2021-09-29 23:24:05,849 INFO     Transforming AnVIL_CCDG_Broad_NP_Epilepsy_CANUTN_DS-EP_GSA-MD
2021-09-29 23:24:07,629 INFO     Transforming anvil_ccdg_broad_ai_ibd_niddk_daly_brant_wes
2021-09-29 23:24:10,927 INFO     Transforming AnVIL_CCDG_Broad_NP_Epilepsy_BELATW_GRU_WES
2021-09-29 23:24:11,868 INFO     Transforming AnVIL_CCDG_Broad_NP_Epilepsy_HKOSB_GRU_WES
2021-09-29 23:24:13,824 INFO     Transforming AnVIL_CCDG_Broad_CVD_AF_TMDU_Cases_WES


AnVIL_CCDG_Broad_CVD_AF_TMDU_Cases_WES fc-2b68ae78-57af-4c65-8020-6f5ed4ae9408 403 GET https://storage.googleapis.com/storage/v1/b/fc-2b68ae78-57af-4c65-8020-6f5ed4ae9408/o?projection=noAcl&fields=items%28size%2C+etag%2C+crc32c%2C+name%2C+timeCreated%29%2CnextPageToken&userProject=terra-test-bwalsh&prettyPrint=false: pet-110793006573203727769@terra-test-bwalsh.iam.gserviceaccount.com does not have storage.objects.list access to the Google Cloud Storage bucket.


2021-09-29 23:24:18,522 INFO     Transforming AnVIL_CCDG_Broad_CVD_AF_VAFAR_WES
2021-09-29 23:24:23,461 INFO     Transforming AnVIL_CCDG_Broad_CVD_AFib_MGH_WGS
2021-09-29 23:24:26,034 INFO     Transforming AnVIL_CCDG_Broad_CVD_AF_Rienstra_WES
2021-09-29 23:24:32,212 INFO     Transforming AnVIL_ccdg_asc_ndd_daly_talkowski_kolevzon_asd_wgs
2021-09-29 23:24:32,727 INFO     Transforming AnVIL_CCDG_Baylor_CVD_Oregon_SUDS_GRU_WGS
2021-09-29 23:24:36,742 INFO     Transforming AnVIL_CCDG_NYGC_AI_Asthma_Gala2_WGS
2021-09-29 23:24:37,830 INFO     Transforming AnVIL_CCDG_Broad_NP_Epilepsy_LEBABM_GRU_GSA-MD
2021-09-29 23:24:39,891 INFO     Transforming anvil_ccdg_broad_ai_ibd_daly_newberry_share_wes
2021-09-29 23:24:42,605 INFO     Transforming AnVIL_CCDG_WashU_CVD_PAGE_HMB-NPU_WGS
2021-09-29 23:24:43,019 INFO     Transforming AnVIL_CCDG_Broad_NP_Epilepsy_GBRUNL_GRU_WES
2021-09-29 23:24:44,926 INFO     Transforming AnVIL_CCDG_Broad_MI_BRAVE_GRU_WES
2021-09-29 23:24:50,231 INFO     Transforming AnV

AnVIL_CCDG_Broad_CVD_AF_Natale_TCAI_WES fc-4f070061-0bc2-4f9a-9fe9-869a739c9817 403 GET https://storage.googleapis.com/storage/v1/b/fc-4f070061-0bc2-4f9a-9fe9-869a739c9817/o?projection=noAcl&fields=items%28size%2C+etag%2C+crc32c%2C+name%2C+timeCreated%29%2CnextPageToken&userProject=terra-test-bwalsh&prettyPrint=false: pet-110793006573203727769@terra-test-bwalsh.iam.gserviceaccount.com does not have storage.objects.list access to the Google Cloud Storage bucket.


2021-09-29 23:26:18,296 INFO     Transforming AnVIL_CCDG_Broad_NP_Epilepsy_DEUPUM_HMB_MDS_WES
2021-09-29 23:26:20,112 INFO     Transforming AnVIL_CCDG_Broad_NP_Epilepsy_USACCH_DS_NEURO_MDS_WES
2021-09-29 23:26:22,999 INFO     Transforming AnVIL_CCDG_Broad_NP_Epilepsy_DEUUTB_HMB-NPU_MDS_GSA-MD
2021-09-29 23:26:29,654 INFO     Transforming AnVIL_ccdg_asc_ndd_daly_talkowski_menashe_asd_exome
2021-09-29 23:26:32,653 INFO     Transforming AnVIL_CCDG_Broad_NP_Epilepsy_HKOSB_GRU_GSA-MD
2021-09-29 23:26:34,350 INFO     Transforming AnVIL_ccdg_asc_ndd_daly_talkowski_AGRE_asd_exome
2021-09-29 23:26:38,410 INFO     Transforming AnVIL_CCDG_Broad_NP_Epilepsy_GBRUCL_DS_EARET_MDS_GSA-MD
2021-09-29 23:26:42,074 INFO     Transforming AnVIL_CCDG_Broad_NP_Epilepsy_GHAKNT_GRU_WES
2021-09-29 23:26:45,145 INFO     Transforming AnVIL_ccdg_asc_ndd_daly_talkowski_domenici_asd_exome
2021-09-29 23:26:49,149 INFO     Transforming anvil_ccdg_broad_ai_ibd_daly_rioux_niddk_wes
2021-09-29 23:26:55,524 INFO     Transf

AnVIL_CCDG_Broad_NP_Epilepsy_AUTMUV_DS_NS_MDS_NPU_WES fc-82bbaf50-f3d4-48e9-bd76-3874638fa714 403 GET https://storage.googleapis.com/storage/v1/b/fc-82bbaf50-f3d4-48e9-bd76-3874638fa714/o?projection=noAcl&fields=items%28size%2C+etag%2C+crc32c%2C+name%2C+timeCreated%29%2CnextPageToken&userProject=terra-test-bwalsh&prettyPrint=false: pet-110793006573203727769@terra-test-bwalsh.iam.gserviceaccount.com does not have storage.objects.list access to the Google Cloud Storage bucket.


2021-09-29 23:31:37,832 INFO     Transforming AnVIL_CCDG_Broad_NP_Epilepsy_NZLUTO_EPIL_BC_ID_MDS_GSA-MD
2021-09-29 23:31:39,735 INFO     Transforming AnVIL_CCDG_Freeze2_VCFs
2021-09-29 23:31:39,737 ERROR    AnVIL_CCDG_Freeze2_VCFs missing subject edges
2021-09-29 23:31:40,419 INFO     Transforming AnVIL_CCDG_Broad_NP_Epilepsy_HKGHKK_HMB_MDS_WES
2021-09-29 23:31:43,809 INFO     Transforming AnVIL_CCDG_Broad_NP_Autism_State-Sanders_WGS
2021-09-29 23:31:46,591 INFO     Transforming AnVIL_ccdg_asc_ndd_daly_talkowski_barbosa_asd_exome
2021-09-29 23:31:49,880 INFO     Transforming anvil_ccdg_broad_ai_ibd_niddk_daly_duerr_wes
2021-09-29 23:31:58,222 INFO     Transforming AnVIL_CCDG_WashU_CVD_EOCAD_WashU-CAD_DS_WGS
2021-09-29 23:31:58,785 INFO     Transforming AnVIL_CCDG_Broad_NP_Epilepsy_AUSRMB_DS-EAED-MDS_GSA-MD
2021-09-29 23:32:00,927 INFO     Transforming AnVIL_CCDG_Baylor_CVD_HemStroke_BNI_HMB_WGS
2021-09-29 23:32:01,893 INFO     Transforming AnVIL_CCDG_Broad_NP_Epilepsy_DEUUKL_HMB_WES
20

AnVIL_CCDG_Broad_NP_Epilepsy_AUSALF_HMB_IRB_GSRS_WES fc-8da94069-2edc-4e37-8c96-5a25740aeb32 403 GET https://storage.googleapis.com/storage/v1/b/fc-8da94069-2edc-4e37-8c96-5a25740aeb32/o?projection=noAcl&fields=items%28size%2C+etag%2C+crc32c%2C+name%2C+timeCreated%29%2CnextPageToken&userProject=terra-test-bwalsh&prettyPrint=false: pet-110793006573203727769@terra-test-bwalsh.iam.gserviceaccount.com does not have storage.objects.list access to the Google Cloud Storage bucket.


2021-09-29 23:32:06,525 INFO     Transforming AnVIL_CCDG_WashU_CVD_EOCAD_BioVu_WGS
2021-09-29 23:32:10,729 INFO     Transforming AnVIL_CCDG_Broad_NP_Epilepsy_DEUUKB_HMB_NPU_MDS_WES
2021-09-29 23:32:21,963 INFO     Transforming anvil_ccdg_broad_ai_ibd_daly_rioux_igenomed_wes


anvil_ccdg_broad_ai_ibd_daly_rioux_igenomed_wes fc-9197a911-c2f8-4f5f-91f9-389d191626d0 403 GET https://storage.googleapis.com/storage/v1/b/fc-9197a911-c2f8-4f5f-91f9-389d191626d0/o?projection=noAcl&fields=items%28size%2C+etag%2C+crc32c%2C+name%2C+timeCreated%29%2CnextPageToken&userProject=terra-test-bwalsh&prettyPrint=false: pet-110793006573203727769@terra-test-bwalsh.iam.gserviceaccount.com does not have storage.objects.list access to the Google Cloud Storage bucket.


2021-09-29 23:32:29,818 INFO     Transforming anvil_ccdg_broad_ai_ibd_daly_mcgovern_niddk_wes
2021-09-29 23:33:08,571 INFO     Transforming AnVIL_CCDG_Broad_NP_Epilepsy_FINKPH_EPIL_MDS_GSA-MD
2021-09-29 23:33:14,610 INFO     Transforming AnVIL_CCDG_Broad_NP_Epilepsy_AUSAUS_EP_BA_CN_ID_MDS_GSA-MD
2021-09-29 23:33:23,837 INFO     Transforming AnVIL_CCDG_Broad_NP_Epilepsy_ITAUBG_DS-EPI-NPU-MDS_GSA-MD
2021-09-29 23:33:27,218 INFO     Transforming AnVIL_CCDG_Broad_NP_Epilepsy_CYPCYP_HMB-NPU-MDS_GSA-MD
2021-09-29 23:33:28,401 INFO     Transforming AnVIL_CCDG_NYGC_NP_Autism_CAG_DS_WGS
2021-09-29 23:33:32,246 INFO     Transforming AnVIL_CCDG_Broad_NP_Epilepsy_TWNCGM_HMB-NPU-ADULTS_GSA-MD
2021-09-29 23:33:37,755 INFO     Transforming AnVIL_CCDG_Broad_NP_Epilepsy_CYPCYP_HMB_NPU_MDS_WES
2021-09-29 23:33:39,383 INFO     Transforming AnVIL_ccdg_asc_ndd_daly_talkowski_palotie_asd_exome
2021-09-29 23:33:40,817 INFO     Transforming AnVIL_CCDG_Broad_CVD_AFib_Intermountain_WGS
2021-09-29 23:33:44,270 I

AnVIL_CCDG_Broad_CVD_AF_Marcus_UCSF_WES fc-e7051891-25c8-4776-80ed-26b1af860277 403 GET https://storage.googleapis.com/storage/v1/b/fc-e7051891-25c8-4776-80ed-26b1af860277/o?projection=noAcl&fields=items%28size%2C+etag%2C+crc32c%2C+name%2C+timeCreated%29%2CnextPageToken&userProject=terra-test-bwalsh&prettyPrint=false: pet-110793006573203727769@terra-test-bwalsh.iam.gserviceaccount.com does not have storage.objects.list access to the Google Cloud Storage bucket.


2021-09-29 23:43:17,782 INFO     Transforming AnVIL_CCDG_Broad_NP_Epilepsy_GBRUCL_DS_EARET_MDS_WES
2021-09-29 23:43:25,806 INFO     Transforming anvil_ccdg_broad_ai_ibd_daly_franchimont_wes
2021-09-29 23:44:01,593 INFO     Transforming AnVIL_CCDG_Broad_CVD_EOCAD_PartnersBiobank_HMB_WES
2021-09-29 23:45:01,669 INFO     Transforming AnVIL_CCDG_WashU_CVD_EOCAD_Cleveland_WGS
2021-09-29 23:45:05,990 INFO     Transforming AnVIL_CCDG_NYGC_NP_Autism_SPARK_GRU_WGS
2021-09-29 23:46:05,731 INFO     Transforming AnVIL_CCDG_Broad_CVD_AFib_Duke_WGS
2021-09-29 23:46:06,926 INFO     Transforming AnVIL_CCDG_Broad_NP_Epilepsy_USAMON_GRU_GSRS_WES
2021-09-29 23:46:07,887 INFO     Transforming AnVIL_CCDG_Broad_NP_Epilepsy_USAUPN_CHOP_GRU_GSA-MD
2021-09-29 23:46:11,398 INFO     Transforming AnVIL_CCDG_Broad_NP_Epilepsy_USAVANcontrols_HMB-GSO_WES
2021-09-29 23:46:24,750 INFO     Transforming AnVIL_CCDG_TOPMed_WashU_CVD_Afib_Penn_WGS
2021-09-29 23:46:28,005 INFO     Transforming AnVIL_CCDG_NYGC_NP_Autism_AGRE

In [8]:
!ls /tmp/CCDG/AnVIL_CCDG_Broad_CVD_EOCAD_PartnersBiobank_HMB_WES
!

Organization.json  Practitioner.json   ResearchStudyObservation.json
Patient.json	   ResearchStudy.json  ResearchSubject.json


## write graph of json vertices

In [9]:
from anvil.terra.workspace_graph import WorkspaceGraph

entities = Entities(path='/tmp/terra.sqlite')   
workspace_graph = WorkspaceGraph(path='/tmp/terra-graph.sqlite')

for workspace in entities.get_by_name('workspace'):
    if 'CSER' in workspace.name:
        continue
    workspace_graph.save(workspace)
workspace_graph.index()    

2021-09-29 23:51:47,525 INFO     Loading AnVIL_CMG_Broad_Muscle_KNC_WGS
2021-09-29 23:51:47,544 INFO     Loading AnVIL_CMG_BaylorHopkins_HMB-NPU_WES
2021-09-29 23:51:47,864 INFO     Loading ANVIL_CMG_UWASH_DS-HFA
2021-09-29 23:51:47,914 INFO     Loading ANVIL_CMG_Broad_Muscle_Laing_WES
2021-09-29 23:51:47,948 INFO     Loading AnVIL_CMG_Broad_Orphan_VCGS-White_WES
2021-09-29 23:51:51,639 INFO     Loading AnVIL_CMG_Broad_Muscle_Myoseq_WES
2021-09-29 23:52:07,902 INFO     Loading AnVIL_CMG_UWASH_HMB
2021-09-29 23:52:08,143 INFO     Loading AnVIL_CMG_Broad_Heart_Ware_WES
2021-09-29 23:52:08,159 INFO     Loading AnVIL_CMG_Broad_Muscle_Beggs_WES
2021-09-29 23:52:08,834 INFO     Loading AnVIL_CMG_Broad_Blood_Sankaran_WGS
2021-09-29 23:52:09,474 INFO     Loading ANVIL_CMG_UWASH_DS-BDIS
2021-09-29 23:52:09,488 INFO     Loading AnVIL_CMG_UWash_DS-EP
2021-09-29 23:52:09,520 INFO     Loading AnVIL_CMG_Broad_Blood_Gazda_WGS
2021-09-29 23:52:10,173 INFO     Loading AnVIL_CMG_Broad_Orphan_Estonia-Oun

AnVIL_CCDG_Broad_CVD_AF_EAST_WES fc-282a8e0b-df88-42de-9059-2b7447d9f9c7 403 GET https://storage.googleapis.com/storage/v1/b/fc-282a8e0b-df88-42de-9059-2b7447d9f9c7/o?projection=noAcl&fields=items%28size%2C+etag%2C+crc32c%2C+name%2C+timeCreated%29%2CnextPageToken&userProject=terra-test-bwalsh&prettyPrint=false: pet-110793006573203727769@terra-test-bwalsh.iam.gserviceaccount.com does not have storage.objects.list access to the Google Cloud Storage bucket.


2021-09-29 23:54:45,326 INFO     Loading AnVIL_CCDG_Broad_CVD_AF_Ellinor_MGH_WES
2021-09-29 23:54:46,478 INFO     Loading AnVIL_CCDG_Broad_NP_Epilepsy_ITAUMR_GRU-NPU_GSA-MD
2021-09-29 23:54:48,811 INFO     Loading AnVIL_CCDG_NYGC_NP_Autism_ACE2_DS-MDS_WGS
2021-09-29 23:54:48,871 INFO     Loading AnVIL_CCDG_Broad_NP_Epilepsy_USAMSS_DS_EP_NEURO_MDS_GSA-MD
2021-09-29 23:54:49,391 INFO     Loading AnVIL_CCDG_Broad_NP_Epilepsy_CANUTN_DS-EP_GSA-MD
2021-09-29 23:54:49,943 INFO     Loading anvil_ccdg_broad_ai_ibd_niddk_daly_brant_wes
2021-09-29 23:54:52,217 INFO     Loading AnVIL_CCDG_Broad_NP_Epilepsy_BELATW_GRU_WES
2021-09-29 23:54:52,732 INFO     Loading AnVIL_CCDG_Broad_NP_Epilepsy_HKOSB_GRU_WES
2021-09-29 23:54:53,269 INFO     Loading AnVIL_CCDG_Broad_CVD_AF_TMDU_Cases_WES


AnVIL_CCDG_Broad_CVD_AF_TMDU_Cases_WES fc-2b68ae78-57af-4c65-8020-6f5ed4ae9408 403 GET https://storage.googleapis.com/storage/v1/b/fc-2b68ae78-57af-4c65-8020-6f5ed4ae9408/o?projection=noAcl&fields=items%28size%2C+etag%2C+crc32c%2C+name%2C+timeCreated%29%2CnextPageToken&userProject=terra-test-bwalsh&prettyPrint=false: pet-110793006573203727769@terra-test-bwalsh.iam.gserviceaccount.com does not have storage.objects.list access to the Google Cloud Storage bucket.


2021-09-29 23:54:54,971 INFO     Loading AnVIL_CCDG_Broad_CVD_AF_VAFAR_WES
2021-09-29 23:54:58,518 INFO     Loading AnVIL_CCDG_Broad_CVD_AFib_MGH_WGS
2021-09-29 23:54:59,041 INFO     Loading AnVIL_CCDG_Broad_CVD_AF_Rienstra_WES
2021-09-29 23:55:03,098 INFO     Loading AnVIL_ccdg_asc_ndd_daly_talkowski_kolevzon_asd_wgs
2021-09-29 23:55:03,472 INFO     Loading AnVIL_CCDG_Baylor_CVD_Oregon_SUDS_GRU_WGS
2021-09-29 23:55:07,078 INFO     Loading AnVIL_CCDG_NYGC_AI_Asthma_Gala2_WGS
2021-09-29 23:55:07,475 INFO     Loading AnVIL_CCDG_Broad_NP_Epilepsy_LEBABM_GRU_GSA-MD
2021-09-29 23:55:08,481 INFO     Loading anvil_ccdg_broad_ai_ibd_daly_newberry_share_wes
2021-09-29 23:55:10,485 INFO     Loading AnVIL_CCDG_WashU_CVD_PAGE_HMB-NPU_WGS
2021-09-29 23:55:10,864 INFO     Loading AnVIL_CCDG_Broad_NP_Epilepsy_GBRUNL_GRU_WES
2021-09-29 23:55:11,331 INFO     Loading AnVIL_CCDG_Broad_MI_BRAVE_GRU_WES
2021-09-29 23:55:14,523 INFO     Loading AnVIL_CCDG_Broad_NP_Epilepsy_ITAUMR_GRU_NPU_WES
2021-09-29 23:5

AnVIL_CCDG_Broad_CVD_AF_Natale_TCAI_WES fc-4f070061-0bc2-4f9a-9fe9-869a739c9817 403 GET https://storage.googleapis.com/storage/v1/b/fc-4f070061-0bc2-4f9a-9fe9-869a739c9817/o?projection=noAcl&fields=items%28size%2C+etag%2C+crc32c%2C+name%2C+timeCreated%29%2CnextPageToken&userProject=terra-test-bwalsh&prettyPrint=false: pet-110793006573203727769@terra-test-bwalsh.iam.gserviceaccount.com does not have storage.objects.list access to the Google Cloud Storage bucket.


2021-09-29 23:56:09,427 INFO     Loading AnVIL_CCDG_Broad_NP_Epilepsy_DEUPUM_HMB_MDS_WES
2021-09-29 23:56:10,470 INFO     Loading AnVIL_CCDG_Broad_NP_Epilepsy_USACCH_DS_NEURO_MDS_WES
2021-09-29 23:56:11,563 INFO     Loading AnVIL_CCDG_Broad_NP_Epilepsy_DEUUTB_HMB-NPU_MDS_GSA-MD
2021-09-29 23:56:15,079 INFO     Loading AnVIL_ccdg_asc_ndd_daly_talkowski_menashe_asd_exome
2021-09-29 23:56:16,931 INFO     Loading AnVIL_CCDG_Broad_NP_Epilepsy_HKOSB_GRU_GSA-MD
2021-09-29 23:56:17,754 INFO     Loading AnVIL_ccdg_asc_ndd_daly_talkowski_AGRE_asd_exome
2021-09-29 23:56:19,888 INFO     Loading AnVIL_CCDG_Broad_NP_Epilepsy_GBRUCL_DS_EARET_MDS_GSA-MD
2021-09-29 23:56:21,795 INFO     Loading AnVIL_CCDG_Broad_NP_Epilepsy_GHAKNT_GRU_WES
2021-09-29 23:56:23,526 INFO     Loading AnVIL_ccdg_asc_ndd_daly_talkowski_domenici_asd_exome
2021-09-29 23:56:25,385 INFO     Loading anvil_ccdg_broad_ai_ibd_daly_rioux_niddk_wes
2021-09-29 23:56:29,659 INFO     Loading AnVIL_CCDG_Baylor_CVD_HemStroke_ERICH_WGS
2021-0

AnVIL_CCDG_Broad_NP_Epilepsy_AUTMUV_DS_NS_MDS_NPU_WES fc-82bbaf50-f3d4-48e9-bd76-3874638fa714 403 GET https://storage.googleapis.com/storage/v1/b/fc-82bbaf50-f3d4-48e9-bd76-3874638fa714/o?projection=noAcl&fields=items%28size%2C+etag%2C+crc32c%2C+name%2C+timeCreated%29%2CnextPageToken&userProject=terra-test-bwalsh&prettyPrint=false: pet-110793006573203727769@terra-test-bwalsh.iam.gserviceaccount.com does not have storage.objects.list access to the Google Cloud Storage bucket.


2021-09-29 23:59:37,007 INFO     Loading AnVIL_CCDG_Broad_NP_Epilepsy_NZLUTO_EPIL_BC_ID_MDS_GSA-MD
2021-09-29 23:59:38,054 INFO     Loading AnVIL_CCDG_Freeze2_VCFs
2021-09-29 23:59:38,739 INFO     Loading AnVIL_CCDG_Broad_NP_Epilepsy_HKGHKK_HMB_MDS_WES
2021-09-29 23:59:41,108 INFO     Loading AnVIL_CCDG_Broad_NP_Autism_State-Sanders_WGS
2021-09-29 23:59:43,166 INFO     Loading AnVIL_ccdg_asc_ndd_daly_talkowski_barbosa_asd_exome
2021-09-29 23:59:44,168 INFO     Loading anvil_ccdg_broad_ai_ibd_niddk_daly_duerr_wes
2021-09-29 23:59:50,713 INFO     Loading AnVIL_CCDG_WashU_CVD_EOCAD_WashU-CAD_DS_WGS
2021-09-29 23:59:50,802 INFO     Loading AnVIL_CCDG_Broad_NP_Epilepsy_AUSRMB_DS-EAED-MDS_GSA-MD
2021-09-29 23:59:52,282 INFO     Loading AnVIL_CCDG_Baylor_CVD_HemStroke_BNI_HMB_WGS
2021-09-29 23:59:53,044 INFO     Loading AnVIL_CCDG_Broad_NP_Epilepsy_DEUUKL_HMB_WES
2021-09-29 23:59:54,075 INFO     Loading AnVIL_ccdg_asc_ndd_daly_talkowski_herman_asd_exome
2021-09-29 23:59:54,570 INFO     Loadin

AnVIL_CCDG_Broad_NP_Epilepsy_AUSALF_HMB_IRB_GSRS_WES fc-8da94069-2edc-4e37-8c96-5a25740aeb32 403 GET https://storage.googleapis.com/storage/v1/b/fc-8da94069-2edc-4e37-8c96-5a25740aeb32/o?projection=noAcl&fields=items%28size%2C+etag%2C+crc32c%2C+name%2C+timeCreated%29%2CnextPageToken&userProject=terra-test-bwalsh&prettyPrint=false: pet-110793006573203727769@terra-test-bwalsh.iam.gserviceaccount.com does not have storage.objects.list access to the Google Cloud Storage bucket.


2021-09-29 23:59:56,285 INFO     Loading AnVIL_CCDG_WashU_CVD_EOCAD_BioVu_WGS
2021-09-29 23:59:58,023 INFO     Loading AnVIL_CCDG_Broad_NP_Epilepsy_DEUUKB_HMB_NPU_MDS_WES
2021-09-30 00:00:06,274 INFO     Loading anvil_ccdg_broad_ai_ibd_daly_rioux_igenomed_wes


anvil_ccdg_broad_ai_ibd_daly_rioux_igenomed_wes fc-9197a911-c2f8-4f5f-91f9-389d191626d0 403 GET https://storage.googleapis.com/storage/v1/b/fc-9197a911-c2f8-4f5f-91f9-389d191626d0/o?projection=noAcl&fields=items%28size%2C+etag%2C+crc32c%2C+name%2C+timeCreated%29%2CnextPageToken&userProject=terra-test-bwalsh&prettyPrint=false: pet-110793006573203727769@terra-test-bwalsh.iam.gserviceaccount.com does not have storage.objects.list access to the Google Cloud Storage bucket.


2021-09-30 00:00:06,827 INFO     Loading anvil_ccdg_broad_ai_ibd_daly_mcgovern_niddk_wes
2021-09-30 00:00:35,535 INFO     Loading AnVIL_CCDG_Broad_NP_Epilepsy_FINKPH_EPIL_MDS_GSA-MD
2021-09-30 00:00:38,654 INFO     Loading AnVIL_CCDG_Broad_NP_Epilepsy_AUSAUS_EP_BA_CN_ID_MDS_GSA-MD
2021-09-30 00:00:44,986 INFO     Loading AnVIL_CCDG_Broad_NP_Epilepsy_ITAUBG_DS-EPI-NPU-MDS_GSA-MD
2021-09-30 00:00:47,343 INFO     Loading AnVIL_CCDG_Broad_NP_Epilepsy_CYPCYP_HMB-NPU-MDS_GSA-MD
2021-09-30 00:00:48,214 INFO     Loading AnVIL_CCDG_NYGC_NP_Autism_CAG_DS_WGS
2021-09-30 00:00:50,581 INFO     Loading AnVIL_CCDG_Broad_NP_Epilepsy_TWNCGM_HMB-NPU-ADULTS_GSA-MD
2021-09-30 00:00:54,275 INFO     Loading AnVIL_CCDG_Broad_NP_Epilepsy_CYPCYP_HMB_NPU_MDS_WES
2021-09-30 00:00:55,214 INFO     Loading AnVIL_ccdg_asc_ndd_daly_talkowski_palotie_asd_exome
2021-09-30 00:00:56,049 INFO     Loading AnVIL_CCDG_Broad_CVD_AFib_Intermountain_WGS
2021-09-30 00:00:57,982 INFO     Loading AnVIL_ccdg_asc_ndd_daly_talkowski_

AnVIL_CCDG_Broad_CVD_AF_Marcus_UCSF_WES fc-e7051891-25c8-4776-80ed-26b1af860277 403 GET https://storage.googleapis.com/storage/v1/b/fc-e7051891-25c8-4776-80ed-26b1af860277/o?projection=noAcl&fields=items%28size%2C+etag%2C+crc32c%2C+name%2C+timeCreated%29%2CnextPageToken&userProject=terra-test-bwalsh&prettyPrint=false: pet-110793006573203727769@terra-test-bwalsh.iam.gserviceaccount.com does not have storage.objects.list access to the Google Cloud Storage bucket.


2021-09-30 00:07:57,810 INFO     Loading AnVIL_CCDG_Broad_NP_Epilepsy_GBRUCL_DS_EARET_MDS_WES
2021-09-30 00:08:01,201 INFO     Loading anvil_ccdg_broad_ai_ibd_daly_franchimont_wes
2021-09-30 00:08:28,303 INFO     Loading AnVIL_CCDG_Broad_CVD_EOCAD_PartnersBiobank_HMB_WES
2021-09-30 00:09:24,207 INFO     Loading AnVIL_CCDG_WashU_CVD_EOCAD_Cleveland_WGS
2021-09-30 00:09:25,735 INFO     Loading AnVIL_CCDG_NYGC_NP_Autism_SPARK_GRU_WGS
2021-09-30 00:10:18,815 INFO     Loading AnVIL_CCDG_Broad_CVD_AFib_Duke_WGS
2021-09-30 00:10:19,806 INFO     Loading AnVIL_CCDG_Broad_NP_Epilepsy_USAMON_GRU_GSRS_WES
2021-09-30 00:10:20,383 INFO     Loading AnVIL_CCDG_Broad_NP_Epilepsy_USAUPN_CHOP_GRU_GSA-MD
2021-09-30 00:10:21,932 INFO     Loading AnVIL_CCDG_Broad_NP_Epilepsy_USAVANcontrols_HMB-GSO_WES
2021-09-30 00:10:31,818 INFO     Loading AnVIL_CCDG_TOPMed_WashU_CVD_Afib_Penn_WGS
2021-09-30 00:10:34,148 INFO     Loading AnVIL_CCDG_NYGC_NP_Autism_AGRE_WGS
2021-09-30 00:10:35,586 INFO     Loading AnVIL_CCD

## "load" 

> Copy results to bucket

In [10]:
# dashboard data
!gsutil cp /tmp/data_dashboard.json  $WORKSPACE_BUCKET
!gsutil cp /tmp/data_dashboard.tsv  $WORKSPACE_BUCKET

# 
!gsutil cp /tmp/terra_summary.json  $WORKSPACE_BUCKET


Copying file:///tmp/data_dashboard.json [Content-Type=application/json]...
/ [1 files][323.7 KiB/323.7 KiB]                                                
Operation completed over 1 objects/323.7 KiB.                                    
Copying file:///tmp/data_dashboard.tsv [Content-Type=text/tab-separated-values]...
/ [1 files][ 50.8 KiB/ 50.8 KiB]                                                
Operation completed over 1 objects/50.8 KiB.                                     
Copying file:///tmp/terra_summary.json [Content-Type=application/json]...
==> NOTE: You are uploading one or more large file(s), which would run          
significantly faster if you enable parallel composite uploads. This
feature can be enabled by editing the
"parallel_composite_upload_threshold" value in your .boto
configuration file. However, note that if you do this large files will
be uploaded as `composite objects
<https://cloud.google.com/storage/docs/composite-objects>`_,which
means that any user who do

In [11]:
# sqlite databases
!gsutil cp -r /tmp/*.sqlite $WORKSPACE_BUCKET


Copying file:///tmp/gen3-drs.sqlite [Content-Type=application/octet-stream]...
==> NOTE: You are uploading one or more large file(s), which would run          
significantly faster if you enable parallel composite uploads. This
feature can be enabled by editing the
"parallel_composite_upload_threshold" value in your .boto
configuration file. However, note that if you do this large files will
be uploaded as `composite objects
<https://cloud.google.com/storage/docs/composite-objects>`_,which
means that any user who downloads such objects will need to have a
compiled crcmod installed (see "gsutil help crcmod"). This is because
without a compiled crcmod, computing checksums on composite objects is
so slow that gsutil disables downloads of composite objects.

Copying file:///tmp/pyanvil-cache.sqlite [Content-Type=application/octet-stream]...
Copying file:///tmp/terra-graph.sqlite [Content-Type=application/octet-stream]...
Copying file:///tmp/terra.sqlite [Content-Type=application/octet-stre

In [12]:
# rm existing FHIR data from bucket
!gsutil -m rm $WORKSPACE_BUCKET/CCDG/**
!gsutil -m rm $WORKSPACE_BUCKET/CMG/**
!gsutil -m rm $WORKSPACE_BUCKET/GTEx/**
!gsutil -m rm $WORKSPACE_BUCKET/ThousandGenomes/**
!gsutil -m rm $WORKSPACE_BUCKET/NHGRI/**
!gsutil -m rm $WORKSPACE_BUCKET/NIMH/**
!gsutil -m rm $WORKSPACE_BUCKET/PAGE/**


# copy all to bucket
!gsutil -m cp -r /tmp/CCDG $WORKSPACE_BUCKET/CCDG
!gsutil -m cp -r /tmp/CMG $WORKSPACE_BUCKET/CMG
!gsutil -m cp -r /tmp/GTEx $WORKSPACE_BUCKET/GTEx
!gsutil -m cp -r /tmp/ThousandGenomes $WORKSPACE_BUCKET/ThousandGenomes

!gsutil -m cp -r /tmp/NHGRI $WORKSPACE_BUCKET/NHGRI
!gsutil -m cp -r /tmp/NIMH $WORKSPACE_BUCKET/NIMH
!gsutil -m cp -r /tmp/PAGE $WORKSPACE_BUCKET/PAGE


# # list bucket contents
# !gsutil ls -r $WORKSPACE_BUCKET/CCDG
# !gsutil ls -r $WORKSPACE_BUCKET/CMG
# !gsutil ls -r $WORKSPACE_BUCKET/GTEx
# !gsutil ls -r $WORKSPACE_BUCKET/ThousandGenomes

# !gsutil ls -r  $WORKSPACE_BUCKET/NHGRI
# !gsutil ls -r  $WORKSPACE_BUCKET/NIMH
# !gsutil ls -r  $WORKSPACE_BUCKET/PAGE





Removing gs://fc-secure-d8ae6fb6-76be-43a4-87a5-2ab255fc8d7d/CCDG/AnVIL_CCDG_Baylor_CVD_AFib_BioVU_WGS/Observation.json...
Removing gs://fc-secure-d8ae6fb6-76be-43a4-87a5-2ab255fc8d7d/CCDG/AnVIL_CCDG_Baylor_CVD_AFib_BioVU_WGS/Organization.json...
Removing gs://fc-secure-d8ae6fb6-76be-43a4-87a5-2ab255fc8d7d/CCDG/AnVIL_CCDG_Baylor_CVD_AFib_BioVU_WGS/Practitioner.json...
Removing gs://fc-secure-d8ae6fb6-76be-43a4-87a5-2ab255fc8d7d/CCDG/AnVIL_CCDG_Baylor_CVD_AFib_BioVU_WGS/Patient.json...
Removing gs://fc-secure-d8ae6fb6-76be-43a4-87a5-2ab255fc8d7d/CCDG/AnVIL_CCDG_Baylor_CVD_AFib_Groningen_WGS/ResearchSubject.json...
Removing gs://fc-secure-d8ae6fb6-76be-43a4-87a5-2ab255fc8d7d/CCDG/AnVIL_CCDG_Baylor_CVD_ARIC/Practitioner.json...
Removing gs://fc-secure-d8ae6fb6-76be-43a4-87a5-2ab255fc8d7d/CCDG/AnVIL_CCDG_Baylor_CVD_ARIC/Observation.json...
Removing gs://fc-secure-d8ae6fb6-76be-43a4-87a5-2ab255fc8d7d/CCDG/AnVIL_CCDG_Baylor_CVD_AFib_Groningen_WGS/Observation.json...
Removing gs://fc-secure-d

## Optional: load bucket contents into FHIR server
![image](https://user-images.githubusercontent.com/47808/102567204-e159de00-4095-11eb-883c-1f36e4790558.png)
![image](https://user-images.githubusercontent.com/47808/102567246-f46cae00-4095-11eb-8090-2fc28f1832e9.png)


# Quick test to test if we can read attributes of workspace we don't have access to

In [None]:
# "broad-genomics-data/CCDG_Ellinor_TIMI_AF_WES"
from anvil.terra.api import get_projects
projects = get_projects(namespaces="broad-genomics-data", project_pattern="CCDG_Ellinor_TIMI_AF_WES")
assert len(projects) > 0, f"Should return at least one project"


