# pyAnVIL dashboard

## installation

> Ensure latest version installed

In [1]:
# Install a pip package in the current Jupyter kernel
import sys
!{sys.executable} -m pip uninstall  -y pyanvil  
!{sys.executable} -m pip install   pyAnVIL==0.0.4rc4 --upgrade
!{sys.executable} -m pip show pyanvil


Found existing installation: pyAnVIL 0.0.4rc2
Uninstalling pyAnVIL-0.0.4rc2:
  Successfully uninstalled pyAnVIL-0.0.4rc2
Collecting pyAnVIL==0.0.4rc2
  Using cached pyAnVIL-0.0.4rc2-py3-none-any.whl (43 kB)
Collecting google-cloud-storage==1.19.1
  Using cached google_cloud_storage-1.19.1-py2.py3-none-any.whl (69 kB)
Collecting attrdict==2.0.1
  Using cached attrdict-2.0.1-py2.py3-none-any.whl (9.9 kB)
Collecting xmltodict==0.12.0
  Using cached xmltodict-0.12.0-py2.py3-none-any.whl (9.2 kB)
Processing /home/jupyter-user/.cache/pip/wheels/3e/34/f1/194eb61d12a8d6aa6f970e3e59ccbd40376cdd261b88c3231e/gen3-2.4.0-py3-none-any.whl
Collecting Click==7.0
  Using cached Click-7.0-py2.py3-none-any.whl (81 kB)
Processing /home/jupyter-user/.cache/pip/wheels/76/a4/8f/2a1dbc1dc45aeb3ccba650a2d02234ce1cc757d7f877689af5/firecloud-0.16.29-py3-none-any.whl
Collecting google-cloud-core<2.0dev,>=1.0.3
  Using cached google_cloud_core-1.4.3-py2.py3-none-any.whl (27 kB)
Collecting google-resumable-media<0.

## validation

> Note:  As a workaround, the tracking spreadsheet is embedded in the python package.  This spreadsheet provides a list of workspaces that should exist and associated dbGap accessions

In [2]:
# check installation, import should work
import os
import time
# embedded data tracking spreadsheet should exist    
# see https://docs.google.com/spreadsheets/d/1UvQimGHggygeJeTIPjIi6Ze3ryxsUdVjjn8BoIFkyho/edit#gid=552844485
from anvil.dbgap.api import DEFAULT_OUTPUT_PATH
assert os.path.isfile(DEFAULT_OUTPUT_PATH), "embedded data tracking spreadsheet should exist"


# extract

> Extract all meta data, reconcile with google bucket and dbGap data

In [3]:
import json
import logging
from anvil.util.reconciler import aggregate, DEFAULT_NAMESPACE

logging.basicConfig(level=logging.INFO, format='%(asctime)s %(levelname)-8s %(message)s')

# store aggregated data locally
DASHBOARD_OUTPUT_PATH = '/tmp/data_dashboard.json'


def reconcile_all(user_project, consortiums,  namespace=DEFAULT_NAMESPACE, output_path=DASHBOARD_OUTPUT_PATH):
    """Reconcile and aggregate results.

    e.g. bin/reconciler --user_project <your-billing-project>  --consortium CMG AnVIL_CMG.* --consortium CCDG AnVIL_CCDG.* --consortium GTEx ^AnVIL_GTEx_V8_hg38$ --consortium ThousandGenomes ^1000G-high-coverage-2019$
    """
    with open(output_path, 'w') as outs:
        json.dump({'projects': [v for v in aggregate(namespace, user_project, consortiums)]}, outs)


logging.info("Starting aggregation for all AnVIL workspaces, this will take several minutes.")        
reconcile_all(
    user_project = os.environ['GOOGLE_PROJECT'],
    consortiums = (
        ('CMG', 'AnVIL_CMG.*'),
        ('CCDG', 'AnVIL_CCDG.*'),
        ('GTEx', '^AnVIL_GTEx_V8_hg38$'),
        ('ThousandGenomes', '^1000G-high-coverage-2019$')
    )
)

2020-11-13 15:46:26,078 INFO     Starting aggregation for all AnVIL workspaces, this will take several minutes.




## validate

In [13]:
# python json serializer setup

import datetime
def json_serial(obj):
    """JSON serializer for objects not serializable by default json code."""
    if isinstance(obj, (datetime, date)):
        return obj.isoformat()
    raise TypeError("Type %s not serializable" % type(obj))



# validate output
import json
import os
DASHBOARD_OUTPUT_PATH = '/tmp/data_dashboard.json'
assert os.path.isfile(DASHBOARD_OUTPUT_PATH), "dashboard should exist"
with open(DASHBOARD_OUTPUT_PATH, 'r') as inputs:
    dashboard_data = json.load(inputs)
    
print(dashboard_data)    

{'projects': [{'file_histogram': [{'count': 30, 'size': 168922682295, 'date': '2020-06-10'}, {'count': 12, 'size': 78729105265, 'date': '2019-06-06'}], 'files': [{'count': 14, 'size': 448, 'type': 'Md5'}, {'count': 14, 'size': 20933293, 'type': 'Crai'}, {'count': 14, 'size': 247630853819, 'type': 'Cram'}], 'nodes': [{'type': 'Project', 'count': 1}, {'type': 'Subject', 'count': 14}, {'type': 'Samples', 'count': 14}], 'size': 247651787560, 'project_id': 'AnVIL_CMG_Broad_Muscle_KNC_WGS', 'public': False, 'createdDate': '2019-06-06T17:33:03.378Z', 'lastModified': '2020-07-30T19:08:57.351Z', 'data_type': ['Whole Genome'], 'data_category': ['Raw Sequencing data'], 'problems': [], 'source': 'CMG', 'accession': 'phs001272', 'qualified_accession': 'phs001272.v1.p1', 'dbgap_sample_count': 1079}, {'file_histogram': [{'count': 62, 'size': 78168449416, 'date': '2019-06-24'}], 'files': [{'count': 31, 'size': 7125945, 'type': 'Crai'}, {'count': 31, 'size': 78161323471, 'type': 'Cram'}], 'nodes': [{'t

# transform

> Flatten the results into a table

In [5]:
from anvil.util.reconciler import flatten
import pandas as pd

(flattened, column_names) = flatten(dashboard_data['projects'])
df = pd.DataFrame(flattened)  
df.columns = column_names
# Print the data  (all rows, all columns)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
df 



Unnamed: 0,source,workspace,accession,Bai,Bam,Crai,Cram,Md5,Tbi,Vcf,size,Project,Samples,Subject,dbgap_sample_count_mismatch,inconsistent_entityName,inconsistent_subject,missing_accession,missing_blobs,missing_samples,missing_schema,missing_sequence,missing_subjects
0,CMG,AnVIL_CMG_Broad_Muscle_KNC_WGS,phs001272.v1.p1,,,20933293.0,247630853819.0,448.0,,,247651787560,1.0,14.0,14.0,,,,,,,,,
1,CMG,ANVIL_CMG_Broad_Muscle_Laing_WES,phs001272.v1.p1,,,7125945.0,78161323471.0,,,,78168449416,1.0,31.0,31.0,,,,,,,,,
2,CMG,AnVIL_CMG_Broad_Orphan_VCGS-White_WES,phs001272.v1.p1,,,77687124.0,801236028306.0,14304.0,,,801313729734,1.0,447.0,447.0,,,,,,,,,
3,CMG,AnVIL_CMG_Broad_Muscle_Myoseq_WES,phs001272.v1.p1,,,252323449.0,2302789160222.0,39872.0,,,2303041523543,1.0,1280.0,1280.0,,,,,,,,,
4,CMG,AnVIL_CMG_Broad_Heart_Ware_WES,phs001272.v1.p1,,,1604113.0,18167853977.0,320.0,,,18169458410,1.0,10.0,10.0,,,,,,,,,
5,CMG,AnVIL_CMG_Broad_Muscle_Beggs_WES,phs001272.v1.p1,,,19128007.0,193902773220.0,3488.0,,,193921904715,1.0,109.0,109.0,,,,,,,,,
6,CMG,AnVIL_CMG_Broad_Blood_Gazda_WGS,phs001272.v1.p1,,,,,,,,0,1.0,0.0,0.0,,,,,True,True,True,,True
7,CMG,AnVIL_CMG_Broad_Orphan_Estonia-Ounap_WES,phs001272.v1.p1,,,5913845.0,76145872650.0,800.0,,,76151787295,1.0,31.0,31.0,,,,,,,,,
8,CMG,AnVIL_CMG_Broad_Eye_Pierce_WES,phs001272.v1.p1,,,127463057.0,1498688877878.0,19264.0,,,1498816360199,1.0,602.0,602.0,,,,,,,,,
9,CMG,AnVIL_CMG_Broad_Orphan_Estonia-Ounap_WGS,phs001272.v1.p1,,,157841116.0,2032832719304.0,3680.0,,,2032990564100,1.0,115.0,115.0,,,,,,,,,


In [6]:
# export create a tsv from dataframe
df.to_csv("/tmp/data_dashboard.tsv", sep="\t")


## summarize

> problems with comma separated list of workspaces [see more on dashboard exceptions](https://github.com/anvilproject/client-apis/wiki/dashboard-exceptions)

In [7]:
# explore
flattened = []
problems = set([problem for project in dashboard_data['projects'] for problem in project['problems']])
for problem in problems:
    projects = [project['project_id'] for project in dashboard_data['projects'] if problem in project['problems']]
    flattened.append([problem, ','.join(projects)])

# Print the data  (all rows, all columns)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.colheader_justify', 'left')

df = pd.DataFrame(flattened)  
df.columns = ['problem', 'affected_workspaces']
df = df.style.set_properties(**{'text-align': 'left'})
df 
# print (df.to_string (justify='left', index=False))


Unnamed: 0,problem,affected_workspaces
0,missing_blobs,"AnVIL_CMG_Broad_Blood_Gazda_WGS,AnVIL_CMG_Broad_Brain_Gleeson_WGS,AnVIL_CMG_Broad_Brain_Engle_WGS,AnVIL_CMG_Broad_Kidney_Pollak_WES,AnVIL_CMG_Broad_Orphan_Scott_WES,AnVIL_CMG_Broad_Blood_Fleming_WES,AnVIL_CMG_Broad_Muscle_Kang_WGS,AnVIL_CCDG_Freeze2_VCFAggregation,AnVIL_CCDG_Broad_NP_Epilepsy_DEUUPM_HMB_MDS_WES,AnVIL_CCDG_Broad_NP_Epilepsy_USAUPN_GRU_NPU_WES,AnVIL_CCDG_Baylor_CVD_EOCAD_SoL_WGS,AnVIL_CCDG_Broad_NP_Epilepsy_LEBABM_GRU_WES,ANVIL_CCDG_Broad_CVD_EOCAD_PROMIS_ARRAY,AnVIL_CCDG_WashU_CVD-NP-AI_Controls_VCControls_WGS,AnVIL_ccdg_asc_ndd_daly_talkowski_simons_asd_exome_svip,AnVIL_CCDG_Broad_Deposit"
1,dbgap_sample_count_mismatch,"phs001272.v1.p1,phs001489.v1.p1,phs001642.v1.p1,phs001222.v1.p1,phs001227.v1.p1,phs001259.v1.p1,phs001543.v1.p1,phs001544.v1.p1,phs000160.v1.p1,phs001676.v1.p1,phs001502.v1.p1,phs001062.v4.p2,phs001180.v2.p1,phs000496.v1.p1,phs001395.v1.p1,phs001624.v1.p1,phs001545.v1.p1,phs000997.v4.p2,phs001398.v1.p1"
2,missing_schema,"AnVIL_CMG_Broad_Blood_Gazda_WGS,AnVIL_CMG_Broad_Brain_Gleeson_WGS,AnVIL_CMG_Broad_Brain_Engle_WGS,AnVIL_CMG_Broad_Kidney_Pollak_WES,AnVIL_CMG_Broad_Orphan_Scott_WES,AnVIL_CMG_Broad_Blood_Fleming_WES,AnVIL_CCDG_WASHU_PAGE,AnVIL_CCDG_Freeze2_VCFs,ANVIL_CCDG_Broad_CVD_EOCAD_PROMIS_ARRAY,AnVIL_CCDG_Baylor_CVD_ARIC,AnVIL_CCDG_Broad_Deposit"
3,missing_samples,"AnVIL_CMG_Broad_Blood_Gazda_WGS,AnVIL_CMG_Broad_Brain_Gleeson_WGS,AnVIL_CMG_Broad_Brain_Engle_WGS,AnVIL_CMG_Broad_Kidney_Pollak_WES,AnVIL_CMG_Broad_Orphan_Scott_WES,AnVIL_CMG_Broad_Blood_Fleming_WES,AnVIL_CCDG_WASHU_PAGE,AnVIL_CCDG_Broad_AI_IBD_Cho_WGS,AnVIL_CCDG_Baylor_CVD_HemStroke_ERICH_WGS,AnVIL_CCDG_Freeze2_VCFs,AnVIL_CCDG_NYGC_NP_Autism_ACE2_GRU-MDS_WGS,ANVIL_CCDG_Broad_CVD_EOCAD_PROMIS_ARRAY,AnVIL_CCDG_Baylor_CVD_ARIC,AnVIL_CCDG_Broad_Deposit,AnVIL_GTEx_V8_hg38"
4,missing_subjects,"AnVIL_CMG_Broad_Blood_Gazda_WGS,AnVIL_CMG_Broad_Brain_Gleeson_WGS,AnVIL_CMG_Broad_Brain_Engle_WGS,AnVIL_CMG_Broad_Kidney_Pollak_WES,AnVIL_CMG_Broad_Orphan_Scott_WES,AnVIL_CMG_Broad_Blood_Fleming_WES,AnVIL_CCDG_Freeze2_VCFAggregation,AnVIL_CCDG_WashU_CVD_EOCAD_BioImage_WGS,AnVIL_CCDG_WASHU_PAGE,AnVIL_CCDG_Freeze2_VCFs,ANVIL_CCDG_Broad_CVD_EOCAD_PROMIS_ARRAY,AnVIL_CCDG_Baylor_CVD_ARIC,AnVIL_CCDG_Broad_Deposit"
5,inconsistent_subject,"AnVIL_CMG_Broad_Muscle_Topf_WES,AnVIL_CMG_UWash_GRU,AnVIL_CMG_Broad_Blood_Gazda_WES,AnVIL_CCDG_WashU_CVD-NP-AI_Controls_VCControls_WGS"
6,inconsistent_entityName,"AnVIL_CCDG_NYGC_NP_Autism_SSC_WGS,AnVIL_CCDG_NYGC_NP_Autism_ACE2_DS-MDS_WGS,AnVIL_CCDG_WashU_CVD-NP-AI_Controls_VCControls_WGS"
7,missing_accession,"AnVIL_CCDG_WashU_CVD_EOCAD_BioMe_WGS,AnVIL_CCDG_Broad_CVD_AFib_Penn_WGS,AnVIL_CCDG_WashU_CVD_EOCAD_METSIM_WGS,ANVIL_CCDG_Broad_CVD_EOCAD_PROMIS_ARRAY,AnVIL_CCDG_Broad_CVD_EOCAD_PROMIS_WGS,AnVIL_CCDG_WashU_CVD_EOCAD_Duke_WGS,AnVIL_CCDG_Broad_CVD_AFib_Duke_WGS"
8,missing_sequence,"AnVIL_CMG_Broad_Muscle_Topf_WES,AnVIL_CMG_Broad_Muscle_Bonnemann_WGS,AnVIL_CMG_Broad_Muscle_Bonnemann_WES"


In [8]:
# list consistent workspaces

df = pd.DataFrame([project['project_id'] for project in dashboard_data['projects'] if len(project['problems']) == 0])  
df.columns = ['workspace']
df = df.style.set_properties(**{'text-align': 'left'})
df 

Unnamed: 0,workspace
0,AnVIL_CMG_Broad_Muscle_KNC_WGS
1,ANVIL_CMG_Broad_Muscle_Laing_WES
2,AnVIL_CMG_Broad_Orphan_VCGS-White_WES
3,AnVIL_CMG_Broad_Muscle_Myoseq_WES
4,AnVIL_CMG_Broad_Heart_Ware_WES
5,AnVIL_CMG_Broad_Muscle_Beggs_WES
6,AnVIL_CMG_Broad_Orphan_Estonia-Ounap_WES
7,AnVIL_CMG_Broad_Eye_Pierce_WES
8,AnVIL_CMG_Broad_Orphan_Estonia-Ounap_WGS
9,AnVIL_CMG_Broad_Blood_Sankaran_WES


# "load" 

> Copy results to bucket

In [9]:
# copy json results to bucket
!gsutil cp /tmp/data_dashboard.json  $WORKSPACE_BUCKET

Copying file:///tmp/data_dashboard.json [Content-Type=application/json]...
/ [1 files][140.7 KiB/140.7 KiB]                                                
Operation completed over 1 objects/140.7 KiB.                                    


In [10]:
# copy tsv results to bucket
!gsutil cp /tmp/data_dashboard.tsv  $WORKSPACE_BUCKET

Copying file:///tmp/data_dashboard.tsv [Content-Type=text/tab-separated-values]...
/ [1 files][ 25.7 KiB/ 25.7 KiB]                                                
Operation completed over 1 objects/25.7 KiB.                                     


In [11]:
!gsutil ls  $WORKSPACE_BUCKET


gs://fc-secure-d8ae6fb6-76be-43a4-87a5-2ab255fc8d7d/DocumentReference.json
gs://fc-secure-d8ae6fb6-76be-43a4-87a5-2ab255fc8d7d/Organization.json
gs://fc-secure-d8ae6fb6-76be-43a4-87a5-2ab255fc8d7d/Patient.json
gs://fc-secure-d8ae6fb6-76be-43a4-87a5-2ab255fc8d7d/Practitioner.json
gs://fc-secure-d8ae6fb6-76be-43a4-87a5-2ab255fc8d7d/ResearchStudy.json
gs://fc-secure-d8ae6fb6-76be-43a4-87a5-2ab255fc8d7d/ResearchSubject.json
gs://fc-secure-d8ae6fb6-76be-43a4-87a5-2ab255fc8d7d/Specimen.json
gs://fc-secure-d8ae6fb6-76be-43a4-87a5-2ab255fc8d7d/Task.json
gs://fc-secure-d8ae6fb6-76be-43a4-87a5-2ab255fc8d7d/data_dashboard.json
gs://fc-secure-d8ae6fb6-76be-43a4-87a5-2ab255fc8d7d/data_dashboard.tsv
gs://fc-secure-d8ae6fb6-76be-43a4-87a5-2ab255fc8d7d/export_2020-10-28T16_19_50.avro
gs://fc-secure-d8ae6fb6-76be-43a4-87a5-2ab255fc8d7d/export_2020-11-04T17_48_47.avro
gs://fc-secure-d8ae6fb6-76be-43a4-87a5-2ab255fc8d7d/export_2020-11-05T23_26_49.avro
gs://fc-secure-d8ae6fb6-76be-43a4-87a5-2

# reconcile against gen3

In [12]:
# copy tsv results to bucket
!gsutil cp $WORKSPACE_BUCKET/export_2020-11-05T23_26_49.avro /tmp/export_2020-11-05T23_26_49.avro 

Copying gs://fc-secure-d8ae6fb6-76be-43a4-87a5-2ab255fc8d7d/export_2020-11-05T23_26_49.avro...
==> NOTE: You are downloading one or more large file(s), which would            
run significantly faster if you enabled sliced object downloads. This
feature is enabled by default but requires that compiled crcmod be
installed (see "gsutil help crcmod").

/ [1 files][226.4 MiB/226.4 MiB]                                                
Operation completed over 1 objects/226.4 MiB.                                    
