In [16]:
from datetime import datetime
import firecloud.api as FAPI
import json
import os
import pandas as pd
import pprint

print(FAPI.whoami())
pp = pprint.PrettyPrinter(indent=4)

BILLING_PROJECT_ID = os.environ['WORKSPACE_NAMESPACE']
WORKSPACE = os.environ['WORKSPACE_NAME']
WORKSPACE_BUCKET = os.environ['WORKSPACE_BUCKET']
CATALOG_WORKSPACE_NAMESPACE = "anvil-datastorage"

def getAttributes(workspace):
    return workspace["workspace"]["attributes"]

def getDataUseRestriction(workspace):
    return getAttributes(workspace).get("library:dataUseRestriction","Unspecified")

def getName(workspace):
    return workspace['workspace']['name']

def getIndication(workspace):
     return getAttributes(workspace).get("library:indication","Unspecified")
    
def getStudyDesign(workspace):
     return getAttributes(workspace).get("library:studyDesign","Unspecified")

def extractDataTypes(workspace):
    attributes = w["workspace"]["attributes"]
    dataTypes = attributes.get("library:datatype","Unspecified")
    if isinstance(dataTypes, dict):
        return  ",".join(dataTypes["items"])  # cant use dataType.items as items is a function on dict
    else:
        return dataTypes
    

!gsutil cp $WORKSPACE_BUCKET/AnVIL_Data_Ingestion_Attributes_210225-November-2-2021-1_17-PM-Dave_Using.csv  .
!gsutil cp $WORKSPACE_BUCKET/attributes_for_AnVIL_workspaces_update_210217.csv  .
!gsutil cp $WORKSPACE_BUCKET/AnVILCatalogWorkspacesInput-2022-03-08.csv  .



# ## Read in Dave Using list
# daveUsing = pd.read_csv('AnVIL_Data_Ingestion_Attributes_210225-November-2-2021-1_17-PM-Dave_Using.csv', keep_default_na=False)
# daveUsing = daveUsing[["name","library:dataUseRestriction", "library:datatype.items", "library:indication", "study_accession", "library:studyDesign"]]
# daveUsing.set_index("name", inplace=True)

## Use input list as "dave Using"
daveUsing = pd.read_csv('AnVILCatalogWorkspacesInput-2022-03-08.csv',keep_default_na=False)
daveUsing = daveUsing[["name","library:dataUseRestriction", "library:datatype", "library:indication", "phsId", "consortium","library:studyDesign"]]
daveUsing.set_index("name", inplace=True)

## Read in Tags Sheet 02-17
tagsheet = pd.read_csv('attributes_for_AnVIL_workspaces_update_210217.csv',keep_default_na=False)
tagsheet = tagsheet[["name","library:dataUseRestriction", "library:datatype.items", "library:indication", "library:studyDesign"]]
tagsheet.set_index("name", inplace=True)




## Get all workspaces we have access to
workspaces = FAPI.list_workspaces().json()

workspaceByName = {}
for w in workspaces:
    workspaceByName[getName(w)]=w



pet-117272931645288568532@terra-e36fcccd.iam.gserviceaccount.com
Copying gs://fc-cb5be780-171f-49d4-9116-b77fd2237d0b/AnVIL_Data_Ingestion_Attributes_210225-November-2-2021-1_17-PM-Dave_Using.csv...
/ [1 files][287.8 KiB/287.8 KiB]                                                
Operation completed over 1 objects/287.8 KiB.                                    
Copying gs://fc-cb5be780-171f-49d4-9116-b77fd2237d0b/attributes_for_AnVIL_workspaces_update_210217.csv...
/ [1 files][293.9 KiB/293.9 KiB]                                                
Operation completed over 1 objects/293.9 KiB.                                    
Copying gs://fc-cb5be780-171f-49d4-9116-b77fd2237d0b/AnVILCatalogWorkspacesInput-2022-03-08.csv...
/ [1 files][ 51.3 KiB/ 51.3 KiB]                                                
Operation completed over 1 objects/51.3 KiB.                                     


In [17]:
class Diff:
    def __init__(self, attribute, actual, expected):
        if pd.isna(expected):
            expected = ""
            
        if pd.isna(actual):
            actual = ""
        
        self.attribute = attribute
        self.actual = actual
        self.expected = expected
        
    def __str__(self):
        return "    "+self.attribute +": " + self.actual + " -> " + self.expected

def appendDiff(difffs, attribute, actual, expected):
    if actual != expected:
        d = Diff(attribute, actual, expected)
        diffs.append(d)
    
    

for name in daveUsing.index:
    if name in workspaceByName:
        
        diffs = []
        
        
        diffs.append( "    "+ "Tag - dbGaP:   -> " + daveUsing.at[name, "phsId"])
        diffs.append( "    "+ "Tag - Consortium:   -> " + daveUsing.at[name, "consortium"])
        
        appendDiff(diffs,
                   "DataUseRestriction",
                   getDataUseRestriction(workspaceByName[name] ),
                   daveUsing.at[name, "library:dataUseRestriction"])
        
        appendDiff(diffs,
                   "Indication",
                    getIndication(workspaceByName[name] ),
                    daveUsing.at[name, "library:indication"])
        
        appendDiff(diffs,
                   "DataTypes",
                    extractDataTypes(workspaceByName[name] ),
                    daveUsing.at[name, "library:datatype"])
        appendDiff(diffs,
                   "StudyDesign",
                    getStudyDesign(workspaceByName[name] ),
                    daveUsing.at[name, "library:studyDesign"])
        
        if len(diffs) > 0:
            print(name)
            for d in diffs:
                print(d)
            print("")



1000G-high-coverage-2019
    Tag - dbGaP:   -> None
    Tag - Consortium:   -> 1000G
    DataUseRestriction: Public -> NRES
    Indication: n/a -> NONE
    StudyDesign: n/a -> Parent-Offspring Trios

AnVIL_CCDG_WashU_CVD_EOCAD_BioImage_WGS
    Tag - dbGaP:   -> phs002325
    Tag - Consortium:   -> CCDG
    DataUseRestriction: DS-CVD  -> DS-CVD
    StudyDesign: Case/Control -> Case-Control

AnVIL_ccdg_asc_ndd_daly_talkowski_chung_asd_exome
    Tag - dbGaP:   -> phs000298
    Tag - Consortium:   -> CCDG
    DataUseRestriction: NA -> DS-ASD
    Indication: NA -> autism spectrum disorder
    DataTypes: Whole Genome -> Exome
    StudyDesign: NA -> Family/Twins/Trios

AnVIL_ccdg_asc_ndd_daly_talkowski_ac-boston_asd_exome
    Tag - dbGaP:   -> Registration Pending
    Tag - Consortium:   -> CCDG
    DataUseRestriction: NA -> DS-ASD
    Indication: NA -> autism spectrum disorder
    DataTypes: Whole Genome -> Exome
    StudyDesign: NA -> Family/Twins/Trios

AnVIL_ccdg_asc_ndd_daly_talkowski_la

    StudyDesign: NA -> Parent-Offspring Trios

AnVIL_CMH_GAFK_scRNA
    Tag - dbGaP:   -> phs002206
    Tag - Consortium:   -> CMH
    DataTypes: Whole Genome -> Unspecified
    StudyDesign:  -> Unspecified

AnVIL_CMH_GAFK_GS-long-read
    Tag - dbGaP:   -> phs002206
    Tag - Consortium:   -> CMH
    DataTypes: Whole Genome -> Unspecified
    StudyDesign:  -> Unspecified

AnVIL_CMH_GAFK_WGBS
    Tag - dbGaP:   -> phs002206
    Tag - Consortium:   -> CMH
    DataTypes: Whole Genome -> Unspecified
    StudyDesign:  -> Unspecified

AnVIL_CMH_GAFK_GS-linked-read
    Tag - dbGaP:   -> phs002206
    Tag - Consortium:   -> CMH
    DataTypes: Whole Genome -> Unspecified
    StudyDesign:  -> Unspecified

AnVIL_CMH_GAFK_SCATAC
    Tag - dbGaP:   -> phs002206
    Tag - Consortium:   -> CMH
    DataTypes: Whole Genome -> Unspecified
    StudyDesign:  -> Unspecified

AnVIL_NIMH_CIRM_FCDI_ConvergentNeuro_McCarroll_Eggan_GRU_Arrays
    Tag - dbGaP:   -> phs002032
    Tag - Consortium:   -> Convergen

In [13]:
for name in daveUsing.index:
    if name in workspaceByName:
        
        diffs = []
        
        appendDiff(diffs,
                   "DataUseRestriction",
                   tagsheet.at[name, "library:dataUseRestriction"],
                   daveUsing.at[name, "library:dataUseRestriction"])
        
        appendDiff(diffs,
                   "Indication",
                    tagsheet.at[name, "library:indication"],
                    daveUsing.at[name, "library:indication"])
        
        appendDiff(diffs,
                   "DataTypes",
                    tagsheet.at[name, "library:datatype.items"],
                    daveUsing.at[name, "library:datatype.items"])
        appendDiff(diffs,
                   "StudyDesign",
                    tagsheet.at[name, "library:studyDesign"],
                    daveUsing.at[name, "library:studyDesign"])
        
        if len(diffs) > 0:
            print(name)
            for d in diffs:
                print(d)
            print("")

1000G-high-coverage-2019
    DataUseRestriction: NRES -> open access
    Indication:  -> not applicable
    DataTypes: ['Whole Genome', 'VCF'] -> Whole Genome
    StudyDesign:  -> Parent-Offspring Trios

AnVIL_T2T
    DataUseRestriction: NRES -> open access
    Indication: Unspecified -> not applicable
    DataTypes: ['Whole Genome'] -> Whole Genome
    StudyDesign:  -> Parent-Offspring Trios

AnVIL_HPRC
    DataUseRestriction: NRES -> open access
    Indication: Unspecified -> not applicable
    DataTypes:  -> Whole Genome
    StudyDesign:  -> 

AnVIL_ccdg_asc_ndd_daly_talkowski_palotie_asd_exome
    DataUseRestriction:  -> DS-ASD
    DataTypes: ['Exome'] -> Exome
    StudyDesign:  -> Family/Twins/Trios

AnVIL_ccdg_asc_ndd_daly_talkowski_puura_asd_exome
    DataUseRestriction:  -> not applicable
    DataTypes: ['Exome'] -> Exome
    StudyDesign:  -> Family/Twins/Trios

AnVIL_CCDG_NYGC_NP_Alz_LOAD_WGS
    DataUseRestriction: GRU-IRB-PUB -> not applicable
    DataTypes:  -> Whole Genome

In [None]:
## Print out combined  workspace sheet