In [49]:
from datetime import datetime
import firecloud.api as FAPI
import json
import os
import pandas as pd
import pprint

print(FAPI.whoami())
pp = pprint.PrettyPrinter(indent=4)

BILLING_PROJECT_ID = os.environ['WORKSPACE_NAMESPACE']
WORKSPACE = os.environ['WORKSPACE_NAME']
WORKSPACE_BUCKET = os.environ['WORKSPACE_BUCKET']
CATALOG_WORKSPACE_NAMESPACE = "anvil-datastorage"


!gsutil cp $WORKSPACE_BUCKET/AnVIL_Data_Ingestion_Attributes_210225-November-2-2021-1_17-PM-Dave_Using.csv  .
!gsutil cp $WORKSPACE_BUCKET/attributes_for_AnVIL_workspaces_update_210217.csv  .


## Read in Dave Using list
daveUsing = pd.read_csv('AnVIL_Data_Ingestion_Attributes_210225-November-2-2021-1_17-PM-Dave_Using.csv',keep_default_na=False)
daveUsing = daveUsing[["name","library:dataUseRestriction", "library:datatype.items", "library:indication", "study_accession", "library:studyDesign","x"]]
daveUsing.set_index("name", inplace=True)

## Read in Tags Sheet 02-17
tagSheet = pd.read_csv('attributes_for_AnVIL_workspaces_update_210217.csv',keep_default_na=False)
tagSheet = tagSheet[["name","library:dataUseRestriction", "library:datatype.items", "library:indication", "library:studyDesign", "tag:tags.items"]]
tagSheet.set_index("name", inplace=True)





pet-117272931645288568532@terra-e36fcccd.iam.gserviceaccount.com
Copying gs://fc-cb5be780-171f-49d4-9116-b77fd2237d0b/AnVIL_Data_Ingestion_Attributes_210225-November-2-2021-1_17-PM-Dave_Using.csv...
/ [1 files][287.8 KiB/287.8 KiB]                                                
Operation completed over 1 objects/287.8 KiB.                                    
Copying gs://fc-cb5be780-171f-49d4-9116-b77fd2237d0b/attributes_for_AnVIL_workspaces_update_210217.csv...
/ [1 files][293.9 KiB/293.9 KiB]                                                
Operation completed over 1 objects/293.9 KiB.                                    


In [66]:
"""
Extract only the fields we want from each file.
Write a single file that if dave using exists takes that data 
and if dave using does not exist takes the tag sheet data
Tag sheet data file is bigger so we will interate over that first.
"""

def extractTagSheetTagItem(workspaceName, tagName):
    if workspaceName in tagSheet.index:
        items = tagSheet.at[workspaceName,"tag:tags.items"]
        items = cleanUpDbGapTagSheetTag(items)
        items = items.split(",")
        for item in items:
            item = item.strip()
            if item.startswith(tagName):
                parts = item.split(":")
                if len(parts) == 2:
                    return parts[1]
                else:
                    return ""
        ## Not found reuturn unspecified
        return ""
    return ""
    

def cleanUpDbGapTagSheetTag(tag):
    if pd.isna(tag):
            return ""
            
    tag = tag.replace("'","")
    tag = tag.replace("[","")
    tag = tag.replace("]","")
    return tag


def setUnspecified(value):
    if pd.isna(value) or value == "":
        return "TBD"
    else:
        return value

def cleanUpDbGaPId(dbGaPId):
    if (dbGaPId == "--"):
        return 'None'
    elif dbGaPId == "registration pending":
        return "Registration Pending"
    elif dbGaPId == 'available through EGA':
        return "Available through EGA"
    else:
        return dbGaPId   

def cleanStudyDesign(studyDesign):
    if studyDesign == "Case/Control":
        return 'Case-Control'
    if studyDesign == "Case-Control":
        return 'Case-Control'
    elif studyDesign == "Control":
        return "Control Set"
    elif studyDesign == 'Controls':
        return "Control Set"
    elif studyDesign == 'Cross-sectional':
        return "Cross-Sectional"
    elif  studyDesign == "NA" or studyDesign == "N/A":
        return "TBD"
    else:
        return studyDesign  
    
def cleanUpStudyDesigns(studyDesigns):
    studyDesigns = studyDesigns.split(',')
    clean = []
    for sd in studyDesigns:
        clean.append(cleanStudyDesign(sd))
    return ','.join(clean)

def cleanDatatype(studyDesign):
    if studyDesign == "AttributeValue" or studyDesign == "NA" or studyDesign == "N/A":
        return "TBD"
    else:
        return studyDesign  
    
def cleanUpDatatypes(dataTypes):
    dataTypes = dataTypes.split(',')
    clean = []
    for sd in dataTypes:
        clean.append(cleanDatatype(sd))
    return ','.join(clean)

def cleanUpDataUseRestrictions(r):
    if r == "open access":
        return "NRES"
    if r == "NA" or r == "N/A":
            return "TBD"
    if r == "not applicable":
            return "Consortia Access Only"
    else:
        return r
    
def cleanUpIndication(ind):
    if ind == "not applicable":
        return "None"
    else:
        return ind



catalogArray = []

for name in tagSheet.index:
    if name in daveUsing.index:
        wsAttributes = {
                'name': name,
                'consortium': daveUsing.at[name,"x"],
                'phsId': daveUsing.at[name,"study_accession"],
                'library:indication': daveUsing.at[name,"library:indication"],
                'library:studyDesign': daveUsing.at[name,"library:studyDesign"],
                'library:datatype': daveUsing.at[name,"library:datatype.items"],
                'library:dataUseRestriction': daveUsing.at[name,"library:dataUseRestriction"],
            }
    else:
         wsAttributes = {
                'name': name,
                'consortium': extractTagSheetTagItem(name,"consortium"),
                'phsId': extractTagSheetTagItem(name,"dbGaP"),
                'library:indication':tagSheet.at[name,"library:indication"],
                'library:studyDesign': tagSheet.at[name,"library:studyDesign"],
                'library:datatype': cleanUpDbGapTagSheetTag(tagSheet.at[name,"library:datatype.items"]),
                'library:dataUseRestriction':  tagSheet.at[name,"library:dataUseRestriction"],
            }
       
    wsAttributes['consortium'] = setUnspecified(wsAttributes['consortium'])
    wsAttributes['phsId'] = setUnspecified(cleanUpDbGaPId(wsAttributes['phsId'])) 
    wsAttributes["library:indication"] = setUnspecified(cleanUpIndication(wsAttributes["library:indication"]))
    wsAttributes['library:studyDesign'] = setUnspecified(cleanUpStudyDesigns(wsAttributes['library:studyDesign']))  
    wsAttributes["library:datatype"] = setUnspecified(cleanUpDatatypes(wsAttributes["library:datatype"]))
    wsAttributes["library:dataUseRestriction"] = cleanUpDataUseRestrictions(wsAttributes["library:dataUseRestriction"]) 
       
                                            
 
    catalogArray.append(wsAttributes)

    

    
fileName = 'AnVILCatalogWorkspacesInput-'+datetime.today().strftime('%Y-%m-%d')+".csv"

catalogDF = pd.DataFrame(catalogArray)
catalogDF.set_index("name", inplace=True)
catalogDF.to_csv(fileName)
!gsutil cp ./$fileName $WORKSPACE_BUCKET/

pp.pprint(catalogArray)    
    


Copying file://./AnVILCatalogWorkspacesInput-2022-03-08.csv [Content-Type=text/csv]...
/ [1 files][ 50.2 KiB/ 50.2 KiB]                                                
Operation completed over 1 objects/50.2 KiB.                                     
[   {   'consortium': '1000G',
        'library:dataUseRestriction': 'NRES',
        'library:datatype': 'Whole Genome',
        'library:indication': 'None',
        'library:studyDesign': 'Parent-Offspring Trios',
        'name': '1000G-high-coverage-2019',
        'phsId': 'None'},
    {   'consortium': 'CCDG',
        'library:dataUseRestriction': 'DS-CVD',
        'library:datatype': 'Whole Genome',
        'library:indication': 'heart and blood vessel disease',
        'library:studyDesign': 'Case-Control',
        'name': 'AnVIL_CCDG_WashU_CVD_EOCAD_BioImage_WGS',
        'phsId': 'phs002325'},
    {   'consortium': 'CCDG',
        'library:dataUseRestriction': 'DS-ASD',
        'library:datatype': 'Exome',
        'library:indicatio

        'library:datatype': 'Genotyping Array',
        'library:indication': 'epilepsy',
        'library:studyDesign': 'TBD',
        'name': 'AnVIL_CCDG_Broad_NP_Epilepsy_KENKIL_GRU_GSA-MD',
        'phsId': 'phs001489'},
    {   'consortium': 'CCDG',
        'library:dataUseRestriction': 'TBD',
        'library:datatype': 'Genotyping Array',
        'library:indication': 'epilepsy',
        'library:studyDesign': 'Case-Control',
        'name': 'AnVIL_CCDG_Broad_NP_Epilepsy_AUSALF_HMB_IRB_GSRS_GSA-MD',
        'phsId': 'phs001489'},
    {   'consortium': 'CCDG',
        'library:dataUseRestriction': 'TBD',
        'library:datatype': 'Exome',
        'library:indication': 'epilepsy',
        'library:studyDesign': 'TBD',
        'name': 'AnVIL_CCDG_Broad_NP_Epilepsy_AUTMUV_DS_NS_NPU_ADLT_WES',
        'phsId': 'phs001489'},
    {   'consortium': 'CCDG',
        'library:dataUseRestriction': 'GRU-NPU',
        'library:datatype': 'Exome',
        'library:indication': 'epilepsy',
  

        'library:studyDesign': 'Case Set',
        'name': 'AnVIL_CCDG_Broad_NP_Epilepsy_JPNFKA_GRU_WES',
        'phsId': 'Registration Pending'},
    {   'consortium': 'CCDG',
        'library:dataUseRestriction': 'DS-EP-ETIOLOGY-MDS',
        'library:datatype': 'Exome',
        'library:indication': 'epilepsy',
        'library:studyDesign': 'Case Set',
        'name': 'AnVIL_CCDG_Broad_NP_Epilepsy_GBRUNL_EP_ETIOLOGY_MDS_WES',
        'phsId': 'phs001489'},
    {   'consortium': 'CCDG',
        'library:dataUseRestriction': 'DS-EARET-MDS',
        'library:datatype': 'Exome',
        'library:indication': 'epilepsy',
        'library:studyDesign': 'Case Set',
        'name': 'AnVIL_CCDG_Broad_NP_Epilepsy_GBRUCL_DS_EARET_MDS_WES',
        'phsId': 'phs001489'},
    {   'consortium': 'CCDG',
        'library:dataUseRestriction': 'TBD',
        'library:datatype': 'Exome',
        'library:indication': 'epilepsy',
        'library:studyDesign': 'TBD',
        'name': 'AnVIL_CCDG_Broad

    {   'consortium': 'CCDG',
        'library:dataUseRestriction': 'GRU',
        'library:datatype': 'Exome',
        'library:indication': 'epilepsy',
        'library:studyDesign': 'Case-Control',
        'name': 'AnVIL_CCDG_Broad_NP_Epilepsy_USACHP_GRU_WES',
        'phsId': 'phs001489'},
    {   'consortium': 'CCDG',
        'library:dataUseRestriction': 'TBD',
        'library:datatype': 'Genotyping Array',
        'library:indication': 'epilepsy',
        'library:studyDesign': 'Case-Control',
        'name': 'AnVIL_CCDG_Broad_NP_Epilepsy_NZLUTO_EPIL_BC_ID_MDS_GSA-MD',
        'phsId': 'TBD'},
    {   'consortium': 'CCDG',
        'library:dataUseRestriction': 'HMB-MDS',
        'library:datatype': 'Exome',
        'library:indication': 'epilepsy',
        'library:studyDesign': 'Case Set',
        'name': 'AnVIL_CCDG_Broad_NP_Epilepsy_HKGHKK_HMB_MDS_WES',
        'phsId': 'phs001489'},
    {   'consortium': 'CCDG',
        'library:dataUseRestriction': 'TBD',
        'library: