# Generated Terra Clean Up Scripts
```
Andrew E. Davidson
aedavids@ucsc.edu
```

ref: extraCellularRNA/terra/jupyterNotebooks/generateTCGAMatrixCreationScripts.ipynb

generates 3 types shell script
1. delete deprecated salmon quant and aux files from the TCGA workspaces buckets
2. check good salmon quant files results exits
    * the deleteDeprecatedSalmonResults.sh are very slow. The call gsutil rm for each url
    * before running delete check scripts are disjoint
    ```
     grep -v \# *_checkSalmonResults.sh | cut -d " " -f 4 > checkUrls.txt
     grep -v \# *deleteDeprecatedSalmonResults.sh | cut -d " " -f 3 > deleteUrls.txt
     cat checkUrls.txt deleteUrls.txt | sort | uniq -c
         
     if the files rows are not disjoint count will be > 1
    ```
    * use a sanity test after running delete
3.*_deleteDeprecatedSalmonResultsURLS.txt
    * much faster
    ```
    cat *_deleteDeprecatedSalmonResultsURLS.txt | gsutil -m rm -I
    ```

TODO: manually delete columns from data models

In [1]:


import datetime as dt
import pandas as pd
from pathlib import Path

now = dt.datetime.now()
dateStr = now.strftime("%d/%m/%Y %H:%M:%S")
print("run on ", dateStr)

run on  23/09/2022 14:26:42


In [2]:
# back ups of terra data models are stored in a separate repo
# so that branch merges do not loose data model version
rootDir = "../../../terraDataModels/test-aedavids-proj/TCGA"
listOfWorkSpacePath = rootDir + "/" + "listOfWorkSpaces.csv" 
print(listOfWorkSpacePath)
workspaceDF = pd.read_csv( listOfWorkSpacePath )
workSpaceNamesList = workspaceDF.loc[:, "wokspace"].to_list()

../../../terraDataModels/test-aedavids-proj/TCGA/listOfWorkSpaces.csv


## Find  deprecated salmon column 
Our salmon workflow bind results to columns "*quant*" and "*aux*".
salmonTarQuantWorkflow v 4 was configure to save results in a column name with '3' in its name. This was not done consistently across all 33 TCGA workspace

In [3]:
def readDataModel( rootDir, workspaceName, entityName ) :
    '''
    entity referers to one of the terra data model tsv files. for exzmple 'sample'
    '''
    dataModelTSV = rootDir + "/" + workspaceName + "/" + entityName + ".tsv"
    dataModelDF = pd.read_csv(dataModelTSV, delimiter='\t')
    return dataModelDF

In [4]:
def findColumnsToDelete(df):
    '''
    returns column names contain sub string 'aux_' and 'quant' that does not include '3'
    '''
    columns = df.columns
    auxColNames = [m for m in columns if 'aux_' in m]
    quantColNames = [m for m in columns if 'quantF' in m]

    candidateColumnNames = auxColNames + quantColNames
    deleteColumnNames = [m for m in candidateColumnNames if '3' not in m]
    
    return sorted( deleteColumnNames )

def testFindColumnsToDelete():
    workspaceName = "TCGA_CESC_ControlledAccess_V1-0_DATA_edu_ucsc_kim_lab"
    df = readDataModel( rootDir, workspaceName, entityName = "sample" )
    ret = findColumnsToDelete(df)
    print(ret)
    
testFindColumnsToDelete()    

['aux_info', 'aux_infoPaired', 'quantFile', 'quantFilePaired']


In [5]:
def findColumnsToCheck(df):
    '''
    returns column names contain sub string 'aux_' and 'quant' that contain
    '''
    columns = df.columns 
    auxColNames = [m for m in columns if 'aux_' in m]
    quantColNames = [m for m in columns if 'quantF' in m]
    
    candidateColumnNames = auxColNames + quantColNames
    saveColumnNames = [m for m in candidateColumnNames if '3' in m] 
    
    return sorted (saveColumnNames)

def testFindColumnsToCheck():
    workspaceName = "TCGA_CESC_ControlledAccess_V1-0_DATA_edu_ucsc_kim_lab"
    df = readDataModel( rootDir, workspaceName, entityName = "sample" )
    ret = findColumnsToCheck(df)
    print(ret)
    
testFindColumnsToCheck()

['aux_infoPaired3', 'quantFilePaired3']


In [6]:
def findQuantFileToDeleteColumnName(workSpaceNamesList, skipWorkspaceList) :
    '''
    loads most of the tcga sample tsv files. See code for workspaces that are where skipped.
    they where skipped because we not all the expected samples ran.
    
    returns a dictionary.
        key is the workspaceName
        value is dataFrame that only contains the names of columns with files urls
        we want to delete
    '''
    dataDict = dict()

    for workspaceName in workSpaceNamesList:
        if workspaceName in skipWorkspaceList:
            continue

        print("\nworkspaceName:{}".format(workspaceName))
        df = readDataModel( rootDir, workspaceName, entityName = "sample" )
        deleteColumnNames = findColumnsToDelete(df)
        print("deleteColumnNames: {}".format(deleteColumnNames))
        dataDict[workspaceName] = df.loc[:, deleteColumnNames]
        
    return dataDict

skipWorkspaceList = []
dataDict = findQuantFileToDeleteColumnName(workSpaceNamesList, skipWorkspaceList)        



workspaceName:TCGA_ACC_ControlledAccess_V1-0_DATA_edu_ucsc_kim_lab
deleteColumnNames: ['aux_info', 'aux_infoPaired', 'quantFile', 'quantFilePaired']

workspaceName:TCGA_BLCA_ControlledAccess_V1-0_DATA_edu_ucsc_kim_lab
deleteColumnNames: ['aux_info', 'aux_infoPaired', 'quantFile', 'quantFilePaired']

workspaceName:TCGA_BRCA_ControlledAccess_V1-0_DATA_edu_ucsc_kim_lab
deleteColumnNames: ['aux_info', 'aux_infoPaired', 'quantFile', 'quantFilePaired']

workspaceName:TCGA_CESC_ControlledAccess_V1-0_DATA_edu_ucsc_kim_lab
deleteColumnNames: ['aux_info', 'aux_infoPaired', 'quantFile', 'quantFilePaired']

workspaceName:TCGA_CHOL_ControlledAccess_V1-0_DATA_edu_ucsc_kim_lab
deleteColumnNames: ['aux_info', 'aux_infoPaired', 'quantFile', 'quantFilePaired']

workspaceName:TCGA_COAD_ControlledAccess_V1-0_DATA_edu_ucsc_kim_lab
deleteColumnNames: ['aux_info', 'aux_infoPaired', 'quantFile', 'quantFilePaired']

workspaceName:TCGA_DLBC_ControlledAccess_V1-0_DATA_edu_ucsc_kim_lab
deleteColumnNames: ['aux_i

In [7]:
def findQuantFileToCheckColumnName(workSpaceNamesList, skipWorkspaceList) :
    '''
    loads most of the tcga sample tsv files. See code for workspaces that are where skipped.
    they where skipped because we not all the expected samples ran.
    
    returns a dictionary.
        key is the workspaceName
        value is dataFrame that only contains the names of columns with files urls
        we want to delete
    '''
    dataDict = dict()

    for workspaceName in workSpaceNamesList:
        if workspaceName in skipWorkspaceList:
            continue

        print("\nworkspaceName:{}".format(workspaceName))
        df = readDataModel( rootDir, workspaceName, entityName = "sample" )
        saveColumnNames = findColumnsToCheck(df)

        print("saveColumnNames: {}".format(saveColumnNames))
        dataDict[workspaceName] = df.loc[:, saveColumnNames]
        
    return dataDict

skipWorkspaceList = []
dataDict = findQuantFileToCheckColumnName(workSpaceNamesList, skipWorkspaceList)        



workspaceName:TCGA_ACC_ControlledAccess_V1-0_DATA_edu_ucsc_kim_lab
saveColumnNames: ['aux_infoPaired3', 'quantFilePaired3']

workspaceName:TCGA_BLCA_ControlledAccess_V1-0_DATA_edu_ucsc_kim_lab
saveColumnNames: ['aux_infoPaired3', 'quantFilePaired3']

workspaceName:TCGA_BRCA_ControlledAccess_V1-0_DATA_edu_ucsc_kim_lab
saveColumnNames: ['aux_infoPaired3', 'quantFilePaired3']

workspaceName:TCGA_CESC_ControlledAccess_V1-0_DATA_edu_ucsc_kim_lab
saveColumnNames: ['aux_infoPaired3', 'quantFilePaired3']

workspaceName:TCGA_CHOL_ControlledAccess_V1-0_DATA_edu_ucsc_kim_lab
saveColumnNames: ['aux_infoPaired3', 'quantFilePaired3']

workspaceName:TCGA_COAD_ControlledAccess_V1-0_DATA_edu_ucsc_kim_lab
saveColumnNames: ['aux_infoPaired3', 'quantFilePaired3']

workspaceName:TCGA_DLBC_ControlledAccess_V1-0_DATA_edu_ucsc_kim_lab
saveColumnNames: []

workspaceName:TCGA_ESCA_ControlledAccess_V1-0_DATA_edu_ucsc_kim_lab
saveColumnNames: ['aux_infoPaired3', 'quantFilePaired3']

workspaceName:TCGA_GBM_Contro

## Create script to delete deprecated Salmon results

In [8]:
def createDeleteCommand(df):
    retList = []

    for col in df.columns:
        # do not create delete command for empty rows
        urls = df.loc[:, col].dropna().to_list()
        for url in urls:
            if url != "":
                retList.append( "gsutil rm {}".format(url) )
        
    return retList

In [9]:
def testCreateDeleteCommand():
    df = dataDict['TCGA_THCA_ControlledAccess_V1-0_DATA_edu_ucsc_kim_lab']#.head()
    commands = createDeleteCommand(df)
    print("number of files to delete:{}".format(len(commands)))
    for i in range(0,5):
        print(commands[i])
        
testCreateDeleteCommand()

number of files to delete:1136
gsutil rm gs://fc-secure-4f53a6a8-3350-4aea-8302-e29dc74e7588/1084847d-3af6-4a23-9bd5-610bf18f9220/quantify/9ecd2e2d-b904-4bf5-b912-a01cfa8dc7da/call-salmon_paired_reads/THCA-4C-A93U-TP.aux_info.tar.gz
gsutil rm gs://fc-secure-4f53a6a8-3350-4aea-8302-e29dc74e7588/1084847d-3af6-4a23-9bd5-610bf18f9220/quantify/5feb8dda-f128-4d20-9b40-38d9099d79f2/call-salmon_paired_reads/THCA-BJ-A0YZ-TP.aux_info.tar.gz
gsutil rm gs://fc-secure-4f53a6a8-3350-4aea-8302-e29dc74e7588/1084847d-3af6-4a23-9bd5-610bf18f9220/quantify/4b431849-6874-4e61-9714-11c6c2423436/call-salmon_paired_reads/attempt-2/THCA-BJ-A0Z0-TP.aux_info.tar.gz
gsutil rm gs://fc-secure-4f53a6a8-3350-4aea-8302-e29dc74e7588/1084847d-3af6-4a23-9bd5-610bf18f9220/quantify/e880327a-3182-461b-b009-ec5a3ec3d31e/call-salmon_paired_reads/attempt-2/THCA-BJ-A0Z2-TP.aux_info.tar.gz
gsutil rm gs://fc-secure-4f53a6a8-3350-4aea-8302-e29dc74e7588/1084847d-3af6-4a23-9bd5-610bf18f9220/quantify/ce2dc642-6d17-4f38-bdca-a34489a4a

In [10]:
def createCheckCommand(df):
    retList = []
    cmdFmt = "gsutil -q stat {}; status=$?; if [[ $status == 1 ]]; then echo {} does not exist; fi"

    for col in df.columns:
        # do not create delete command for empty rows
        urls = df.loc[:, col].dropna().to_list()
        for url in urls:
            if url != "":
                cmd = cmdFmt.format(url, url)
                retList.append( cmd )
        
    return retList

In [11]:
def testCreateCheckCommand():
    df = dataDict['TCGA_THCA_ControlledAccess_V1-0_DATA_edu_ucsc_kim_lab']#.head()
    commands = createCheckCommand(df)
    print("number of files to delete:{}".format(len(commands)))
    for i in range(0,5):
        print(commands[i])

testCreateCheckCommand()

number of files to delete:1136
gsutil -q stat gs://fc-secure-4f53a6a8-3350-4aea-8302-e29dc74e7588/1084847d-3af6-4a23-9bd5-610bf18f9220/quantify/9ecd2e2d-b904-4bf5-b912-a01cfa8dc7da/call-salmon_paired_reads/THCA-4C-A93U-TP.aux_info.tar.gz; status=$?; if [[ $status == 1 ]]; then echo gs://fc-secure-4f53a6a8-3350-4aea-8302-e29dc74e7588/1084847d-3af6-4a23-9bd5-610bf18f9220/quantify/9ecd2e2d-b904-4bf5-b912-a01cfa8dc7da/call-salmon_paired_reads/THCA-4C-A93U-TP.aux_info.tar.gz does not exist; fi
gsutil -q stat gs://fc-secure-4f53a6a8-3350-4aea-8302-e29dc74e7588/1084847d-3af6-4a23-9bd5-610bf18f9220/quantify/5feb8dda-f128-4d20-9b40-38d9099d79f2/call-salmon_paired_reads/THCA-BJ-A0YZ-TP.aux_info.tar.gz; status=$?; if [[ $status == 1 ]]; then echo gs://fc-secure-4f53a6a8-3350-4aea-8302-e29dc74e7588/1084847d-3af6-4a23-9bd5-610bf18f9220/quantify/5feb8dda-f128-4d20-9b40-38d9099d79f2/call-salmon_paired_reads/THCA-BJ-A0YZ-TP.aux_info.tar.gz does not exist; fi
gsutil -q stat gs://fc-secure-4f53a6a8-3350

In [12]:
def writeDeleteScript(rootDir, workspaceName, df):
    '''
    generated a 'gsutil rm' command for each URL . This is a very slow scripts
    use cat listofURLS.txt | gsutil -m rm -I
    '''
    outDirPath = Path(rootDir + "/" + workspaceName + "/" +  "generateTerraCleanUpScripts.ipynb.out")
    print("create dir: {}".format(outDirPath))
    outDirPath.mkdir( parents=True, exist_ok=True )
    
    scriptPath = outDirPath.joinpath(workspaceName + "_deleteDeprecatedSalmonResults.sh")
    with open(scriptPath, 'w') as fp:
        #
        # add header information
        #
        now = dt.datetime.now()
        dateStr = now.strftime("%d/%m/%Y %H:%M:%S")        
        fp.write("# created by:generateTerraCleanUpScripts.ipynb on {}".format(dateStr))
        fp.write("\n# sample.tsv urls came from column names\n")
        
        columnNames = df.columns.to_list()
        print(columnNames)                
        for name in columnNames:
            fp.write("# {}\n".format(name))
            
        #
        # write commands
        #
       
        for name in columnNames:
            fp.write("#\n")
            fp.write("# urls from column:{} \n".format(name))
            fp.write("#\n")
            tmpDF = df.loc[:,[name]]
            cmdList = createDeleteCommand(tmpDF)
            for cmd in cmdList:
                fp.write("{}\n".format(cmd))   
    
    print("wrote file: {}".format(scriptPath))    
    

def testWriteDeleteScript():
    workspaceName = 'TCGA_THCA_ControlledAccess_V1-0_DATA_edu_ucsc_kim_lab'
    df = dataDict['TCGA_THCA_ControlledAccess_V1-0_DATA_edu_ucsc_kim_lab'].head()
    writeDeleteScript(rootDir, workspaceName, df)
    
testWriteDeleteScript()

create dir: ../../../terraDataModels/test-aedavids-proj/TCGA/TCGA_THCA_ControlledAccess_V1-0_DATA_edu_ucsc_kim_lab/generateTerraCleanUpScripts.ipynb.out
['aux_info3', 'quantFile3']
wrote file: ../../../terraDataModels/test-aedavids-proj/TCGA/TCGA_THCA_ControlledAccess_V1-0_DATA_edu_ucsc_kim_lab/generateTerraCleanUpScripts.ipynb.out/TCGA_THCA_ControlledAccess_V1-0_DATA_edu_ucsc_kim_lab_deleteDeprecatedSalmonResults.sh


In [13]:
def writeCheckScript(rootDir, workspaceName, df):
    '''
    uses gsutil -q stat
    '''
    outDirPath = Path(rootDir + "/" + workspaceName + "/" +  "generateTerraCleanUpScripts.ipynb.out")
    print("create dir: {}".format(outDirPath))
    outDirPath.mkdir( parents=True, exist_ok=True )
    
    scriptPath = outDirPath.joinpath(workspaceName + "_checkSalmonResults.sh")
    with open(scriptPath, 'w') as fp:
        #
        # add header information
        #
        now = dt.datetime.now()
        dateStr = now.strftime("%d/%m/%Y %H:%M:%S")        
        fp.write("# created by:generateTerraCleanUpScripts.ipynb on {}".format(dateStr))
        fp.write("\n# sample.tsv urls came from column names\n")
        
        columnNames = df.columns.to_list()
        print(columnNames)                
        for name in columnNames:
            fp.write("# {}\n".format(name))
            
        #
        # write commands
        #
       
        for name in columnNames:
            fp.write("#\n")
            fp.write("# urls from column:{} \n".format(name))
            fp.write("#\n")
            tmpDF = df.loc[:,[name]]
            cmdList = createCheckCommand(tmpDF)
            for cmd in cmdList:
                fp.write("{}\n".format(cmd))   
    
    print("wrote file: {}".format(scriptPath))        

In [14]:
def testWriteDCheckScript():
    workspaceName = 'TCGA_THCA_ControlledAccess_V1-0_DATA_edu_ucsc_kim_lab'
    df = dataDict['TCGA_THCA_ControlledAccess_V1-0_DATA_edu_ucsc_kim_lab'].head()
    writeCheckScript(rootDir, workspaceName, df)
    
testWriteDCheckScript()

create dir: ../../../terraDataModels/test-aedavids-proj/TCGA/TCGA_THCA_ControlledAccess_V1-0_DATA_edu_ucsc_kim_lab/generateTerraCleanUpScripts.ipynb.out
['aux_info3', 'quantFile3']
wrote file: ../../../terraDataModels/test-aedavids-proj/TCGA/TCGA_THCA_ControlledAccess_V1-0_DATA_edu_ucsc_kim_lab/generateTerraCleanUpScripts.ipynb.out/TCGA_THCA_ControlledAccess_V1-0_DATA_edu_ucsc_kim_lab_checkSalmonResults.sh


In [15]:
def writeDeleteFileList(rootDir, workspaceName, df):
    '''
    generated a 'gsutil rm' command for each URL . This is a very slow scripts
    use cat listofURLS.txt | gsutil -m rm -I
    '''
    outDirPath = Path(rootDir + "/" + workspaceName + "/" +  "generateTerraCleanUpScripts.ipynb.out")
    print("create dir: {}".format(outDirPath))
    outDirPath.mkdir( parents=True, exist_ok=True )
    
#     scriptPath = outDirPath.joinpath(workspaceName + "_deleteDeprecatedSalmonResultsURLS.txt")
#     with open(scriptPath, 'w') as fp:
        #
        # add header information
        #
#         now = dt.datetime.now()
#         dateStr = now.strftime("%d/%m/%Y %H:%M:%S")        
#         fp.write("# created by:generateTerraCleanUpScripts.ipynb on {}".format(dateStr))
#         fp.write("\n# sample.tsv urls came from column names\n")
        
#         columnNames = df.columns.to_list()
#         print(columnNames)                
#         for name in columnNames:
#             fp.write("# {}\n".format(name))
            
        #
        # write commands
        #
        
    columnNames = df.columns.to_list()
    for name in columnNames:
        scriptPath = outDirPath.joinpath(workspaceName + "_" + name + "_deleteDeprecatedSalmonResultsURLS.txt")
        with open(scriptPath, 'w') as fp:        
            urls = df.loc[:,name].dropna().to_list()
            for url in urls:
                if url != "":
                    fp.write("{}\n".format(url))   

            print("wrote file: {}".format(scriptPath))    
    

def testWriteDeleteFileList():
    workspaceName = 'TCGA_THCA_ControlledAccess_V1-0_DATA_edu_ucsc_kim_lab'
    df = dataDict['TCGA_THCA_ControlledAccess_V1-0_DATA_edu_ucsc_kim_lab'].head()
    writeDeleteFileList(rootDir, workspaceName, df)
    
testWriteDeleteFileList()

create dir: ../../../terraDataModels/test-aedavids-proj/TCGA/TCGA_THCA_ControlledAccess_V1-0_DATA_edu_ucsc_kim_lab/generateTerraCleanUpScripts.ipynb.out
wrote file: ../../../terraDataModels/test-aedavids-proj/TCGA/TCGA_THCA_ControlledAccess_V1-0_DATA_edu_ucsc_kim_lab/generateTerraCleanUpScripts.ipynb.out/TCGA_THCA_ControlledAccess_V1-0_DATA_edu_ucsc_kim_lab_aux_info3_deleteDeprecatedSalmonResultsURLS.txt
wrote file: ../../../terraDataModels/test-aedavids-proj/TCGA/TCGA_THCA_ControlledAccess_V1-0_DATA_edu_ucsc_kim_lab/generateTerraCleanUpScripts.ipynb.out/TCGA_THCA_ControlledAccess_V1-0_DATA_edu_ucsc_kim_lab_quantFile3_deleteDeprecatedSalmonResultsURLS.txt


## Run all

In [16]:
def runWriteDeleteAll(rootDir, workSpaceNamesList, skipWorkspaceList):
    dataDict = findQuantFileToDeleteColumnName(workSpaceNamesList, skipWorkspaceList)     
    for key,df in dataDict.items():
        print("\n\n******** {}".format(key))
        writeDeleteScript(rootDir, key, df)
    
skipWorkspaceList = []
runWriteDeleteAll(rootDir, workSpaceNamesList, skipWorkspaceList)


workspaceName:TCGA_ACC_ControlledAccess_V1-0_DATA_edu_ucsc_kim_lab
deleteColumnNames: ['aux_info', 'aux_infoPaired', 'quantFile', 'quantFilePaired']

workspaceName:TCGA_BLCA_ControlledAccess_V1-0_DATA_edu_ucsc_kim_lab
deleteColumnNames: ['aux_info', 'aux_infoPaired', 'quantFile', 'quantFilePaired']

workspaceName:TCGA_BRCA_ControlledAccess_V1-0_DATA_edu_ucsc_kim_lab
deleteColumnNames: ['aux_info', 'aux_infoPaired', 'quantFile', 'quantFilePaired']

workspaceName:TCGA_CESC_ControlledAccess_V1-0_DATA_edu_ucsc_kim_lab
deleteColumnNames: ['aux_info', 'aux_infoPaired', 'quantFile', 'quantFilePaired']

workspaceName:TCGA_CHOL_ControlledAccess_V1-0_DATA_edu_ucsc_kim_lab
deleteColumnNames: ['aux_info', 'aux_infoPaired', 'quantFile', 'quantFilePaired']

workspaceName:TCGA_COAD_ControlledAccess_V1-0_DATA_edu_ucsc_kim_lab
deleteColumnNames: ['aux_info', 'aux_infoPaired', 'quantFile', 'quantFilePaired']

workspaceName:TCGA_DLBC_ControlledAccess_V1-0_DATA_edu_ucsc_kim_lab
deleteColumnNames: ['aux_i

wrote file: ../../../terraDataModels/test-aedavids-proj/TCGA/TCGA_PCPG_ControlledAccess_V1-0_DATA_edu_ucsc_kim_lab/generateTerraCleanUpScripts.ipynb.out/TCGA_PCPG_ControlledAccess_V1-0_DATA_edu_ucsc_kim_lab_deleteDeprecatedSalmonResults.sh


******** TCGA_PRAD_ControlledAccess_V1-0_DATA_edu_ucsc_kim_lab
create dir: ../../../terraDataModels/test-aedavids-proj/TCGA/TCGA_PRAD_ControlledAccess_V1-0_DATA_edu_ucsc_kim_lab/generateTerraCleanUpScripts.ipynb.out
['aux_info', 'quantFile']
wrote file: ../../../terraDataModels/test-aedavids-proj/TCGA/TCGA_PRAD_ControlledAccess_V1-0_DATA_edu_ucsc_kim_lab/generateTerraCleanUpScripts.ipynb.out/TCGA_PRAD_ControlledAccess_V1-0_DATA_edu_ucsc_kim_lab_deleteDeprecatedSalmonResults.sh


******** TCGA_READ_ControlledAccess_V1-0_DATA_edu_ucsc_kim_lab
create dir: ../../../terraDataModels/test-aedavids-proj/TCGA/TCGA_READ_ControlledAccess_V1-0_DATA_edu_ucsc_kim_lab/generateTerraCleanUpScripts.ipynb.out
['aux_info', 'aux_infoPaired', 'quantFile', 'quantFilePair

In [17]:
def runWriteCheckAll(rootDir, workSpaceNamesList, skipWorkspaceList):
    dataDict = findQuantFileToCheckColumnName(workSpaceNamesList, skipWorkspaceList)     
    for key,df in dataDict.items():
        print("\n\n******** {}".format(key))
        writeCheckScript(rootDir, key, df)
    
skipWorkspaceList = []
runWriteCheckAll(rootDir, workSpaceNamesList, skipWorkspaceList)


workspaceName:TCGA_ACC_ControlledAccess_V1-0_DATA_edu_ucsc_kim_lab
saveColumnNames: ['aux_infoPaired3', 'quantFilePaired3']

workspaceName:TCGA_BLCA_ControlledAccess_V1-0_DATA_edu_ucsc_kim_lab
saveColumnNames: ['aux_infoPaired3', 'quantFilePaired3']

workspaceName:TCGA_BRCA_ControlledAccess_V1-0_DATA_edu_ucsc_kim_lab
saveColumnNames: ['aux_infoPaired3', 'quantFilePaired3']

workspaceName:TCGA_CESC_ControlledAccess_V1-0_DATA_edu_ucsc_kim_lab
saveColumnNames: ['aux_infoPaired3', 'quantFilePaired3']

workspaceName:TCGA_CHOL_ControlledAccess_V1-0_DATA_edu_ucsc_kim_lab
saveColumnNames: ['aux_infoPaired3', 'quantFilePaired3']

workspaceName:TCGA_COAD_ControlledAccess_V1-0_DATA_edu_ucsc_kim_lab
saveColumnNames: ['aux_infoPaired3', 'quantFilePaired3']

workspaceName:TCGA_DLBC_ControlledAccess_V1-0_DATA_edu_ucsc_kim_lab
saveColumnNames: []

workspaceName:TCGA_ESCA_ControlledAccess_V1-0_DATA_edu_ucsc_kim_lab
saveColumnNames: ['aux_infoPaired3', 'quantFilePaired3']

workspaceName:TCGA_GBM_Contro

wrote file: ../../../terraDataModels/test-aedavids-proj/TCGA/TCGA_THCA_ControlledAccess_V1-0_DATA_edu_ucsc_kim_lab/generateTerraCleanUpScripts.ipynb.out/TCGA_THCA_ControlledAccess_V1-0_DATA_edu_ucsc_kim_lab_checkSalmonResults.sh


******** TCGA_THYM_ControlledAccess_V1-0_DATA_edu_ucsc_kim_lab
create dir: ../../../terraDataModels/test-aedavids-proj/TCGA/TCGA_THYM_ControlledAccess_V1-0_DATA_edu_ucsc_kim_lab/generateTerraCleanUpScripts.ipynb.out
['aux_info3', 'quantFile3']
wrote file: ../../../terraDataModels/test-aedavids-proj/TCGA/TCGA_THYM_ControlledAccess_V1-0_DATA_edu_ucsc_kim_lab/generateTerraCleanUpScripts.ipynb.out/TCGA_THYM_ControlledAccess_V1-0_DATA_edu_ucsc_kim_lab_checkSalmonResults.sh


******** TCGA_UCEC_ControlledAccess_V1-0_DATA_edu_ucsc_kim_lab
create dir: ../../../terraDataModels/test-aedavids-proj/TCGA/TCGA_UCEC_ControlledAccess_V1-0_DATA_edu_ucsc_kim_lab/generateTerraCleanUpScripts.ipynb.out
['aux_info3', 'quantFile3']
wrote file: ../../../terraDataModels/test-aedavids

In [18]:
def runWriteDeleteFileListAll(rootDir, workSpaceNamesList, skipWorkspaceList):
    dataDict = findQuantFileToDeleteColumnName(workSpaceNamesList, skipWorkspaceList)     
    for key,df in dataDict.items():
        print("\n\n******** {}".format(key))
        writeDeleteFileList(rootDir, key, df)
    
skipWorkspaceList = []
runWriteDeleteFileListAll(rootDir, workSpaceNamesList, skipWorkspaceList)


workspaceName:TCGA_ACC_ControlledAccess_V1-0_DATA_edu_ucsc_kim_lab
deleteColumnNames: ['aux_info', 'aux_infoPaired', 'quantFile', 'quantFilePaired']

workspaceName:TCGA_BLCA_ControlledAccess_V1-0_DATA_edu_ucsc_kim_lab
deleteColumnNames: ['aux_info', 'aux_infoPaired', 'quantFile', 'quantFilePaired']

workspaceName:TCGA_BRCA_ControlledAccess_V1-0_DATA_edu_ucsc_kim_lab
deleteColumnNames: ['aux_info', 'aux_infoPaired', 'quantFile', 'quantFilePaired']

workspaceName:TCGA_CESC_ControlledAccess_V1-0_DATA_edu_ucsc_kim_lab
deleteColumnNames: ['aux_info', 'aux_infoPaired', 'quantFile', 'quantFilePaired']

workspaceName:TCGA_CHOL_ControlledAccess_V1-0_DATA_edu_ucsc_kim_lab
deleteColumnNames: ['aux_info', 'aux_infoPaired', 'quantFile', 'quantFilePaired']

workspaceName:TCGA_COAD_ControlledAccess_V1-0_DATA_edu_ucsc_kim_lab
deleteColumnNames: ['aux_info', 'aux_infoPaired', 'quantFile', 'quantFilePaired']

workspaceName:TCGA_DLBC_ControlledAccess_V1-0_DATA_edu_ucsc_kim_lab
deleteColumnNames: ['aux_i

wrote file: ../../../terraDataModels/test-aedavids-proj/TCGA/TCGA_LGG_ControlledAccess_V1-0_DATA_edu_ucsc_kim_lab/generateTerraCleanUpScripts.ipynb.out/TCGA_LGG_ControlledAccess_V1-0_DATA_edu_ucsc_kim_lab_quantFile_deleteDeprecatedSalmonResultsURLS.txt
wrote file: ../../../terraDataModels/test-aedavids-proj/TCGA/TCGA_LGG_ControlledAccess_V1-0_DATA_edu_ucsc_kim_lab/generateTerraCleanUpScripts.ipynb.out/TCGA_LGG_ControlledAccess_V1-0_DATA_edu_ucsc_kim_lab_quantFilePaired_deleteDeprecatedSalmonResultsURLS.txt


******** TCGA_LIHC_ControlledAccess_V1-0_DATA_edu_ucsc_kim_lab
create dir: ../../../terraDataModels/test-aedavids-proj/TCGA/TCGA_LIHC_ControlledAccess_V1-0_DATA_edu_ucsc_kim_lab/generateTerraCleanUpScripts.ipynb.out
wrote file: ../../../terraDataModels/test-aedavids-proj/TCGA/TCGA_LIHC_ControlledAccess_V1-0_DATA_edu_ucsc_kim_lab/generateTerraCleanUpScripts.ipynb.out/TCGA_LIHC_ControlledAccess_V1-0_DATA_edu_ucsc_kim_lab_aux_info_deleteDeprecatedSalmonResultsURLS.txt
wrote file: ../.