# Create GTEx-TCGA DESeq Data Sets

```
Andrew Davidson
aedavids@ucsc.edu
```


In [1]:
# use display() to print an html version of a data frame
# useful if dataFrame output is not generated by last like of cell
from IPython.display import display

import os
from os.path import exists
import pandas as pd


WORKSPACE_BUCKET = os.environ['WORKSPACE_BUCKET']
print(WORKSPACE_BUCKET)

gs://fc-e15b796f-1abe-4206-ab91-bd58374cc275


## Download , Read and Check data

In [2]:
tmp = "./tmp"
!mkdir -p $tmp

In [3]:
TCGAColDataURL = WORKSPACE_BUCKET + "/data/colData/trainingDataSets"
TCGADataURL    = WORKSPACE_BUCKET + "/data/matrices/groupByGeneId/trainingDataSets"

GTExColDataURL = WORKSPACE_BUCKET #+ "/GTEx*ColData.csv"
GTExDataURL    = WORKSPACE_BUCKET #+ "/GTEx*GroupByGenesCountMatrix.csv"

In [4]:
# uncomment to reset/rerun 
# pathToOldResults = WORKSPACE_BUCKET + "/data/GTEx_TCGA"
# ! gsutil ls $pathToOldResults
# ! gsutil rm -r $pathToOldResults

In [5]:
!gsutil ls $TCGAColDataURL

gs://fc-e15b796f-1abe-4206-ab91-bd58374cc275/data/colData/trainingDataSets/TCGA-TestColData.csv
gs://fc-e15b796f-1abe-4206-ab91-bd58374cc275/data/colData/trainingDataSets/TCGA-TrainColData.csv
gs://fc-e15b796f-1abe-4206-ab91-bd58374cc275/data/colData/trainingDataSets/TCGA-ValidateColData.csv
gs://fc-e15b796f-1abe-4206-ab91-bd58374cc275/data/colData/trainingDataSets/TCGA-miscColData.csv


In [6]:
!gsutil ls $TCGADataURL

gs://fc-e15b796f-1abe-4206-ab91-bd58374cc275/data/matrices/groupByGeneId/trainingDataSets/TCGA-TestGroupByGeneId.csv
gs://fc-e15b796f-1abe-4206-ab91-bd58374cc275/data/matrices/groupByGeneId/trainingDataSets/TCGA-TrainGroupByGeneId.csv
gs://fc-e15b796f-1abe-4206-ab91-bd58374cc275/data/matrices/groupByGeneId/trainingDataSets/TCGA-ValidateGroupByGeneId.csv


In [7]:
p = GTExDataURL + "/GTEx*GroupByGenesCountMatrix.csv"
!gsutil ls $p

gs://fc-e15b796f-1abe-4206-ab91-bd58374cc275/GTExTestGroupByGenesCountMatrix.csv
gs://fc-e15b796f-1abe-4206-ab91-bd58374cc275/GTExTrainGroupByGenesCountMatrix.csv
gs://fc-e15b796f-1abe-4206-ab91-bd58374cc275/GTExValidateGroupByGenesCountMatrix.csv


In [8]:
p = GTExColDataURL + "/GTEx*ColData.csv"
#print( p)
!gsutil ls $p

gs://fc-e15b796f-1abe-4206-ab91-bd58374cc275/GTExTestColData.csv
gs://fc-e15b796f-1abe-4206-ab91-bd58374cc275/GTExTrainColData.csv
gs://fc-e15b796f-1abe-4206-ab91-bd58374cc275/GTExValidateColData.csv


In [9]:
def downLoadColData(TCGAColDataURL, GTExColDataURL, cacheDir, trainingSetName="Test" ):
    tcga = "/TCGA-" + trainingSetName + "ColData.csv"
    if not exists(cacheDir + tcga):    
        srcURL = TCGAColDataURL + tcga
        print("srcURL:{} ".format(srcURL))
        !gsutil -m cp  $srcURL $cacheDir
    else:
        print("{} found in local cache".format(tcga))
    
    gtex = "/GTEx" + trainingSetName + "ColData.csv"
    if not exists(cacheDir + gtex):
        print("srcURL:{} ".format(srcURL))
        srcURL = GTExColDataURL + gtex
        !gsutil -m cp  $srcURL $cacheDir
    else:
        print("{} found in local cache".format(gtex))


# test
downLoadColData(TCGAColDataURL, GTExColDataURL, tmp, trainingSetName="Test" )

/TCGA-TestColData.csv found in local cache
/GTExTestColData.csv found in local cache


In [10]:
def downLoadGroubByData(TCGADataURL, GTExDataURL, cacheDir, trainingSetName="Test"):
    tcga = "/TCGA-" + trainingSetName + "GroupByGeneId.csv"
    if not exists(cacheDir + tcga):    
        srcURL = TCGADataURL + tcga
        !gsutil -m cp  $srcURL $cacheDir
    else:
        print("{} found in local cache".format(tcga))

    
    gtex = "/GTEx" + trainingSetName + "GroupByGenesCountMatrix.csv"
    if not exists(cacheDir + gtex):
        srcURL = GTExDataURL + gtex
        !gsutil -m cp  $srcURL $cacheDir
    else:
        print("{} found in local cache".format(gtex))
        
# test
downLoadGroubByData(TCGADataURL, GTExDataURL, tmp, trainingSetName="Test")

/TCGA-TestGroupByGeneId.csv found in local cache
/GTExTestGroupByGenesCountMatrix.csv found in local cache


In [11]:
def loadDF(localPath):
    retDF = None
    if not exists(localPath):
        print("ERROR {} not found".format(localPath))
    else:
        retDF = pd.read_csv(localPath)
    
    return retDF
        
def readData(cacheDir,  trainingSetName="Test"):
    GTexPath = cacheDir +  "/GTEx" + trainingSetName + "ColData.csv"
    GTExColDataDF = loadDF(GTexPath)
    
    TCGAPath = cacheDir +  "/TCGA-" + trainingSetName + "ColData.csv"
    TCGAColDataDF = loadDF(TCGAPath)
    
    GTEXGroupbyPath = cacheDir +  "/GTEx" + trainingSetName + "GroupByGenesCountMatrix.csv"
    GTExGroupByDF = loadDF(GTEXGroupbyPath)
    
    TCGAGroupbyPath = cacheDir + "/TCGA-" + trainingSetName + "GroupByGeneId.csv"
    TCGAGroupByDF = loadDF(TCGAGroupbyPath)
    
    return (GTExColDataDF, GTExGroupByDF, TCGAColDataDF, TCGAGroupByDF)

# test
GTExColDataDF, GTExGroupByDF, TCGAColDataDF, TCGAGroupByDF = readData(tmp, trainingSetName="Test")

In [12]:
def validateColDataGroupByDataSets(colDataDF, groupByDF):
    '''
    assert number or rows in colData == number of columns in groupBy
    assert sample names are in the same order
    '''
    print( "colDataDF.shape: {}".format(colDataDF.shape) )
    #print( display(colDataDF.iloc[0:3, 0:2]) )
    
    print( "groupByDF.shape: {}".format(groupByDF.shape) )
    
    nRows = colDataDF.shape[0]
    nCols = groupByDF.shape[1]
    
    assert nRows == nCols -1, "ERROR num colData rows must equal num groupByCols -1"
    
    # make sure the rows and cols are in the same order
    colDataSampleNamesSeries = colDataDF.loc[:, 'sample_id']
    # skip the 'geneId col'
    groupBySampleNameList = groupByDF.columns.to_list()[1:]
    assert (colDataSampleNamesSeries == groupBySampleNameList).all(), "ERROR colDataSamples are not in same order as groupBy"

# test
print("GTEx")
validateColDataGroupByDataSets(GTExColDataDF, GTExGroupByDF)
print("\n TCGA")
validateColDataGroupByDataSets(TCGAColDataDF, TCGAGroupByDF )

GTEx
colDataDF.shape: (3471, 6)
groupByDF.shape: (74777, 3472)

 TCGA
colDataDF.shape: (1800, 7)
groupByDF.shape: (74777, 1801)


## Create Combined Data Set

In [13]:
def combineColData(GTExDF, TCGADF):
    gDF = GTExDF.rename( columns = {'sex':'gender', 'tissue_id':'category'})
    gDF['dataSet'] = "GTEx"
    gDF = gDF.loc[:, ['sample_id', 'participant_id', 'category', 'gender', 'age', 'dataSet']]
    #display(gDF)
    
    tDF = TCGADF.rename( columns={'Gender':'gender', 'Cohort':'category', 'Age':'age'} )
    tDF['dataSet'] = "TCGA"
    tDF = tDF.loc[:, ['sample_id', 'participant_id', 'category', 'gender', 'age', 'dataSet']]

    #print()
    #display(tDF)
    
    byRows = 0
    retDF = pd.concat( [gDF, tDF], axis=byRows)
    
    return retDF
  
# test
GTEx_TCGA_colDataDF = combineColData(GTExDF=GTExColDataDF, TCGADF=TCGAColDataDF)
print("GTEx_TCGA_colDataDF.shape: {}".format(GTEx_TCGA_colDataDF.shape))
display( GTEx_TCGA_colDataDF.head() )
display( GTEx_TCGA_colDataDF.tail() )

GTEx_TCGA_colDataDF.shape: (5271, 6)


Unnamed: 0,sample_id,participant_id,category,gender,age,dataSet
0,GTEX-1117F-0226-SM-5GZZ7,GTEX-1117F,Adipose_Subcutaneous,Female,66.0,GTEx
1,GTEX-1117F-0526-SM-5EGHJ,GTEX-1117F,Artery_Tibial,Female,66.0,GTEx
2,GTEX-1117F-0626-SM-5N9CS,GTEX-1117F,Artery_Coronary,Female,66.0,GTEx
3,GTEX-1117F-1326-SM-5EGHH,GTEX-1117F,Adipose_Visceral_Omentum,Female,66.0,GTEx
4,GTEX-1117F-2526-SM-5GZY6,GTEX-1117F,Vagina,Female,66.0,GTEx


Unnamed: 0,sample_id,participant_id,category,gender,age,dataSet
1795,UVM-VD-AA8N-TP,UVM-VD-AA8N,UVM,male,86.0,TCGA
1796,UVM-WC-A87W-TP,UVM-WC-A87W,UVM,female,57.0,TCGA
1797,UVM-WC-A87Y-TP,UVM-WC-A87Y,UVM,male,59.0,TCGA
1798,UVM-WC-AA9A-TP,UVM-WC-AA9A,UVM,female,70.0,TCGA
1799,UVM-YZ-A982-TP,UVM-YZ-A982,UVM,female,79.0,TCGA


In [14]:
def combineGroupBy(GTExDF, TCGADF):
    '''
    asserts the geneId's are in the same order
    '''
    gGroupbyGenes = GTExDF.loc[:, 'geneId']
    tGroupbyGenes = TCGADF.loc[:, 'geneId']

    assert (gGroupbyGenes == tGroupbyGenes).all(), "ERROR GTExDF and TCGADF geneId's do not match!"
    
    # remove the geneId col from TCGA, we do not want to have 2 copies of this column
    tcgaColList = TCGADF.columns.to_list()
    tcgaColList.remove('geneId')
    
    df = TCGADF.loc[:, tcgaColList]
    
    byColumn = 1    
    retDF = pd.concat( [GTExDF, df], axis=byColumn)
    
    return retDF
  
#test
GTEx_TCGA_groupbyDF = combineGroupBy(GTExDF=GTExGroupByDF.head(), TCGADF=TCGAGroupByDF.head())
print("GTEx_TCGA_groupbyDF.shape: {}".format(GTEx_TCGA_groupbyDF.shape))
display( GTEx_TCGA_groupbyDF.iloc[0:5, 0:3] )
GTEx_TCGA_groupbyDF.iloc[0:5:, -3:]

GTEx_TCGA_groupbyDF.shape: (5, 5272)


Unnamed: 0,geneId,GTEX-1117F-0226-SM-5GZZ7,GTEX-1117F-0526-SM-5EGHJ
0,(A)n,9,1
1,(AAA)n,0,0
2,(AAAAAAC)n,0,0
3,(AAAAAAG)n,0,0
4,(AAAAAAT)n,0,0


Unnamed: 0,UVM-WC-A87Y-TP,UVM-WC-AA9A-TP,UVM-YZ-A982-TP
0,3,3,3
1,0,0,0
2,0,0,0
3,0,0,0
4,0,0,0


## Run All

In [15]:
%%time
def bucketURLExists(url):
    '''
    returns 0 if url exists
    '''
    exitCodeList = ! (gsutil -q stat $url; echo $?)
    exitCode = int(exitCodeList[0])
    return exitCode

def run(BUCKET, cacheDir):
    for trainingSetName in ['Test', 'Validate', 'Train']:
        print("\n********** trainingSetName: {}".format(trainingSetName))
        downLoadColData(TCGAColDataURL, GTExColDataURL, cacheDir, trainingSetName )
        downLoadGroubByData(TCGADataURL, GTExDataURL, cacheDir, trainingSetName)
        
        colDataPath = cacheDir + "/GTEx_TCGA_" + trainingSetName + "ColData.csv"
        colDataURL  = BUCKET   + "/GTEx_TCGA_" + trainingSetName + "ColData.csv"
        groupByPath = cacheDir + "/GTEx_TCGA_" + trainingSetName + "Groupby.csv"
        groupByURL  = BUCKET   + "/GTEx_TCGA_" + trainingSetName + "Groupby.csv"
        
        colDataExits = bucketURLExists(colDataURL)
        groupByExists = bucketURLExists(groupByURL)
        if not colDataExits and not groupByExists:
            print("skipping {} colData and groupby data exist".format(trainingSetName))
            continue
        
        GTExColDataDF, GTExGroupByDF, TCGAColDataDF, TCGAGroupByDF = readData(tmp, trainingSetName)
        print("GTEx")
        validateColDataGroupByDataSets(GTExColDataDF, GTExGroupByDF)
        print("\n TCGA")
        validateColDataGroupByDataSets(TCGAColDataDF, TCGAGroupByDF )   

        GTEx_TCGA_colDataDF = combineColData(GTExDF=GTExColDataDF, TCGADF=TCGAColDataDF)
        print("GTEx_TCGA_colDataDF.shape: {}".format(GTEx_TCGA_colDataDF.shape))
        display( GTEx_TCGA_colDataDF.head() )
        display( GTEx_TCGA_colDataDF.tail() )  

        GTEx_TCGA_groupbyDF = combineGroupBy(GTExDF=GTExGroupByDF, TCGADF=TCGAGroupByDF)
        print("GTEx_TCGA_groupbyDF.shape: {}".format(GTEx_TCGA_groupbyDF.shape))
        display( GTEx_TCGA_groupbyDF.iloc[0:5, 0:3] )
        GTEx_TCGA_groupbyDF.iloc[0:5:, -3:]  
        
        # quick check
        nCols = 1
        numCountCols = GTEx_TCGA_groupbyDF.shape[nCols] -1 # -1 to adjust for 'geneId' column
        
        nRows = 0
        numColDataRows = GTEx_TCGA_colDataDF.shape[nRows]
        
        errMsg = "ERROR numCountCols {} !=  numColDataRows -1: {}".format(numCountCols, numColDataRows)
        assert numCountCols == numColDataRows, errMsg

        # save to local disk
        print("saving: {}".format(colDataPath))
        GTEx_TCGA_colDataDF.to_csv(colDataPath, index=False)
        print("saving: {}".format(groupByPath))        
        GTEx_TCGA_groupbyDF.to_csv(groupByPath, index=False)

        # move to bucket 
        print("saving: {}".format(colDataURL))
        !gsutil -m cp $colDataPath $colDataURL
        print("saving: {}".format(groupByURL))        
        !gsutil -m cp $groupByPath $groupByURL

        
# clean up left over test memory 
GTExColDataDF = GTExGroupByDF = TCGAColDataDF = TCGAGroupByDF = None
GTEx_TCGA_colDataDF = None
GTEx_TCGA_groupbyDF = None
run( WORKSPACE_BUCKET + "/data/GTEx_TCGA", tmp )


********** trainingSetName: Test
/TCGA-TestColData.csv found in local cache
/GTExTestColData.csv found in local cache
/TCGA-TestGroupByGeneId.csv found in local cache
/GTExTestGroupByGenesCountMatrix.csv found in local cache
GTEx
colDataDF.shape: (3471, 6)
groupByDF.shape: (74777, 3472)

 TCGA
colDataDF.shape: (1800, 7)
groupByDF.shape: (74777, 1801)
GTEx_TCGA_colDataDF.shape: (5271, 6)


Unnamed: 0,sample_id,participant_id,category,gender,age,dataSet
0,GTEX-1117F-0226-SM-5GZZ7,GTEX-1117F,Adipose_Subcutaneous,Female,66.0,GTEx
1,GTEX-1117F-0526-SM-5EGHJ,GTEX-1117F,Artery_Tibial,Female,66.0,GTEx
2,GTEX-1117F-0626-SM-5N9CS,GTEX-1117F,Artery_Coronary,Female,66.0,GTEx
3,GTEX-1117F-1326-SM-5EGHH,GTEX-1117F,Adipose_Visceral_Omentum,Female,66.0,GTEx
4,GTEX-1117F-2526-SM-5GZY6,GTEX-1117F,Vagina,Female,66.0,GTEx


Unnamed: 0,sample_id,participant_id,category,gender,age,dataSet
1795,UVM-VD-AA8N-TP,UVM-VD-AA8N,UVM,male,86.0,TCGA
1796,UVM-WC-A87W-TP,UVM-WC-A87W,UVM,female,57.0,TCGA
1797,UVM-WC-A87Y-TP,UVM-WC-A87Y,UVM,male,59.0,TCGA
1798,UVM-WC-AA9A-TP,UVM-WC-AA9A,UVM,female,70.0,TCGA
1799,UVM-YZ-A982-TP,UVM-YZ-A982,UVM,female,79.0,TCGA


GTEx_TCGA_groupbyDF.shape: (74777, 5272)


Unnamed: 0,geneId,GTEX-1117F-0226-SM-5GZZ7,GTEX-1117F-0526-SM-5EGHJ
0,(A)n,9,1
1,(AAA)n,0,0
2,(AAAAAAC)n,0,0
3,(AAAAAAG)n,0,0
4,(AAAAAAT)n,0,0


saving: ./tmp/GTEx_TCGA_TestColData.csv
saving: ./tmp/GTEx_TCGA_TestGroupby.csv
saving: gs://fc-e15b796f-1abe-4206-ab91-bd58374cc275/data/GTEx_TCGA/GTEx_TCGA_TestColData.csv
Copying file://./tmp/GTEx_TCGA_TestColData.csv [Content-Type=text/csv]...
/ [1/1 files][327.2 KiB/327.2 KiB] 100% Done                                    
Operation completed over 1 objects/327.2 KiB.                                    
saving: gs://fc-e15b796f-1abe-4206-ab91-bd58374cc275/data/GTEx_TCGA/GTEx_TCGA_TestGroupby.csv
Copying file://./tmp/GTEx_TCGA_TestGroupby.csv [Content-Type=text/csv]...
==> NOTE: You are uploading one or more large file(s), which would run          
significantly faster if you enable parallel composite uploads. This
feature can be enabled by editing the
"parallel_composite_upload_threshold" value in your .boto
configuration file. However, note that if you do this large files will
be uploaded as `composite objects
<https://cloud.google.com/storage/docs/composite-objects>`_,which
means

Unnamed: 0,sample_id,participant_id,category,gender,age,dataSet
0,GTEX-1117F-0426-SM-5EGHI,GTEX-1117F,Muscle_Skeletal,Female,66.0,GTEx
1,GTEX-1117F-0726-SM-5GIEN,GTEX-1117F,Heart_Atrial_Appendage,Female,66.0,GTEx
2,GTEX-1117F-2426-SM-5EGGH,GTEX-1117F,Uterus,Female,66.0,GTEx
3,GTEX-1117F-2826-SM-5GZXL,GTEX-1117F,Breast_Mammary_Tissue,Female,66.0,GTEx
4,GTEX-1117F-2926-SM-5GZYI,GTEX-1117F,Skin_Not_Sun_Exposed_Suprapubic,Female,66.0,GTEx


Unnamed: 0,sample_id,participant_id,category,gender,age,dataSet
1795,UVM-VD-AA8P-TP,UVM-VD-AA8P,UVM,female,64.0,TCGA
1796,UVM-VD-AA8S-TP,UVM-VD-AA8S,UVM,male,40.0,TCGA
1797,UVM-WC-A880-TP,UVM-WC-A880,UVM,male,63.0,TCGA
1798,UVM-WC-A885-TP,UVM-WC-A885,UVM,male,60.0,TCGA
1799,UVM-WC-A88A-TP,UVM-WC-A88A,UVM,male,75.0,TCGA


GTEx_TCGA_groupbyDF.shape: (74777, 5272)


Unnamed: 0,geneId,GTEX-1117F-0426-SM-5EGHI,GTEX-1117F-0726-SM-5GIEN
0,(A)n,0,3
1,(AAA)n,0,0
2,(AAAAAAC)n,0,0
3,(AAAAAAG)n,0,0
4,(AAAAAAT)n,0,0


saving: ./tmp/GTEx_TCGA_ValidateColData.csv
saving: ./tmp/GTEx_TCGA_ValidateGroupby.csv
saving: gs://fc-e15b796f-1abe-4206-ab91-bd58374cc275/data/GTEx_TCGA/GTEx_TCGA_ValidateColData.csv
Copying file://./tmp/GTEx_TCGA_ValidateColData.csv [Content-Type=text/csv]...
/ [1/1 files][327.1 KiB/327.1 KiB] 100% Done                                    
Operation completed over 1 objects/327.1 KiB.                                    
saving: gs://fc-e15b796f-1abe-4206-ab91-bd58374cc275/data/GTEx_TCGA/GTEx_TCGA_ValidateGroupby.csv
Copying file://./tmp/GTEx_TCGA_ValidateGroupby.csv [Content-Type=text/csv]...
==> NOTE: You are uploading one or more large file(s), which would run          
significantly faster if you enable parallel composite uploads. This
feature can be enabled by editing the
"parallel_composite_upload_threshold" value in your .boto
configuration file. However, note that if you do this large files will
be uploaded as `composite objects
<https://cloud.google.com/storage/docs/composit

Unnamed: 0,sample_id,participant_id,category,gender,age,dataSet
0,GTEX-1117F-0226-SM-5GZZ7,GTEX-1117F,Adipose_Subcutaneous,Female,66.0,GTEx
1,GTEX-1117F-0526-SM-5EGHJ,GTEX-1117F,Artery_Tibial,Female,66.0,GTEx
2,GTEX-1117F-0726-SM-5GIEN,GTEX-1117F,Heart_Atrial_Appendage,Female,66.0,GTEx
3,GTEX-1117F-2826-SM-5GZXL,GTEX-1117F,Breast_Mammary_Tissue,Female,66.0,GTEx
4,GTEX-1117F-3226-SM-5N9CT,GTEX-1117F,Brain_Cortex,Female,66.0,GTEx


Unnamed: 0,sample_id,participant_id,category,gender,age,dataSet
5395,UVM-WC-AA9E-TP,UVM-WC-AA9E,UVM,male,60.0,TCGA
5396,UVM-YZ-A980-TP,UVM-YZ-A980,UVM,male,75.0,TCGA
5397,UVM-YZ-A983-TP,UVM-YZ-A983,UVM,female,51.0,TCGA
5398,UVM-YZ-A984-TP,UVM-YZ-A984,UVM,female,50.0,TCGA
5399,UVM-YZ-A985-TP,UVM-YZ-A985,UVM,female,41.0,TCGA


GTEx_TCGA_groupbyDF.shape: (74777, 15812)


Unnamed: 0,geneId,GTEX-1117F-0226-SM-5GZZ7,GTEX-1117F-0526-SM-5EGHJ
0,(A)n,9,1
1,(AAA)n,0,0
2,(AAAAAAC)n,0,0
3,(AAAAAAG)n,0,0
4,(AAAAAAT)n,0,0


saving: ./tmp/GTEx_TCGA_TrainColData.csv
saving: ./tmp/GTEx_TCGA_TrainGroupby.csv
saving: gs://fc-e15b796f-1abe-4206-ab91-bd58374cc275/data/GTEx_TCGA/GTEx_TCGA_TrainColData.csv
Copying file://./tmp/GTEx_TCGA_TrainColData.csv [Content-Type=text/csv]...
/ [1/1 files][970.7 KiB/970.7 KiB] 100% Done                                    
Operation completed over 1 objects/970.7 KiB.                                    
saving: gs://fc-e15b796f-1abe-4206-ab91-bd58374cc275/data/GTEx_TCGA/GTEx_TCGA_TrainGroupby.csv
Copying file://./tmp/GTEx_TCGA_TrainGroupby.csv [Content-Type=text/csv]...
==> NOTE: You are uploading one or more large file(s), which would run          
significantly faster if you enable parallel composite uploads. This
feature can be enabled by editing the
"parallel_composite_upload_threshold" value in your .boto
configuration file. However, note that if you do this large files will
be uploaded as `composite objects
<https://cloud.google.com/storage/docs/composite-objects>`_,which

In [16]:
# quick sanity check
!pwd
p = "/home/jupyter/uber/edit/tmp/GTEx_T*"
! wc -l $p
print()
url = WORKSPACE_BUCKET + "/data/GTEx_TCGA"
! gsutil ls -l $url

/home/jupyter/uber/edit
      5272 /home/jupyter/uber/edit/tmp/GTEx_TCGA_TestColData.csv
     74778 /home/jupyter/uber/edit/tmp/GTEx_TCGA_TestGroupby.csv
     15812 /home/jupyter/uber/edit/tmp/GTEx_TCGA_TrainColData.csv
     74778 /home/jupyter/uber/edit/tmp/GTEx_TCGA_TrainGroupby.csv
      5272 /home/jupyter/uber/edit/tmp/GTEx_TCGA_ValidateColData.csv
     74778 /home/jupyter/uber/edit/tmp/GTEx_TCGA_ValidateGroupby.csv
    250690 total

    335079  2022-07-12T02:37:08Z  gs://fc-e15b796f-1abe-4206-ab91-bd58374cc275/data/GTEx_TCGA/GTEx_TCGA_TestColData.csv
1014624246  2022-07-12T02:37:21Z  gs://fc-e15b796f-1abe-4206-ab91-bd58374cc275/data/GTEx_TCGA/GTEx_TCGA_TestGroupby.csv
    994010  2022-07-12T02:52:39Z  gs://fc-e15b796f-1abe-4206-ab91-bd58374cc275/data/GTEx_TCGA/GTEx_TCGA_TrainColData.csv
3045437677  2022-07-12T02:53:13Z  gs://fc-e15b796f-1abe-4206-ab91-bd58374cc275/data/GTEx_TCGA/GTEx_TCGA_TrainGroupby.csv
    334922  2022-07-12T02:39:37Z  gs://fc-e15b796f-1abe-4206-ab91-bd58374cc2