# Create GTEx-TCGA DESeq Data Sets

```
Andrew Davidson
aedavids@ucsc.edu
```

TODO:
- start with test. ie small fail fast
- download col data to local disk
- download matrix to disk
- load col data
- load groupby


In [1]:
# use display() to print an html version of a data frame
# useful if dataFrame output is not generated by last like of cell
from IPython.display import display

import os
from os.path import exists
import pandas as pd


WORKSPACE_BUCKET = os.environ['WORKSPACE_BUCKET']
print(WORKSPACE_BUCKET)

gs://fc-e15b796f-1abe-4206-ab91-bd58374cc275


## Download , Read and Check data

In [2]:
tmp = "./tmp"
!mkdir -p $tmp

In [3]:
TCGAColDataURL = WORKSPACE_BUCKET + "/data/colData/trainingDataSets"
TCGADataURL    = WORKSPACE_BUCKET + "/data/matrices/groupByGeneId/trainingDataSets"

GTExColDataURL = WORKSPACE_BUCKET #+ "/GTEx*ColData.csv"
GTExDataURL    = WORKSPACE_BUCKET #+ "/GTEx*GroupByGenesCountMatrix.csv"

In [4]:
!gsutil ls $TCGAColDataURL

gs://fc-e15b796f-1abe-4206-ab91-bd58374cc275/data/colData/trainingDataSets/TCGA-TestColData.csv
gs://fc-e15b796f-1abe-4206-ab91-bd58374cc275/data/colData/trainingDataSets/TCGA-TrainColData.csv
gs://fc-e15b796f-1abe-4206-ab91-bd58374cc275/data/colData/trainingDataSets/TCGA-ValidateColData.csv
gs://fc-e15b796f-1abe-4206-ab91-bd58374cc275/data/colData/trainingDataSets/TCGA-miscColData.csv


In [5]:
!gsutil ls $TCGADataURL

gs://fc-e15b796f-1abe-4206-ab91-bd58374cc275/data/matrices/groupByGeneId/trainingDataSets/TCGA-TestGroupByGeneId.csv
gs://fc-e15b796f-1abe-4206-ab91-bd58374cc275/data/matrices/groupByGeneId/trainingDataSets/TCGA-TrainGroupByGeneId.csv
gs://fc-e15b796f-1abe-4206-ab91-bd58374cc275/data/matrices/groupByGeneId/trainingDataSets/TCGA-ValidateGroupByGeneId.csv


In [6]:
p = GTExDataURL + "/GTEx*GroupByGenesCountMatrix.csv"
!gsutil ls $p

gs://fc-e15b796f-1abe-4206-ab91-bd58374cc275/GTExTestGroupByGenesCountMatrix.csv
gs://fc-e15b796f-1abe-4206-ab91-bd58374cc275/GTExTrainGroupByGenesCountMatrix.csv
gs://fc-e15b796f-1abe-4206-ab91-bd58374cc275/GTExValidateGroupByGenesCountMatrix.csv


In [7]:
p = GTExColDataURL + "/GTEx*ColData.csv"
#print( p)
!gsutil ls $p

gs://fc-e15b796f-1abe-4206-ab91-bd58374cc275/GTExTestColData.csv
gs://fc-e15b796f-1abe-4206-ab91-bd58374cc275/GTExTrainColData.csv
gs://fc-e15b796f-1abe-4206-ab91-bd58374cc275/GTExValidateColData.csv


In [8]:
def downLoadColData(TCGAColDataURL, GTExColDataURL, cacheDir, trainingSetName="Test" ):
    tcga = "/TCGA-" + trainingSetName + "ColData.csv"
    if not exists(cacheDir + tcga):    
        srcURL = TCGAColDataURL + tcga
        print("srcURL:{} ".format(srcURL))
        !gsutil -m cp  $srcURL $cacheDir
    else:
        print("{} found in local cache".format(tcga))
    
    gtex = "/GTEx" + trainingSetName + "ColData.csv"
    if not exists(cacheDir + gtex):
        print("srcURL:{} ".format(srcURL))
        srcURL = GTExColDataURL + gtex
        !gsutil -m cp  $srcURL $cacheDir
    else:
        print("{} found in local cache".format(gtex))


downLoadColData(TCGAColDataURL, GTExColDataURL, tmp, trainingSetName="Test" )

/TCGA-TestColData.csv found in local cache
/GTExTestColData.csv found in local cache


In [9]:
def downLoadGroubByData(TCGADataURL, GTExDataURL, cacheDir, trainingSetName="Test"):
    tcga = "/TCGA-" + trainingSetName + "GroupByGeneId.csv"
    if not exists(cacheDir + tcga):    
        srcURL = TCGADataURL + tcga
        !gsutil -m cp  $srcURL $cacheDir
    else:
        print("{} found in local cache".format(tcga))

    
    gtex = "/GTEx" + trainingSetName + "GroupByGenesCountMatrix.csv"
    if not exists(cacheDir + gtex):
        srcURL = GTExDataURL + gtex
        !gsutil -m cp  $srcURL $cacheDir
    else:
        print("{} found in local cache".format(gtex))
        

downLoadGroubByData(TCGADataURL, GTExDataURL, tmp, trainingSetName="Test")

/TCGA-TestGroupByGeneId.csv found in local cache
/GTExTestGroupByGenesCountMatrix.csv found in local cache


In [10]:
def loadDF(localPath):
    retDF = None
    if not exists(localPath):
        print("ERROR {} not found".format(localPath))
    else:
        retDF = pd.read_csv(localPath)
    
    return retDF
        
def readData(cacheDir,  trainingSetName="Test"):
    GTexPath = cacheDir +  "/GTEx" + trainingSetName + "ColData.csv"
    GTExColDataDF = loadDF(GTexPath)
    
    TCGAPath = cacheDir +  "/TCGA-" + trainingSetName + "ColData.csv"
    TCGAColDataDF = loadDF(TCGAPath)
    
    GTEXGroupbyPath = cacheDir +  "/GTEx" + trainingSetName + "GroupByGenesCountMatrix.csv"
    GTExGroupByDF = loadDF(GTEXGroupbyPath)
    
    TCGAGroupbyPath = cacheDir + "/TCGA-" + trainingSetName + "GroupByGeneId.csv"
    TCGAGroupByDF = loadDF(TCGAGroupbyPath)
    
    return (GTExColDataDF, GTExGroupByDF, TCGAColDataDF, TCGAGroupByDF)

GTExColDataDF, GTExGroupByDF, TCGAColDataDF, TCGAGroupByDF = readData(tmp, trainingSetName="Test")

In [11]:
def validateColDataGroupByDataSets(colDataDF, groupByDF):
    '''
    assert number or rows in colData == number of columns in groupBy
    assert sample names are in the same order
    '''
    print( "colDataDF.shape: {}".format(colDataDF.shape) )
    #print( display(colDataDF.iloc[0:3, 0:2]) )
    
    print( "groupByDF.shape: {}".format(groupByDF.shape) )
    
    nRows = colDataDF.shape[0]
    nCols = groupByDF.shape[1]
    # DESeq requires cols and rows match. groupBy has an extra 'geneId' col data
    assert nRows == nCols -1, "ERROR num colData rows must equal num groupByCols -1"
    
    # make sure the rows and cols are in the same order
    colDataSampleNamesSeries = colDataDF.loc[:, 'sample_id']
    # skip the 'geneId col'
    groupBySampleNameList = groupByDF.columns.to_list()[1:]
    assert (colDataSampleNamesSeries == groupBySampleNameList).all(), "ERROR colDataSamples are not in same order as groupBy"

print("GTEx")
validateColDataGroupByDataSets(GTExColDataDF, GTExGroupByDF)
print("\n TCGA")
validateColDataGroupByDataSets(TCGAColDataDF, TCGAGroupByDF )

GTEx
colDataDF.shape: (3471, 6)
groupByDF.shape: (74777, 3472)

 TCGA
colDataDF.shape: (1800, 7)
groupByDF.shape: (74777, 1801)


## Create Combined Data Set

In [12]:
def combineColData(GTExDF, TCGADF):
    gDF = GTExDF.rename( columns = {'sex':'gender', 'tissue_id':'category'})
    gDF['dataSet'] = "GTEx"
    gDF = gDF.loc[:, ['sample_id', 'participant_id', 'category', 'gender', 'age', 'dataSet']]
    #display(gDF)
    
    tDF = TCGADF.rename( columns={'Gender':'gender', 'Cohort':'category', 'Age':'age'} )
    tDF['dataSet'] = "TCGA"
    tDF = tDF.loc[:, ['sample_id', 'participant_id', 'category', 'gender', 'age', 'dataSet']]

    #print()
    #display(tDF)
    
    byRows = 0
    retDF = pd.concat( [gDF, tDF], axis=byRows)
    
    return retDF
    
GTEx_TCGA_colDataDF = combineColData(GTExDF=GTExColDataDF, TCGADF=TCGAColDataDF)
print("GTEx_TCGA_colDataDF.shape: {}".format(GTEx_TCGA_colDataDF.shape))
display( GTEx_TCGA_colDataDF.head() )
display( GTEx_TCGA_colDataDF.tail() )

GTEx_TCGA_colDataDF.shape: (5271, 6)


Unnamed: 0,sample_id,participant_id,category,gender,age,dataSet
0,GTEX-1117F-0226-SM-5GZZ7,GTEX-1117F,Adipose_Subcutaneous,Female,66.0,GTEx
1,GTEX-1117F-0526-SM-5EGHJ,GTEX-1117F,Artery_Tibial,Female,66.0,GTEx
2,GTEX-1117F-0626-SM-5N9CS,GTEX-1117F,Artery_Coronary,Female,66.0,GTEx
3,GTEX-1117F-1326-SM-5EGHH,GTEX-1117F,Adipose_Visceral_Omentum,Female,66.0,GTEx
4,GTEX-1117F-2526-SM-5GZY6,GTEX-1117F,Vagina,Female,66.0,GTEx


Unnamed: 0,sample_id,participant_id,category,gender,age,dataSet
1795,UVM-VD-AA8N-TP,UVM-VD-AA8N,UVM,male,86.0,TCGA
1796,UVM-WC-A87W-TP,UVM-WC-A87W,UVM,female,57.0,TCGA
1797,UVM-WC-A87Y-TP,UVM-WC-A87Y,UVM,male,59.0,TCGA
1798,UVM-WC-AA9A-TP,UVM-WC-AA9A,UVM,female,70.0,TCGA
1799,UVM-YZ-A982-TP,UVM-YZ-A982,UVM,female,79.0,TCGA


In [13]:
def combineGroupBy(GTExDF, TCGADF):
    '''
    asserts the geneId's are in the same order
    '''
    gGroupbyGenes = GTExDF.loc[:, 'geneId']
    tGroupbyGenes = TCGADF.loc[:, 'geneId']

    assert (gGroupbyGenes == tGroupbyGenes).all(), "ERROR GTExDF and TCGADF geneId's do not match!"
    byColumn = 1
    retDF = pd.concat( [GTExDF, TCGADF], axis=byColumn)
    
    return retDF
    
GTEx_TCGA_groupbyDF = combineGroupBy(GTExDF=GTExGroupByDF.head(), TCGADF=TCGAGroupByDF.head())
print("GTEx_TCGA_groupbyDF.shape: {}".format(GTEx_TCGA_groupbyDF.shape))
display( GTEx_TCGA_groupbyDF.iloc[0:5, 0:3] )
GTEx_TCGA_groupbyDF.iloc[0:5:, -3:]

GTEx_TCGA_groupbyDF.shape: (5, 5273)


Unnamed: 0,geneId,GTEX-1117F-0226-SM-5GZZ7,GTEX-1117F-0526-SM-5EGHJ
0,(A)n,9,1
1,(AAA)n,0,0
2,(AAAAAAC)n,0,0
3,(AAAAAAG)n,0,0
4,(AAAAAAT)n,0,0


Unnamed: 0,UVM-WC-A87Y-TP,UVM-WC-AA9A-TP,UVM-YZ-A982-TP
0,3,3,3
1,0,0,0
2,0,0,0
3,0,0,0
4,0,0,0
