# Create TCGA train, validate, and test data sets
```
Andrew E. Davidson
aedavids@ucsc.edu
```

## Use single sex types to validate training 
- BRCA (female= 1052 , male = 11) Breast invasive carcinom
- OV   (female only) Ovarian serous cystadenocarcinoma
- UCS  (female only) Uterine Carcinosarcoma 
- PRAD (male only)   Prostate adenocarcinoma
- TGCT (male only)   Testicular Germ Cell Tumors 

ref: [sklearn.model_selection.StratifiedShuffleSplit](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.StratifiedShuffleSplit.html#sklearn.model_selection.StratifiedShuffleSplit)

## AEDWIP
- sort by sample id
- focus on groupByGenes
- later create NumReads data sets
- TCGA col data does not have tissue id. 
    * tmp work around, copy Cohort colum as tissue_id
    * maybe there is additional meta data on TCGA sigh
    * follow up with Toil compondum. what did they do?
```
gs://fc-e15b796f-1abe-4206-ab91-bd58374cc275/data/matrices/NumReads/
gs://fc-e15b796f-1abe-4206-ab91-bd58374cc275/data/matrices/groupByGeneId/
```

In [1]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split


WORKSPACE_BUCKET = os.environ['WORKSPACE_BUCKET']
print(WORKSPACE_BUCKET)

gs://fc-e15b796f-1abe-4206-ab91-bd58374cc275


In [2]:
#UBER_TCGA_groupByGeneMatrixURL = "$WORKSPACE_BUCKET/"
UBER_TCGA_groupByGeneMatrixURL = WORKSPACE_BUCKET + "AEDWIP "

# TCGA_colDataDirURL       = "$WORKSPACE_BUCKET/data/colData"
# debug_TCGA_colDataDirURL = "$WORKSPACE_BUCKET/data/colDataDebug"

TCGA_colDataDirURL       = WORKSPACE_BUCKET + "/data/colData"
debug_TCGA_colDataDirURL = WORKSPACE_BUCKET + "/data/colDataDebug"

UBER_TCGA_trainingSetDirURL         = TCGA_colDataDirURL + "/trainingDataSets"
# UBER_TCGA_colDataMatrixURL       = TCGA_colDataDirURL + "/trainingDataSets/uberColData.csv"
UBER_TCGA_colDataMatrixURL       = UBER_TCGA_trainingSetDirURL + "/uberColData.csv"

debug_UBER_TCGA_trainingSetDirURL   = debug_TCGA_colDataDirURL + "/trainingDataSets"
# debug_UBER_TCGA_colDataMatrixURL = debug_TCGA_colDataDirURL + "/trainingDataSets/uberColDataDebug.csv"
debug_UBER_TCGA_colDataMatrixURL = debug_TCGA_colDataDirURL + "/trainingDataSets/uberColDataDebug.csv"

localColDataTmp =  "data/colData/"

#
# configure for debug or production in one place
#
# SAVE_MATRIX      = False # set to False for debug, True for production
# colDataDirURL    = debug_TCGA_colDataDirURL
# colDataMatrixURL = debug_UBER_TCGA_colDataMatrixURL
# colDataDirURL    = debug_TCGA_colDataDirURL
# outputBucketURL  = debug_UBER_TCGA_trainingSetDirURL

SAVE_MATRIX      = True # set to False for debug, True for production
colDataDirURL    = TCGA_colDataDirURL
colDataMatrixURL = UBER_TCGA_colDataMatrixURL
colDataDirURL    = TCGA_colDataDirURL
outputBucketURL  = UBER_TCGA_trainingSetDirURL

In [3]:
def cleanUpUbers():
    #gs://fc-e15b796f-1abe-4206-ab91-bd58374cc275/data/colDataDebug/TCGA-ValidateColData.csv
    files = ["TCGA-TrainColData.csv", "TCGA-ValidateColData.csv", "TCGA-TestColData.csv"]
    for f in files:
        print()
        d = UBER_TCGA_trainingSetDirURL + "/" + f
        e = debug_UBER_TCGA_trainingSetDirURL + "/" + f
        ! gsutil rm $d 
        ! gsutil rm $e
         
    print()
    ! gsutil rm $debug_UBER_TCGA_colDataMatrixURL
    ! gsutil rm $UBER_TCGA_colDataMatrixURL
    
    print()
    ! gsutil rm -r $UBER_TCGA_trainingSetDirURL
    ! gsutil rm -f $debug_UBER_TCGA_trainingSetDirURL
    
#cleanUpUbers()


CommandException: No URLs matched: gs://fc-e15b796f-1abe-4206-ab91-bd58374cc275/data/colData/trainingDataSets/TCGA-TrainColData.csv
CommandException: No URLs matched: gs://fc-e15b796f-1abe-4206-ab91-bd58374cc275/data/colDataDebug/trainingDataSets/TCGA-TrainColData.csv

CommandException: No URLs matched: gs://fc-e15b796f-1abe-4206-ab91-bd58374cc275/data/colData/trainingDataSets/TCGA-ValidateColData.csv
CommandException: No URLs matched: gs://fc-e15b796f-1abe-4206-ab91-bd58374cc275/data/colDataDebug/trainingDataSets/TCGA-ValidateColData.csv

CommandException: No URLs matched: gs://fc-e15b796f-1abe-4206-ab91-bd58374cc275/data/colData/trainingDataSets/TCGA-TestColData.csv
CommandException: No URLs matched: gs://fc-e15b796f-1abe-4206-ab91-bd58374cc275/data/colDataDebug/trainingDataSets/TCGA-TestColData.csv

CommandException: No URLs matched: gs://fc-e15b796f-1abe-4206-ab91-bd58374cc275/data/colDataDebug/trainingDataSets/uberColDataDebug.csv
Removing gs://fc-e15b796f-1abe-4206-ab91-bd58374c

In [4]:
# create test and debug data
def createTestData():
    debugColDataList = [
        "$TCGA_colDataDirURL/TCGA_ACC_ControlledAccess_V1-0_DATA_edu_ucsc_kim_lab_colData.csv",
        "$TCGA_colDataDirURL/TCGA_BLCA_ControlledAccess_V1-0_DATA_edu_ucsc_kim_lab_colData.csv"
    ]
    
    ! rm -rf debug_TCGA_colDataDirURL
    
    ! gsutil -m cp \
        $TCGA_colDataDirURL/TCGA_ACC_ControlledAccess_V1-0_DATA_edu_ucsc_kim_lab_colData.csv \
        $TCGA_colDataDirURL/TCGA_BLCA_ControlledAccess_V1-0_DATA_edu_ucsc_kim_lab_colData.csv \
        $debug_TCGA_colDataDirURL
    
if (colDataDirURL == debug_TCGA_colDataDirURL):
    createTestData() 

# Combine all the TCGA data set

In [5]:
%%time
def loadCSV(matrixURL, dataDirURL, localTmpDir):
    '''
    localTmpDir
        a path to a directory loadCSV() is free to create, or delete
        
    returns pandas data frame
    
    if matrix URL does not exits 
        create a pandas data frame from file under dataDirURL 
        saves the dataframe to matrixURL
    '''
    
    exitCodeList = ! (gsutil -q stat $matrixURL; echo $?)
    exitCode = int(exitCodeList[0])
    retDF = None
    if exitCode == 0:
        # matrixURL exits
        print("reading:{}".format(matrixURL))
        ! 'rm' -rf $localTmpDir
        ! mkdir -p $localTmpDir
        ! gsutil -m cp "$matrixURL" $localTmpDir 
        file = !ls $localTmpDir
        p = localTmpDir + "/" + file[0]
        retDF = pd.read_csv(p)
       
    else:
        print("create matrix: {}".format(matrixURL))
        ! 'rm' -rf $localTmpDir
        ! mkdir -p $localTmpDir
        ! gsutil -m cp "$dataDirURL/*" $localTmpDir     
        fileList = ! ls $localTmpDir
        
        # read files and "union" them
        retDF = pd.DataFrame()
        print()
        for f in fileList:
            p = localTmpDir + "/" + f
            print("p:{}".format(p))
            tmpDF = pd.read_csv(p) #index_col='entity:sample_id'
            retDF = pd.concat([retDF, tmpDF])
            #print(retDF.columns)
            
        print("\n!!!! do not hard code csv\n")
        p = localTmpDir + "/" "matrix.csv"
        retDF.to_csv(p, index=False)
        
        # read with out index column
        retDF = pd.read_csv(p)

        ! gsutil -m cp $p $matrixURL
        
    # clean up
    ! 'rm' -rf $localTmpDir
            
    return retDF
            

colDataDF = loadCSV( colDataMatrixURL, colDataDirURL, localColDataTmp )
colDataDF.head()

create matrix: gs://fc-e15b796f-1abe-4206-ab91-bd58374cc275/data/colData/trainingDataSets/uberColData.csv
Copying gs://fc-e15b796f-1abe-4206-ab91-bd58374cc275/data/colData/TCGA_BLCA_ControlledAccess_V1-0_DATA_edu_ucsc_kim_lab_colData.csv...
Copying gs://fc-e15b796f-1abe-4206-ab91-bd58374cc275/data/colData/TCGA_ACC_ControlledAccess_V1-0_DATA_edu_ucsc_kim_lab_colData.csv...
Copying gs://fc-e15b796f-1abe-4206-ab91-bd58374cc275/data/colData/TCGA_BRCA_ControlledAccess_V1-0_DATA_edu_ucsc_kim_lab_colData.csv...
Copying gs://fc-e15b796f-1abe-4206-ab91-bd58374cc275/data/colData/TCGA_CESC_ControlledAccess_V1-0_DATA_edu_ucsc_kim_lab_colData.csv...
Copying gs://fc-e15b796f-1abe-4206-ab91-bd58374cc275/data/colData/TCGA_ESCA_ControlledAccess_V1-0_DATA_edu_ucsc_kim_lab_colData.csv...
Copying gs://fc-e15b796f-1abe-4206-ab91-bd58374cc275/data/colData/TCGA_DLBC_ControlledAccess_V1-0_DATA_edu_ucsc_kim_lab_colData.csv...
Copying gs://fc-e15b796f-1abe-4206-ab91-bd58374cc275/data/colData/TCGA_GBM_Controlled

Unnamed: 0,entity:sample_id,entity:participant_id,tcga_sample_id,Cohort,Age,Gender,sample_type
0,ACC-OR-A5J1-TP,ACC-OR-A5J1,TCGA-OR-A5J1-01,ACC,58.0,male,TP
1,ACC-OR-A5J2-TP,ACC-OR-A5J2,TCGA-OR-A5J2-01,ACC,44.0,female,TP
2,ACC-OR-A5J3-TP,ACC-OR-A5J3,TCGA-OR-A5J3-01,ACC,23.0,female,TP
3,ACC-OR-A5J5-TP,ACC-OR-A5J5,TCGA-OR-A5J5-01,ACC,30.0,male,TP
4,ACC-OR-A5J6-TP,ACC-OR-A5J6,TCGA-OR-A5J6-01,ACC,29.0,female,TP


In [6]:
! gsutil ls -l $colDataMatrixURL
print("colDataDF.shape:{}".format(colDataDF.shape))

    657184  2022-06-15T00:34:22Z  gs://fc-e15b796f-1abe-4206-ab91-bd58374cc275/data/colData/trainingDataSets/uberColData.csv
TOTAL: 1 objects, 657184 bytes (641.78 KiB)
colDataDF.shape:(10347, 7)


## Clean up
ref: [TCGA sample-type-cods](https://gdc.cancer.gov/resources-tcga-users/tcga-code-tables/sample-type-codes)
Select Primary Solid Tumor sample. i.e. sample_type = 'tp'. Save all other samples in miscDF

In [7]:
# rename columns
colDataDF = colDataDF.rename( columns={'entity:sample_id'     : 'sample_id', 
                                     'entity:participant_id': 'participant_id'} )

colDataDF.head()


Unnamed: 0,sample_id,participant_id,tcga_sample_id,Cohort,Age,Gender,sample_type
0,ACC-OR-A5J1-TP,ACC-OR-A5J1,TCGA-OR-A5J1-01,ACC,58.0,male,TP
1,ACC-OR-A5J2-TP,ACC-OR-A5J2,TCGA-OR-A5J2-01,ACC,44.0,female,TP
2,ACC-OR-A5J3-TP,ACC-OR-A5J3,TCGA-OR-A5J3-01,ACC,23.0,female,TP
3,ACC-OR-A5J5-TP,ACC-OR-A5J5,TCGA-OR-A5J5-01,ACC,30.0,male,TP
4,ACC-OR-A5J6-TP,ACC-OR-A5J6,TCGA-OR-A5J6-01,ACC,29.0,female,TP


In [8]:
print(colDataDF.shape)
sampleTypeTP_rows = colDataDF['sample_type'] == 'TP'
sampleTypeNotTP_rows = colDataDF['sample_type'] != 'TP'
colDataTPDF = colDataDF.loc[sampleTypeTP_rows, :].copy()
print(colDataTPDF.shape)

(10347, 7)
(9004, 7)


In [9]:
miscDF = colDataDF.loc[sampleTypeNotTP_rows, :].copy()
colDataDF = colDataTPDF
colDataTPDF = None
print(miscDF.shape)
print(colDataDF.shape)

(1343, 7)
(9004, 7)


## Split data into train, validate, and test data sets
### convert  'Cohort', 'Gender' and 'sample_type' to  factors

In [10]:
colDataDF.loc[:, "Cohort"]      = colDataDF.loc[:, "Cohort"].astype('category')
colDataDF.loc[:, "Gender"]      = colDataDF.loc[:, "Gender"].astype('category')
colDataDF.dtypes

sample_id           object
participant_id      object
tcga_sample_id      object
Cohort            category
Age                float64
Gender            category
sample_type         object
dtype: object

### Create a label we can use to balance data sets by  Cohort, Gender, and sample_type

In [11]:
a = colDataDF["Cohort"].astype(str).values
b = colDataDF["Gender"].astype(str).values

colDataDF.loc[:, "balanceType"] = ( a + "_" + b )
colDataDF.loc[:, "balanceType"] = colDataDF.loc[:, "balanceType"].astype('category')

# print("colDataDF['label'].categories : {}\n".format(colDataDF["label"].categories ))
print(colDataDF['balanceType'].dtypes)
print(colDataDF['balanceType'].unique())

colDataDF.head()

category
['ACC_male', 'ACC_female', 'BLCA_male', 'BLCA_female', 'BRCA_female', ..., 'THYM_female', 'UCEC_female', 'UCS_female', 'UVM_female', 'UVM_male']
Length: 62
Categories (62, object): ['ACC_female', 'ACC_male', 'BLCA_female', 'BLCA_male', ..., 'UCEC_female', 'UCS_female', 'UVM_female', 'UVM_male']


Unnamed: 0,sample_id,participant_id,tcga_sample_id,Cohort,Age,Gender,sample_type,balanceType
0,ACC-OR-A5J1-TP,ACC-OR-A5J1,TCGA-OR-A5J1-01,ACC,58.0,male,TP,ACC_male
1,ACC-OR-A5J2-TP,ACC-OR-A5J2,TCGA-OR-A5J2-01,ACC,44.0,female,TP,ACC_female
2,ACC-OR-A5J3-TP,ACC-OR-A5J3,TCGA-OR-A5J3-01,ACC,23.0,female,TP,ACC_female
3,ACC-OR-A5J5-TP,ACC-OR-A5J5,TCGA-OR-A5J5-01,ACC,30.0,male,TP,ACC_male
4,ACC-OR-A5J6-TP,ACC-OR-A5J6,TCGA-OR-A5J6-01,ACC,29.0,female,TP,ACC_female


In [12]:
key = 'display.max_rows'
default = pd.get_option(key)
pd.set_option(key, None) # None
balanceTypeCountsSeries = colDataDF.groupby( ["balanceType"] )['balanceType'].agg('count')
balanceTypeCountsSeries

balanceType
ACC_female       48
ACC_male         31
BLCA_female     107
BLCA_male       301
BRCA_female    1052
BRCA_male        11
CESC_female     304
CHOL_female      20
CHOL_male        16
COAD_female     117
COAD_male       147
COAD_nan          2
DLBC_female      26
DLBC_male        22
ESCA_female      26
ESCA_male       158
GBM_female       54
GBM_male         99
GBM_nan           1
HNSC_female     136
HNSC_male       384
KICH_female      27
KICH_male        39
KIRC_female     188
KIRC_male       345
KIRP_female      76
KIRP_male       214
LGG_female      230
LGG_male        285
LGG_nan           1
LIHC_female     121
LIHC_male       250
LUAD_female     277
LUAD_male       238
LUSC_female     130
LUSC_male       371
MESO_female      16
MESO_male        71
OV_female       294
PAAD_female      80
PAAD_male        98
PCPG_female     101
PCPG_male        78
PRAD_male       497
READ_female      42
READ_male        52
SARC_female     141
SARC_male       117
SKCM_female      42
SKCM_mal

### remove or fix samples that are missing gender info
move to miscDF

- COAD  = Colon adenocarcinoma 
- GBM  = Glioblastoma multiforme
- LGG  = Brain Lower Grade Glioma 
- TGCT = Testicular Germ Cell Tumors

In [13]:
balanceTypeCountsSeries [ balanceTypeCountsSeries.index.str.contains('nan') ] #== "*nan"#.str.match('*nan')

balanceType
COAD_nan     2
GBM_nan      1
LGG_nan      1
TGCT_nan    16
Name: balanceType, dtype: int64

In [14]:
fixRows = colDataDF["balanceType"]  == 'TGCT_nan'
colDataDF.loc[fixRows, "balanceType"] = 'TGCT_male'
colDataDF.loc[fixRows, 'Gender'] = 'male'

balanceTypeCountsSeries = colDataDF.groupby( ["balanceType"] )['balanceType'].agg('count')
balanceTypeCountsSeries [ balanceTypeCountsSeries.index.str.contains('nan') ] 

balanceType
COAD_nan    2
GBM_nan     1
LGG_nan     1
TGCT_nan    0
Name: balanceType, dtype: int64

In [15]:
selectRowsMissingGender = colDataDF["balanceType"].isin(['COAD_nan', 'GBM_nan', 'LGG_nan'])
print(sum(selectRowsMissingGender))
print(miscDF.shape)

missingGenderDF = colDataDF.loc[selectRowsMissingGender, :]

tmpDF = pd.concat([miscDF, missingGenderDF])
miscDF = tmpDF
print(miscDF.shape)

4
(1343, 7)
(1347, 8)


In [16]:
print(colDataDF.shape)
selectRowsNotMissingGender = ~colDataDF["balanceType"].isin(['COAD_nan', 'GBM_nan', 'LGG_nan'])
print(sum(selectRowsNotMissingGender))
tmpDF = colDataDF.loc[selectRowsNotMissingGender, :]
colDataDF = tmpDF
print(colDataDF.shape)

(9004, 8)
9000
(9000, 8)


### remove classes that only have a few examples or are not balance
Add these samples to miscDF

In [17]:
# key = 'display.max_rows'
# default = pd.get_option(key)
# pd.set_option(key, None) # None
# balanceTypeCountsSeries = colDataDF.groupby( ["balanceType"] )['balanceType'].agg('count')
# balanceTypeCountsSeries

In [18]:
balanceTypeCountsSeries = colDataDF.groupby( ["balanceType"] )['balanceType'].agg('count')
print(balanceTypeCountsSeries.describe())
print()
bq = 0.05 # 0.1 
bottomQuantile = balanceTypeCountsSeries.quantile(q=bq)
print("bottomQuantile {}% = {}".format(int(100 * bq), bottomQuantile))

count      62.000000
mean      145.161290
std       164.876625
min         0.000000
25%        42.000000
50%       100.000000
75%       207.500000
max      1052.000000
Name: balanceType, dtype: float64

bottomQuantile 5% = 0.5500000000000029


In [19]:
selectRowsBellow = balanceTypeCountsSeries < bottomQuantile
print("balance types bellow bottomQuantile {}% = {}\n".format(int(100 * bq), bottomQuantile))
print(balanceTypeCountsSeries[ selectRowsBellow ])

balance types bellow bottomQuantile 5% = 0.5500000000000029

balanceType
COAD_nan    0
GBM_nan     0
LGG_nan     0
TGCT_nan    0
Name: balanceType, dtype: int64


In [20]:
# remove class that do not have enough samples
selectRowsBellow = balanceTypeCountsSeries < bottomQuantile
print("balance types bellow bottomQuantile {}% = {}\n".format(int(100 * bq), bottomQuantile))
print(balanceTypeCountsSeries[ selectRowsBellow ])

balance types bellow bottomQuantile 5% = 0.5500000000000029

balanceType
COAD_nan    0
GBM_nan     0
LGG_nan     0
TGCT_nan    0
Name: balanceType, dtype: int64


In [21]:
selectRowsAbove =  [not x for x in selectRowsBellow]
balanceTypesAboveThreshold = balanceTypeCountsSeries[ selectRowsAbove ]
balanceTypesAboveThreshold.sort_values()

balanceType
BRCA_male        11
MESO_female      16
CHOL_male        16
CHOL_female      20
DLBC_male        22
DLBC_female      26
ESCA_female      26
KICH_female      27
ACC_male         31
UVM_female       35
KICH_male        39
SKCM_female      42
READ_female      42
UVM_male         45
ACC_female       48
READ_male        52
GBM_female       54
UCS_female       57
THYM_female      57
SKCM_male        61
THYM_male        63
MESO_male        71
KIRP_female      76
PCPG_male        78
PAAD_female      80
PAAD_male        98
GBM_male         99
PCPG_female     101
BLCA_female     107
COAD_female     117
SARC_male       117
LIHC_female     121
STAD_female     130
LUSC_female     130
THCA_male       135
HNSC_female     136
SARC_female     141
COAD_male       147
TGCT_male       150
ESCA_male       158
UCEC_female     176
KIRC_female     188
KIRP_male       214
LGG_female      230
LUAD_male       238
STAD_male       246
LIHC_male       250
LUAD_female     277
LGG_male        285
OV_femal

In [22]:
selectAboveThresholdRows = colDataDF["balanceType"].isin( balanceTypesAboveThreshold.index )
selectBelowThrsholdRows =  ~colDataDF["balanceType"].isin( balanceTypesAboveThreshold.index )
print("number of samples bellow the threshold: {}".format(sum(selectBelowThrsholdRows)) )

tmpDF = pd.concat([miscDF, colDataDF.loc[selectBelowThrsholdRows]])
miscDF = tmpDF
print("miscDF.shape: {}".format(miscDF.shape))

number of samples bellow the threshold: 0
miscDF.shape: (1347, 8)


### split out a 60 % of the samples into a training set

StratifiedShuffleSplit does not work (at least for small data sets). StratifiedShuffleSplit Splits may not be balanced. It is was designed for cross validation. Balancing is not important as each sample will eventuall wind up in the training set

In [23]:
X = colDataDF.loc[ selectAboveThresholdRows  ].to_numpy()
y = colDataDF.loc[ selectAboveThresholdRows, ['balanceType'] ].to_numpy()
print("colDataDF.shape:{}".format(colDataDF.shape))
print("X.shape:{}".format(X.shape))
print("y.shape:{}".format(y.shape))
X_train, X_not_train, y_train, y_not_train = train_test_split(X, y,
                                                stratify=y, 
                                                test_size=0.4)

print()
print("X_train.shape:{}".format(X_train.shape))
print("y_train.shape:{}".format(y_train.shape))
print()
print("X_not_train.shape:{}".format(X_not_train.shape))
print("y_not_train.shape:{}".format(y_not_train.shape))

print()
print(X_train[0:5,:])
print()
print(y_train[0:5,:])

colDataDF.shape:(9000, 8)
X.shape:(9000, 8)
y.shape:(9000, 1)

X_train.shape:(5400, 8)
y_train.shape:(5400, 1)

X_not_train.shape:(3600, 8)
y_not_train.shape:(3600, 1)

[['SARC-K1-A6RV-TP' 'SARC-K1-A6RV' 'TCGA-K1-A6RV-01' 'SARC' 67.0 'male'
  'TP' 'SARC_male']
 ['HNSC-CR-7365-TP' 'HNSC-CR-7365' 'TCGA-CR-7365-01' 'HNSC' 60.0 'male'
  'TP' 'HNSC_male']
 ['PRAD-XK-AAJA-TP' 'PRAD-XK-AAJA' 'TCGA-XK-AAJA-01' 'PRAD' 62.0 'male'
  'TP' 'PRAD_male']
 ['PRAD-KK-A7B1-TP' 'PRAD-KK-A7B1' 'TCGA-KK-A7B1-01' 'PRAD' 65.0 'male'
  'TP' 'PRAD_male']
 ['BRCA-A8-A08O-TP' 'BRCA-A8-A08O' 'TCGA-A8-A08O-01' 'BRCA' 45.0 'female'
  'TP' 'BRCA_female']]

[['SARC_male']
 ['HNSC_male']
 ['PRAD_male']
 ['PRAD_male']
 ['BRCA_female']]


In [24]:
# split the validate and test sets out
X_validate, X_test, y_validate, y_test = train_test_split(X_not_train, y_not_train,
                                                stratify=y_not_train, 
                                                test_size=0.5)

### rough test to see if data sets are balanced
create data frame where columns are probability distributions

In [25]:
def createDataFrameFromNumpyArray(X, y, XColnames, yColName):
    '''
    returns (XDF, yDF) pandas data frames. 
    '''
    XDF = pd.DataFrame(X, columns = XColnames)
    yDF = pd.DataFrame(y, columns = yColName)

    return (XDF, yDF)

In [26]:
def createTypeProbDistributions():
    dataSetDict = {
        "Train"   : (X_train, y_train),
        "Validate": (X_validate, y_validate),
        "Test"    : (X_test, y_test)
    }

    probDF = pd.DataFrame()
    for key in dataSetDict.keys():
        X,y = dataSetDict[key]
        
        yColName = ["balanceType"]
        XDF, yDF = createDataFrameFromNumpyArray(X, y, colDataDF.columns, yColName)
        
        balanceTypeCountsSeries = yDF.groupby( ["balanceType"] )['balanceType'].agg('count')
        total = balanceTypeCountsSeries.sum()
        probSeries = balanceTypeCountsSeries / total
        
        probDF[key] = probSeries
        
        
    return probDF

    
propDF = createTypeProbDistributions()
propDF

Unnamed: 0_level_0,Train,Validate,Test
balanceType,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ACC_female,0.00537,0.005,0.005556
ACC_male,0.003519,0.003333,0.003333
BLCA_female,0.011852,0.011667,0.012222
BLCA_male,0.033333,0.033333,0.033889
BRCA_female,0.116852,0.117222,0.116667
BRCA_male,0.001296,0.001111,0.001111
CESC_female,0.033704,0.033889,0.033889
CHOL_female,0.002222,0.002222,0.002222
CHOL_male,0.001852,0.001667,0.001667
COAD_female,0.012963,0.012778,0.013333


# Save

In [27]:
def saveDF(XDF, colDataDirURL, localTmpDir, prefix, key, suffix):
        # get rid of the batchType column. we created it to make spliting the data easier
        XDF = XDF.drop(["balanceType"], axis=1)
        
        # sort by sample_id. DESeq requires colData row sample_ids be in same order as count matrix columns sample_ids
        XDF = XDF.sort_values(by="sample_id")
        
        p = localTmpDir + "/" + prefix + "-" + key + suffix
        XDF.to_csv(p, index=False)
        
        print("\ncopying {} to {}".format(p, colDataDirURL))
        ! gsutil -m cp $p $colDataDirURL    
    
def saveDataSets(colDataDirURL, colDataDF, dataDict, localTmpDir, prefix, suffix):
    ! rm -rf $localTmpDir
    ! mkdir -p $localTmpDir
    
    for key in dataDict.keys():
        X,y = dataDict[key]
        
        yColName = ["balanceType"]
        XDF, yDF = createDataFrameFromNumpyArray(X, y, colDataDF.columns, yColName)
        
        saveDF(XDF, colDataDirURL, localTmpDir, prefix, key, suffix)

In [28]:
if (SAVE_MATRIX):
    dataSetDict = {
        "Train"   : (X_train, y_train),
        "Validate": (X_validate, y_validate),
        "Test"    : (X_test, y_test)
    }
    saveDataSets( UBER_TCGA_trainingSetDirURL, colDataDF, dataSetDict, localColDataTmp, 'TCGA', 'ColData.csv' )  
    saveDF(miscDF, UBER_TCGA_trainingSetDirURL, localColDataTmp, 'TCGA', 'misc', 'ColData.csv')
    
    # clean up 
#     ! rm -rf $localTmpDir
    
#     ! gsutil rm $UBER_TCGA_colDataMatrixURL
else:
    print("DEBUG did not save")


copying data/colData//TCGA-TrainColData.csv to gs://fc-e15b796f-1abe-4206-ab91-bd58374cc275/data/colData/trainingDataSets
Copying file://data/colData//TCGA-TrainColData.csv [Content-Type=text/csv]...
/ [1/1 files][334.9 KiB/334.9 KiB] 100% Done                                    
Operation completed over 1 objects/334.9 KiB.                                    

copying data/colData//TCGA-ValidateColData.csv to gs://fc-e15b796f-1abe-4206-ab91-bd58374cc275/data/colData/trainingDataSets
Copying file://data/colData//TCGA-ValidateColData.csv [Content-Type=text/csv]...
/ [1/1 files][111.7 KiB/111.7 KiB] 100% Done                                    
Operation completed over 1 objects/111.7 KiB.                                    

copying data/colData//TCGA-TestColData.csv to gs://fc-e15b796f-1abe-4206-ab91-bd58374cc275/data/colData/trainingDataSets
Copying file://data/colData//TCGA-TestColData.csv [Content-Type=text/csv]...
/ [1/1 files][111.6 KiB/111.6 KiB] 100% Done                       

In [29]:
# !gsutil rm gs://fc-e15b796f-1abe-4206-ab91-bd58374cc275/data/colData/TCGA-TrainColData.csv
# !gsutil rm gs://fc-e15b796f-1abe-4206-ab91-bd58374cc275/data/colData/TCGA-ValidateColData.csv
# !gsutil rm gs://fc-e15b796f-1abe-4206-ab91-bd58374cc275/data/colData/TCGA-TestColData.csv
# !gsutil rm gs://fc-e15b796f-1abe-4206-ab91-bd58374cc275/data/colData/TCGA-miscColData.csv

In [30]:
! echo $UBER_TCGA_colDataMatrixURL

gs://fc-e15b796f-1abe-4206-ab91-bd58374cc275/data/colData/trainingDataSets/uberColData.csv


In [32]:
! rm -rf $localTmpDir
    
! gsutil rm $UBER_TCGA_colDataMatrixURL

Removing gs://fc-e15b796f-1abe-4206-ab91-bd58374cc275/data/colData/trainingDataSets/uberColData.csv...
/ [1 objects]                                                                   
Operation completed over 1 objects.                                              
