# Create DESeq Data

- aedavids@ucsc.edu
- 11/17/21



## output
All data is sorted by sample_id. DESeq assume that the columns of the count matrix are in the same order
1. GTExTrainQuantFiles.csv, GTExValidateQuantFiles.csv, GTExTestQuantFiles.csv
    - process these files using apache spark to create DESeq count matrix and scaling factors data files
    - random,  balanced by tissue_id and sex,  split 60/20/20 of sample
    - two columns 'sampleName,source'
    - source is gcp bucket URL
    - sample row
        * GTEX-1117F-0226-SM-5GZZ7,gs://anvil_gtex_v8_hg38_edu_ucsc_kim_lab_spark/quant/GTEX-1117F-0226-SM-5GZZ7.quant.sf.gz
        * <span style="color:red">This is not the terra workspace URL!</span>
        * we can not start a dataproc spark cluster in our terra GCP (google cloud platform) project
        * our work around was to create a native GCP project and copy the data
        * <span style="color:red">make sure you set the BUCKET ID</span>

2. GTExTrainColData.csv, GTExValidateColData.csv and GTExTestColData.csv
    - these file contain sample meta data required to create model designs/formula in DESeq
    - the column names are 
        * 'sample_id', 'participant_id', 'tissue_id', 'tissue_site_detail', 'sex', 'age'
        
3. GTEx-trainQuantFiles.batch0.csv, GTEx-trainQuantFiles.batch1.csv, ...
    - GTEx-trainQuantFiles.csv has ove 10,000 samples
    - we can not join them using spark for unknow reason. It just does not seem to make progress
    - split into smaller data sets
    - we should able to use spark union to creat a single count table


In [1]:
import numpy as np
import pandas as pd
import pathlib as pl
#from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit

# the participant and sample table data was download from 
# https://app.terra.bio/#workspaces/test-aedavids-proj/AnVIL_GTEx_V8_hg38_edu_ucsc_kim_lab/data
rootDir = pl.Path( "../../../terraDataModels/test-aedavids-proj/AnVIL_GTEx_V8_hg38_edu_ucsc_kim_lab" )
outputDir = rootDir.joinpath(  "createDESeqData.output/spark" )
outputDir.mkdir(parents=True, exist_ok=True)
outfilePrefix = "GTEx"

#BUCKET_URL = "gs://anvil-gtex-v8-hg38-edu-ucsc-kim-lab-spark/quant/"
BUCKET_URL = "gs://anvil-gtex-v8-hg38-edu-ucsc-kim-lab-spark/GTEx/quant/"

In [2]:
# load the terra workspace sample table data
entityName = "sample"
sampleTSV = rootDir.joinpath( entityName + ".tsv" )
sampleDF = pd.read_csv(sampleTSV, delimiter='\t')
sampleDF.head()

Unnamed: 0,entity:sample_id,aux_info,bam_file,bam_index,firstEndFastq,participant,quantFile,secondEndFastq,tissue_id,tissue_site_detail,unpairedFastq
0,GTEX-1117F-0226-SM-5GZZ7,gs://fc-secure-f5aa8a37-78e5-45f6-9c59-c643016...,gs://fc-secure-ff8156a3-ddf3-42e4-9211-0fd89da...,gs://fc-secure-ff8156a3-ddf3-42e4-9211-0fd89da...,,GTEX-1117F,gs://fc-secure-f5aa8a37-78e5-45f6-9c59-c643016...,,Adipose_Subcutaneous,Adipose - Subcutaneous,
1,GTEX-1117F-0426-SM-5EGHI,gs://fc-secure-f5aa8a37-78e5-45f6-9c59-c643016...,gs://fc-secure-ff8156a3-ddf3-42e4-9211-0fd89da...,gs://fc-secure-ff8156a3-ddf3-42e4-9211-0fd89da...,,GTEX-1117F,gs://fc-secure-f5aa8a37-78e5-45f6-9c59-c643016...,,Muscle_Skeletal,Muscle - Skeletal,
2,GTEX-1117F-0526-SM-5EGHJ,gs://fc-secure-f5aa8a37-78e5-45f6-9c59-c643016...,gs://fc-secure-ff8156a3-ddf3-42e4-9211-0fd89da...,gs://fc-secure-ff8156a3-ddf3-42e4-9211-0fd89da...,,GTEX-1117F,gs://fc-secure-f5aa8a37-78e5-45f6-9c59-c643016...,,Artery_Tibial,Artery - Tibial,
3,GTEX-1117F-0626-SM-5N9CS,gs://fc-secure-f5aa8a37-78e5-45f6-9c59-c643016...,gs://fc-secure-ff8156a3-ddf3-42e4-9211-0fd89da...,gs://fc-secure-ff8156a3-ddf3-42e4-9211-0fd89da...,,GTEX-1117F,gs://fc-secure-f5aa8a37-78e5-45f6-9c59-c643016...,,Artery_Coronary,Artery - Coronary,
4,GTEX-1117F-0726-SM-5GIEN,gs://fc-secure-f5aa8a37-78e5-45f6-9c59-c643016...,gs://fc-secure-ff8156a3-ddf3-42e4-9211-0fd89da...,gs://fc-secure-ff8156a3-ddf3-42e4-9211-0fd89da...,,GTEX-1117F,gs://fc-secure-f5aa8a37-78e5-45f6-9c59-c643016...,,Heart_Atrial_Appendage,Heart - Atrial Appendage,


In [3]:
# load the terra workspace participant table data  
entityName = "participant"
participantTSV = rootDir.joinpath( entityName + ".tsv" )
participantDF = pd.read_csv(participantTSV, delimiter='\t')
participantDF.head()

Unnamed: 0,entity:participant_id,age,ase_chrX_raw_counts,ase_counts,ase_wasp_chrX_raw_counts,ase_wasp_counts,has_genotype,has_rnaseq,sex,wes_bam_file,wes_bam_index,wgs_cram_file,wgs_cram_index
0,GTEX-1117F,66,gs://fc-secure-ff8156a3-ddf3-42e4-9211-0fd89da...,gs://fc-secure-ff8156a3-ddf3-42e4-9211-0fd89da...,gs://fc-secure-ff8156a3-ddf3-42e4-9211-0fd89da...,gs://fc-secure-ff8156a3-ddf3-42e4-9211-0fd89da...,True,True,Female,gs://fc-secure-ff8156a3-ddf3-42e4-9211-0fd89da...,gs://fc-secure-ff8156a3-ddf3-42e4-9211-0fd89da...,gs://fc-secure-ff8156a3-ddf3-42e4-9211-0fd89da...,gs://fc-secure-ff8156a3-ddf3-42e4-9211-0fd89da...
1,GTEX-111CU,57,gs://fc-secure-ff8156a3-ddf3-42e4-9211-0fd89da...,gs://fc-secure-ff8156a3-ddf3-42e4-9211-0fd89da...,gs://fc-secure-ff8156a3-ddf3-42e4-9211-0fd89da...,gs://fc-secure-ff8156a3-ddf3-42e4-9211-0fd89da...,True,True,Male,gs://fc-secure-ff8156a3-ddf3-42e4-9211-0fd89da...,gs://fc-secure-ff8156a3-ddf3-42e4-9211-0fd89da...,gs://fc-secure-ff8156a3-ddf3-42e4-9211-0fd89da...,gs://fc-secure-ff8156a3-ddf3-42e4-9211-0fd89da...
2,GTEX-111FC,61,gs://fc-secure-ff8156a3-ddf3-42e4-9211-0fd89da...,gs://fc-secure-ff8156a3-ddf3-42e4-9211-0fd89da...,gs://fc-secure-ff8156a3-ddf3-42e4-9211-0fd89da...,gs://fc-secure-ff8156a3-ddf3-42e4-9211-0fd89da...,True,True,Male,gs://fc-secure-ff8156a3-ddf3-42e4-9211-0fd89da...,gs://fc-secure-ff8156a3-ddf3-42e4-9211-0fd89da...,gs://fc-secure-ff8156a3-ddf3-42e4-9211-0fd89da...,gs://fc-secure-ff8156a3-ddf3-42e4-9211-0fd89da...
3,GTEX-111VG,63,gs://fc-secure-ff8156a3-ddf3-42e4-9211-0fd89da...,gs://fc-secure-ff8156a3-ddf3-42e4-9211-0fd89da...,gs://fc-secure-ff8156a3-ddf3-42e4-9211-0fd89da...,gs://fc-secure-ff8156a3-ddf3-42e4-9211-0fd89da...,True,True,Male,gs://fc-secure-ff8156a3-ddf3-42e4-9211-0fd89da...,gs://fc-secure-ff8156a3-ddf3-42e4-9211-0fd89da...,gs://fc-secure-ff8156a3-ddf3-42e4-9211-0fd89da...,gs://fc-secure-ff8156a3-ddf3-42e4-9211-0fd89da...
4,GTEX-111YS,62,gs://fc-secure-ff8156a3-ddf3-42e4-9211-0fd89da...,gs://fc-secure-ff8156a3-ddf3-42e4-9211-0fd89da...,gs://fc-secure-ff8156a3-ddf3-42e4-9211-0fd89da...,gs://fc-secure-ff8156a3-ddf3-42e4-9211-0fd89da...,True,True,Male,gs://fc-secure-ff8156a3-ddf3-42e4-9211-0fd89da...,gs://fc-secure-ff8156a3-ddf3-42e4-9211-0fd89da...,gs://fc-secure-ff8156a3-ddf3-42e4-9211-0fd89da...,gs://fc-secure-ff8156a3-ddf3-42e4-9211-0fd89da...


In [4]:
print("sample col names:\n{}".format( sampleDF.columns ))
print("\nparticipant col names:\n{}".format( participantDF.columns ))

sample col names:
Index(['entity:sample_id', 'aux_info', 'bam_file', 'bam_index',
       'firstEndFastq', 'participant', 'quantFile', 'secondEndFastq',
       'tissue_id', 'tissue_site_detail', 'unpairedFastq'],
      dtype='object')

participant col names:
Index(['entity:participant_id', 'age', 'ase_chrX_raw_counts', 'ase_counts',
       'ase_wasp_chrX_raw_counts', 'ase_wasp_counts', 'has_genotype',
       'has_rnaseq', 'sex', 'wes_bam_file', 'wes_bam_index', 'wgs_cram_file',
       'wgs_cram_index'],
      dtype='object')


In [5]:
# select columns of interest and rename 
colDataSampleDF = sampleDF[ ['entity:sample_id', 'participant','tissue_id', 'tissue_site_detail','quantFile'] ]
newColNames = {
    'entity:sample_id':'sample_id', 
         'participant':'participant_id',
           'tissue_id':'tissue_id',
  'tissue_site_detail':'tissue_site_detail',
           'quantFile':'quantFile'
}
colDataSampleDF = colDataSampleDF.rename(columns=newColNames)
colDataSampleDF.head()

Unnamed: 0,sample_id,participant_id,tissue_id,tissue_site_detail,quantFile
0,GTEX-1117F-0226-SM-5GZZ7,GTEX-1117F,Adipose_Subcutaneous,Adipose - Subcutaneous,gs://fc-secure-f5aa8a37-78e5-45f6-9c59-c643016...
1,GTEX-1117F-0426-SM-5EGHI,GTEX-1117F,Muscle_Skeletal,Muscle - Skeletal,gs://fc-secure-f5aa8a37-78e5-45f6-9c59-c643016...
2,GTEX-1117F-0526-SM-5EGHJ,GTEX-1117F,Artery_Tibial,Artery - Tibial,gs://fc-secure-f5aa8a37-78e5-45f6-9c59-c643016...
3,GTEX-1117F-0626-SM-5N9CS,GTEX-1117F,Artery_Coronary,Artery - Coronary,gs://fc-secure-f5aa8a37-78e5-45f6-9c59-c643016...
4,GTEX-1117F-0726-SM-5GIEN,GTEX-1117F,Heart_Atrial_Appendage,Heart - Atrial Appendage,gs://fc-secure-f5aa8a37-78e5-45f6-9c59-c643016...


## <span style="color:red">Change quant.sf file URL</span>
should be something like gs://anvil_gtex_v8_hg38_edu_ucsc_kim_lab_spark/quant/GTEX-1117F-0226-SM-5GZZ7.quant.sf.gz

In [6]:
colDataSampleDF["quantFile"] =  BUCKET_URL + colDataSampleDF.loc[:, ["sample_id"]] +".quant.sf.gz"

In [7]:
# select columns of interest and rename
colDatParticipantDF = participantDF[ ['entity:participant_id', 'age', 'sex'] ]
newColNames = {
    'entity:participant_id':'participant_id', 
                      'age':'age', 
                      'sex':'sex'
}
colDatParticipantDF = colDatParticipantDF.rename(columns=newColNames)
colDatParticipantDF.head()

Unnamed: 0,participant_id,age,sex
0,GTEX-1117F,66,Female
1,GTEX-111CU,57,Male
2,GTEX-111FC,61,Male
3,GTEX-111VG,63,Male
4,GTEX-111YS,62,Male


In [8]:
# combine teh sample and participant tables
colDataDF = colDataSampleDF.set_index('participant_id')\
            .join( colDatParticipantDF.set_index('participant_id'), 
                  how='inner', on='participant_id')


colDataDF['participant_id'] = colDataDF.index

# reorder the columns
newOrder = [ 'sample_id', 'participant_id', 'tissue_id', 'tissue_site_detail', 'sex', 'age', 'quantFile']
colDataDF = colDataDF[ newOrder ]

print(colDataDF.head())


                               sample_id participant_id  \
participant_id                                            
GTEX-1117F      GTEX-1117F-0226-SM-5GZZ7     GTEX-1117F   
GTEX-1117F      GTEX-1117F-0426-SM-5EGHI     GTEX-1117F   
GTEX-1117F      GTEX-1117F-0526-SM-5EGHJ     GTEX-1117F   
GTEX-1117F      GTEX-1117F-0626-SM-5N9CS     GTEX-1117F   
GTEX-1117F      GTEX-1117F-0726-SM-5GIEN     GTEX-1117F   

                             tissue_id        tissue_site_detail     sex  age  \
participant_id                                                                  
GTEX-1117F        Adipose_Subcutaneous    Adipose - Subcutaneous  Female   66   
GTEX-1117F             Muscle_Skeletal         Muscle - Skeletal  Female   66   
GTEX-1117F               Artery_Tibial           Artery - Tibial  Female   66   
GTEX-1117F             Artery_Coronary         Artery - Coronary  Female   66   
GTEX-1117F      Heart_Atrial_Appendage  Heart - Atrial Appendage  Female   66   

                  

# split data set into training, validate, and test sets
[sklearn.model_selection.StratifiedShuffleSplit](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.StratifiedShuffleSplit.html#sklearn.model_selection.StratifiedShuffleSplit)

In [9]:
# convert to factors
print(colDataDF.columns)
colDataDF["tissue_site_detail"] = colDataDF["tissue_site_detail"].astype('category')
colDataDF["sex"]                = colDataDF["sex"].astype('category')

colDataDF.dtypes

Index(['sample_id', 'participant_id', 'tissue_id', 'tissue_site_detail', 'sex',
       'age', 'quantFile'],
      dtype='object')


sample_id               object
participant_id          object
tissue_id               object
tissue_site_detail    category
sex                   category
age                      int64
quantFile               object
dtype: object

In [10]:
# create a label variable we can use to balance the splits by tissue_id and sex
a = colDataDF["tissue_id"].astype(str).values
b = colDataDF["sex"].astype(str).values
colDataDF["label"] = ( a + "_" + b)
colDataDF.head()

Unnamed: 0_level_0,sample_id,participant_id,tissue_id,tissue_site_detail,sex,age,quantFile,label
participant_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
GTEX-1117F,GTEX-1117F-0226-SM-5GZZ7,GTEX-1117F,Adipose_Subcutaneous,Adipose - Subcutaneous,Female,66,gs://anvil-gtex-v8-hg38-edu-ucsc-kim-lab-spark...,Adipose_Subcutaneous_Female
GTEX-1117F,GTEX-1117F-0426-SM-5EGHI,GTEX-1117F,Muscle_Skeletal,Muscle - Skeletal,Female,66,gs://anvil-gtex-v8-hg38-edu-ucsc-kim-lab-spark...,Muscle_Skeletal_Female
GTEX-1117F,GTEX-1117F-0526-SM-5EGHJ,GTEX-1117F,Artery_Tibial,Artery - Tibial,Female,66,gs://anvil-gtex-v8-hg38-edu-ucsc-kim-lab-spark...,Artery_Tibial_Female
GTEX-1117F,GTEX-1117F-0626-SM-5N9CS,GTEX-1117F,Artery_Coronary,Artery - Coronary,Female,66,gs://anvil-gtex-v8-hg38-edu-ucsc-kim-lab-spark...,Artery_Coronary_Female
GTEX-1117F,GTEX-1117F-0726-SM-5GIEN,GTEX-1117F,Heart_Atrial_Appendage,Heart - Atrial Appendage,Female,66,gs://anvil-gtex-v8-hg38-edu-ucsc-kim-lab-spark...,Heart_Atrial_Appendage_Female


In [11]:
colDataDF.groupby( ["label"] )['label'].agg('count')

label
Adipose_Subcutaneous_Female        218
Adipose_Subcutaneous_Male          445
Adipose_Visceral_Omentum_Female    170
Adipose_Visceral_Omentum_Male      371
Adrenal_Gland_Female               101
                                  ... 
Thyroid_Male                       434
Uterus_Female                      142
Vagina_Female                      156
Whole_Blood_Female                 254
Whole_Blood_Male                   501
Name: label, Length: 100, dtype: int64

## remove classes that only have a few examples or are not balance

In [12]:
key = 'display.max_rows'
default = pd.get_option(key)
pd.set_option(key, None) # None
labelCountsSeries = colDataDF.groupby( ["label"] )['label'].agg('count')

In [13]:
print(labelCountsSeries.describe())
print()
bq = 0.05
bottomQuantile = labelCountsSeries.quantile(q=bq)
print("bottomQuantile {} = {}".format(bq, bottomQuantile))

count    100.000000
mean     173.820000
std      123.786912
min        1.000000
25%       72.250000
50%      153.500000
75%      228.750000
max      543.000000
Name: label, dtype: float64

bottomQuantile 0.05 = 9.95


In [14]:
selectRowsBellow = labelCountsSeries < bottomQuantile
print("\nbellow bottomQuantile {} = {}".format(bq, bottomQuantile))
print(labelCountsSeries[ selectRowsBellow ])


bellow bottomQuantile 0.05 = 9.95
label
Bladder_Female              7
Cervix_Ectocervix_Female    9
Fallopian_Tube_Female       9
Kidney_Medulla_Female       1
Kidney_Medulla_Male         3
Name: label, dtype: int64


In [15]:
selectRowsAbove =  [not x for x in selectRowsBellow]
labelsAboveThreshold = labelCountsSeries[ selectRowsAbove ]
labelsAboveThreshold.sort_values()

label
Cervix_Endocervix_Female                         10
Bladder_Male                                     14
Kidney_Cortex_Female                             19
Brain_Substantia_nigra_Female                    38
Brain_Amygdala_Female                            45
Minor_Salivary_Gland_Female                      47
Brain_Anterior_cingulate_cortex_BA24_Female      48
Brain_Putamen_basal_ganglia_Female               49
Brain_Hippocampus_Female                         54
Brain_Hypothalamus_Female                        55
Brain_Frontal_Cortex_BA9_Female                  56
Brain_Spinal_cord_cervical_c-1_Female            57
Brain_Cerebellar_Hemisphere_Female               58
Cells_EBV-transformed_lymphocytes_Female         62
Brain_Caudate_basal_ganglia_Female               63
Brain_Nucleus_accumbens_basal_ganglia_Female     64
Liver_Female                                     65
Kidney_Cortex_Male                               66
Small_Intestine_Terminal_Ileum_Female            67
Brain_

In [16]:
labelsAboveThreshold.index 
selectRows = colDataDF["label"].head().isin( labelsAboveThreshold.index )
print(selectRows)
colDataDF.head().loc[ selectRows  ]

participant_id
GTEX-1117F    True
GTEX-1117F    True
GTEX-1117F    True
GTEX-1117F    True
GTEX-1117F    True
Name: label, dtype: bool


Unnamed: 0_level_0,sample_id,participant_id,tissue_id,tissue_site_detail,sex,age,quantFile,label
participant_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
GTEX-1117F,GTEX-1117F-0226-SM-5GZZ7,GTEX-1117F,Adipose_Subcutaneous,Adipose - Subcutaneous,Female,66,gs://anvil-gtex-v8-hg38-edu-ucsc-kim-lab-spark...,Adipose_Subcutaneous_Female
GTEX-1117F,GTEX-1117F-0426-SM-5EGHI,GTEX-1117F,Muscle_Skeletal,Muscle - Skeletal,Female,66,gs://anvil-gtex-v8-hg38-edu-ucsc-kim-lab-spark...,Muscle_Skeletal_Female
GTEX-1117F,GTEX-1117F-0526-SM-5EGHJ,GTEX-1117F,Artery_Tibial,Artery - Tibial,Female,66,gs://anvil-gtex-v8-hg38-edu-ucsc-kim-lab-spark...,Artery_Tibial_Female
GTEX-1117F,GTEX-1117F-0626-SM-5N9CS,GTEX-1117F,Artery_Coronary,Artery - Coronary,Female,66,gs://anvil-gtex-v8-hg38-edu-ucsc-kim-lab-spark...,Artery_Coronary_Female
GTEX-1117F,GTEX-1117F-0726-SM-5GIEN,GTEX-1117F,Heart_Atrial_Appendage,Heart - Atrial Appendage,Female,66,gs://anvil-gtex-v8-hg38-edu-ucsc-kim-lab-spark...,Heart_Atrial_Appendage_Female


## split out a 60 % of the samples into a training set

In [17]:
theMeaningOfLife = 42
# n_splits = 1, we are not doing cross validation
split = StratifiedShuffleSplit(n_splits=1, test_size=0.4, random_state=theMeaningOfLife)
selectRows = colDataDF["label"].isin( labelsAboveThreshold.index )
X = colDataDF.loc[ selectRows  ].to_numpy()
y = colDataDF.loc[ selectRows, ['label'] ].to_numpy()
print("colDataDF.shape:{}".format(colDataDF.shape))
print("X.shape:{}".format(X.shape))
print("y.shape:{}".format(y.shape))


for train_index, not_train_index in split.split(X, y):
    X_train = X[train_index]
    X_not_train = X[not_train_index]
    
    y_train = y[train_index]
    y_not_train =y[not_train_index]

colDataDF.shape:(17382, 8)
X.shape:(17353, 8)
y.shape:(17353, 1)


In [18]:
print("AEDWIP X_train.shape:{}".format(X_train.shape))
print("AEDWIP y_train.shape:{}".format(y_train.shape))
print()
print("AEDWIP X_not_train.shape:{}".format(X_not_train.shape))
print("AEDWIP y_not_train.shape:{}".format(y_not_train.shape))

print( y_train[0:5] )


trainColDataDF = pd.DataFrame( X_train, columns=colDataDF.columns)
trainColDataDF.head()

AEDWIP X_train.shape:(10411, 8)
AEDWIP y_train.shape:(10411, 1)

AEDWIP X_not_train.shape:(6942, 8)
AEDWIP y_not_train.shape:(6942, 1)
[['Skin_Not_Sun_Exposed_Suprapubic_Male']
 ['Adipose_Visceral_Omentum_Male']
 ['Liver_Male']
 ['Heart_Atrial_Appendage_Male']
 ['Skin_Sun_Exposed_Lower_leg_Male']]


Unnamed: 0,sample_id,participant_id,tissue_id,tissue_site_detail,sex,age,quantFile,label
0,GTEX-11NSD-0626-SM-5A5LU,GTEX-11NSD,Skin_Not_Sun_Exposed_Suprapubic,Skin - Not Sun Exposed (Suprapubic),Male,27,gs://anvil-gtex-v8-hg38-edu-ucsc-kim-lab-spark...,Skin_Not_Sun_Exposed_Suprapubic_Male
1,GTEX-1N2DW-0726-SM-DTX97,GTEX-1N2DW,Adipose_Visceral_Omentum,Adipose - Visceral (Omentum),Male,58,gs://anvil-gtex-v8-hg38-edu-ucsc-kim-lab-spark...,Adipose_Visceral_Omentum_Male
2,GTEX-1PIIG-1326-SM-EXOIR,GTEX-1PIIG,Liver,Liver,Male,34,gs://anvil-gtex-v8-hg38-edu-ucsc-kim-lab-spark...,Liver_Male
3,GTEX-11DXZ-0326-SM-5EGH1,GTEX-11DXZ,Heart_Atrial_Appendage,Heart - Atrial Appendage,Male,56,gs://anvil-gtex-v8-hg38-edu-ucsc-kim-lab-spark...,Heart_Atrial_Appendage_Male
4,GTEX-1QMI2-0226-SM-EVYBX,GTEX-1QMI2,Skin_Sun_Exposed_Lower_leg,Skin - Sun Exposed (Lower leg),Male,67,gs://anvil-gtex-v8-hg38-edu-ucsc-kim-lab-spark...,Skin_Sun_Exposed_Lower_leg_Male


## split the not_training files into validation and test sets

In [19]:
# n_splits = 1, we are not doing cross validation
split = StratifiedShuffleSplit(n_splits=1, test_size=0.5)
for validate_index, test_index in split.split(X_not_train, y_not_train):
    X_validate = X[validate_index]
    X_test = X[test_index]
    
    y_validate = y[validate_index]
    y_test =y[test_index]
    

In [20]:
validateColDataDF = pd.DataFrame( X_validate, columns=colDataDF.columns)
print(validateColDataDF.shape)
validateColDataDF.head()

(3471, 8)


Unnamed: 0,sample_id,participant_id,tissue_id,tissue_site_detail,sex,age,quantFile,label
0,GTEX-13OVL-1326-SM-5IJCZ,GTEX-13OVL,Artery_Tibial,Artery - Tibial,Male,50,gs://anvil-gtex-v8-hg38-edu-ucsc-kim-lab-spark...,Artery_Tibial_Male
1,GTEX-1477Z-0011-R10b-SM-5S2RF,GTEX-1477Z,Brain_Frontal_Cortex_BA9,Brain - Frontal Cortex (BA9),Male,65,gs://anvil-gtex-v8-hg38-edu-ucsc-kim-lab-spark...,Brain_Frontal_Cortex_BA9_Male
2,GTEX-139UW-0326-SM-5J1ML,GTEX-139UW,Skin_Not_Sun_Exposed_Suprapubic,Skin - Not Sun Exposed (Suprapubic),Male,62,gs://anvil-gtex-v8-hg38-edu-ucsc-kim-lab-spark...,Skin_Not_Sun_Exposed_Suprapubic_Male
3,GTEX-13SLW-0626-SM-5Q5ER,GTEX-13SLW,Skin_Not_Sun_Exposed_Suprapubic,Skin - Not Sun Exposed (Suprapubic),Male,70,gs://anvil-gtex-v8-hg38-edu-ucsc-kim-lab-spark...,Skin_Not_Sun_Exposed_Suprapubic_Male
4,GTEX-11P7K-0008-SM-5S2O5,GTEX-11P7K,Cells_Cultured_fibroblasts,Cells - Cultured fibroblasts,Male,37,gs://anvil-gtex-v8-hg38-edu-ucsc-kim-lab-spark...,Cells_Cultured_fibroblasts_Male


In [21]:
testColDataDF = pd.DataFrame( X_test, columns=colDataDF.columns)
print(testColDataDF.shape)
testColDataDF.head()

(3471, 8)


Unnamed: 0,sample_id,participant_id,tissue_id,tissue_site_detail,sex,age,quantFile,label
0,GTEX-12WSF-0011-R7b-SM-5HL99,GTEX-12WSF,Brain_Putamen_basal_ganglia,Brain - Putamen (basal ganglia),Male,70,gs://anvil-gtex-v8-hg38-edu-ucsc-kim-lab-spark...,Brain_Putamen_basal_ganglia_Male
1,GTEX-139TU-1126-SM-5J1NY,GTEX-139TU,Artery_Aorta,Artery - Aorta,Male,64,gs://anvil-gtex-v8-hg38-edu-ucsc-kim-lab-spark...,Artery_Aorta_Male
2,GTEX-132Q8-0011-R11b-SM-5DUW9,GTEX-132Q8,Brain_Cerebellar_Hemisphere,Brain - Cerebellar Hemisphere,Male,68,gs://anvil-gtex-v8-hg38-edu-ucsc-kim-lab-spark...,Brain_Cerebellar_Hemisphere_Male
3,GTEX-11P82-1326-SM-5HL62,GTEX-11P82,Breast_Mammary_Tissue,Breast - Mammary Tissue,Male,21,gs://anvil-gtex-v8-hg38-edu-ucsc-kim-lab-spark...,Breast_Mammary_Tissue_Male
4,GTEX-17F9Y-1426-SM-7IGOO,GTEX-17F9Y,Artery_Coronary,Artery - Coronary,Female,70,gs://anvil-gtex-v8-hg38-edu-ucsc-kim-lab-spark...,Artery_Coronary_Female


# save  files

In [22]:
dfDict = {
    "train"    : trainColDataDF,
    "validate" : validateColDataDF,
    "test"     : testColDataDF,

}

for key in dfDict:
    colDataDF = dfDict[ key ]
    
    # create quantFile csv file
    quantFileDF = colDataDF[ ['sample_id','quantFile'] ]
    
    newColNames = {
    'sample_id':'sampleName', 
           'quantFile':'source'
    }
    quantFileDF = quantFileDF.rename( columns=newColNames )
    f = outputDir.joinpath( outfilePrefix + "-" + key + "QuantFiles.csv" )
    print("\nwriting file:{}".format(f))
    quantFileDF.sort_values(by=['sampleName']).to_csv(f, index=False)
    
    # create colData csv file
    # drop columns by Name
    df = colDataDF.drop(  ['quantFile', 'label'] , axis=1 )
    f = outputDir.joinpath( outfilePrefix + "-" + key + "ColData.csv" )
    print("writing file:{}".format(f))    
    df.sort_values(by=['sample_id']).to_csv(f, index=False)



writing file:../../../terraDataModels/test-aedavids-proj/AnVIL_GTEx_V8_hg38_edu_ucsc_kim_lab/createDESeqData.output/spark/GTEx-trainQuantFiles.csv
writing file:../../../terraDataModels/test-aedavids-proj/AnVIL_GTEx_V8_hg38_edu_ucsc_kim_lab/createDESeqData.output/spark/GTEx-trainColData.csv

writing file:../../../terraDataModels/test-aedavids-proj/AnVIL_GTEx_V8_hg38_edu_ucsc_kim_lab/createDESeqData.output/spark/GTEx-validateQuantFiles.csv
writing file:../../../terraDataModels/test-aedavids-proj/AnVIL_GTEx_V8_hg38_edu_ucsc_kim_lab/createDESeqData.output/spark/GTEx-validateColData.csv

writing file:../../../terraDataModels/test-aedavids-proj/AnVIL_GTEx_V8_hg38_edu_ucsc_kim_lab/createDESeqData.output/spark/GTEx-testQuantFiles.csv
writing file:../../../terraDataModels/test-aedavids-proj/AnVIL_GTEx_V8_hg38_edu_ucsc_kim_lab/createDESeqData.output/spark/GTEx-testColData.csv


# split GTEx-trainQuantFiles.csv into multiple batches

In [23]:
key = "train"
colDataDF = dfDict[ key ]
quantFileDF = colDataDF[ ['sample_id','quantFile'] ]
numRows, numColumns = quantFileDF.shape
print("numRows:{} numColumns:{}".format(numRows, numColumns))

batchDict = dict()
numberOfBatches = 10
numRowsInBatch = numRows // numberOfBatches
numRemaining = numRows % numberOfBatches
print("numRowsInBatch:{} numRemaining:{}".format(numRowsInBatch, numRemaining))

print()
startingRowIdx = 0
for i in range(numberOfBatches):
    lastRowIdx = startingRowIdx + numRowsInBatch
    # select rows where startingRowIdx >= idx < lastRowIdx
    print("\nstartingRowIdx:{} lastRowIdx:{}".format(startingRowIdx, lastRowIdx))
    batchQuantFileDF = quantFileDF.iloc[startingRowIdx:lastRowIdx, :]
    batchDict[i] = batchQuantFileDF
    startingRowIdx = lastRowIdx #+ 1
    print("i:{} num batch rows:{} ".format(i, batchQuantFileDF.shape[0]))

if numRemaining > 0:
    i += 1
    startingRowIdx = lastRowIdx #+ 1
    lastRowIdx = lastRowIdx - numRowsInBatch + numRemaining
    print("\nxxx startingRowIdx:{} ".format(startingRowIdx))    
    batchQuantFileDF = quantFileDF.iloc[startingRowIdx:, :]
    batchDict[i] = batchQuantFileDF
    print("i:{} num batch rows:{} ".format(i, batchQuantFileDF.shape[0]))
    
    
for key,df in batchDict.items():
    newColNames = {
    'sample_id':'sampleName', 
           'quantFile':'source'
    }
    df = df.rename( columns=newColNames )
    f = outputDir.joinpath( outfilePrefix + "-batch-" + str(key) + "-QuantFiles.csv" )
    print("\nwriting file:{}".format(f))
    df.sort_values(by=['sampleName']).to_csv(f, index=False)
    

numRows:10411 numColumns:2
numRowsInBatch:1041 numRemaining:1


startingRowIdx:0 lastRowIdx:1041
i:0 num batch rows:1041 

startingRowIdx:1041 lastRowIdx:2082
i:1 num batch rows:1041 

startingRowIdx:2082 lastRowIdx:3123
i:2 num batch rows:1041 

startingRowIdx:3123 lastRowIdx:4164
i:3 num batch rows:1041 

startingRowIdx:4164 lastRowIdx:5205
i:4 num batch rows:1041 

startingRowIdx:5205 lastRowIdx:6246
i:5 num batch rows:1041 

startingRowIdx:6246 lastRowIdx:7287
i:6 num batch rows:1041 

startingRowIdx:7287 lastRowIdx:8328
i:7 num batch rows:1041 

startingRowIdx:8328 lastRowIdx:9369
i:8 num batch rows:1041 

startingRowIdx:9369 lastRowIdx:10410
i:9 num batch rows:1041 

xxx startingRowIdx:10410 
i:10 num batch rows:1 

writing file:../../../terraDataModels/test-aedavids-proj/AnVIL_GTEx_V8_hg38_edu_ucsc_kim_lab/createDESeqData.output/spark/GTEx-batch-0-QuantFiles.csv

writing file:../../../terraDataModels/test-aedavids-proj/AnVIL_GTEx_V8_hg38_edu_ucsc_kim_lab/createDESeqData.output/s

In [24]:
testDF = batchDict[0].head().copy(deep=True)
print(testDF)
# testDF["aedwip"] =  testDF.loc[:, ["sample_id"]] + "-AEDWIP-"
# testDF["aedwip"] =  "aedwip-" + testDF.loc[:, ["sample_id"]]
testDF["quantFile"] =  "aedwip-" + testDF.loc[:, ["sample_id"]]
print("")
print(testDF)
batchDict[0].head()

                  sample_id                                          quantFile
0  GTEX-11NSD-0626-SM-5A5LU  gs://anvil-gtex-v8-hg38-edu-ucsc-kim-lab-spark...
1  GTEX-1N2DW-0726-SM-DTX97  gs://anvil-gtex-v8-hg38-edu-ucsc-kim-lab-spark...
2  GTEX-1PIIG-1326-SM-EXOIR  gs://anvil-gtex-v8-hg38-edu-ucsc-kim-lab-spark...
3  GTEX-11DXZ-0326-SM-5EGH1  gs://anvil-gtex-v8-hg38-edu-ucsc-kim-lab-spark...
4  GTEX-1QMI2-0226-SM-EVYBX  gs://anvil-gtex-v8-hg38-edu-ucsc-kim-lab-spark...

                  sample_id                        quantFile
0  GTEX-11NSD-0626-SM-5A5LU  aedwip-GTEX-11NSD-0626-SM-5A5LU
1  GTEX-1N2DW-0726-SM-DTX97  aedwip-GTEX-1N2DW-0726-SM-DTX97
2  GTEX-1PIIG-1326-SM-EXOIR  aedwip-GTEX-1PIIG-1326-SM-EXOIR
3  GTEX-11DXZ-0326-SM-5EGH1  aedwip-GTEX-11DXZ-0326-SM-5EGH1
4  GTEX-1QMI2-0226-SM-EVYBX  aedwip-GTEX-1QMI2-0226-SM-EVYBX


Unnamed: 0,sample_id,quantFile
0,GTEX-11NSD-0626-SM-5A5LU,gs://anvil-gtex-v8-hg38-edu-ucsc-kim-lab-spark...
1,GTEX-1N2DW-0726-SM-DTX97,gs://anvil-gtex-v8-hg38-edu-ucsc-kim-lab-spark...
2,GTEX-1PIIG-1326-SM-EXOIR,gs://anvil-gtex-v8-hg38-edu-ucsc-kim-lab-spark...
3,GTEX-11DXZ-0326-SM-5EGH1,gs://anvil-gtex-v8-hg38-edu-ucsc-kim-lab-spark...
4,GTEX-1QMI2-0226-SM-EVYBX,gs://anvil-gtex-v8-hg38-edu-ucsc-kim-lab-spark...
