# Create Data set for PAAD vs Pancreas 1vsAll Analysis

```
Andrew E. Davidson
aedavids@ucsc.edu
3/2/23
```

- 107 PAAD samples
- 197 Pancrease sample

output
- /private/groups/kimlab/natureBioMedEng/GTExWhole_Blood.vs.GTEx/pancreasePAADGroupByGenseCountMatrixData.csv
- /private/groups/kimlab/natureBioMedEng/GTExWhole_Blood.vs.GTEx/pancreasePAADColData.csvsy

In [1]:
import pandas as pd
import pathlib as pl

# use display() to print an html version of a data frame
# useful if dataFrame output is not generated by last like of cell
from IPython.display import display

outputDirPl = pl.Path("/private/groups/kimlab/natureBioMedEng/PAAD.vs.pancrease")
outputDirPl.mkdir(parents=True, exist_ok=True)

In [2]:
# load data sets 
GTExDataRootPl = pl.Path('/private/groups/kimlab/GTEx')
GTExColDataPl = GTExDataRootPl.joinpath("GTExTrainColData.csv")
GTExGroupByGenseCountMatrixDataPl = GTExDataRootPl.joinpath("GTExTrainGroupByGenesCountMatrix.csv")

TCGA_ColDataTrainingDatasetRootPl = pl.Path('/private/groups/kimlab/TCGA/colData/trainingDataSets/')
TCGA_ColDataPl = TCGA_ColDataTrainingDatasetRootPl.joinpath("TCGA-TrainColData.csv")

TCGA_DataTrainingDatasetRootPl = pl.Path('/private/groups/kimlab/TCGA/trainingDataSets/')                                    
TCGA_GroupByGenseCountMatrixDataPl = TCGA_DataTrainingDatasetRootPl.joinpath("TCGA-TrainGroupByGeneId.csv")

# Select colData for TCGA PAAD

In [3]:
tcgaColDataDF = pd.read_csv(TCGA_ColDataPl)
tcgaColDataDF.head()

Unnamed: 0,sample_id,participant_id,tcga_sample_id,Cohort,Age,Gender,sample_type
0,ACC-OR-A5J1-TP,ACC-OR-A5J1,TCGA-OR-A5J1-01,ACC,58.0,male,TP
1,ACC-OR-A5J2-TP,ACC-OR-A5J2,TCGA-OR-A5J2-01,ACC,44.0,female,TP
2,ACC-OR-A5J3-TP,ACC-OR-A5J3,TCGA-OR-A5J3-01,ACC,23.0,female,TP
3,ACC-OR-A5J6-TP,ACC-OR-A5J6,TCGA-OR-A5J6-01,ACC,29.0,female,TP
4,ACC-OR-A5J7-TP,ACC-OR-A5J7,TCGA-OR-A5J7-01,ACC,30.0,female,TP


In [4]:
tcgaColDataDF.loc[:, 'Cohort'].unique()

array(['ACC', 'BLCA', 'BRCA', 'CESC', 'CHOL', 'COAD', 'DLBC', 'ESCA',
       'GBM', 'HNSC', 'KICH', 'KIRC', 'KIRP', 'LGG', 'LIHC', 'LUAD',
       'LUSC', 'MESO', 'OV', 'PAAD', 'PCPG', 'PRAD', 'READ', 'SARC',
       'SKCM', 'STAD', 'TGCT', 'THCA', 'THYM', 'UCEC', 'UCS', 'UVM'],
      dtype=object)

In [5]:
selectRows = tcgaColDataDF['Cohort'] == 'PAAD'
PAADColDataDF = tcgaColDataDF.loc[selectRows, :]

print(PAADColDataDF.info())
PAADColDataDF.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 107 entries, 3738 to 3844
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   sample_id       107 non-null    object 
 1   participant_id  107 non-null    object 
 2   tcga_sample_id  107 non-null    object 
 3   Cohort          107 non-null    object 
 4   Age             107 non-null    float64
 5   Gender          107 non-null    object 
 6   sample_type     107 non-null    object 
dtypes: float64(1), object(6)
memory usage: 6.7+ KB
None


Unnamed: 0,sample_id,participant_id,tcga_sample_id,Cohort,Age,Gender,sample_type
3738,PAAD-2J-AAB4-TP,PAAD-2J-AAB4,TCGA-2J-AAB4-01,PAAD,48.0,male,TP
3739,PAAD-2J-AAB6-TP,PAAD-2J-AAB6,TCGA-2J-AAB6-01,PAAD,75.0,male,TP
3740,PAAD-2J-AAB9-TP,PAAD-2J-AAB9,TCGA-2J-AAB9-01,PAAD,70.0,female,TP
3741,PAAD-2J-AABA-TP,PAAD-2J-AABA,TCGA-2J-AABA-01,PAAD,55.0,male,TP
3742,PAAD-2J-AABE-TP,PAAD-2J-AABE,TCGA-2J-AABE-01,PAAD,73.0,male,TP


# Select colData for Pancreas

In [6]:
gtexColDataDF = pd.read_csv(GTExColDataPl)
gtexColDataDF.head()

Unnamed: 0,sample_id,participant_id,tissue_id,tissue_site_detail,sex,age
0,GTEX-1117F-0226-SM-5GZZ7,GTEX-1117F,Adipose_Subcutaneous,Adipose - Subcutaneous,Female,66
1,GTEX-1117F-0526-SM-5EGHJ,GTEX-1117F,Artery_Tibial,Artery - Tibial,Female,66
2,GTEX-1117F-0726-SM-5GIEN,GTEX-1117F,Heart_Atrial_Appendage,Heart - Atrial Appendage,Female,66
3,GTEX-1117F-2826-SM-5GZXL,GTEX-1117F,Breast_Mammary_Tissue,Breast - Mammary Tissue,Female,66
4,GTEX-1117F-3226-SM-5N9CT,GTEX-1117F,Brain_Cortex,Brain - Cortex,Female,66


In [7]:
gtexColDataDF.loc[:, 'tissue_id'].unique()

array(['Adipose_Subcutaneous', 'Artery_Tibial', 'Heart_Atrial_Appendage',
       'Breast_Mammary_Tissue', 'Brain_Cortex', 'Lung', 'Spleen',
       'Pancreas', 'Esophagus_Muscularis',
       'Esophagus_Gastroesophageal_Junction',
       'Skin_Not_Sun_Exposed_Suprapubic',
       'Small_Intestine_Terminal_Ileum', 'Colon_Transverse', 'Testis',
       'Nerve_Tibial', 'Skin_Sun_Exposed_Lower_leg', 'Muscle_Skeletal',
       'Heart_Left_Ventricle', 'Prostate', 'Minor_Salivary_Gland',
       'Brain_Cerebellum', 'Thyroid', 'Cells_Cultured_fibroblasts',
       'Adrenal_Gland', 'Adipose_Visceral_Omentum',
       'Cells_EBV-transformed_lymphocytes', 'Whole_Blood', 'Artery_Aorta',
       'Esophagus_Mucosa', 'Stomach', 'Vagina', 'Uterus', 'Pituitary',
       'Colon_Sigmoid', 'Brain_Frontal_Cortex_BA9',
       'Brain_Nucleus_accumbens_basal_ganglia',
       'Brain_Putamen_basal_ganglia', 'Brain_Hypothalamus',
       'Brain_Hippocampus', 'Artery_Coronary',
       'Brain_Cerebellar_Hemisphere', 'Liver',

In [8]:
selectRows = gtexColDataDF['tissue_id'] == 'Pancreas'
PancreasColDataDF = gtexColDataDF.loc[selectRows, :]

print(PancreasColDataDF.info())
PancreasColDataDF.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 197 entries, 7 to 10406
Data columns (total 6 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   sample_id           197 non-null    object
 1   participant_id      197 non-null    object
 2   tissue_id           197 non-null    object
 3   tissue_site_detail  197 non-null    object
 4   sex                 197 non-null    object
 5   age                 197 non-null    int64 
dtypes: int64(1), object(5)
memory usage: 10.8+ KB
None


Unnamed: 0,sample_id,participant_id,tissue_id,tissue_site_detail,sex,age
7,GTEX-111CU-0526-SM-5EGHK,GTEX-111CU,Pancreas,Pancreas,Male,57
40,GTEX-111YS-1226-SM-5EGGJ,GTEX-111YS,Pancreas,Pancreas,Male,62
68,GTEX-1128S-0826-SM-5GZZI,GTEX-1128S,Pancreas,Pancreas,Female,66
149,GTEX-11DXX-0926-SM-5H112,GTEX-11DXX,Pancreas,Pancreas,Female,66
311,GTEX-11GSP-0426-SM-5A5KX,GTEX-11GSP,Pancreas,Pancreas,Female,66


# Select the PAAD samples

In [9]:
TCGAGroupByGenseCountMatrixDF = pd.read_csv(TCGA_GroupByGenseCountMatrixDataPl)
print(TCGAGroupByGenseCountMatrixDF.info())
TCGAGroupByGenseCountMatrixDF.iloc[0:3, 0:3]

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 74777 entries, 0 to 74776
Columns: 5401 entries, geneId to UVM-YZ-A985-TP
dtypes: int64(5400), object(1)
memory usage: 3.0+ GB
None


Unnamed: 0,geneId,ACC-OR-A5J1-TP,ACC-OR-A5J2-TP
0,(A)n,7,27
1,(AAA)n,0,0
2,(AAAAAAC)n,0,0


In [10]:
print(TCGAGroupByGenseCountMatrixDF.columns)

Index(['geneId', 'ACC-OR-A5J1-TP', 'ACC-OR-A5J2-TP', 'ACC-OR-A5J3-TP',
       'ACC-OR-A5J6-TP', 'ACC-OR-A5J7-TP', 'ACC-OR-A5J8-TP', 'ACC-OR-A5JB-TP',
       'ACC-OR-A5JF-TP', 'ACC-OR-A5JI-TP',
       ...
       'UVM-WC-A881-TP', 'UVM-WC-A882-TP', 'UVM-WC-A883-TP', 'UVM-WC-A884-TP',
       'UVM-WC-A888-TP', 'UVM-WC-AA9E-TP', 'UVM-YZ-A980-TP', 'UVM-YZ-A983-TP',
       'UVM-YZ-A984-TP', 'UVM-YZ-A985-TP'],
      dtype='object', length=5401)


In [11]:
PAADSampleIdList = ['geneId'] + PAADColDataDF.loc[:, 'sample_id'].values.tolist()
print(f'len(PAADSampleIdList) : {len(PAADSampleIdList)}' )
print(f'PAADSampleIdList[0:3] : {PAADSampleIdList[0:3]}')
# selectRows = TCGAGroupByGenseCountMatrixDF['sample_id'].isin(PAADSampleIdList)
PAADSamplesDF = TCGAGroupByGenseCountMatrixDF.loc[:, PAADSampleIdList]
print(f'PAADSamplesDF.info() : {PAADSamplesDF.info()}')

PAADSamplesDF.iloc[0:3, 0:3]

len(PAADSampleIdList) : 108
PAADSampleIdList[0:3] : ['geneId', 'PAAD-2J-AAB4-TP', 'PAAD-2J-AAB6-TP']
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 74777 entries, 0 to 74776
Columns: 108 entries, geneId to PAAD-XN-A8T5-TP
dtypes: int64(107), object(1)
memory usage: 61.6+ MB
PAADSamplesDF.info() : None


Unnamed: 0,geneId,PAAD-2J-AAB4-TP,PAAD-2J-AAB6-TP
0,(A)n,6,42
1,(AAA)n,0,0
2,(AAAAAAC)n,0,0


# Select the Pancreas samples

In [12]:
GTExGroupByGenseCountMatrixDF = pd.read_csv(GTExGroupByGenseCountMatrixDataPl)
print(GTExGroupByGenseCountMatrixDF.info())
GTExGroupByGenseCountMatrixDF.iloc[0:3, 0:3]

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 74777 entries, 0 to 74776
Columns: 10412 entries, geneId to GTEX-ZZPU-2726-SM-5NQ8O
dtypes: int64(10411), object(1)
memory usage: 5.8+ GB
None


Unnamed: 0,geneId,GTEX-1117F-0226-SM-5GZZ7,GTEX-1117F-0526-SM-5EGHJ
0,(A)n,9,1
1,(AAA)n,0,0
2,(AAAAAAC)n,0,0


In [13]:
pancreasSampleIdList = ['geneId'] + PancreasColDataDF.loc[:, 'sample_id'].values.tolist()
print(f'len(pancreasSampleIdList) : {len(pancreasSampleIdList)}' )
print(f'pancreasSampleIdList[0:3] : {pancreasSampleIdList[0:3]}')
pancreaseSamplesDF = GTExGroupByGenseCountMatrixDF.loc[:, pancreasSampleIdList]
print(f'pancreaseSamplesDF.info() : {pancreaseSamplesDF.info()}')

pancreaseSamplesDF.iloc[0:3, 0:3]

len(pancreasSampleIdList) : 198
pancreasSampleIdList[0:3] : ['geneId', 'GTEX-111CU-0526-SM-5EGHK', 'GTEX-111YS-1226-SM-5EGGJ']
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 74777 entries, 0 to 74776
Columns: 198 entries, geneId to GTEX-ZZPU-0726-SM-5N9C8
dtypes: int64(197), object(1)
memory usage: 113.0+ MB
pancreaseSamplesDF.info() : None


Unnamed: 0,geneId,GTEX-111CU-0526-SM-5EGHK,GTEX-111YS-1226-SM-5EGGJ
0,(A)n,1,1
1,(AAA)n,0,0
2,(AAAAAAC)n,0,0


# Combine PAAD and Pancreas samples into a single dataframe

In [14]:
# check the geneIds are in the same order
ret = pancreaseSamplesDF.loc[:, "geneId"] == PAADSamplesDF.loc[:, "geneId"]
assert sum(ret) == 74777 , "ERROR gene ids are not aligned"  

In [15]:
# we do not want to have two copies of the 'geneId' col
paadSamplesDF = PAADSamplesDF.iloc[:, 1:]
print(paadSamplesDF.info())
print(paadSamplesDF.columns)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 74777 entries, 0 to 74776
Columns: 107 entries, PAAD-2J-AAB4-TP to PAAD-XN-A8T5-TP
dtypes: int64(107)
memory usage: 61.0 MB
None
Index(['PAAD-2J-AAB4-TP', 'PAAD-2J-AAB6-TP', 'PAAD-2J-AAB9-TP',
       'PAAD-2J-AABA-TP', 'PAAD-2J-AABE-TP', 'PAAD-2J-AABF-TP',
       'PAAD-2J-AABH-TP', 'PAAD-2J-AABP-TP', 'PAAD-2J-AABR-TP',
       'PAAD-2J-AABT-TP',
       ...
       'PAAD-US-A774-TP', 'PAAD-US-A776-TP', 'PAAD-US-A779-TP',
       'PAAD-US-A77E-TP', 'PAAD-US-A77G-TP', 'PAAD-XD-AAUG-TP',
       'PAAD-XD-AAUH-TP', 'PAAD-XD-AAUI-TP', 'PAAD-XD-AAUL-TP',
       'PAAD-XN-A8T5-TP'],
      dtype='object', length=107)


In [16]:
byCols = 1 # ie unix paste
samplesDF = pd.concat([pancreaseSamplesDF, paadSamplesDF], axis=byCols)
print(samplesDF.info())
print(samplesDF.columns)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 74777 entries, 0 to 74776
Columns: 305 entries, geneId to PAAD-XN-A8T5-TP
dtypes: int64(304), object(1)
memory usage: 174.0+ MB
None
Index(['geneId', 'GTEX-111CU-0526-SM-5EGHK', 'GTEX-111YS-1226-SM-5EGGJ',
       'GTEX-1128S-0826-SM-5GZZI', 'GTEX-11DXX-0926-SM-5H112',
       'GTEX-11GSP-0426-SM-5A5KX', 'GTEX-11LCK-0226-SM-5A5M6',
       'GTEX-11NSD-0526-SM-5A5LT', 'GTEX-11ONC-0526-SM-5BC57',
       'GTEX-11XUK-0626-SM-5N9ES',
       ...
       'PAAD-US-A774-TP', 'PAAD-US-A776-TP', 'PAAD-US-A779-TP',
       'PAAD-US-A77E-TP', 'PAAD-US-A77G-TP', 'PAAD-XD-AAUG-TP',
       'PAAD-XD-AAUH-TP', 'PAAD-XD-AAUI-TP', 'PAAD-XD-AAUL-TP',
       'PAAD-XN-A8T5-TP'],
      dtype='object', length=305)


In [17]:
assert samplesDF.shape[1] == 305, "ERROR number of columns is wrong"

# Save samples

In [18]:
# we need to make sure the sample columns and colData are in sort order
sampleSortOrderList = sorted( samplesDF.columns[1:].values.tolist() )
print(type(sampleSortOrderList))
print(f"len(sampleSortOrderList) : {len(sampleSortOrderList)}")
print(f"sampleSortOrderList[0:3] : {sampleSortOrderList[0:3]}")

colOrder = ['geneId'] + sampleSortOrderList
saveSamplesDF = samplesDF.loc[:, colOrder]
print(f'saveSamplesDF.info() : {saveSamplesDF.info()}')
saveSamplesDF.iloc[0:3, 0:3]

<class 'list'>
len(sampleSortOrderList) : 304
sampleSortOrderList[0:3] : ['GTEX-111CU-0526-SM-5EGHK', 'GTEX-111YS-1226-SM-5EGGJ', 'GTEX-1128S-0826-SM-5GZZI']
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 74777 entries, 0 to 74776
Columns: 305 entries, geneId to PAAD-XN-A8T5-TP
dtypes: int64(304), object(1)
memory usage: 174.0+ MB
saveSamplesDF.info() : None


Unnamed: 0,geneId,GTEX-111CU-0526-SM-5EGHK,GTEX-111YS-1226-SM-5EGGJ
0,(A)n,1,1
1,(AAA)n,0,0
2,(AAAAAAC)n,0,0


In [19]:
sampleFilePathPl = outputDirPl.joinpath("pancreasePAADGroupByGenseCountMatrixData.csv")
saveSamplesDF.to_csv(sampleFilePathPl, index=False)

In [20]:
# sanity check
numCols = len(saveSamplesDF.columns)
print(numCols)
! head $sampleFilePathPl | cut -d , -f 1,2,3
print()
! head $sampleFilePathPl | cut -d , -f 303,304,305,306 # there is no 306 col

305
geneId,GTEX-111CU-0526-SM-5EGHK,GTEX-111YS-1226-SM-5EGGJ
(A)n,1,1
(AAA)n,0,0
(AAAAAAC)n,0,0
(AAAAAAG)n,0,0
(AAAAAAT)n,0,0
(AAAAAC)n,0,0
(AAAAACA)n,0,0
(AAAAACC)n,0,0
(AAAAACT)n,0,0

PAAD-XD-AAUI-TP,PAAD-XD-AAUL-TP,PAAD-XN-A8T5-TP
32,22,11
0,0,0
0,0,0
0,0,0
0,0,0
0,0,1
0,0,0
0,0,0
0,0,0


# Save ColData

In [21]:
# combine colData
print(PancreasColDataDF.columns)
print(PAADColDataDF.columns)

tmpPancreasColDataDF = PancreasColDataDF.loc[:, ['sample_id', 'participant_id', 'tissue_id', 'sex', 'age']]
savePancreasColDataDF = tmpPancreasColDataDF.rename(columns={'age' : 'Age', 
                                                             'tissue_id' : 'Cohort', 
                                                             'sex' : 'Gender' } )

savePAADColDataDF     = PAADColDataDF.loc[:,    ['sample_id', 'participant_id', 'Cohort', 'Age', 'Gender']]

Index(['sample_id', 'participant_id', 'tissue_id', 'tissue_site_detail', 'sex',
       'age'],
      dtype='object')
Index(['sample_id', 'participant_id', 'tcga_sample_id', 'Cohort', 'Age',
       'Gender', 'sample_type'],
      dtype='object')


### bug fix
ref : https://bioconductor.org/packages/3.7/bioc/vignettes/DESeq2/inst/doc/DESeq2.html#model-matrix-not-full-rank

When we run DESeq2 with model '~ + Gender + Cohort' . We get the following error
```
converting counts to integer mode
Error in checkFullRank(modelMatrix) : 
  the model matrix is not full rank, so the model cannot be fit as specified.
  One or more variables or interaction terms in the design formula are linear
  combinations of the others and must be removed.
 ```
 
 This is because the Gender and Cohort  contain exactly the same information. We solve this problem
 By converting 'female' to 'Female', and 'male' to 'Male'

 ```
 extraCellularRNA) cut -d , -f 3,4 pancreasePAADColData.csv | sort | uniq -c
     48 PAAD,female
     59 PAAD,male
     73 Pancreas,Female
    124 Pancreas,Male
      1 tissue_id,Gender
```

In [22]:
savePAADColDataDF['Gender'] = savePAADColDataDF['Gender'].str.capitalize()


In [23]:
print(f"savePancreasColDataDF.info : {savePancreasColDataDF.info()} \n")
print(f"savePAADColDataDF.info : {savePAADColDataDF.info()} \n")

<class 'pandas.core.frame.DataFrame'>
Int64Index: 197 entries, 7 to 10406
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   sample_id       197 non-null    object
 1   participant_id  197 non-null    object
 2   Cohort          197 non-null    object
 3   Gender          197 non-null    object
 4   Age             197 non-null    int64 
dtypes: int64(1), object(4)
memory usage: 9.2+ KB
savePancreasColDataDF.info : None 

<class 'pandas.core.frame.DataFrame'>
Int64Index: 107 entries, 3738 to 3844
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   sample_id       107 non-null    object 
 1   participant_id  107 non-null    object 
 2   Cohort          107 non-null    object 
 3   Age             107 non-null    float64
 4   Gender          107 non-null    object 
dtypes: float64(1), object(4)
memory usage: 5.0+ KB
savePAADColDataDF.info : 

In [24]:
byRows = 0 # unix cat
colDataDF = pd.concat([savePancreasColDataDF, savePAADColDataDF], axis=byRows)
print(colDataDF.info())

assert colDataDF.shape[0] == 304, "ERROR number of samples should be 304"

<class 'pandas.core.frame.DataFrame'>
Int64Index: 304 entries, 7 to 3844
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   sample_id       304 non-null    object 
 1   participant_id  304 non-null    object 
 2   Cohort          304 non-null    object 
 3   Gender          304 non-null    object 
 4   Age             304 non-null    float64
dtypes: float64(1), object(4)
memory usage: 14.2+ KB
None


In [25]:
# we need to make sure the sample columns and colData are in sort order
display(colDataDF.head())
saveColDataDF = colDataDF.sort_values(by=['sample_id'])
display(saveColDataDF.head())
display(saveColDataDF.tail())

Unnamed: 0,sample_id,participant_id,Cohort,Gender,Age
7,GTEX-111CU-0526-SM-5EGHK,GTEX-111CU,Pancreas,Male,57.0
40,GTEX-111YS-1226-SM-5EGGJ,GTEX-111YS,Pancreas,Male,62.0
68,GTEX-1128S-0826-SM-5GZZI,GTEX-1128S,Pancreas,Female,66.0
149,GTEX-11DXX-0926-SM-5H112,GTEX-11DXX,Pancreas,Female,66.0
311,GTEX-11GSP-0426-SM-5A5KX,GTEX-11GSP,Pancreas,Female,66.0


Unnamed: 0,sample_id,participant_id,Cohort,Gender,Age
7,GTEX-111CU-0526-SM-5EGHK,GTEX-111CU,Pancreas,Male,57.0
40,GTEX-111YS-1226-SM-5EGGJ,GTEX-111YS,Pancreas,Male,62.0
68,GTEX-1128S-0826-SM-5GZZI,GTEX-1128S,Pancreas,Female,66.0
149,GTEX-11DXX-0926-SM-5H112,GTEX-11DXX,Pancreas,Female,66.0
311,GTEX-11GSP-0426-SM-5A5KX,GTEX-11GSP,Pancreas,Female,66.0


Unnamed: 0,sample_id,participant_id,Cohort,Gender,Age
3840,PAAD-XD-AAUG-TP,PAAD-XD-AAUG,PAAD,Female,66.0
3841,PAAD-XD-AAUH-TP,PAAD-XD-AAUH,PAAD,Female,57.0
3842,PAAD-XD-AAUI-TP,PAAD-XD-AAUI,PAAD,Female,50.0
3843,PAAD-XD-AAUL-TP,PAAD-XD-AAUL,PAAD,Male,56.0
3844,PAAD-XN-A8T5-TP,PAAD-XN-A8T5,PAAD,Female,53.0


In [26]:
colDataFilePathPl = outputDirPl.joinpath("pancreasePAADColData.csv")
saveColDataDF.to_csv(colDataFilePathPl, index=False)

In [27]:
# sanity check 
numCols = len(saveColDataDF.columns)
print(numCols)
! head $colDataFilePathPl 

5
sample_id,participant_id,Cohort,Gender,Age
GTEX-111CU-0526-SM-5EGHK,GTEX-111CU,Pancreas,Male,57.0
GTEX-111YS-1226-SM-5EGGJ,GTEX-111YS,Pancreas,Male,62.0
GTEX-1128S-0826-SM-5GZZI,GTEX-1128S,Pancreas,Female,66.0
GTEX-11DXX-0926-SM-5H112,GTEX-11DXX,Pancreas,Female,66.0
GTEX-11GSP-0426-SM-5A5KX,GTEX-11GSP,Pancreas,Female,66.0
GTEX-11LCK-0226-SM-5A5M6,GTEX-11LCK,Pancreas,Male,38.0
GTEX-11NSD-0526-SM-5A5LT,GTEX-11NSD,Pancreas,Male,27.0
GTEX-11ONC-0526-SM-5BC57,GTEX-11ONC,Pancreas,Male,69.0
GTEX-11XUK-0626-SM-5N9ES,GTEX-11XUK,Pancreas,Female,48.0


In [28]:
# sanity check
# make sure we have 4 uniq counts, else DESeq will report design is a linear combination error
saveColDataDF.groupby(by=['Cohort', 'Gender']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,sample_id,participant_id,Age
Cohort,Gender,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
PAAD,Female,48,48,48
PAAD,Male,59,59,59
Pancreas,Female,73,73,73
Pancreas,Male,124,124,124
