# Create Data set for LUAD vs LUSC 1vsAll Analysis

```
Andrew E. Davidson
aedavids@ucsc.edu
1/1/24
```

- 309 LUAD samples
- 302 LUSC sample

output  
- /private/groups/kimlab/aedavids/deconvolution/LUAD.vs.LUSC/LUSC_LUAD_GroupByGenseCountMatrixData.csv
- /private/groups/kimlab/aedavids/deconvolution/LUAD.vs.LUSC/LUSC_LUAD_ColData.csv

In [1]:
import pandas as pd
import pathlib as pl

# use display() to print an html version of a data frame
# useful if dataFrame output is not generated by last like of cell
from IPython.display import display

outputDirPl = pl.Path("/private/groups/kimlab/aedavids/deconvolution/LUAD.vs.LUSC")
outputDirPl.mkdir(parents=True, exist_ok=True)

In [2]:
# load data sets 
GTExDataRootPl = pl.Path('/private/groups/kimlab/GTEx')
GTExColDataPl = GTExDataRootPl.joinpath("GTExTrainColData.csv")
GTExGroupByGenseCountMatrixDataPl = GTExDataRootPl.joinpath("GTExTrainGroupByGenesCountMatrix.csv")

TCGA_ColDataTrainingDatasetRootPl = pl.Path('/private/groups/kimlab/TCGA/colData/trainingDataSets/')
TCGA_ColDataPl = TCGA_ColDataTrainingDatasetRootPl.joinpath("TCGA-TrainColData.csv")

TCGA_DataTrainingDatasetRootPl = pl.Path('/private/groups/kimlab/TCGA/trainingDataSets/')                                    
TCGA_GroupByGenseCountMatrixDataPl = TCGA_DataTrainingDatasetRootPl.joinpath("TCGA-TrainGroupByGeneId.csv")

# Select colData for TCGA LUAD

In [3]:
tcgaColDataDF = pd.read_csv(TCGA_ColDataPl)
tcgaColDataDF.head()

Unnamed: 0,sample_id,participant_id,tcga_sample_id,Cohort,Age,Gender,sample_type
0,ACC-OR-A5J1-TP,ACC-OR-A5J1,TCGA-OR-A5J1-01,ACC,58.0,male,TP
1,ACC-OR-A5J2-TP,ACC-OR-A5J2,TCGA-OR-A5J2-01,ACC,44.0,female,TP
2,ACC-OR-A5J3-TP,ACC-OR-A5J3,TCGA-OR-A5J3-01,ACC,23.0,female,TP
3,ACC-OR-A5J6-TP,ACC-OR-A5J6,TCGA-OR-A5J6-01,ACC,29.0,female,TP
4,ACC-OR-A5J7-TP,ACC-OR-A5J7,TCGA-OR-A5J7-01,ACC,30.0,female,TP


In [4]:
tcgaColDataDF.loc[:, 'Cohort'].unique()

array(['ACC', 'BLCA', 'BRCA', 'CESC', 'CHOL', 'COAD', 'DLBC', 'ESCA',
       'GBM', 'HNSC', 'KICH', 'KIRC', 'KIRP', 'LGG', 'LIHC', 'LUAD',
       'LUSC', 'MESO', 'OV', 'PAAD', 'PCPG', 'PRAD', 'READ', 'SARC',
       'SKCM', 'STAD', 'TGCT', 'THCA', 'THYM', 'UCEC', 'UCS', 'UVM'],
      dtype=object)

In [5]:
selectRows = tcgaColDataDF['Cohort'] == 'LUAD'
LUADColDataDF = tcgaColDataDF.loc[selectRows, :]

print(LUADColDataDF.info())
LUADColDataDF.head()

<class 'pandas.core.frame.DataFrame'>
Index: 309 entries, 2899 to 3207
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   sample_id       309 non-null    object 
 1   participant_id  309 non-null    object 
 2   tcga_sample_id  309 non-null    object 
 3   Cohort          309 non-null    object 
 4   Age             309 non-null    float64
 5   Gender          309 non-null    object 
 6   sample_type     309 non-null    object 
dtypes: float64(1), object(6)
memory usage: 19.3+ KB
None


Unnamed: 0,sample_id,participant_id,tcga_sample_id,Cohort,Age,Gender,sample_type
2899,LUAD-05-4244-TP,LUAD-05-4244,TCGA-05-4244-01,LUAD,70.0,male,TP
2900,LUAD-05-4249-TP,LUAD-05-4249,TCGA-05-4249-01,LUAD,67.0,male,TP
2901,LUAD-05-4250-TP,LUAD-05-4250,TCGA-05-4250-01,LUAD,79.0,female,TP
2902,LUAD-05-4382-TP,LUAD-05-4382,TCGA-05-4382-01,LUAD,68.0,male,TP
2903,LUAD-05-4384-TP,LUAD-05-4384,TCGA-05-4384-01,LUAD,66.0,male,TP


# Select colData for LUSC

In [6]:
selectRows = tcgaColDataDF['Cohort'] == 'LUSC'
LUSCColDataDF = tcgaColDataDF.loc[selectRows, :]

print(LUSCColDataDF.info())
LUSCColDataDF.head()

<class 'pandas.core.frame.DataFrame'>
Index: 301 entries, 3208 to 3508
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   sample_id       301 non-null    object 
 1   participant_id  301 non-null    object 
 2   tcga_sample_id  301 non-null    object 
 3   Cohort          301 non-null    object 
 4   Age             301 non-null    float64
 5   Gender          301 non-null    object 
 6   sample_type     301 non-null    object 
dtypes: float64(1), object(6)
memory usage: 18.8+ KB
None


Unnamed: 0,sample_id,participant_id,tcga_sample_id,Cohort,Age,Gender,sample_type
3208,LUSC-18-3406-TP,LUSC-18-3406,TCGA-18-3406-01,LUSC,67.0,male,TP
3209,LUSC-18-3407-TP,LUSC-18-3407,TCGA-18-3407-01,LUSC,72.0,male,TP
3210,LUSC-18-3408-TP,LUSC-18-3408,TCGA-18-3408-01,LUSC,77.0,female,TP
3211,LUSC-18-3409-TP,LUSC-18-3409,TCGA-18-3409-01,LUSC,74.0,male,TP
3212,LUSC-18-3412-TP,LUSC-18-3412,TCGA-18-3412-01,LUSC,52.0,male,TP


# Select the LUAD samples

In [7]:
TCGAGroupByGenseCountMatrixDF = pd.read_csv(TCGA_GroupByGenseCountMatrixDataPl)
print(TCGAGroupByGenseCountMatrixDF.info())
TCGAGroupByGenseCountMatrixDF.iloc[0:3, 0:3]

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 74777 entries, 0 to 74776
Columns: 5401 entries, geneId to UVM-YZ-A985-TP
dtypes: int64(5400), object(1)
memory usage: 3.0+ GB
None


Unnamed: 0,geneId,ACC-OR-A5J1-TP,ACC-OR-A5J2-TP
0,(A)n,7,27
1,(AAA)n,0,0
2,(AAAAAAC)n,0,0


In [8]:
print(TCGAGroupByGenseCountMatrixDF.columns)

Index(['geneId', 'ACC-OR-A5J1-TP', 'ACC-OR-A5J2-TP', 'ACC-OR-A5J3-TP',
       'ACC-OR-A5J6-TP', 'ACC-OR-A5J7-TP', 'ACC-OR-A5J8-TP', 'ACC-OR-A5JB-TP',
       'ACC-OR-A5JF-TP', 'ACC-OR-A5JI-TP',
       ...
       'UVM-WC-A881-TP', 'UVM-WC-A882-TP', 'UVM-WC-A883-TP', 'UVM-WC-A884-TP',
       'UVM-WC-A888-TP', 'UVM-WC-AA9E-TP', 'UVM-YZ-A980-TP', 'UVM-YZ-A983-TP',
       'UVM-YZ-A984-TP', 'UVM-YZ-A985-TP'],
      dtype='object', length=5401)


In [9]:
LUADSampleIdList = ['geneId'] + LUADColDataDF.loc[:, 'sample_id'].values.tolist()
print(f'len(LUADSampleIdList) : {len(LUADSampleIdList)}' )
print(f'LUADSampleIdList[0:3] : {LUADSampleIdList[0:3]}')
# selectRows = TCGAGroupByGenseCountMatrixDF['sample_id'].isin(PAADSampleIdList)
LUADSamplesDF = TCGAGroupByGenseCountMatrixDF.loc[:, LUADSampleIdList]
print(f'LUADSamplesDF.info() : {LUADSamplesDF.info()}')

LUADSamplesDF.iloc[0:3, 0:3]

len(LUADSampleIdList) : 310
LUADSampleIdList[0:3] : ['geneId', 'LUAD-05-4244-TP', 'LUAD-05-4249-TP']
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 74777 entries, 0 to 74776
Columns: 310 entries, geneId to LUAD-NJ-A55R-TP
dtypes: int64(309), object(1)
memory usage: 176.9+ MB
LUADSamplesDF.info() : None


Unnamed: 0,geneId,LUAD-05-4244-TP,LUAD-05-4249-TP
0,(A)n,11,2
1,(AAA)n,0,0
2,(AAAAAAC)n,0,0


# Select the LUSC samples

In [10]:
LUSCSampleIdList = ['geneId'] + LUSCColDataDF.loc[:, 'sample_id'].values.tolist()
print(f'len(LUSCSampleIdList) : {len(LUSCSampleIdList)}' )
print(f'LUSCSampleIdList[0:3] : {LUSCSampleIdList[0:3]}')
# selectRows = TCGAGroupByGenseCountMatrixDF['sample_id'].isin(PAADSampleIdList)
LUSCSamplesDF = TCGAGroupByGenseCountMatrixDF.loc[:, LUSCSampleIdList]
print(f'LUSCSamplesDF.info() : {LUSCSamplesDF.info()}')

LUSCSamplesDF.iloc[0:3, 0:3]

len(LUSCSampleIdList) : 302
LUSCSampleIdList[0:3] : ['geneId', 'LUSC-18-3406-TP', 'LUSC-18-3407-TP']
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 74777 entries, 0 to 74776
Columns: 302 entries, geneId to LUSC-XC-AA0X-TP
dtypes: int64(301), object(1)
memory usage: 172.3+ MB
LUSCSamplesDF.info() : None


Unnamed: 0,geneId,LUSC-18-3406-TP,LUSC-18-3407-TP
0,(A)n,61,21
1,(AAA)n,0,0
2,(AAAAAAC)n,0,0


# Combine PAAD and Pancreas samples into a single dataframe

In [11]:
# check the geneIds are in the same order
ret = LUADSamplesDF.loc[:, "geneId"] == LUSCSamplesDF.loc[:, "geneId"]
assert sum(ret) == 74777 , "ERROR gene ids are not aligned"  

In [12]:
# we do not want to have two copies of the 'geneId' col
luadSamplesDF = LUADSamplesDF.iloc[:, 1:]
print(luadSamplesDF.info())
print(luadSamplesDF.columns)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 74777 entries, 0 to 74776
Columns: 309 entries, LUAD-05-4244-TP to LUAD-NJ-A55R-TP
dtypes: int64(309)
memory usage: 176.3 MB
None
Index(['LUAD-05-4244-TP', 'LUAD-05-4249-TP', 'LUAD-05-4250-TP',
       'LUAD-05-4382-TP', 'LUAD-05-4384-TP', 'LUAD-05-4389-TP',
       'LUAD-05-4395-TP', 'LUAD-05-4397-TP', 'LUAD-05-4398-TP',
       'LUAD-05-4402-TP',
       ...
       'LUAD-MP-A4TC-TP', 'LUAD-MP-A4TD-TP', 'LUAD-MP-A4TE-TP',
       'LUAD-MP-A4TF-TP', 'LUAD-MP-A4TH-TP', 'LUAD-MP-A4TJ-TP',
       'LUAD-MP-A5C7-TP', 'LUAD-NJ-A4YP-TP', 'LUAD-NJ-A4YQ-TP',
       'LUAD-NJ-A55R-TP'],
      dtype='object', length=309)


In [13]:
byCols = 1 # ie unix paste
samplesDF = pd.concat([LUSCSamplesDF, luadSamplesDF], axis=byCols)
print(samplesDF.info())
print(samplesDF.columns)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 74777 entries, 0 to 74776
Columns: 611 entries, geneId to LUAD-NJ-A55R-TP
dtypes: int64(610), object(1)
memory usage: 348.6+ MB
None
Index(['geneId', 'LUSC-18-3406-TP', 'LUSC-18-3407-TP', 'LUSC-18-3408-TP',
       'LUSC-18-3409-TP', 'LUSC-18-3412-TP', 'LUSC-18-3414-TP',
       'LUSC-18-3416-TP', 'LUSC-18-3417-TP', 'LUSC-18-3419-TP',
       ...
       'LUAD-MP-A4TC-TP', 'LUAD-MP-A4TD-TP', 'LUAD-MP-A4TE-TP',
       'LUAD-MP-A4TF-TP', 'LUAD-MP-A4TH-TP', 'LUAD-MP-A4TJ-TP',
       'LUAD-MP-A5C7-TP', 'LUAD-NJ-A4YP-TP', 'LUAD-NJ-A4YQ-TP',
       'LUAD-NJ-A55R-TP'],
      dtype='object', length=611)


In [14]:
assert samplesDF.shape[1] == 611, "ERROR number of columns is wrong"

# Save samples

In [15]:
# we need to make sure the sample columns and colData are in sort order
sampleSortOrderList = sorted( samplesDF.columns[1:].values.tolist() )
print(type(sampleSortOrderList))
print(f"len(sampleSortOrderList) : {len(sampleSortOrderList)}")
print(f"sampleSortOrderList[0:3] : {sampleSortOrderList[0:3]}")

colOrder = ['geneId'] + sampleSortOrderList
saveSamplesDF = samplesDF.loc[:, colOrder]
print(f'saveSamplesDF.info() : {saveSamplesDF.info()}')
saveSamplesDF.iloc[0:3, 0:3]

<class 'list'>
len(sampleSortOrderList) : 610
sampleSortOrderList[0:3] : ['LUAD-05-4244-TP', 'LUAD-05-4249-TP', 'LUAD-05-4250-TP']
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 74777 entries, 0 to 74776
Columns: 611 entries, geneId to LUSC-XC-AA0X-TP
dtypes: int64(610), object(1)
memory usage: 348.6+ MB
saveSamplesDF.info() : None


Unnamed: 0,geneId,LUAD-05-4244-TP,LUAD-05-4249-TP
0,(A)n,11,2
1,(AAA)n,0,0
2,(AAAAAAC)n,0,0


In [16]:
sampleFilePathPl = outputDirPl.joinpath("LUSC_LUAD_GroupByGenseCountMatrixData.csv")
saveSamplesDF.to_csv(sampleFilePathPl, index=False)
print(f"saving samples file:\n{sampleFilePathPl}")

saving samples file:
/private/groups/kimlab/aedavids/deconvolution/LUAD.vs.LUSC/LUSC_LUAD_GroupByGenseCountMatrixData.csv


In [17]:
# sanity check
numCols = len(saveSamplesDF.columns)
print(numCols)
! head $sampleFilePathPl | cut -d , -f 1,2,3
print()
! head $sampleFilePathPl | cut -d , -f 608,609,610 # there is no 611 col

611
geneId,LUAD-05-4244-TP,LUAD-05-4249-TP
(A)n,11,2
(AAA)n,0,0
(AAAAAAC)n,0,0
(AAAAAAG)n,0,0
(AAAAAAT)n,0,0
(AAAAAC)n,0,0
(AAAAACA)n,0,0
(AAAAACC)n,0,0
(AAAAACT)n,0,0

LUSC-O2-A52S-TP,LUSC-O2-A52W-TP,LUSC-O2-A5IB-TP
37,38,15
0,0,0
0,0,0
0,0,0
0,0,0
1,0,0
0,0,0
0,0,0
0,0,0


# Save ColData

### bug fix
ref : https://bioconductor.org/packages/3.7/bioc/vignettes/DESeq2/inst/doc/DESeq2.html#model-matrix-not-full-rank

When we run DESeq2 with model '~ + Gender + Cohort' . We get the following error
```
converting counts to integer mode
Error in checkFullRank(modelMatrix) : 
  the model matrix is not full rank, so the model cannot be fit as specified.
  One or more variables or interaction terms in the design formula are linear
  combinations of the others and must be removed.
 ```
 
 This is because the Gender and Cohort  contain exactly the same information. We solve this problem
 By converting 'female' to 'Female', and 'male' to 'Male'

 ```
 extraCellularRNA) cut -d , -f 3,4 pancreasePAADColData.csv | sort | uniq -c
     48 PAAD,female
     59 PAAD,male
     73 Pancreas,Female
    124 Pancreas,Male
      1 tissue_id,Gender
```

In [18]:
# saveLUADColDataDF['Gender'] = saveLUADColDataDF['Gender'].str.capitalize()

In [19]:
# print(f"savePancreasColDataDF.info : {savePancreasColDataDF.info()} \n")
# print(f"savePAADColDataDF.info : {savePAADColDataDF.info()} \n")

In [20]:
byRows = 0 # unix cat
colDataDF = pd.concat([LUADColDataDF, LUSCColDataDF], axis=byRows)
print(colDataDF.info())

assert colDataDF.shape[0] == 610, "ERROR number of samples should be 610"

<class 'pandas.core.frame.DataFrame'>
Index: 610 entries, 2899 to 3508
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   sample_id       610 non-null    object 
 1   participant_id  610 non-null    object 
 2   tcga_sample_id  610 non-null    object 
 3   Cohort          610 non-null    object 
 4   Age             610 non-null    float64
 5   Gender          610 non-null    object 
 6   sample_type     610 non-null    object 
dtypes: float64(1), object(6)
memory usage: 38.1+ KB
None


In [21]:
# we need to make sure the sample columns and colData are in sort order
display(colDataDF.head())
saveColDataDF = colDataDF.sort_values(by=['sample_id'])
display(saveColDataDF.head())
display(saveColDataDF.tail())

Unnamed: 0,sample_id,participant_id,tcga_sample_id,Cohort,Age,Gender,sample_type
2899,LUAD-05-4244-TP,LUAD-05-4244,TCGA-05-4244-01,LUAD,70.0,male,TP
2900,LUAD-05-4249-TP,LUAD-05-4249,TCGA-05-4249-01,LUAD,67.0,male,TP
2901,LUAD-05-4250-TP,LUAD-05-4250,TCGA-05-4250-01,LUAD,79.0,female,TP
2902,LUAD-05-4382-TP,LUAD-05-4382,TCGA-05-4382-01,LUAD,68.0,male,TP
2903,LUAD-05-4384-TP,LUAD-05-4384,TCGA-05-4384-01,LUAD,66.0,male,TP


Unnamed: 0,sample_id,participant_id,tcga_sample_id,Cohort,Age,Gender,sample_type
2899,LUAD-05-4244-TP,LUAD-05-4244,TCGA-05-4244-01,LUAD,70.0,male,TP
2900,LUAD-05-4249-TP,LUAD-05-4249,TCGA-05-4249-01,LUAD,67.0,male,TP
2901,LUAD-05-4250-TP,LUAD-05-4250,TCGA-05-4250-01,LUAD,79.0,female,TP
2902,LUAD-05-4382-TP,LUAD-05-4382,TCGA-05-4382-01,LUAD,68.0,male,TP
2903,LUAD-05-4384-TP,LUAD-05-4384,TCGA-05-4384-01,LUAD,66.0,male,TP


Unnamed: 0,sample_id,participant_id,tcga_sample_id,Cohort,Age,Gender,sample_type
3504,LUSC-O2-A52Q-TP,LUSC-O2-A52Q,TCGA-O2-A52Q-01,LUSC,44.0,female,TP
3505,LUSC-O2-A52S-TP,LUSC-O2-A52S,TCGA-O2-A52S-01,LUSC,57.0,female,TP
3506,LUSC-O2-A52W-TP,LUSC-O2-A52W,TCGA-O2-A52W-01,LUSC,63.0,male,TP
3507,LUSC-O2-A5IB-TP,LUSC-O2-A5IB,TCGA-O2-A5IB-01,LUSC,71.0,female,TP
3508,LUSC-XC-AA0X-TP,LUSC-XC-AA0X,TCGA-XC-AA0X-01,LUSC,77.0,female,TP


In [22]:
colDataFilePathPl = outputDirPl.joinpath("LUSC_LUAD_ColData.csv")
saveColDataDF.to_csv(colDataFilePathPl, index=False)
print(f'saving col data files\n{colDataFilePathPl}')

saving col data files
/private/groups/kimlab/aedavids/deconvolution/LUAD.vs.LUSC/LUSC_LUAD_ColData.csv


In [23]:
# sanity check 
numCols = len(saveColDataDF.columns)
print(numCols)
! head $colDataFilePathPl 

7
sample_id,participant_id,tcga_sample_id,Cohort,Age,Gender,sample_type
LUAD-05-4244-TP,LUAD-05-4244,TCGA-05-4244-01,LUAD,70.0,male,TP
LUAD-05-4249-TP,LUAD-05-4249,TCGA-05-4249-01,LUAD,67.0,male,TP
LUAD-05-4250-TP,LUAD-05-4250,TCGA-05-4250-01,LUAD,79.0,female,TP
LUAD-05-4382-TP,LUAD-05-4382,TCGA-05-4382-01,LUAD,68.0,male,TP
LUAD-05-4384-TP,LUAD-05-4384,TCGA-05-4384-01,LUAD,66.0,male,TP
LUAD-05-4389-TP,LUAD-05-4389,TCGA-05-4389-01,LUAD,70.0,male,TP
LUAD-05-4395-TP,LUAD-05-4395,TCGA-05-4395-01,LUAD,76.0,male,TP
LUAD-05-4397-TP,LUAD-05-4397,TCGA-05-4397-01,LUAD,65.0,male,TP
LUAD-05-4398-TP,LUAD-05-4398,TCGA-05-4398-01,LUAD,47.0,female,TP


In [24]:
# sanity check
# make sure we have 4 uniq counts, else DESeq will report design is a linear combination error
saveColDataDF.groupby(by=['Cohort', 'Gender']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,sample_id,participant_id,tcga_sample_id,Age,sample_type
Cohort,Gender,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
LUAD,female,166,166,166,166,166
LUAD,male,143,143,143,143,143
LUSC,female,78,78,78,78,78
LUSC,male,223,223,223,223,223
