# Lump training sets
Andrew E. Davidson  
aedavids@ucsc.edu 
2/22/24

Copyright (c) 2020-2023, Regents of the University of California All rights reserved. https://polyformproject.org/licenses/noncommercial/1.0.0

Combine the train/validate/test data sets into a single table

In [1]:
import ipynbname
from IPython.display import display
import pandas as pd
# display all rows
#pd.set_option('display.max_rows', None)

# https://joelmccune.com/pandas-dataframe-to-markdown/
#from pandas.io.clipboards import to_clipboard

#import pathlib as pl
import pprint as pp
import numpy as np
import os

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
dataRoot = "/private/groups/kimlab/GTEx_TCGA/groupbyGeneTrainingSets"

def loadCSV(path : str) -> pd.DataFrame:
    print(path)
    df = pd.read_csv(path)
    print(df.shape)
    
    return df

## merge col data

In [3]:
%%time
trainColDataPath = f'{dataRoot}/GTEx_TCGA_TrainColData.csv'
trainColDataDF   = loadCSV( trainColDataPath )

validateColDataPath = f'{dataRoot}/GTEx_TCGA_ValidateColData.csv'
validateColDataDF   = loadCSV( validateColDataPath )

testColDataPath = f'{dataRoot}/GTEx_TCGA_TestColData.csv'
testColDataDF   = loadCSV( testColDataPath )

/private/groups/kimlab/GTEx_TCGA/groupbyGeneTrainingSets/GTEx_TCGA_TrainColData.csv
(15801, 6)
/private/groups/kimlab/GTEx_TCGA/groupbyGeneTrainingSets/GTEx_TCGA_ValidateColData.csv
(5268, 6)
/private/groups/kimlab/GTEx_TCGA/groupbyGeneTrainingSets/GTEx_TCGA_TestColData.csv
(5268, 6)
CPU times: user 33 ms, sys: 8.93 ms, total: 42 ms
Wall time: 46.9 ms


In [4]:
expectedNumberOfRows = trainColDataDF.shape[0] + validateColDataDF.shape[0] + testColDataDF.shape[0]
expectedNumberOfCols = trainColDataDF.shape[1]

In [5]:
colDataDF = pd.concat( [trainColDataDF, validateColDataDF, testColDataDF] )

print(f'colDataDF.shape : {colDataDF.shape}' )
assert colDataDF.shape[0] == expectedNumberOfRows, "ERROR number of rows does not match expected"
assert colDataDF.shape[1] == expectedNumberOfCols, "ERROR number of columns does not match expected"

colDataDF.shape : (26337, 6)


## merge group by count data

In [6]:
%%time
trainGroupbyPath = f'{dataRoot}/GTEx_TCGA_TrainGroupby.csv'
trainGroupbyDF   = loadCSV( trainGroupbyPath )

validateGroupbyPath = f'{dataRoot}/GTEx_TCGA_ValidateGroupby.csv'
validateGroupbyDF   = loadCSV( validateGroupbyPath )

testGroupbyPath = f'{dataRoot}/GTEx_TCGA_TestGroupby.csv'
testGroupbyDF   = loadCSV( testGroupbyPath )

/private/groups/kimlab/GTEx_TCGA/groupbyGeneTrainingSets/GTEx_TCGA_TrainGroupby.csv
(74777, 15802)
/private/groups/kimlab/GTEx_TCGA/groupbyGeneTrainingSets/GTEx_TCGA_ValidateGroupby.csv
(74777, 5269)
/private/groups/kimlab/GTEx_TCGA/groupbyGeneTrainingSets/GTEx_TCGA_TestGroupby.csv
(74777, 5269)
CPU times: user 4min 51s, sys: 21.5 s, total: 5min 13s
Wall time: 5min 19s


In [7]:
expectedNumberOfRows = trainGroupbyDF.shape[0] 
# -2 all groupby data frames have a geneId col. final results should only have 1
expectedNumberOfCols = trainGroupbyDF.shape[1]+ validateGroupbyDF.shape[1] + testGroupbyDF.shape[1] - 2

In [8]:
# sanity check make sure the gene ids are in the same order
trainGeneDF = trainGroupbyDF .loc[:, ['geneId']]
validateGeneDF = validateGroupbyDF.loc[:, ['geneId']]
testGeneDF = testGroupbyDF.loc[:, ['geneId']]

pd.testing.assert_frame_equal(trainGeneDF, validateGeneDF)
pd.testing.assert_frame_equal(trainGeneDF, testGeneDF)

In [1]:
aedwip

turns out we are missing some samples. 

interesection size should be zero
len( TrainColData.intersection(ValidateColData ) ) 2090
len( TrainColData.intersection(TestColData ) ) 2064
len( ValidateColData.intersection(TestColData ) ) 0

see 
- tree/terra/GTExTCGA_Release/jupyterNotebooks/fixColData.ipynb
- terra/GTExTCGA_Release/jupyterNotebooks/checkColDataForDuplicateSamples.ipynb

SyntaxError: invalid syntax (1890914453.py, line 3)

In [9]:
%%time
# all groupby data frames have a geneId col. final results should only have 1
valSampleCols = ~validateGroupbyDF.columns.isin( ['geneId'] )
testSampleCols = ~testGroupbyDF.columns.isin( ['geneId'] )
dataFrames = [ trainGroupbyDF, validateGroupbyDF.loc[:, valSampleCols] , testGroupbyDF.loc[:,testSampleCols] ]

# concat like unix paste , i.e. make the data frame wider not longer
byCols = 1
groupbyDF = pd.concat( dataFrames, axis=byCols ) 

CPU times: user 3.84 s, sys: 7.15 s, total: 11 s
Wall time: 11 s


In [10]:
print( f'groupbyDF.shape : {groupbyDF.shape}' )
assert groupbyDF.shape[0] == expectedNumberOfRows, "ERROR number of rows does not match expected"
assert groupbyDF.shape[1] == expectedNumberOfCols, "ERROR number of columns does not match expected"

groupbyDF.shape : (74777, 26338)


## Save

In [14]:
%%time
# colDataPath = f'{dataRoot}/GTEx_TCGA_ColData.csv'
# print(f'{colDataPath}')
# colDataDF.to_csv( colDataPath, index=False ) 

groupbyPath = f'{dataRoot}/GTEx_TCGA_Groupby.csv'
print(f'\n{groupbyPath}')
groupbyDF.to_csv( groupbyPath, index=False)


/private/groups/kimlab/GTEx_TCGA/groupbyGeneTrainingSets/GTEx_TCGA_Groupby.csv
CPU times: user 23min 24s, sys: 4.67 s, total: 23min 29s
Wall time: 23min 32s


In [12]:
print(f'{colDataPath}')
print(f'\n{groupbyPath}')

/private/groups/kimlab/GTEx_TCGA/groupbyGeneTrainingSets/GTEx_TCGA_ColData.csv


NameError: name 'groupbyPath' is not defined