# fix non disjoint data sets

Both our GTEx_TCGA validation and test data sets have sample that are included in the training Data set.

ref extracellular/terra/GTExTCGA_Release/jupyterNotebooks/checkColDataForDuplicateSamples.ipynb

**summary**
- we have unique 22183 TCGA + GTEx samples
- GTEx
    * we have GTEx 13,199 samples expected 17,383
    * we are missing  GTEx samples 4154
        + there should be 17,383 GTEx samples : wc -l colData.csv = 17383
        + The number of non disjoint sample between training and validation and test sets is 4154
- TCGA
    ```
    /Users/andrewdavidson/googleUCSC/kimLab/terraDataModels/test-aedavids-proj/TCGA
    find . -name "*edu_ucsc_kim_lab_colData.csv"
    ```
    * 33 cancer types
    * 10347 samples
    * we have 8984 in train/val/test set
    * <span style="color:red;background-color:yellow">do the validation and test sets have GTEx samples?</span>

**We do not have a problem** We never used the validation or test sets  

## <span style="color:red;background-color:yellow">short term fix</span>
**A**
- all the sample ids in the training count data set are unique
- just use training data for now

**B**
- create a new data set from unique samples
- saved to : /scratch/aedavids/tmp/GTEx_TCGA_NormalizedGroupby.hdf5 (74777, 22183)
- saved to : /scratch/aedavids/tmp/GTEx_TCGA_ColData.hdf5 (22183, 5)
- cp all sample id data sets to GTEx_TCGA/groupbyGeneTrainingSets
- mv copies non disjoint data sets to /private/groups/kimlab/GTEx_TCGA/groupbyGeneTrainingSets
  ```
  $ ls notDisjoint/
    GTEx_TCGA_ColData.csv                      GTEx_TCGA_NormalizedGroupby.csv
    GTEx_TCGA_TestGroupby.csv
    GTEx_TCGA_Groupby.csv                      GTEx_TCGA_NormalizedGroupby.hdf5
    GTEx_TCGA_ValidateColData.csv
    GTEx_TCGA_GroupbyEstimatedSizeFactors.csv  GTEx_TCGA_TestColData.csv
    GTEx_TCGA_ValidateGroupby.csv
  ```

In [1]:
from IPython.display import display
import numpy as np
import pandas as pd
dataRoot = "/scratch/aedavids/tmp"

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
%%time
def loadColData(dataRoot):
    # "GTEx_TCGA_ColData.csv",
    colDataFiles = [ "GTEx_TCGA_TestColData.csv", "GTEx_TCGA_TrainColData.csv", "GTEx_TCGA_ValidateColData.csv"]
    colDataDict = dict()
    for f in colDataFiles:
        key = f.split("_")[2].split(".")[0]
        fPath = f'{dataRoot}/{f}'
        df = pd.read_csv(fPath, index_col="sample_id")
        print(f'{key} {fPath} {df.shape}')
        colDataDict[key] = df

    return colDataDict

colDataDict = loadColData(dataRoot)
print(f'colDataDict.keys() : {colDataDict.keys()}')
display(colDataDict[ 'TrainColData'].head())
display(colDataDict[ 'TrainColData'].tail())
print()

TestColData /scratch/aedavids/tmp/GTEx_TCGA_TestColData.csv (5268, 5)
TrainColData /scratch/aedavids/tmp/GTEx_TCGA_TrainColData.csv (15801, 5)
ValidateColData /scratch/aedavids/tmp/GTEx_TCGA_ValidateColData.csv (5268, 5)
colDataDict.keys() : dict_keys(['TestColData', 'TrainColData', 'ValidateColData'])


Unnamed: 0_level_0,participant_id,category,gender,age,dataSet
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
GTEX-1117F-0226-SM-5GZZ7,GTEX-1117F,Adipose_Subcutaneous,Female,66.0,GTEx
GTEX-1117F-0526-SM-5EGHJ,GTEX-1117F,Artery_Tibial,Female,66.0,GTEx
GTEX-1117F-0726-SM-5GIEN,GTEX-1117F,Heart_Atrial_Appendage,Female,66.0,GTEx
GTEX-1117F-2826-SM-5GZXL,GTEX-1117F,Breast_Mammary_Tissue,Female,66.0,GTEx
GTEX-1117F-3226-SM-5N9CT,GTEX-1117F,Brain_Cortex,Female,66.0,GTEx


Unnamed: 0_level_0,participant_id,category,gender,age,dataSet
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
UVM-WC-AA9E-TP,UVM-WC-AA9E,UVM,male,60.0,TCGA
UVM-YZ-A980-TP,UVM-YZ-A980,UVM,male,75.0,TCGA
UVM-YZ-A983-TP,UVM-YZ-A983,UVM,female,51.0,TCGA
UVM-YZ-A984-TP,UVM-YZ-A984,UVM,female,50.0,TCGA
UVM-YZ-A985-TP,UVM-YZ-A985,UVM,female,41.0,TCGA



CPU times: user 81.3 ms, sys: 9.99 ms, total: 91.3 ms
Wall time: 146 ms


In [3]:
%%time
def saveAsHDF(dict ):
    for key in dict.keys():
        df = dict[key]
        hdf5Path = f'{dataRoot}/{key}.hdf5'
        df.to_hdf(hdf5Path, key=key)
        print(f'saved to : {hdf5Path}')

# Wall time: 1min 21s
# saveAsHDF(countDict)        

CPU times: user 11 µs, sys: 0 ns, total: 11 µs
Wall time: 20.3 µs


In [4]:
%%time
def loadCountData(dataRoot):
    '''
    csv Wall time: 7min 33s
    hdf5 Wall time: 41.3 s
    '''
    #"GTEx_TCGA_Groupby.csv",
    #countFiles = [ "GTEx_TCGA_TestGroupby.csv", "GTEx_TCGA_TrainGroupby.csv", "GTEx_TCGA_ValidateGroupby.csv"]
    countFiles = [ "GTEx_TCGA_TestGroupby.hdf5", "GTEx_TCGA_TrainGroupby.hdf5", "GTEx_TCGA_ValidateGroupby.hdf5"]
    countDict = dict()
    for f in countFiles:
        key = f.split("_")[2].split(".")[0]
        fPath = f'{dataRoot}/{f}'
        # df = pd.read_csv(fPath, index_col="geneId")
        df = pd.read_hdf(fPath)
        print(f'{key} {fPath} {df.shape}')
        countDict[key] = df

    return countDict

countDict = loadCountData(dataRoot)
print(f'countDict.keys() : {countDict.keys()}')
display(countDict[ 'TrainGroupby'].head().iloc[0:3, 0:3])
print()

TestGroupby /scratch/aedavids/tmp/GTEx_TCGA_TestGroupby.hdf5 (74777, 5268)
TrainGroupby /scratch/aedavids/tmp/GTEx_TCGA_TrainGroupby.hdf5 (74777, 15801)
ValidateGroupby /scratch/aedavids/tmp/GTEx_TCGA_ValidateGroupby.hdf5 (74777, 5268)
countDict.keys() : dict_keys(['TestGroupby', 'TrainGroupby', 'ValidateGroupby'])


Unnamed: 0_level_0,GTEX-1117F-0226-SM-5GZZ7,GTEX-1117F-0526-SM-5EGHJ,GTEX-1117F-0726-SM-5GIEN
geneId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
(A)n,9,1,3
(AAA)n,0,0,0
(AAAAAAC)n,0,0,0



CPU times: user 12.5 s, sys: 15.8 s, total: 28.3 s
Wall time: 44.1 s


In [5]:
def createColDataSampleIdDict() :
    retDict = dict()
    for key in colDataDict.keys():
        df = colDataDict[key]
        sampleIds = df.index
        retDict[key] = sampleIds
        print(f'{key :<15} num sample ids : {len(sampleIds)} num unique : {sampleIds.unique().shape}' )

    return retDict
        
print(f'the len should equal the number unique ids')
colDataSampleIdsDict = createColDataSampleIdDict()

the len should equal the number unique ids
TestColData     num sample ids : 5268 num unique : (5268,)
TrainColData    num sample ids : 15801 num unique : (15801,)
ValidateColData num sample ids : 5268 num unique : (5268,)


In [6]:
def createCountSampleIdDict() :
    retDict = dict()
    for key in countDict.keys():
        df = countDict[key]
        sampleIds = df.columns
        retDict[key] = sampleIds
        # print(f'{key} len(sampleIds) {len(sampleIds)}' )
        print(f'{key :<15} num sample ids : {len(sampleIds)} num unique : {sampleIds.unique().shape}' )

    return retDict

print(f'the len should equal the number unique ids')
countSampleIdsDict = createCountSampleIdDict()

the len should equal the number unique ids
TestGroupby     num sample ids : 5268 num unique : (5268,)
TrainGroupby    num sample ids : 15801 num unique : (15801,)
ValidateGroupby num sample ids : 5268 num unique : (5268,)


# Check for duplicates

In [7]:
def checkforColDataDuplicates():
    '''
    with in each col data set check for duplicate row
    '''
    for key in colDataDict.keys():
        duplicateRows = colDataDict[key].duplicated()
        print(f'{key :<15} num duplicates : {sum(duplicateRows)}')

checkforColDataDuplicates()

TestColData     num duplicates : 0
TrainColData    num duplicates : 0
ValidateColData num duplicates : 0


In [8]:
# check if colData & count sets share sample ids
def createSampleIdSetDict( sampleIdsDict ):
    retDict = dict()
    for key in sampleIdsDict:
        s = sampleIdsDict[key]
        retDict[key] = s
        print(f'{key:<15} len(set) : {len(s)}')

    return retDict

colDataSetDict= createSampleIdSetDict(colDataSampleIdsDict)
print()
countSetDict= createSampleIdSetDict(countSampleIdsDict)

TestColData     len(set) : 5268
TrainColData    len(set) : 15801
ValidateColData len(set) : 5268

TestGroupby     len(set) : 5268
TrainGroupby    len(set) : 15801
ValidateGroupby len(set) : 5268


In [9]:
def checkForSharedSamples(key1, key2, sampleIdDict):
    s1 = sampleIdDict[key1]
    s2 = sampleIdDict[key2]
    intersection = s1.intersection(s2)
    print(f'len( {key1}.intersection({key2} ) ) { len(intersection) }' )

    return intersection

In [10]:
print(f'interesection size should be zero')
colDataTrainValIntersection  = checkForSharedSamples('TrainColData',    'ValidateColData', colDataSetDict)
colDataTrainTestIntersection = checkForSharedSamples('TrainColData',    'TestColData',     colDataSetDict)
colDataValTestIntersection   = checkForSharedSamples('ValidateColData', 'TestColData',     colDataSetDict)

interesection size should be zero
len( TrainColData.intersection(ValidateColData ) ) 2090
len( TrainColData.intersection(TestColData ) ) 2064
len( ValidateColData.intersection(TestColData ) ) 0


In [11]:
print(f'interesection size should be zero')
countTrainValIntersection  = checkForSharedSamples('TrainGroupby',    'ValidateGroupby', countSetDict)
countTrainTestIntersection = checkForSharedSamples('TrainGroupby',    'TestGroupby',     countSetDict)
countValTestIntersection   = checkForSharedSamples('ValidateGroupby', 'TestGroupby',     countSetDict)

interesection size should be zero
len( TrainGroupby.intersection(ValidateGroupby ) ) 2090
len( TrainGroupby.intersection(TestGroupby ) ) 2064
len( ValidateGroupby.intersection(TestGroupby ) ) 0


## Are the samples in the intersection the same?

In [12]:
print('we expected sum to be zero if the intersections are the same')
print( sum(colDataTrainValIntersection != countTrainValIntersection) )
print( sum(colDataTrainTestIntersection != countTrainTestIntersection) )
print( sum(colDataValTestIntersection != countValTestIntersection) )

we expected sum to be zero if the intersections are the same
0
0
0


# How to fix?

In [13]:
print(f'some how we messed up GTEx!!!!\n')

print(f'number of samples in colDataTrainValIntersection : {colDataTrainValIntersection.shape}')
print(f'ValidateColData')
display( colDataDict['ValidateColData'].loc[colDataTrainValIntersection, :].groupby("dataSet").count() )

print()
print(f'number of samples in colDataTrainTestIntersection : {colDataTrainTestIntersection.shape}')
print(f'TestColData')
colDataDict['TestColData'].loc[colDataTrainTestIntersection, :].groupby("dataSet").count()


some how we messed up GTEx!!!!

number of samples in colDataTrainValIntersection : (2090,)
ValidateColData


Unnamed: 0_level_0,participant_id,category,gender,age
dataSet,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
GTEx,2090,2090,2090,2090



number of samples in colDataTrainTestIntersection : (2064,)
TestColData


Unnamed: 0_level_0,participant_id,category,gender,age
dataSet,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
GTEx,2064,2064,2064,2064


## <span style="color:red;background-color:yellow">short term fix</span>
- all the sample ids in the training count data set are unique
- if all the count data has colData just use training data for now

In [14]:
print('we expected sum to be zero if the sample ids are the same')
print(f"{sum( colDataSampleIdsDict['TrainColData'] != countSampleIdsDict['TrainGroupby'] )}")

we expected sum to be zero if the sample ids are the same
0


In [15]:
# how many samples in total do we have?
def howManySamplesInTotal(setDict):
    retSet = set()
    total = 0
    for key, s in setDict.items():
        retSet = retSet.union(set(s))
        total += (len(s))
        print(f'{key :<15} len(sampleIdSet) : {len(s) :>6} s.unique().shape { s.unique().shape } total : {total} len(retSet) : {len(retSet)}')

    l = list(retSet)
    print(f'\ntotal num sample ids: {total} num unique { len( np.unique(l)) } ')

howManySamplesInTotal(countSetDict)

print()
print(f'num missing GTEx samples {26337 - 22183}')
print(f'num overlapping ie train/val and train/test sample ids are not disjoin GTEx samples {2090 + 2064}')

TestGroupby     len(sampleIdSet) :   5268 s.unique().shape (5268,) total : 5268 len(retSet) : 5268
TrainGroupby    len(sampleIdSet) :  15801 s.unique().shape (15801,) total : 21069 len(retSet) : 19005
ValidateGroupby len(sampleIdSet) :   5268 s.unique().shape (5268,) total : 26337 len(retSet) : 22183

total num sample ids: 26337 num unique 22183 

num missing GTEx samples 4154
num overlapping ie train/val and train/test sample ids are not disjoin GTEx samples 4154


# How many GTEx samples do we actuall have? Check GTEx/

In [16]:
! wc -l /private/groups/kimlab/GTEx/colData.csv

17383 /private/groups/kimlab/GTEx/colData.csv


In [17]:
%%time
GTExRootDir = "/private/groups/kimlab/GTEx"


def loadGTExCountData(dataRoot):
    '''
    csv Wall time: 5min 18s
    hdf5 Wall time: 19.9 s
    '''
    #countFiles= ["GTExTestGroupByGenesCountMatrix.csv", "GTExValidateGroupByGenesCountMatrix.csv", "GTExTrainGroupByGenesCountMatrix.csv"]
    countFiles= ["GTExTestGroupByGenesCountMatrix.hdf5", "GTExValidateGroupByGenesCountMatrix.hdf5", "GTExTrainGroupByGenesCountMatrix.hdf5"]
    countDict = dict()
    suffix = len('GenesCountMatrix.hdf5')
    for f in countFiles:
        key = f[:-suffix]
        fPath = f'{dataRoot}/{f}'
        #df = pd.read_csv(fPath, index_col="geneId")
        df = pd.read_hdf(fPath)
        print(f'{key} {fPath} {df.shape}')
        countDict[key] = df

    return countDict

GTExCountDict = loadGTExCountData(dataRoot)

print(f'GTExCountDict.keys() : {GTExCountDict.keys()}')
display(GTExCountDict[ 'GTExTestGroupBy'].head().iloc[0:3, 0:3])
print()

GTExTestGroupBy /scratch/aedavids/tmp/GTExTestGroupByGenesCountMatrix.hdf5 (74777, 3471)
GTExValidateGroupBy /scratch/aedavids/tmp/GTExValidateGroupByGenesCountMatrix.hdf5 (74777, 3471)
GTExTrainGroupBy /scratch/aedavids/tmp/GTExTrainGroupByGenesCountMatrix.hdf5 (74777, 10411)
GTExCountDict.keys() : dict_keys(['GTExTestGroupBy', 'GTExValidateGroupBy', 'GTExTrainGroupBy'])


Unnamed: 0_level_0,GTEX-1117F-0226-SM-5GZZ7,GTEX-1117F-0526-SM-5EGHJ,GTEX-1117F-0626-SM-5N9CS
geneId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
(A)n,9,1,3
(AAA)n,0,0,0
(AAAAAAC)n,0,0,0



CPU times: user 9.3 s, sys: 12.8 s, total: 22.1 s
Wall time: 34.5 s


In [18]:
#saveAsHDF(GTExCountDict) 

In [19]:
def howManyGTExSamplesInTotal(countDict):
    retSet = set()
    total = 0
    for key, df in countDict.items():
        s = df.columns
        retSet = retSet.union(set(s))
        total += (len(s))
        print(f'{key :<19} len(sampleIdSet) : {len(s) :>6} s.unique().shape { s.unique().shape } total : {total} len(retSet) : {len(retSet)}')

    l = list(retSet)
    print(f'\ntotal num sample ids: {total} num unique { len( np.unique(l)) } ')

howManyGTExSamplesInTotal(GTExCountDict)

GTExTestGroupBy     len(sampleIdSet) :   3471 s.unique().shape (3471,) total : 3471 len(retSet) : 3471
GTExValidateGroupBy len(sampleIdSet) :   3471 s.unique().shape (3471,) total : 6942 len(retSet) : 6942
GTExTrainGroupBy    len(sampleIdSet) :  10411 s.unique().shape (10411,) total : 17353 len(retSet) : 13199

total num sample ids: 17353 num unique 13199 


# How many TCGA samples do we have?

In [20]:
def howManyTCGASamples():
    sampleIdSet = set()
    total = 0
    for key, df in colDataDict.items():
        selectTCGASamples = df.loc[:, "dataSet"] == 'TCGA'
        samples = df.loc[selectTCGASamples, :].index
        s = set(samples)
        sampleIdSet = sampleIdSet.union(s)
        total += len(samples)
        
        print(f'{key :<15} total: {total} len(samples) : {len(samples)} num unique : {len(s)} ' )

    print(f'total : {total} num unique : {len(sampleIdSet)}')
        
howManyTCGASamples()       

TestColData     total: 1797 len(samples) : 1797 num unique : 1797 
TrainColData    total: 7187 len(samples) : 5390 num unique : 5390 
ValidateColData total: 8984 len(samples) : 1797 num unique : 1797 
total : 8984 num unique : 8984


# Gather all the uniqe a samples into a single data set

In [21]:
%%time
normalizedGroupbyHDF5Path = f'{dataRoot}/GTEx_TCGA_NormalizedGroupby.hdf5'
normalizedGroupbyHDF5_DF = pd.read_hdf(normalizedGroupbyHDF5Path)
normalizedGroupbyHDF5_DF =  normalizedGroupbyHDF5_DF.set_index( 'geneId' )
print( f'normalizedGroupbyHDF5_DF.shape : {normalizedGroupbyHDF5_DF.shape}' )

normalizedGroupbyHDF5_DF.shape : (74777, 26337)
CPU times: user 17.4 s, sys: 56.8 s, total: 1min 14s
Wall time: 1min 22s


In [26]:
normalizedGroupbyHDF5_DF.iloc[0:3, 0:3]

Unnamed: 0_level_0,GTEX-1117F-0226-SM-5GZZ7,GTEX-1117F-0526-SM-5EGHJ,GTEX-1117F-0726-SM-5GIEN
geneId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
(A)n,7.506043,0.68924,1.611646
(AAA)n,0.0,0.0,0.0
(AAAAAAC)n,0.0,0.0,0.0


In [22]:
%%time
colDataPath = f"{dataRoot}/GTEx_TCGA_ColData.csv"
colDataDF = pd.read_csv(colDataPath, index_col="sample_id")
print(f'colDataDF.shape : {colDataDF.shape}' )
colDataDF.head()

colDataDF.shape : (26337, 5)
CPU times: user 40.5 ms, sys: 13.3 ms, total: 53.9 ms
Wall time: 52.6 ms


Unnamed: 0_level_0,participant_id,category,gender,age,dataSet
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
GTEX-1117F-0226-SM-5GZZ7,GTEX-1117F,Adipose_Subcutaneous,Female,66.0,GTEx
GTEX-1117F-0526-SM-5EGHJ,GTEX-1117F,Artery_Tibial,Female,66.0,GTEx
GTEX-1117F-0726-SM-5GIEN,GTEX-1117F,Heart_Atrial_Appendage,Female,66.0,GTEx
GTEX-1117F-2826-SM-5GZXL,GTEX-1117F,Breast_Mammary_Tissue,Female,66.0,GTEx
GTEX-1117F-3226-SM-5N9CT,GTEX-1117F,Brain_Cortex,Female,66.0,GTEx


In [27]:
uniqueCountSampleIds = normalizedGroupbyHDF5_DF.columns.unique()
uniqueColDataSampleIds = colDataDF.index.unique()

print(len(uniqueCountSampleIds))
print( len(uniqueColDataSampleIds))



26337
22183


In [24]:
print( sum( normalizedGroupbyHDF5_DF.columns.duplicated() ) )
print( sum( colDataDF.index.duplicated() ) )
print( sum( ~colDataDF.index.duplicated() ) )
sum( normalizedGroupbyHDF5_DF.columns != colDataDF.index ) 

0
4154
22183


4154

In [32]:
%%time
uniqueSampleIds = ~colDataDF.index.duplicated()
noDupsNormalizedGroupbyHDF5_DF = normalizedGroupbyHDF5_DF.loc[:, uniqueSampleIds]
print( noDupsNormalizedGroupbyHDF5_DF.shape )
display( noDupsNormalizedGroupbyHDF5_DF.iloc[0:3, 0:3] )
print()

(74777, 22183)


Unnamed: 0_level_0,GTEX-1117F-0226-SM-5GZZ7,GTEX-1117F-0526-SM-5EGHJ,GTEX-1117F-0726-SM-5GIEN
geneId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
(A)n,7.506043,0.68924,1.611646
(AAA)n,0.0,0.0,0.0
(AAAAAAC)n,0.0,0.0,0.0



CPU times: user 2.4 s, sys: 36.7 s, total: 39.1 s
Wall time: 41.1 s


In [30]:
%%time
noDupsColDataDF = colDataDF.loc[uniqueSampleIds, :]
print( noDupsColDataDF.shape )
display( colDataDF.head() )
print()

(22183, 5)


Unnamed: 0_level_0,participant_id,category,gender,age,dataSet
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
GTEX-1117F-0226-SM-5GZZ7,GTEX-1117F,Adipose_Subcutaneous,Female,66.0,GTEx
GTEX-1117F-0526-SM-5EGHJ,GTEX-1117F,Artery_Tibial,Female,66.0,GTEx
GTEX-1117F-0726-SM-5GIEN,GTEX-1117F,Heart_Atrial_Appendage,Female,66.0,GTEx
GTEX-1117F-2826-SM-5GZXL,GTEX-1117F,Breast_Mammary_Tissue,Female,66.0,GTEx
GTEX-1117F-3226-SM-5N9CT,GTEX-1117F,Brain_Cortex,Female,66.0,GTEx



CPU times: user 16.1 ms, sys: 0 ns, total: 16.1 ms
Wall time: 14.7 ms


# <span style="color:red;background-color:yellow"> save unique samples as a single data set</span>

In [33]:
%%time
def saveAsHDF2( dataRoot, fileName, df, key):
    hdf5Path = f'{dataRoot}/{fileName}.hdf5'
    df.to_hdf(hdf5Path, key=key)
    print(f'saved to : {hdf5Path}')

# saveAsHDF2( dataRoot, 'GTEx_TCGA_NormalizedGroupby', noDupsNormalizedGroupbyHDF5_DF, 'normalizedGroupbyGenesCounts')
# saveAsHDF2( dataRoot, 'GTEx_TCGA_ColData', noDupsColDataDF, 'colData')

saved to : /scratch/aedavids/tmp/GTEx_TCGA_NormalizedGroupby.hdf5
saved to : /scratch/aedavids/tmp/GTEx_TCGA_ColData.hdf5
CPU times: user 28.4 s, sys: 1min 22s, total: 1min 50s
Wall time: 2min
