# Clean up GTEx data set definitions
```
Andrew E. Davidson
aedavids@ucsc.edu
```

remove old, unused data sets

Typically when running a batch job a few samples fail. typically this is because they need more memory. You can not simply adjust the memory parameter and re-run. Terra will re-run all the samples. This wastes a lot of time an money

Best practices is to define a sample set containing the failed samples.

In [1]:
import pandas as pd
from pandas import DataFrame

In [2]:
rootDir = "../../../terraDataModels/test-aedavids-proj/AnVIL_GTEx_V8_hg38_edu_ucsc_kim_lab"

In [3]:
def readDataModel( rootDir, entityName ) :
    dataModelTSV = rootDir + "/" + entityName + ".tsv"
    dataModelDF = pd.read_csv(dataModelTSV, delimiter='\t')
    return dataModelDF

def saveDataModel( rootDir, entityName, dataModelDF ) :
    dataModelTSV = rootDir + "/" + entityName + ".tsv"
    print("writing {}".format(dataModelTSV))
    dataModelDF.to_csv(dataModelTSV, sep='\t', index=False)
    
def deleteSets(sampleSetEntityDF, sampleSetMembershipDF, setList):       
    for s in setList:
        deleteMembersPS = sampleSetMembershipDF['membership:sample_set_id'] == s
        memberRowsPS =  sampleSetMembershipDF['membership:sample_set_id'] != s
        sampleSetMembershipDF = sampleSetMembershipDF.loc[memberRowsPS, :]
        print("removing set '{} with items {}".format(s, sum( deleteMembersPS) ))  
        
        entityRowsPS = sampleSetEntityDF['entity:sample_set_id'] != s
        sampleSetEntityDF = sampleSetEntityDF.loc[entityRowsPS, :]
        
    return (sampleSetEntityDF, sampleSetMembershipDF)

In [4]:
sampleSetDF = readDataModel( rootDir, "sample_set_entity")
sampleSetMembershipDF = readDataModel( rootDir, "sample_set_membership")
sampleSetDF['entity:sample_set_id'].to_list()

['Adipose_Subcutaneous',
 'Adipose_Visceral_Omentum',
 'Adrenal_Gland',
 'Artery_Aorta',
 'Artery_Coronary',
 'Artery_Tibial',
 'Bladder',
 'Brain_Amygdala',
 'Brain_Anterior_cingulate_cortex_BA24',
 'Brain_Caudate_basal_ganglia',
 'Brain_Cerebellar_Hemisphere',
 'Brain_Cerebellum',
 'Brain_Cortex',
 'Brain_Frontal_Cortex_BA9',
 'Brain_Hippocampus',
 'Brain_Hypothalamus',
 'Brain_Nucleus_accumbens_basal_ganglia',
 'Brain_Putamen_basal_ganglia',
 'Brain_Spinal_cord_cervical_c-1',
 'Brain_Substantia_nigra',
 'Breast_Mammary_Tissue',
 'Cells_Cultured_fibroblasts',
 'Cells_EBV-transformed_lymphocytes',
 'Cervix_Ectocervix',
 'Cervix_Endocervix',
 'Colon_Sigmoid',
 'Colon_Transverse',
 'Esophagus_Gastroesophageal_Junction',
 'Esophagus_Mucosa',
 'Esophagus_Muscularis',
 'Fallopian_Tube',
 'Heart_Atrial_Appendage',
 'Heart_Left_Ventricle',
 'Kidney_Cortex',
 'Kidney_Medulla',
 'Liver',
 'Lung',
 'Minor_Salivary_Gland',
 'Muscle_Skeletal',
 'Nerve_Tibial',
 'Ovary',
 'Pancreas',
 'Pituitary',

## remove deprecated set definitions

In [5]:
setsToRemove = [
    '2021_09_29-re-run',
    '2021_10_03-re-run',
    '2bladderSamples',
    'aug_2021_08_11-re-run',
    'aug_2021_08_23-re-run',
    'salmonQuantWorkflow-resubmission-2021-05-09T23-30-56',
    'salmonQuantWorkflow_2021-05-04T17-11-47',
    'salmonQuantWorkflow_2021-11-17T19-21-18',
    'salmonQuantWorkflow_2022-02-26T18-11-34',
    'salmonQuantWorkflow_2022-02-27T00-49-32',
    'salmonQuantWorkflow_2022-02-27T17-14-32',
    'salmonQuantWorkflow_2022-02-28T14-34-21',
    'salmonQuantWorkflow_2022-02-28T20-52-04',
    'salmonQuantWorkflow_2022-03-02T03-19-10',
    'salmonQuantWorkflow_25_panc',
    'panc-10-fixParatest',
    'Pancreas_10',
    'total_colon',
    'total_esophagus'
]
cleanSampleSetEntityDF, cleanSampleSetMembershipDF = deleteSets(sampleSetDF, sampleSetMembershipDF, setsToRemove)
newSetList = cleanSampleSetEntityDF['entity:sample_set_id'].to_list()
print("\n\nnew set list length:{}".format(len(newSetList)))
newSetList

removing set '2021_09_29-re-run with items 0
removing set '2021_10_03-re-run with items 0
removing set '2bladderSamples with items 0
removing set 'aug_2021_08_11-re-run with items 0
removing set 'aug_2021_08_23-re-run with items 0
removing set 'salmonQuantWorkflow-resubmission-2021-05-09T23-30-56 with items 0
removing set 'salmonQuantWorkflow_2021-05-04T17-11-47 with items 0
removing set 'salmonQuantWorkflow_2021-11-17T19-21-18 with items 0
removing set 'salmonQuantWorkflow_2022-02-26T18-11-34 with items 0
removing set 'salmonQuantWorkflow_2022-02-27T00-49-32 with items 0
removing set 'salmonQuantWorkflow_2022-02-27T17-14-32 with items 0
removing set 'salmonQuantWorkflow_2022-02-28T14-34-21 with items 0
removing set 'salmonQuantWorkflow_2022-02-28T20-52-04 with items 0
removing set 'salmonQuantWorkflow_2022-03-02T03-19-10 with items 0
removing set 'salmonQuantWorkflow_25_panc with items 0
removing set 'panc-10-fixParatest with items 0
removing set 'Pancreas_10 with items 0
removing set

['Adipose_Subcutaneous',
 'Adipose_Visceral_Omentum',
 'Adrenal_Gland',
 'Artery_Aorta',
 'Artery_Coronary',
 'Artery_Tibial',
 'Bladder',
 'Brain_Amygdala',
 'Brain_Anterior_cingulate_cortex_BA24',
 'Brain_Caudate_basal_ganglia',
 'Brain_Cerebellar_Hemisphere',
 'Brain_Cerebellum',
 'Brain_Cortex',
 'Brain_Frontal_Cortex_BA9',
 'Brain_Hippocampus',
 'Brain_Hypothalamus',
 'Brain_Nucleus_accumbens_basal_ganglia',
 'Brain_Putamen_basal_ganglia',
 'Brain_Spinal_cord_cervical_c-1',
 'Brain_Substantia_nigra',
 'Breast_Mammary_Tissue',
 'Cells_Cultured_fibroblasts',
 'Cells_EBV-transformed_lymphocytes',
 'Cervix_Ectocervix',
 'Cervix_Endocervix',
 'Colon_Sigmoid',
 'Colon_Transverse',
 'Esophagus_Gastroesophageal_Junction',
 'Esophagus_Mucosa',
 'Esophagus_Muscularis',
 'Fallopian_Tube',
 'Heart_Atrial_Appendage',
 'Heart_Left_Ventricle',
 'Kidney_Cortex',
 'Kidney_Medulla',
 'Liver',
 'Lung',
 'Minor_Salivary_Gland',
 'Muscle_Skeletal',
 'Nerve_Tibial',
 'Ovary',
 'Pancreas',
 'Pituitary',

## explore new set definitions

In [6]:
sampleSetMembershipDF.groupby('membership:sample_set_id').count()

Unnamed: 0_level_0,sample
membership:sample_set_id,Unnamed: 1_level_1
Adipose_Subcutaneous,5304
Adipose_Visceral_Omentum,4328
Adrenal_Gland,2064
Artery_Aorta,3456
Artery_Coronary,1920
Artery_Tibial,5304
Bladder,168
Brain_Amygdala,1216
Brain_Anterior_cingulate_cortex_BA24,1408
Brain_Caudate_basal_ganglia,1968


In [7]:
cleanSampleSetMembershipDF.groupby('membership:sample_set_id').count()

Unnamed: 0_level_0,sample
membership:sample_set_id,Unnamed: 1_level_1
Adipose_Subcutaneous,5304
Adipose_Visceral_Omentum,4328
Adrenal_Gland,2064
Artery_Aorta,3456
Artery_Coronary,1920
Artery_Tibial,5304
Bladder,168
Brain_Amygdala,1216
Brain_Anterior_cingulate_cortex_BA24,1408
Brain_Caudate_basal_ganglia,1968


## remove duplicates
there was a bug in terra that introduced duplicated

In [8]:
groupedDF = cleanSampleSetMembershipDF.groupby('membership:sample_set_id').count()
print("total number of sample is 17382 count with duplicates is {}".format(sum( groupedDF['sample'])))

total number of sample is 17382 count with duplicates is 141024


In [9]:
# https://realpython.com/pandas-groupby/
def removeDups(setMembershipDF) :
    numSamples = 0
    retDF = pd.DataFrame( {'membership:sample_set_id':[], 'sample':[] } )
    bySetId = setMembershipDF.groupby('membership:sample_set_id')
    for setId, df in bySetId:
        sampleIds = df['sample'].unique()
        numUniqueIds = len(sampleIds)
        print("setId:{} numUniqueIds:{}".format( setId, numUniqueIds))
        numSamples += numUniqueIds
        
        memberSetId = [setId] * numUniqueIds
        df = pd.DataFrame( {'membership:sample_set_id':memberSetId, 'sample':sampleIds } )
        retDF = retDF.append(df)
        
    print("\nnumber of unique samples: {}".format(numSamples))
    return retDF
    
    
# testSelectRows = cleanSampleSetMembershipDF['membership:sample_set_id'].isin( ['Testis', 'Vagina'] )
# aedwipDF = removeDups( cleanSampleSetMembershipDF.loc[testSelectRows, :] )
# print("\n\n***********")
# aedwipDF

cleanSampleSetMembershipDF = removeDups( cleanSampleSetMembershipDF )

setId:Adipose_Subcutaneous numUniqueIds:663
setId:Adipose_Visceral_Omentum numUniqueIds:541
setId:Adrenal_Gland numUniqueIds:258
setId:Artery_Aorta numUniqueIds:432
setId:Artery_Coronary numUniqueIds:240
setId:Artery_Tibial numUniqueIds:663
setId:Bladder numUniqueIds:21
setId:Brain_Amygdala numUniqueIds:152
setId:Brain_Anterior_cingulate_cortex_BA24 numUniqueIds:176
setId:Brain_Caudate_basal_ganglia numUniqueIds:246
setId:Brain_Cerebellar_Hemisphere numUniqueIds:215
setId:Brain_Cerebellum numUniqueIds:241
setId:Brain_Cortex numUniqueIds:255
setId:Brain_Frontal_Cortex_BA9 numUniqueIds:209
setId:Brain_Hippocampus numUniqueIds:197
setId:Brain_Hypothalamus numUniqueIds:202
setId:Brain_Nucleus_accumbens_basal_ganglia numUniqueIds:246
setId:Brain_Putamen_basal_ganglia numUniqueIds:205
setId:Brain_Spinal_cord_cervical_c-1 numUniqueIds:159
setId:Brain_Substantia_nigra numUniqueIds:139
setId:Breast_Mammary_Tissue numUniqueIds:459
setId:Cells_Cultured_fibroblasts numUniqueIds:504
setId:Cells_EBV

# Save data models

In [10]:
saveDataModel( rootDir, "sample_set_entity", cleanSampleSetEntityDF )
saveDataModel( rootDir, "sample_set_membership", cleanSampleSetMembershipDF )

writing ../../../terraDataModels/test-aedavids-proj/AnVIL_GTEx_V8_hg38_edu_ucsc_kim_lab/sample_set_entity.tsv
writing ../../../terraDataModels/test-aedavids-proj/AnVIL_GTEx_V8_hg38_edu_ucsc_kim_lab/sample_set_membership.tsv
