# Create tissue specific data sets for GTEx

```
Andrew Davidson
aedavids@ucsc.edu
2/25/22
```


- [Make a set of data  table from scratch ](https://support.terra.bio/hc/en-us/articles/360047611871#h_01EJXZMM6GA3481YRRQBR65MY3)
- [Data Tables QuickStart Part 3: Understanding sets of data](https://support.terra.bio/hc/en-us/articles/360047611871)
- [adding data to a workspace with a template](https://support.terra.bio/hc/en-us/articles/360059242671). see "Sets of data - sample_set table"

In [1]:
from datetime import datetime
now = datetime.now()
print("run on {}".format( now.strftime('%Y-%m-%d %H:%M:%S') ))

import numpy as np
import pandas as pd

run on 2022-02-25 15:30:35


## Load data and create list of tissue ids

In [2]:
rootDir = "../../../terraDataModels/test-aedavids-proj/AnVIL_GTEx_V8_hg38_edu_ucsc_kim_lab"

entityName = "sample"
sampleTSV = rootDir + "/" + entityName + ".tsv"
sampleDF = pd.read_csv(sampleTSV, delimiter='\t')
sampleDF.head()

Unnamed: 0,entity:sample_id,aux_info,bam_file,bam_index,firstEndFastq,participant,quantFile,secondEndFastq,tissue_id,tissue_site_detail,unpairedFastq
0,GTEX-1117F-0226-SM-5GZZ7,gs://fc-secure-f5aa8a37-78e5-45f6-9c59-c643016...,gs://fc-secure-ff8156a3-ddf3-42e4-9211-0fd89da...,gs://fc-secure-ff8156a3-ddf3-42e4-9211-0fd89da...,,GTEX-1117F,gs://fc-secure-f5aa8a37-78e5-45f6-9c59-c643016...,,Adipose_Subcutaneous,Adipose - Subcutaneous,
1,GTEX-1117F-0426-SM-5EGHI,gs://fc-secure-f5aa8a37-78e5-45f6-9c59-c643016...,gs://fc-secure-ff8156a3-ddf3-42e4-9211-0fd89da...,gs://fc-secure-ff8156a3-ddf3-42e4-9211-0fd89da...,,GTEX-1117F,gs://fc-secure-f5aa8a37-78e5-45f6-9c59-c643016...,,Muscle_Skeletal,Muscle - Skeletal,
2,GTEX-1117F-0526-SM-5EGHJ,gs://fc-secure-f5aa8a37-78e5-45f6-9c59-c643016...,gs://fc-secure-ff8156a3-ddf3-42e4-9211-0fd89da...,gs://fc-secure-ff8156a3-ddf3-42e4-9211-0fd89da...,,GTEX-1117F,gs://fc-secure-f5aa8a37-78e5-45f6-9c59-c643016...,,Artery_Tibial,Artery - Tibial,
3,GTEX-1117F-0626-SM-5N9CS,gs://fc-secure-f5aa8a37-78e5-45f6-9c59-c643016...,gs://fc-secure-ff8156a3-ddf3-42e4-9211-0fd89da...,gs://fc-secure-ff8156a3-ddf3-42e4-9211-0fd89da...,,GTEX-1117F,gs://fc-secure-f5aa8a37-78e5-45f6-9c59-c643016...,,Artery_Coronary,Artery - Coronary,
4,GTEX-1117F-0726-SM-5GIEN,gs://fc-secure-f5aa8a37-78e5-45f6-9c59-c643016...,gs://fc-secure-ff8156a3-ddf3-42e4-9211-0fd89da...,gs://fc-secure-ff8156a3-ddf3-42e4-9211-0fd89da...,,GTEX-1117F,gs://fc-secure-f5aa8a37-78e5-45f6-9c59-c643016...,,Heart_Atrial_Appendage,Heart - Atrial Appendage,


In [3]:
tissueIdNP = np.sort( sampleDF.loc[:,'tissue_id'].unique() )
tissueIdNP

array(['Adipose_Subcutaneous', 'Adipose_Visceral_Omentum',
       'Adrenal_Gland', 'Artery_Aorta', 'Artery_Coronary',
       'Artery_Tibial', 'Bladder', 'Brain_Amygdala',
       'Brain_Anterior_cingulate_cortex_BA24',
       'Brain_Caudate_basal_ganglia', 'Brain_Cerebellar_Hemisphere',
       'Brain_Cerebellum', 'Brain_Cortex', 'Brain_Frontal_Cortex_BA9',
       'Brain_Hippocampus', 'Brain_Hypothalamus',
       'Brain_Nucleus_accumbens_basal_ganglia',
       'Brain_Putamen_basal_ganglia', 'Brain_Spinal_cord_cervical_c-1',
       'Brain_Substantia_nigra', 'Breast_Mammary_Tissue',
       'Cells_Cultured_fibroblasts', 'Cells_EBV-transformed_lymphocytes',
       'Cervix_Ectocervix', 'Cervix_Endocervix', 'Colon_Sigmoid',
       'Colon_Transverse', 'Esophagus_Gastroesophageal_Junction',
       'Esophagus_Mucosa', 'Esophagus_Muscularis', 'Fallopian_Tube',
       'Heart_Atrial_Appendage', 'Heart_Left_Ventricle', 'Kidney_Cortex',
       'Kidney_Medulla', 'Liver', 'Lung', 'Minor_Salivary_Gland',

## Create the sample_set_entity.tsv file

In [4]:
entityName = "sample_set_entity"
sampleSetEntityTSV = rootDir + "/" + entityName + ".tsv"
sampleSetEntityDF = pd.DataFrame( { 'entity:sample_set_id':tissueIdNP } )
sampleSetEntityDF.head()

Unnamed: 0,entity:sample_set_id
0,Adipose_Subcutaneous
1,Adipose_Visceral_Omentum
2,Adrenal_Gland
3,Artery_Aorta
4,Artery_Coronary


In [5]:
tmp = sampleSetEntityTSV
#tmp =  rootDir + "/" + "tmp-"  + entityName + ".tsv"

print("writing file:\n{}".format(tmp))
sampleSetEntityDF.to_csv( tmp, sep='\t', index=False )

writing file:
../../../terraDataModels/test-aedavids-proj/AnVIL_GTEx_V8_hg38_edu_ucsc_kim_lab/sample_set_entity.tsv


# Create sample_set_membership.tsv file

In [6]:
setMembershipDF = pd.DataFrame(columns=['membership:sample_set_id','sample'])
for tid in  tissueIdNP: #  ['Adipose_Subcutaneous', 'Brain_Caudate_basal_ganglia']
    #print("\n{}".format(tid))
    rows = sampleDF['tissue_id'] == tid
    #print("sumRows:{}".format(sum(rows)))
    sampleIdList = sampleDF.loc[rows, 'entity:sample_id'].to_list()
    setIdList = [tid] * len(sampleIdList)
    tissueSampleDF = pd.DataFrame( {'membership:sample_set_id':setIdList ,
                                    'sample':sampleIdList
                                   })
    setMembershipDF = setMembershipDF.append(tissueSampleDF)

In [7]:
setMembershipDF

Unnamed: 0,membership:sample_set_id,sample
0,Adipose_Subcutaneous,GTEX-1117F-0226-SM-5GZZ7
1,Adipose_Subcutaneous,GTEX-111CU-1826-SM-5GZYN
2,Adipose_Subcutaneous,GTEX-111FC-0226-SM-5N9B8
3,Adipose_Subcutaneous,GTEX-111VG-2326-SM-5N9BK
4,Adipose_Subcutaneous,GTEX-111YS-2426-SM-5GZZQ
...,...,...
750,Whole_Blood,GTEX-ZVTK-0006-SM-57WBK
751,Whole_Blood,GTEX-ZVZP-0006-SM-51MSW
752,Whole_Blood,GTEX-ZVZQ-0006-SM-51MR8
753,Whole_Blood,GTEX-ZXES-0005-SM-57WCB


In [8]:
entityName = "sample_set_membership"
sampleSetMembershipTSV = rootDir + "/" + entityName + ".tsv"
tmp = sampleSetMembershipTSV
#tmp = rootDir + "/" + "tmp-" + entityName + ".tsv"
print("writing file:\n{}".format(tmp))
setMembershipDF.to_csv( tmp, sep='\t', index=False )

writing file:
../../../terraDataModels/test-aedavids-proj/AnVIL_GTEx_V8_hg38_edu_ucsc_kim_lab/sample_set_membership.tsv
