In [27]:
pwd

'/mnt/4TB/TCGA_Liver/scripts/3-processing'

In [28]:
import pandas as pd
import json

These are the cases that satisfy cohort criteria

In [29]:
case_ids = pd.read_csv('../2-clinical/2-select.csv')['case_id']

In [30]:
len(case_ids) == len(set(case_ids))

True

In [31]:
len(case_ids)

109

These are all of the advanced colorectal cancer files from the portal. We need to get the subset of these that satisfy the cohort inclusion criteria

Sometimes there are multiple files per case

In [32]:
with open('metadata.json') as f:
    metadata = json.load(f)

In [33]:
len(metadata)

322

In [34]:
metadata[0]

{'data_format': 'BAM',
 'access': 'controlled',
 'associated_entities': [{'entity_submitter_id': 'TCGA-DD-AAW1-10A-01D-A40P-10',
   'entity_type': 'aliquot',
   'case_id': '31c7d56b-4af7-4fdb-ba83-63bf615fcc3d',
   'entity_id': '7730846e-e3d4-44d3-a53c-47ab352eaf6e'}],
 'file_name': '7730846e-e3d4-44d3-a53c-47ab352eaf6e_wxs_gdc_realn.bam',
 'submitter_id': 'eb8ba4f9-133a-4279-b297-5bc7fbff9aaf',
 'data_category': 'Sequencing Reads',
 'downstream_analyses': [{'output_files': [{'data_format': 'VCF',
     'access': 'controlled',
     'file_name': 'ec983056-1048-4497-9868-4d41d09bca4c.wxs.mutect2.raw_somatic_mutation.vcf.gz',
     'file_id': '55dcade6-ca3b-4a2a-b5ee-ce0012535848',
     'data_type': 'Raw Simple Somatic Mutation',
     'data_category': 'Simple Nucleotide Variation',
     'file_size': 334007}],
   'workflow_type': 'MuTect2'},
  {'output_files': [{'data_format': 'VCF',
     'access': 'controlled',
     'file_name': 'ec983056-1048-4497-9868-4d41d09bca4c.wxs.varscan2.raw_somatic

In [35]:
case_dikts = []
for case_id in case_ids:
    case_dikt = None
    
    for entry in metadata:
            if (case_dikt is None):
                entry_case_id = entry['associated_entities'][0]['case_id']
                if (case_id == entry_case_id):
                        size = round(float(entry['file_size'])/(10**9),1)
                        
                        case_dikt = {}
                        case_dikt['case_id']     = case_id
                        case_dikt['file_id']     = entry['file_id']
                        case_dikt['file_name']   = entry['file_name']
                        case_dikt['size_bam_GB'] = size

    if (case_dikt is None):
        print(case_id)
        # raise Exception(f"no match found for case_id: {case_id}")
    else:
        case_dikts.append(case_dikt)

In [36]:
len(case_dikts)

109

In [37]:
case_dikts[0]

{'case_id': '0004d251-3f70-4395-b175-c94c2f5b1b81',
 'file_id': '83f5edc7-c91c-453c-8c21-3519d63c2049',
 'file_name': 'c86ff390-e3fe-44f9-8f26-e7c7dd6c5536_wxs_gdc_realn.bam',
 'size_bam_GB': 16.0}

In [38]:
df_manifest = pd.DataFrame.from_dict(case_dikts)

Boolean values get weird when any non-boolean values are used alongside them

In [39]:
df_manifest['downloaded']  = 'no' # we will introduce a text-based 'checked_out' status later
df_manifest['called']      = 'no'
df_manifest['annotated']   = 'no'
df_manifest['qc']          = 'no'
df_manifest['size_vcf_MB'] = None

---

For prototyping the differentiation scripts

In [40]:
df_clinical = pd.read_csv('/mnt/4TB/TCGA_Liver/scripts/2-clinical/2-select.csv')

In [41]:
cases_alive = df_clinical[df_clinical['vital_status']=='Alive']['case_id'].tolist()

In [42]:
len(cases_alive)

44

In [43]:
df_manifest['vital_status'] = 'Dead'

In [44]:
df_manifest.loc[df_manifest['case_id'].isin(cases_alive), 'vital_status'] = 'Alive'

In [45]:
df_manifest['vital_status'].value_counts()

Dead     65
Alive    44
Name: vital_status, dtype: int64

---

For prototyping handling allosomes

In [46]:
cases_female = df_clinical[df_clinical['gender']=='female']['case_id'].tolist()

In [47]:
len(cases_female)

32

In [48]:
df_manifest['gender'] = 'male'

In [49]:
df_manifest.loc[df_manifest['case_id'].isin(cases_female), 'gender'] = 'female'

---

Commenting this out so that the manifest doesn't get wiped by accident

In [53]:
df_manifest.to_csv('1-manifest.csv', index=False)

---

Figuring out container size for job

In [51]:
df_manifest['size_bam_GB'].describe()

count    109.000000
mean      21.273394
std        7.596328
min       13.100000
25%       16.900000
50%       19.800000
75%       21.600000
max       49.600000
Name: size_bam_GB, dtype: float64

In [52]:
df_manifest['file_id'].tolist()

['83f5edc7-c91c-453c-8c21-3519d63c2049',
 '36e340e3-b6c2-416c-ba4e-72fb08101902',
 '4ac8b0a1-7189-4f66-b81e-2bbec2177da2',
 '4822b3f2-9507-4eda-bafd-54180b51a923',
 '127bca9f-ef8e-4f57-88f3-337316c133a6',
 'aef22fa7-3523-4b6a-aefd-25c4ff4b8b89',
 'fd1fcda0-1ddb-4529-9167-882f27cb55a6',
 'b2988829-a865-459e-a429-1a862a90caf0',
 'c1b72ba1-cbd9-4a2f-8021-700d8280762f',
 '2b48029a-0c8a-40e6-a2ba-2ed49593d321',
 '599b3e0b-7266-4ffb-a66c-f9419f19cfa2',
 '7610e1fe-19bd-42e5-83c3-c94f963c7073',
 '1c90a4ff-9be7-4504-a32e-103b4222eeba',
 'e096c2a4-d2c0-41ea-9816-8f9c6ab9d08d',
 '741a0a0e-05e4-4416-bfaa-5b5227d9f0c3',
 '0cbe00aa-9e39-481d-b406-f40a632867a6',
 '352db63b-1310-4973-8963-917764a381ee',
 '775c8434-b0d7-44f2-a202-9c9e9a4291cd',
 'e10f7cb0-f800-4c3a-a6e4-ddb838015b04',
 '7c9d4303-ad33-4253-b1ee-0688129c8646',
 '54e26323-bee2-4e25-ab7a-3dfd40f900ab',
 'a6c4b90a-063c-45aa-9ffc-08b51ede2a42',
 '1d886cbc-6654-4145-aff1-ac910aac41d7',
 '9f2e96bd-b555-4438-980b-45e4443284a5',
 '62670acf-a512-