In [1]:
import os
import re
import sys
import logging

from pathlib import Path
import requests
import pandas as pd
ix = pd.IndexSlice
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

In [7]:
dna_samplesheet = (
    pd.read_csv('../data/gdc_sample_sheet.2023-08-06.tsv', delimiter='\t')
      .assign(
        participant=lambda df: df['Case ID'].str.split(', ', expand=True)[0]
      )
      .rename({
        'File ID': 'maf/file_id',
        'File Name': 'maf/filename',
        'Sample ID': 'maf/sample',
        'Sample Type': 'tissue'
      }, axis=1)
      .loc[:, ['maf/file_id', 'maf/filename', 'maf/sample', 'tissue', 'participant']]
      .set_index('participant')
      .sort_index()
)

dna_samplesheet.head()


Unnamed: 0_level_0,maf/file_id,maf/filename,maf/sample,tissue
participant,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
11LU013,ac9aba0d-dc3c-4d84-9fda-8405a5908ad0,29612340-6316-4225-a5a3-9c6dc9a66db4.wxs.aliqu...,"9f905736-f662-41d6-b3ac-16758d, c7788b82-8190-...","Primary Tumor, Blood Derived Normal"
11LU016,71eca4a5-ca99-491d-9c3a-e0c6ec41b044,b2d4e73f-1809-4f0e-ab08-116c0026c512.wxs.aliqu...,"93e30fd5-e57e-4503-a175-863c7d, 953d4247-a33a-...","Primary Tumor, Blood Derived Normal"
11LU022,49c4d3e5-acd7-4690-b141-797ff60126be,d478e504-a423-4c1f-8455-7946f02f6c9e.wxs.aliqu...,"a55e011d-f91f-4dbd-98e1-328e05, 5a84eae1-197e-...","Blood Derived Normal, Primary Tumor"
11LU035,b18d9782-7ddc-4ad5-83c7-1ccebd8d66a3,cf6f7b60-8614-44cf-b5c6-051a4857c683.wxs.aliqu...,"2f2e5477-42a4-4906-a943-bf7f80, 6b328a7b-2d97-...","Primary Tumor, Blood Derived Normal"
C3L-00001,bb59f841-bf82-47d1-ad2d-173a648f2b68,59c962fd-c193-4418-8d00-d165d6054431.wxs.aliqu...,"C3L-00001-32, C3L-00001-02","Blood Derived Normal, Primary Tumor"


# Turn the case ID to the patient/submitter ID

In [10]:
import requests
def case2submitter(case_id):
    url = 'https://api.gdc.cancer.gov'
    service = 'cases'
    resp = requests.get(f'{url}/{service}/{case_id}').json()
    submitter_id = resp['data']['submitter_id']
    return submitter_id

case2submitter('c6c75c5d-b3c9-4918-8dc2-02849933cd3c')

'C3N-01220'

In [12]:
colnames = [
    'Hugo_Symbol', 'Chromosome', 'Start_Position', 'End_Position', 
    'Strand', 'Variant_Classification', 'dbSNP_RS', 'SYMBOL', 'CANONICAL',
    'SWISSPROT', 'TREMBL', 'UNIPROT_ISOFORM', 'RefSeq', 'MANE', 'case_id', 
    'IMPACT'
]

variants = []
for _, row in dna_samplesheet.iterrows():
    filepath = f'../data/snv/{row["maf/file_id"]}/{row["maf/filename"]}'
    maf = (
        pd.read_csv(filepath, delimiter='\t', comment='#', compression='gzip')
          .loc[:, colnames]
    )
    if maf.empty:
        logging.warning(f'MAF {f} is empty!')
        continue
    case_id = maf['case_id'].unique()[0]
    maf['PatientID'] = case2submitter(case_id)
    maf = maf.pivot(index=['Hugo_Symbol', 'Chromosome', 'Start_Position', 'End_Position'], columns=['PatientID'], values='IMPACT')
    variants.append(maf)
variants = pd.concat(variants)
variants.to_csv('../data/variants.csv')
    

In [23]:
variants['C3L-00001'].dropna().sort_values()  # This patient's tumor had 76 mutations, three of them of which are highly likely to severely affect the protein

Hugo_Symbol  Chromosome  Start_Position  End_Position
ABCB1        chr7        87600138        87600138        MODERATE
ABCG1        chr21       42259371        42259371        MODIFIER
AC013489.1   chr15       88530849        88530849        MODERATE
AC244197.3   chrX        149483000       149483000       MODERATE
ADCY2        chr5        7709248         7709248         MODERATE
                                                           ...   
XIRP2        chr2        167239888       167239888       MODERATE
ZFP92        chrX        153421328       153421328       MODERATE
ZNF320       chr19       52881956        52881956        MODERATE
ZNF423       chr16       49636578        49636578             LOW
ZNF513       chr2        27378646        27378646        MODERATE
Name: C3L-00001, Length: 76, dtype: object

Hugo_Symbol  Chromosome  Start_Position  End_Position
DPP6         chr7        154772888       154772888           HIGH
DIXDC1       chr11       111996144       111996144           HIGH
CACNB1       chr17       39183814        39183814            HIGH
SPRED1       chr15       38351313        38351313             LOW
SLC28A3      chr9        84313383        84313383             LOW
                                                           ...   
SFRP4        chr7        37916551        37916551        MODIFIER
SPTBN1       chr2        54529417        54529417        MODIFIER
SHANK1       chr19       50687039        50687039        MODIFIER
ABCG1        chr21       42259371        42259371        MODIFIER
AL049777.1   chr14       28778159        28778159        MODIFIER
Name: C3L-00001, Length: 76, dtype: object