# Annotate cohorts with OncoTree cancer types and create cancer types

Gather information from cohorts and OncoTree cancer types to generate tables linking both: 

- For each of the cohorts in our dataset, we have annotated and curated the available OncoTree cancer types from (http://oncotree.mskcc.org). This has resulted in 'oncotree_levels.txt' table. 


- For each available cancer type in the OncoTree, this code annotates the cohorts and samples linked to it (step 1). This generates 'cancer_types_raw.csv'


- Then, it removes duplicates (two cancer types comprising the same samples) from 'cancer_types_raw.csv' based on the OncoTree hierarchy (step 2) and classifies cancer types as level A (specific) or B (meta-cancer type). This is done manually and generates 'cancer_types_curated.csv' 


- Read 'cancer_types_curated.csv' and annotate number of samples and mutations (step 3). This generates 'cancer_types_annotated.csv'


- Read 'cancer_types_annotated.csv' and write cohorts table with information from cancer types level A and B (step 4). Generate 'cohorts_to_cancer_types.txt' table.


- Generate 'cohorts_annotated.tsv' table including data from cohorts' full name, samples, mutations and cancer types that they match to (step 5).


In [1]:
import os
from collections import defaultdict

import pandas as pd

In [2]:
main_dir = ''

In [3]:
cancer_types_f = os.path.join(main_dir, 'oncotree_levels.txt')     # cohorts to cancer types from OncoTree (manually annotated)
cohorts_annot_f = os.path.join(main_dir, 'cohorts_samples_mutations.txt')    # cohorts information about samples and mutations

### Step 1) Identify cancer types for analysis

In [6]:
output_f = os.path.join(main_dir, 'cancer_types_raw.csv')

In [7]:
# Read cancer types per cohort as annotated from OncoTree
ctypes_df = pd.read_csv(cancer_types_f, sep='\t', header=0)
ctypes_df.head()

Unnamed: 0,COHORT,LEVEL_1,LEVEL_2,LEVEL_3,LEVEL_4,LEVEL_5
0,D_ACC,Adrenal Gland (ADRENAL_GLAND),Adrenocortical Carcinoma (ACC),,,
1,D_ALL,Lymphoid (LYMPH),Lymphoid Neoplasm (LNM),Acute lymphoblastic leukemia (ALL),,
2,PCAWG_WGS_MYELOID_AML,Myeloid (MYELOID),Myeloid Neoplasm (MNM),Acute Myeloid Leukemia (AML),,
3,D_AML,Myeloid (MYELOID),Myeloid Neoplasm (MNM),Acute Myeloid Leukemia (AML),,
4,HARTWIG_ANUS,Bowel (BOWEL),Anal Cancer (AN),,,


In [8]:
# Annotate cohorts for each cancer type 
ctypes_data = defaultdict(list)
ctypes_levels = dict()
for _, row in ctypes_df.iterrows(): 
    for column in ctypes_df.columns[1:]: 
        if row[column] == row[column]:    # no NaN
            ctypes_data[row[column]] += [row['COHORT']]  
            ctypes_levels[row[column]] = column[-1]

In [9]:
ctypes_data['Bowel (BOWEL)']

['HARTWIG_ANUS',
 'HARTWIG_SMALL_INTESTINE',
 'HARTWIG_NET_SMALL_INTESTINAL',
 'HARTWIG_COLONRECTUM',
 'PCAWG_WGS_COLORECT_ADENOCA']

In [10]:
len(ctypes_data.keys())

93

In [11]:
annotate_cohorts_df = pd.read_csv(cohorts_annot_f, sep='\t', header=0)
cohort_samples = dict(list(zip(annotate_cohorts_df['COHORT'], annotate_cohorts_df['SAMPLES'])))
annotate_cohorts_df.head()

Unnamed: 0,COHORT,SAMPLES,MUTATIONS_TOTAL,MUTATIONS_SNV,MUTATIONS_MNV,MUTATIONS_INS,MUTATIONS_DEL
0,HARTWIG_MESOTHELIOMA,33,111336,100261,1310,6031,3734
1,HARTWIG_NERVOUS_SYSTEM_GLIOBLASTOMA_MULTIFORME,54,371363,335086,1413,24920,9944
2,PCAWG_WGS_UTERUS_ADENOCA,38,331657,288049,1288,30366,11954
3,PCAWG_WGS_THY_ADENOCA,48,63303,59334,245,2801,923
4,PCAWG_WGS_SKIN_MELANOMA,98,7339443,7140676,121007,57184,20576


In [12]:
annotate_cohorts_df['SAMPLES'].sum()

7507

In [13]:
annotate_cohorts_df['MUTATIONS_TOTAL'].sum()

83410018

In [14]:
# Annotate number of samples
ctypes_samples = defaultdict(int)
for ctype, cohorts in ctypes_data.items(): 
    for cohort in cohorts: 
        ctypes_samples[ctype] += cohort_samples.get(cohort, 0)

In [15]:
# Merge information 
# This table contains duplicated entities
lines = []
for ctype, cohorts in ctypes_data.items(): 
    samples = ctypes_samples[ctype]
    cohorts = ', '.join(cohorts)
    lines.append(pd.DataFrame([[ctype, int(ctypes_levels[ctype]), samples, cohorts]]))
table = pd.concat(lines)
table.columns = ['CANCER_TYPE', 'LEVEL', 'SAMPLES', 'COHORTS']

In [16]:
table.head()

Unnamed: 0,CANCER_TYPE,LEVEL,SAMPLES,COHORTS
0,Adrenal Gland (ADRENAL_GLAND),1,20,D_ACC
0,Adrenocortical Carcinoma (ACC),2,20,D_ACC
0,Lymphoid (LYMPH),1,1420,"D_ALL, PCAWG_WGS_LYMPH_CLL, HARTWIG_LYMPHOID, ..."
0,Lymphoid Neoplasm (LNM),2,1420,"D_ALL, PCAWG_WGS_LYMPH_CLL, HARTWIG_LYMPHOID, ..."
0,Acute lymphoblastic leukemia (ALL),3,278,D_ALL


In [17]:
len(table)

93

In [18]:
# Remove duplicated entities 
lines = []
for group, data in table.groupby('COHORTS'): 
    if len(data) == 1: 
        lines.append(data)
    else: 
        print(data['CANCER_TYPE'].unique())
        # Select the most specific level in the hierachy (e.g, select level 3 over level 2)
        data_filter = data.sort_values('LEVEL', ascending=True, inplace=False)
        data_filter = data_filter.iloc[[-1]]
        print(data_filter['LEVEL'].iloc[0], data_filter['CANCER_TYPE'].iloc[0])
        lines.append(data_filter)
table2 = pd.concat(lines)
table2.columns = ['CANCER_TYPE', 'LEVEL', 'SAMPLES', 'COHORTS']

['Adrenal Gland (ADRENAL_GLAND)' 'Adrenocortical Carcinoma (ACC)']
2 Adrenocortical Carcinoma (ACC)
['Lymphoid (LYMPH)' 'Lymphoid Neoplasm (LNM)']
2 Lymphoid Neoplasm (LNM)
['Ependymomal Tumor (EPMT)' 'Ependymoma (EPM)']
3 Ependymoma (EPM)
['Peripheral Nervous System (PNS)' 'Neuroblastoma (NBL)']
2 Neuroblastoma (NBL)
['Eye (EYE)' 'Retinoblastoma (RBL)']
2 Retinoblastoma (RBL)
['Pleura (PLEURA)' 'Pleural Mesothelioma (PLMESO)']
2 Pleural Mesothelioma (PLMESO)
['Glioblastoma (GB)' 'Glioblastoma Multiforme (GBM)']
4 Glioblastoma Multiforme (GBM)
['Gastrointestinal Neuroendocrine Tumors (GINET)'
 'Small Bowel Neuroendocrine Tumor (SBNET)']
3 Small Bowel Neuroendocrine Tumor (SBNET)
['Ovary/Fallopian Tube (OVARY)' 'Ovarian Cancer (OV)']
2 Ovarian Cancer (OV)
['Melanoma (MEL)' 'Cutaneous Melanoma (SKCM)']
3 Cutaneous Melanoma (SKCM)
['Vulva/Vagina (VULVA_VAGINA)' 'Vulva (VULVA)']
2 Vulva (VULVA)
['Bone (BONE)' 'Osteosarcoma (OS)']
3 Osteosarcoma (OS)
['Breast (BREAST)' 'Invasive Breast Carc

In [19]:
table2.to_csv(output_f, sep='\t', header=True, index=False)

In [20]:
len(table2)

76

### Step 2) Curate cancer types manually

Manually review "cancer_types_raw.csv" to remove replicated entities and save as "cancer_types_curated.csv"

### Step 3) Annotate cancer types

In [23]:
output_f = os.path.join(main_dir, 'cancer_types_annotated.tsv')

In [24]:
# Read final table of curated cancer types
input_f = os.path.join(main_dir, 'cancer_types_curated.csv')
ctypes_for_analysis_df = pd.read_csv(input_f, sep='\t', header=0)
ctypes_for_analysis_df.head()

Unnamed: 0,CANCER_TYPE,LEVEL,LEVEL_ANALYSIS,SAMPLES,COHORTS
0,Adrenocortical Carcinoma (ACC),2,A,20,D_ACC
1,Acute lymphoblastic leukemia (ALL),3,A,278,D_ALL
2,Lymphoid Neoplasm (LNM),2,B,1420,"D_ALL, PCAWG_WGS_LYMPH_CLL, HARTWIG_LYMPHOID, ..."
3,Ependymoma (EPM),3,A,39,D_EPD
4,CNS/Brain (BRAIN),1,B,475,"D_EPD, HARTWIG_NERVOUS_SYSTEM_GLIOBLASTOMA_MUL..."


In [25]:
len(ctypes_for_analysis_df)

48

In [26]:
ctypes_for_analysis_df.loc[ctypes_for_analysis_df['LEVEL_ANALYSIS'] == 'A']['SAMPLES'].sum()

7507

In [27]:
# Annotate cancer types with total mutations
cohort_muts = {}
for mtype in ['TOTAL', 'SNV', 'MNV', 'INS', 'DEL']: 
    cohort_muts[mtype] = dict(list(zip(annotate_cohorts_df['COHORT'], annotate_cohorts_df[f'MUTATIONS_{mtype}'])))

In [28]:
ctype_muts = defaultdict(lambda: defaultdict(int))
for mtype in ['TOTAL', 'SNV', 'MNV', 'INS', 'DEL']: 
    for ctype, cohorts in ctypes_data.items(): 
        if ctype in ctypes_for_analysis_df['CANCER_TYPE'].tolist(): 
            for cohort in cohorts: 
                ctype_muts[mtype][ctype] += cohort_muts[mtype][cohort]

In [29]:
for mtype in ['TOTAL', 'SNV', 'MNV', 'INS', 'DEL']: 

    ctypes_for_analysis_df[f'MUTATIONS_{mtype}'] = ctypes_for_analysis_df.apply(
    lambda x: ctype_muts[mtype][x['CANCER_TYPE']], axis=1)
ctypes_for_analysis_df.head()

Unnamed: 0,CANCER_TYPE,LEVEL,LEVEL_ANALYSIS,SAMPLES,COHORTS,MUTATIONS_TOTAL,MUTATIONS_SNV,MUTATIONS_MNV,MUTATIONS_INS,MUTATIONS_DEL
0,Adrenocortical Carcinoma (ACC),2,A,20,D_ACC,16649,16614,0,26,9
1,Acute lymphoblastic leukemia (ALL),3,A,278,D_ALL,154660,154095,198,145,222
2,Lymphoid Neoplasm (LNM),2,B,1420,"D_ALL, PCAWG_WGS_LYMPH_CLL, HARTWIG_LYMPHOID, ...",1723396,1596077,6705,74059,46555
3,Ependymoma (EPM),3,A,39,D_EPD,9158,9114,0,34,10
4,CNS/Brain (BRAIN),1,B,475,"D_EPD, HARTWIG_NERVOUS_SYSTEM_GLIOBLASTOMA_MUL...",958034,877915,2716,50214,27189


In [30]:
# Reformat names
ctypes_for_analysis_df = ctypes_for_analysis_df.rename(
    columns={'CANCER_TYPE': 'CANCER_TYPE_LONG', 'LEVEL': 'LEVEL_ONCOTREE'})

In [31]:
short_names = {}
for ctype in ctypes_for_analysis_df['CANCER_TYPE_LONG'].tolist(): 
    short_names[ctype] = ctype[ctype.find("(")+1:ctype.find(")")]

In [32]:
ctypes_for_analysis_df['CANCER_TYPE'] = ctypes_for_analysis_df.apply(lambda x: short_names[x['CANCER_TYPE_LONG']], axis=1)

In [33]:
ctypes_for_analysis_df = ctypes_for_analysis_df[[
    'CANCER_TYPE',
    'CANCER_TYPE_LONG',
    'LEVEL_ONCOTREE',
    'LEVEL_ANALYSIS',
    'SAMPLES',
    'COHORTS',
    'MUTATIONS_TOTAL',
    'MUTATIONS_SNV',
    'MUTATIONS_MNV',
    'MUTATIONS_INS',
    'MUTATIONS_DEL',
]]

In [34]:
# Add pancancer
pancan_df = pd.DataFrame([[
    'PANCANCER', 
    'Pancancer (PANCANCER)', 
    'None', 
    'B', 
    ctypes_for_analysis_df.loc[ctypes_for_analysis_df['LEVEL_ANALYSIS'] == 'A']['SAMPLES'].sum(), 
    ', '.join(sorted(annotate_cohorts_df['COHORT'].tolist())),
    ctypes_for_analysis_df.loc[ctypes_for_analysis_df['LEVEL_ANALYSIS'] == 'A']['MUTATIONS_TOTAL'].sum(),
    ctypes_for_analysis_df.loc[ctypes_for_analysis_df['LEVEL_ANALYSIS'] == 'A']['MUTATIONS_SNV'].sum(),
    ctypes_for_analysis_df.loc[ctypes_for_analysis_df['LEVEL_ANALYSIS'] == 'A']['MUTATIONS_MNV'].sum(),
    ctypes_for_analysis_df.loc[ctypes_for_analysis_df['LEVEL_ANALYSIS'] == 'A']['MUTATIONS_INS'].sum(),
    ctypes_for_analysis_df.loc[ctypes_for_analysis_df['LEVEL_ANALYSIS'] == 'A']['MUTATIONS_DEL'].sum(),
]])
pancan_df.columns = ctypes_for_analysis_df.columns
ctypes_for_analysis_df = pd.concat([ctypes_for_analysis_df, pancan_df])

In [35]:
ctypes_for_analysis_df.head()

Unnamed: 0,CANCER_TYPE,CANCER_TYPE_LONG,LEVEL_ONCOTREE,LEVEL_ANALYSIS,SAMPLES,COHORTS,MUTATIONS_TOTAL,MUTATIONS_SNV,MUTATIONS_MNV,MUTATIONS_INS,MUTATIONS_DEL
0,ACC,Adrenocortical Carcinoma (ACC),2,A,20,D_ACC,16649,16614,0,26,9
1,ALL,Acute lymphoblastic leukemia (ALL),3,A,278,D_ALL,154660,154095,198,145,222
2,LNM,Lymphoid Neoplasm (LNM),2,B,1420,"D_ALL, PCAWG_WGS_LYMPH_CLL, HARTWIG_LYMPHOID, ...",1723396,1596077,6705,74059,46555
3,EPM,Ependymoma (EPM),3,A,39,D_EPD,9158,9114,0,34,10
4,BRAIN,CNS/Brain (BRAIN),1,B,475,"D_EPD, HARTWIG_NERVOUS_SYSTEM_GLIOBLASTOMA_MUL...",958034,877915,2716,50214,27189


In [36]:
ctypes_for_analysis_df.to_csv(output_f, sep='\t', header=True, index=False)

### Step 4) Annotate cohorts with information from cancer types level A and B

In [37]:
output_f = os.path.join(main_dir, 'cohorts_to_cancer_types.txt')

In [38]:
annotate_cohorts_df.head()

Unnamed: 0,COHORT,SAMPLES,MUTATIONS_TOTAL,MUTATIONS_SNV,MUTATIONS_MNV,MUTATIONS_INS,MUTATIONS_DEL
0,HARTWIG_MESOTHELIOMA,33,111336,100261,1310,6031,3734
1,HARTWIG_NERVOUS_SYSTEM_GLIOBLASTOMA_MULTIFORME,54,371363,335086,1413,24920,9944
2,PCAWG_WGS_UTERUS_ADENOCA,38,331657,288049,1288,30366,11954
3,PCAWG_WGS_THY_ADENOCA,48,63303,59334,245,2801,923
4,PCAWG_WGS_SKIN_MELANOMA,98,7339443,7140676,121007,57184,20576


In [39]:
cohort_to_levels = defaultdict(dict)
for ctype, data in ctypes_for_analysis_df.groupby('CANCER_TYPE_LONG'): 
    if 'PANCANCER' not in ctype: 
        level = data['LEVEL_ANALYSIS'].iloc[0]
        for cohort in data['COHORTS'].iloc[0].split(', '): 
            cohort_to_levels[cohort][level] = ctype

In [40]:
cohort_to_levels['PCAWG_WGS_MYELOID_AML']

{'A': 'Acute Myeloid Leukemia (AML)', 'B': 'Myeloid Neoplasm (MNM)'}

In [41]:
len(cohort_to_levels.keys())

78

In [42]:
lines = []
for cohort, cohort_levels in cohort_to_levels.items(): 
    level_a = cohort_levels.get('A', 'None')
    level_a_short = level_a[level_a.find("(")+1:level_a.find(")")] if level_a != 'None' else 'None'
    level_b = cohort_levels.get('B', 'None')
    level_b_short = level_b[level_b.find("(")+1:level_b.find(")")] if level_b != 'None' else 'None'
    lines.append(pd.DataFrame([[cohort, level_a, level_a_short, level_b, level_b_short]]))
results = pd.concat(lines)
results.columns = ['COHORT', 'LEVEL_A_LONG', 'LEVEL_A', 'LEVEL_B_LONG', 'LEVEL_B']
results.head()

Unnamed: 0,COHORT,LEVEL_A_LONG,LEVEL_A,LEVEL_B_LONG,LEVEL_B
0,PCAWG_WGS_MYELOID_AML,Acute Myeloid Leukemia (AML),AML,Myeloid Neoplasm (MNM),MNM
0,D_AML,Acute Myeloid Leukemia (AML),AML,Myeloid Neoplasm (MNM),MNM
0,D_ALL,Acute lymphoblastic leukemia (ALL),ALL,Lymphoid Neoplasm (LNM),LNM
0,D_ACC,Adrenocortical Carcinoma (ACC),ACC,,
0,HARTWIG_ANUS,Anal Cancer (AN),AN,Bowel (BOWEL),BOWEL


In [43]:
results.to_csv(output_f, sep='\t', header=True, index=False)

### Step 5) Merge all cohort level information into a single table

In [44]:
output_f = os.path.join(main_dir, 'cohorts_annotated.tsv')

In [46]:
original_cohorts = os.path.join(main_dir, 'cohorts_names.txt')
original_cohorts = pd.read_csv(original_cohorts, sep='\t', header=0)

cohorts_sample_muts = os.path.join(main_dir, 'cohorts_samples_mutations.txt')
cohorts_sample_muts = pd.read_csv(cohorts_sample_muts, sep='\t', header=0)

cohorts_cancertypes_AB = os.path.join(main_dir, 'cohorts_to_cancer_types.txt')
cohorts_cancertypes_AB = pd.read_csv(cohorts_cancertypes_AB, sep='\t', header=0)

cohorts_cancertypes_oncotree = os.path.join(main_dir, 'oncotree_levels.txt')
cohorts_cancertypes_oncotree = pd.read_csv(cohorts_cancertypes_oncotree, sep='\t', header=0)

In [47]:
original_cohorts.head(1)

Unnamed: 0,COHORT,COHORT_MANUSCRIPT_LONG_NAME,SOURCE,PLATFORM,REFERENCE_DOI,TYPE,TREATED,AGE
0,D_ACC,Adrenocortical carcinomas (St. Jude Children's...,STJUDE,WGS,10.1038/ncomms7302,Primary,Untreated,Pediatric


In [48]:
cohorts_sample_muts.head(1)

Unnamed: 0,COHORT,SAMPLES,MUTATIONS_TOTAL,MUTATIONS_SNV,MUTATIONS_MNV,MUTATIONS_INS,MUTATIONS_DEL
0,HARTWIG_MESOTHELIOMA,33,111336,100261,1310,6031,3734


In [49]:
cohorts_cancertypes_AB.head(1)

Unnamed: 0,COHORT,LEVEL_A_LONG,LEVEL_A,LEVEL_B_LONG,LEVEL_B
0,PCAWG_WGS_MYELOID_AML,Acute Myeloid Leukemia (AML),AML,Myeloid Neoplasm (MNM),MNM


In [50]:
cohorts_cancertypes_oncotree.head(1)

Unnamed: 0,COHORT,LEVEL_1,LEVEL_2,LEVEL_3,LEVEL_4,LEVEL_5
0,D_ACC,Adrenal Gland (ADRENAL_GLAND),Adrenocortical Carcinoma (ACC),,,


In [51]:
cohorts_full_table = pd.merge(original_cohorts, cohorts_sample_muts, on='COHORT')
cohorts_full_table = pd.merge(cohorts_full_table, cohorts_cancertypes_AB, on='COHORT')
cohorts_full_table = pd.merge(cohorts_full_table, cohorts_cancertypes_oncotree, on='COHORT')

In [52]:
cohorts_full_table.head()

Unnamed: 0,COHORT,COHORT_MANUSCRIPT_LONG_NAME,SOURCE,PLATFORM,REFERENCE_DOI,TYPE,TREATED,AGE,SAMPLES,MUTATIONS_TOTAL,...,MUTATIONS_DEL,LEVEL_A_LONG,LEVEL_A,LEVEL_B_LONG,LEVEL_B,LEVEL_1,LEVEL_2,LEVEL_3,LEVEL_4,LEVEL_5
0,D_ACC,Adrenocortical carcinomas (St. Jude Children's...,STJUDE,WGS,10.1038/ncomms7302,Primary,Untreated,Pediatric,20,16649,...,9,Adrenocortical Carcinoma (ACC),ACC,,,Adrenal Gland (ADRENAL_GLAND),Adrenocortical Carcinoma (ACC),,,
1,D_ALL,Acute lymphoblastic leukemias (St. Jude Childr...,STJUDE,WGS,10.1038/ncomms7604,Primary,Untreated,Pediatric,278,154660,...,222,Acute lymphoblastic leukemia (ALL),ALL,Lymphoid Neoplasm (LNM),LNM,Lymphoid (LYMPH),Lymphoid Neoplasm (LNM),Acute lymphoblastic leukemia (ALL),,
2,PCAWG_WGS_MYELOID_AML,Acute myeloid leukemias (PCAWG),PCAWG,WGS,10.1101/162784,Primary,Untreated,Adult,13,18413,...,327,Acute Myeloid Leukemia (AML),AML,Myeloid Neoplasm (MNM),MNM,Myeloid (MYELOID),Myeloid Neoplasm (MNM),Acute Myeloid Leukemia (AML),,
3,D_AML,Acute myeloid leukemias (St. Jude Children's R...,STJUDE,WGS,10.1038/ng.3709,Primary,Untreated,Pediatric,21,15291,...,14,Acute Myeloid Leukemia (AML),AML,Myeloid Neoplasm (MNM),MNM,Myeloid (MYELOID),Myeloid Neoplasm (MNM),Acute Myeloid Leukemia (AML),,
4,HARTWIG_ANUS,Anal cancers (Hartwig Medical Foundation),HARTWIG,WGS,10.1038/s41586-019-1689-y,Metastasis,Treated,Adult,14,208041,...,8994,Anal Cancer (AN),AN,Bowel (BOWEL),BOWEL,Bowel (BOWEL),Anal Cancer (AN),,,


In [53]:
cohorts_full_table.to_csv(output_f, sep='\t', header=True, index=False)