This notebook process intogen results to provide a list of cancer genes with annotations of relevant involvement of the genes (biological processes, pathways, protein family ). This list is provided in supplementary materials table 3 and used in figure 1d and supplementary figure 2

In [None]:
import pandas as pd
import os
import numpy as np
import glob
import seaborn
from collections import defaultdict
import json

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', -1)

### prepare black and white lists of cancer genes

In [None]:
ext_files_path = "../ext_files/process_intogen/"

# cancer mine to check genes with very few information
cancer_mine = pd.read_csv(os.path.join(ext_files_path,"cancermine_collated_02102019.tsv"), sep='\t')

# IntoGen manually curated list of genes that are most likely sequencing artifacts 
with open(os.path.join(ext_files_path, 'artifacts_intogen_24_02_2020.json')) as json_file:
    intogen_black_list = json.load(json_file)
    
# Cancer Gene Census as whitelist to recover genes that are known to be drivers in other cancer types
# that appear in Tier 3 and 4 in IntoGen
cgc_genes = pd.read_csv(os.path.join(ext_files_path,"cancer_gene_census_parsed.tsv"), sep='\t')
cgc_genes = cgc_genes[cgc_genes['Somatic'] == 'yes']

# Also read metadata data to add info to the cohort results 
clinical = pd.read_csv("../ext_files/all_cohort_clinical_groups.tsv", sep='\t')
clinical.head()

In [None]:
hemato_acronyms = ['aCML','AITL', 'AL', 'ALL','AML', 'APL', 'B-ALL', 'B-CLL', 'B-NHL','CLL','CML',
                  'CMML', 'CNL', 'DLBCL', 'DLCL', 'ETP-ALL','JMML', 'L','MALT', 'MCL', 'MDS', 'MM',
                  'NHL', 'NK/T', 'PMBL', 'pre-B ALL','sAML','SMZL','T-ALL','T-CLL','T-PLL']

def filter_cgc_acronym(rw):
    acronyms = set(rw['acronym_cgc'].split(","))
    test = acronyms.intersection(hemato_acronyms)
    if len(test) == 0:
        rw['keep'] = False
    else:
        rw['keep'] = True
    return rw

In [None]:
cgc_genes = cgc_genes.apply(lambda x: filter_cgc_acronym(x), axis=1)
cgc_genes_hemato = cgc_genes[cgc_genes['keep'] == True]

In [None]:
black_list = intogen_black_list['known']
black_list.extend(intogen_black_list['suspects'])

In [None]:
len(black_list)

### read results and manually inspect them to check for false positive suspects

In [None]:
dff_drivers_cohort = pd.DataFrame()

In [None]:
## Inés manually defined black list
suspects_FP = set(['CCDC190', 'RETSAT', 'TMEM67', 'RIBC2', 'KRTAP26-1', 'MAML2', 'TGM3', 
                   'MAGEL2', 'ISY1', 'MUC16', 'AP3D1', 'ACIN1', 'KLRF1', 'AGPAT2', 'RIMS2', 
                   'TOPBP1', 'CHIT1', 'NPY5R', 'OR4X1', 'CELSR2', 'ZNF274', 'NGEF', 'ISY1-RAB43', 
                   'ZFYVE26', 'PIK3R6', 'SV2A', 'ZNF780B', 'C10orf113', 'PABPC1', 'ARMC9', 
                   'SLITRK4', 'OR6Q1', 'CCDC93', 'GOLGA2', 'PTPRZ1', 'ST6GALNAC6', 'EME1', 
                   'PABPC3', 'MX2', 'PCDHB9', 'ZNF880', 'URI1', 'RTKN2', 'FAM71E2', 'PRR12', 
                   'SCAMP1', 'LTV1', 'AXDND1', 'GPATCH1', 'IARS2', 'IQSEC3','COL4A6', 'DCDC2B', 
                   'GPR61', 'IL13RA2', 'LUZP2', 'SLC7A9', 'TRPV3','ITK','OGFR', 'LNP1','TP53TG5'])

In [None]:
suspects_FP.intersection(cgc_genes_hemato['Gene Symbol'])

In [None]:
# paths to results

intogen_parent_path = ""

intogen_subpaths = {'BALL_DUX4ERG_PED':"26022020",
                'BALL_HYPER_PED':"26022020",
                'BALL_HYPO_PED':"26022020/",
                'BALL_IAMP21_PED':"26022020",
                'BALL_INFANT_PED':"03032020",
                'BALL_PH_LIKE_PED':"26022020",
                'TALL_PED':"03032020",
                'BALL_OSHIMA_PED':"04032020",
                'TALL_OSHIMA_PED':"04032020",
                'TALL_ADULT':"29072020"}

In [None]:
# relate cohort intogen names with subtype annotation used in the analysis

subtype_names = {'BALL_DUX4ERG_PED_PRY':"DUX4-ERG",
                'BALL_HYPER_PED_PRY':"Hyperdiploid",
                'BALL_HYPO_PED_PRY':"Hypodiploid",
                'BALL_IAMP21_PED_PRY':"iAMP21",
                'BALL_INFANT_PED_PRY':"Infant_MLL-R",
                'BALL_PH_LIKE_PED_PRY':"PHALL",
                'TALL_PED_PRY':"TALL_Pediatric_pry",
                'BALL_OSHIMA_PED_PRY':"BALL_Pediatric_WXS_pry",
                'TALL_OSHIMA_PED_PRY':"TALL_Pediatric_WXS_pry",
                'BALL_OSHIMA_PED_REL':"BALL_Pediatric_WXS_rel",
                'TALL_OSHIMA_PED_REL':"TALL_Pediatric_WXS_rel",
                'TALL_ADULT_PRY':"TALL_Adult_pry",
                'TALL_ADULT_REL':"TALL_Adult_rel"}

In [None]:
# read results of the combination of methods of intogen

for k,v in intogen_subpaths.items():
    files = glob.glob(os.path.join(intogen_parent_path, v,'combination', k+'*.05.out.gz'))
    for f in files:
        type_leuk = k.split("_")[0]
        cohort = f.split("/")[-1].replace(".05.out.gz", "")
        stage = f.split("/")[-1].split("_")[-1].replace(".05.out.gz", "")
        clinical_subtype = clinical[clinical['SUBTYPE'] == subtype_names[cohort]].reset_index(drop=True)
        df_original = pd.read_csv(f, compression='gzip', sep='\t')
        df = df_original[df_original['TIER'].isin([1,2])] # keep tier 1 and 2
        df = df.append(df_original[(df_original['TIER'] == 3) & (df_original['SYMBOL'].isin(cgc_genes_hemato['Gene Symbol'].unique()))], ignore_index=True) # only cancer genes of hematopoietic cancers in CGC 
        df = df.append(df_original[(df_original['TIER'] == 4) & (df_original['SYMBOL'].isin(cgc_genes_hemato['Gene Symbol'].unique()))], ignore_index=True) # only cancer genes of hematopoietic cancers in CGC
        df_filt = df[~df['SYMBOL'].isin(black_list)] # remove genes from black list of any tier
        df_filt = df_filt[['SYMBOL', 'TIER', 'ROLE']]
        df_filt['SUBTYPE'] = subtype_names[cohort]
        df_filt['SUBTYPE_LABEL'] = clinical_subtype.loc[0, 'SUBTYPE_LABEL']
        df_filt['TYPE'] = clinical_subtype.loc[0, 'TYPE']
        df_filt['STAGE'] = stage
        dff_drivers_cohort = dff_drivers_cohort.append(df_filt, ignore_index=True,sort=False)

In [None]:
dff_drivers_cohort['STAGE'] = dff_drivers_cohort['STAGE'].replace("PRY", "primary")
dff_drivers_cohort['STAGE'] = dff_drivers_cohort['STAGE'].replace("REL", "relapse")

In [None]:
# most of the FP suspects are not even in cancer mine
suspects_FP.difference(set(cancer_mine['gene_normalized'].unique()))

In [None]:
# from those in cancer mine most are not related to any hematological neoplasms 
cancer_mine[cancer_mine['gene_normalized'].isin(suspects_FP)]

In [None]:
# finally filter the manually checked FP suspects
dff_drivers_cohort = dff_drivers_cohort[~dff_drivers_cohort['SYMBOL'].isin(suspects_FP)]
dff_drivers_cohort['SYMBOL'].unique()

In [None]:
## PRIMARY AND RELAPSE CHINESE COHORT

chin_wgs_into_run = os.path.join(intogen_parent_path, "18032020", "combination")

#ALL PRIMARY 
df_original = pd.read_csv(os.path.join(chin_wgs_into_run,"TALL_CHINESE_PRY.05.out.gz"), compression='gzip', sep='\t')
df = df_original[df_original['TIER'].isin([1,2])]
df = df.append(df_original[(df_original['TIER'] == 3) & (df_original['SYMBOL'].isin(cgc_genes_hemato['Gene Symbol'].unique()))], ignore_index=True)
df = df.append(df_original[(df_original['TIER'] == 4) & (df_original['SYMBOL'].isin(cgc_genes_hemato['Gene Symbol'].unique()))], ignore_index=True)
df_filt = df[~df['SYMBOL'].isin(black_list)] # general one of IntoGen
df_filt = df_filt[~df_filt['SYMBOL'].isin(suspects_FP)] # manually filtered FP candidates
df_filt = df_filt[['SYMBOL', 'TIER', 'ROLE']]
df_filt['SUBTYPE'] = "TALL_ped"
df_filt['SUBTYPE_LABEL'] = "ALL Pediatric Chinese Study"
df_filt['STAGE'] = 'primary'
df_filt['TYPE'] = "TALL"
dff_drivers_cohort = dff_drivers_cohort.append(df_filt, ignore_index=True, sort=False)

#ALL RELAPSE
df_original = pd.read_csv(os.path.join(chin_wgs_into_run,"TALL_CHINESE_REL.05.out.gz"), compression='gzip', sep='\t')
df = df_original[df_original['TIER'].isin([1,2])]
df = df.append(df_original[(df_original['TIER'] == 3) & (df_original['SYMBOL'].isin(cgc_genes_hemato['Gene Symbol'].unique()))], ignore_index=True)
df = df.append(df_original[(df_original['TIER'] == 4) & (df_original['SYMBOL'].isin(cgc_genes_hemato['Gene Symbol'].unique()))], ignore_index=True)
df_filt = df[~df['SYMBOL'].isin(black_list)] # general one of IntoGen
df_filt = df_filt[~df_filt['SYMBOL'].isin(suspects_FP)] # manually filtered FP candidates
df_filt = df_filt[['SYMBOL', 'TIER', 'ROLE']]
df_filt['SUBTYPE'] = "TALL_ped"
df_filt['SUBTYPE_LABEL'] = "ALL Pediatric Chinese Study"
df_filt['STAGE'] = 'relapse'
df_filt['TYPE'] = "TALL"
dff_drivers_cohort = dff_drivers_cohort.append(df_filt, ignore_index=True, sort=False)

### complete intogen list with literature

In [None]:
df_list_lite = pd.read_csv("../ext_files/literature/mutations_lite.tsv", sep='\t')
df_list_lite['ROLE'] = df_list_lite['ROLE'].apply(lambda x: "Act" if x == "act" else x)
df_list_lite['ROLE'] = df_list_lite['ROLE'].apply(lambda x: "LoF" if x == "lof" else x)
df_list_lite['ROLE'] = df_list_lite['ROLE'].apply(lambda x: "LoF" if x == "lof " else x)
df_list_lite['SUBTYPE'] = 'literature'
df_list_lite.head()

In [None]:
# join both
df_final_list = dff_drivers_cohort[['SYMBOL', 'ROLE', 'SUBTYPE', 'SUBTYPE_LABEL']].copy()
df_final_list = df_final_list.append(df_list_lite[['SYMBOL', 'ROLE', 'SUBTYPE']], ignore_index=True, sort=False)

# correct NOTCH1
df_final_list['ROLE'] = df_final_list.apply(lambda x: 'Act' if x['SYMBOL'] == 'NOTCH1' else x['ROLE'], axis=1)
df_final_list[df_final_list['SYMBOL'] == 'NOTCH1']

In [None]:
df_final_list[['SYMBOL', 'ROLE']].drop_duplicates().groupby('SYMBOL').count().sort_values(by='ROLE',ascending=False)

In [None]:
## Correct for wrong roles of genes

print(len(df_final_list))
grps_genes = df_final_list.groupby('SYMBOL')

df_final_list = pd.DataFrame()

for g in grps_genes.groups:
    df_gene = grps_genes.get_group(g)
    if (len(df_gene['ROLE'].unique()) > 1) and ('literature' in df_gene['SUBTYPE'].tolist()):
        trusted_role = df_gene[df_gene['SUBTYPE'] == 'literature'].reset_index()
        df_gene['ROLE'] = trusted_role.loc[0, 'ROLE'] 
        df_final_list = df_final_list.append(df_gene, ignore_index=True, sort=False)
    else:
        if (len(df_gene['ROLE'].unique()) > 1):
            print(df_gene)
            df_final_list = df_final_list.append(df_gene, ignore_index=True, sort=False)
        else:
            df_final_list = df_final_list.append(df_gene, ignore_index=True, sort=False)
print(len(df_final_list))

### Annotations

#### GO terms by REACTOME 

In [None]:
reactome = pd.read_csv("../ext_files/process_intogen/annotations/Ensembl2Reactome.txt", sep='\t',
                      names=['ENSEMBL', 'STH', 'PATH', 'PATHWAY_REACTOME', 'STH2', 'ORGANISM'])
reactome = reactome[reactome['STH2'] == 'TAS']
reactome = reactome[reactome['ORGANISM'] == 'Homo sapiens']

biomart = pd.read_csv("../ext_files/mart_export_grch37.txt", sep='\t')
biomart = biomart[['Gene stable ID', 'Gene name']]
biomart.rename(columns={'Gene stable ID':'ENSEMBL', 'Gene name':'SYMBOL'}, inplace=True)

all_go_terms = biomart.merge(reactome[['ENSEMBL', 'PATHWAY_REACTOME']].drop_duplicates(), how='left', on='ENSEMBL')
all_go_terms = all_go_terms[['SYMBOL', 'PATHWAY_REACTOME']].dropna()

#### PANCAN ATLAS pathway list from supplementary

In [None]:
pathway_list = pd.read_excel("../ext_files/process_intogen/annotations/supp_from_paper_pathwayPANCAN.xlsx", sheet_name="Table S2", skiprows=[0,1,2])
pathway_list = pathway_list[['Gene','Pathway']].drop_duplicates()
pathway_list.rename(columns={'Gene':'SYMBOL', 'Pathway':'PATHWAY_PANCAN'}, inplace=True)

In [None]:
pathway_list = pathway_list[['Gene','Pathway']].drop_duplicates()
pathway_list.rename(columns={'Gene':'SYMBOL', 'Pathway':'PATHWAY_PANCAN'}, inplace=True)

#### PANTHER PATHWAYS

In [None]:
with open('../ext_files/process_intogen/annotations/analysis_panther_pathways.json') as json_file:
    data = json.load(json_file)

In [None]:
panther_results_gene = pd.DataFrame()

for r in data['overrepresentation']['group']:
    try:
        if type (r['result']['input_list']['mapped_id_list']['mapped_id']) == list:
            for gene in r['result']['input_list']['mapped_id_list']['mapped_id']:
                panther_results_gene = panther_results_gene.append({'SYMBOL':gene, 'PATHWAY_PANTHER':r['result']['term']['label']}, 
                                                               ignore_index=True) 
        else:
            panther_results_gene = panther_results_gene.append({'SYMBOL':r['result']['input_list']['mapped_id_list']['mapped_id'], 
                                                                'PATHWAY_PANTHER':r['result']['term']['label']}, 
                                                               ignore_index=True)
    except KeyError:
        pass
    
panther_results_gene.drop_duplicates(inplace=True)
panther_results_gene.sort_values(by=["SYMBOL"], ascending=False)

In [None]:
# check individually
gene = 'CSF2RA'
print(panther_results_gene[panther_results_gene['SYMBOL'] == gene])
print(pathway_list[pathway_list['SYMBOL'] == gene])
all_go_terms[all_go_terms['SYMBOL'] == gene]

In [None]:
panther_results_gene.head()

In [None]:
pathway_list.head()

In [None]:
all_go_terms.head()

In [None]:
cancer_gene_annotated = df_final_list[['SYMBOL']].merge(all_go_terms, how='left', on=['SYMBOL'])
cancer_gene_annotated = cancer_gene_annotated.merge(pathway_list, how='left', on=['SYMBOL'])
cancer_gene_annotated = cancer_gene_annotated.merge(panther_results_gene, how='left', on=['SYMBOL'])

In [None]:
len(cancer_gene_annotated)

Since there are many rows out of the merge of all the annotation sources we prioritize panther > pancan > reactome to have a reduced list of annotations and after we manually uniform them in one term that makes sense even thought the source of information is different

In [None]:
gene_pathway = pd.DataFrame()

for gene in df_final_list['SYMBOL'].unique():
    gene_pancan = pathway_list[pathway_list['SYMBOL'] == gene].reset_index(drop=True)
    gene_panther = panther_results_gene[panther_results_gene['SYMBOL'] == gene].reset_index(drop=True)
    gene_panther['PATHWAY_PANTHER'] = gene_panther['PATHWAY_PANTHER'].replace("UNCLASSIFIED", np.nan)
    gene_panther = gene_panther.dropna()
    gene_reactome = all_go_terms[all_go_terms['SYMBOL'] == gene].reset_index(drop=True)
    
#    "signaling pathway"
    
    if gene_panther.empty == False:
        if len(gene_panther) == 1:
            gene_pathway = gene_pathway.append({'SYMBOL':gene, 'PATHWAY':gene_panther.loc[0,'PATHWAY_PANTHER'], 
                                            'PATHWAY_SOURCE':'PANTHER'}, ignore_index=True, sort=False)
        else:
            gene_panther_subset = gene_panther[gene_panther['PATHWAY_PANTHER'].str.contains("signaling pathway")] # prioritize signaling pathways over any other annotation of biological processes
            if len(gene_panther_subset) == 0:
                for i,rw in gene_panther.iterrows():
                    gene_pathway = gene_pathway.append({'SYMBOL':gene, 'PATHWAY':gene_panther.loc[i,'PATHWAY_PANTHER'], 
                                            'PATHWAY_SOURCE':'PANTHER'}, ignore_index=True, sort=False)
            else:
                for i,rw in gene_panther_subset.iterrows():
                    gene_pathway = gene_pathway.append({'SYMBOL':gene, 'PATHWAY':gene_panther_subset.loc[i,'PATHWAY_PANTHER'], 
                                            'PATHWAY_SOURCE':'PANTHER'}, ignore_index=True, sort=False)
    else:
        if gene_pancan.empty == False:
            gene_pathway = gene_pathway.append({'SYMBOL':gene, 'PATHWAY':gene_pancan.loc[0,'PATHWAY_PANCAN'], 
                                                'PATHWAY_SOURCE':'PANCAN_PAPER'}, 
                                               ignore_index=True, sort=False)
        else:
            if gene_reactome.empty == False:
                for i,rw in gene_reactome.iterrows():
                    gene_pathway = gene_pathway.append({'SYMBOL':gene, 'PATHWAY':gene_reactome.loc[i,'PATHWAY_REACTOME'], 
                                                'PATHWAY_SOURCE':'REACTOME'}, 
                                               ignore_index=True, sort=False)
            else:
                gene_pathway = gene_pathway.append({'SYMBOL':gene, 'PATHWAY':np.nan, 
                                                'PATHWAY_SOURCE':np.nan}, 
                                               ignore_index=True, sort=False)

In [None]:
# manually decide for repeated rows (genes) and use GeneCards (https://www.genecards.org/) 
# to unify terms and complete empty information
print(len(gene_pathway))
gene_pathway.to_excel("../intermediate_files/driver_candidate_gene_pathways.xlsx",index=False)

In [None]:
# read unified and revised biological processes
bio_proces = pd.read_csv("../intermediate_files/driver_candidate_gene_pathways.csv",
                         sep='\t')
bio_proces.head()

In [None]:
print(len(df_final_list))
df_final_list = df_final_list.merge(bio_proces, on='SYMBOL', how='left')
print(len(df_final_list))

In [None]:
df_final_list.head()

In [None]:
# results of this can be found in Table S3
out_path = 
df_final_list.to_csv(os.path.join(out_path, "cancer_genes_ALL.csv"), sep='\t', index=False) 