# Create a list of cancer driver genes

The list of driver genes "cancergenes.txt" is obtained by merging IntOGen and COSMIC Cancer Gene Census cancer driver genes (except fusion partner genes).  

In [1]:
import os

import pandas as pd

In [2]:
main_dir = ''

In [3]:
output_f = os.path.join(main_dir, 'data', 'cancerdrivers.txt')

In [4]:
driver_genes = set()

#### Load Gene IDs

In [5]:
geneid_d = {}
geneid_f = os.path.join(main_dir, 'data', 'Ensembl_v104_geneids_2021_08_24.txt') 
with open(geneid_f, 'r') as fd: 
    next(fd)
    for line in fd: 
        if len(line.strip().split('\t')) == 4:     # some genes do not have symbol
            geneid, _, _, symbol = line.strip().split('\t')
            geneid_d[symbol] = geneid

In [6]:
# Add those that have changed their names in the latest Ensembl version
synonyms_d = {}
with open(os.path.join(main_dir, 'data', 'failed_genes_synonyms.txt'), 'r') as fd: 
    for line in fd: 
        old_name, new_name, ensid = line.strip().split('\t')
        synonyms_d[old_name] = new_name
        geneid_d[new_name] = ensid
        print(old_name, new_name)

CARS CARS1
FAM46C TENT5C
H3F3A H3-3A
H3F3B H3-3B
HIST1H3B H3C2
HIST1H4I H4C9
SEPT9 SEPTIN9


#### Read CGC and remove fussion partners

In [7]:
cgc_f = os.path.join(main_dir, 'data', 'Census_all_Tue_Aug_24_08_2021.tsv')
cgc_df = pd.read_csv(cgc_f, sep='\t', header=0)

In [8]:
len(cgc_df)

723

In [9]:
cgc_df.head()

Unnamed: 0,Gene Symbol,Name,Entrez GeneId,Genome Location,Tier,Hallmark,Chr Band,Somatic,Germline,Tumour Types(Somatic),Tumour Types(Germline),Cancer Syndrome,Tissue Type,Molecular Genetics,Role in Cancer,Mutation Types,Translocation Partner,Other Germline Mut,Other Syndrome,Synonyms
0,A1CF,APOBEC1 complementation factor,29974.0,10:50799421-50885675,2,,11.23,yes,,melanoma,,,E,,oncogene,Mis,,,,"29974,A1CF,ACF,ACF64,ACF65,APOBEC1CF,ASP,ENSG0..."
1,ABI1,abl-interactor 1,10006.0,10:26746593-26860935,1,Yes,12.1,yes,,AML,,,L,Dom,"TSG, fusion",T,KMT2A,,,"10006,ABI-1,ABI1,E3B1,ENSG00000136754.17,Q8IZP..."
2,ABL1,v-abl Abelson murine leukemia viral oncogene h...,25.0,9:130713946-130885683,1,Yes,34.12,yes,,"CML, ALL, T-ALL",,,L,Dom,"oncogene, fusion","T, Mis","BCR, ETV6, NUP214",,,"25,ABL,ABL1,ENSG00000097007.17,JTK7,P00519,c-A..."
3,ABL2,"c-abl oncogene 2, non-receptor tyrosine kinase",27.0,1:179099327-179229601,1,,25.2,yes,,AML,,,L,Dom,"oncogene, fusion",T,ETV6,,,"27,ABL2,ABLL,ARG,ENSG00000143322.19,P42684"
4,ACKR3,atypical chemokine receptor 3,57007.0,2:236569641-236582358,1,Yes,37.3,yes,,lipoma,,,M,Dom,"oncogene, fusion",T,HMGA2,,,"57007,ACKR3,CMKOR1,CXCR7,ENSG00000144476.5,GPR..."


In [10]:
cgc_df['Role in Cancer'].unique()

array(['oncogene', 'TSG, fusion', 'oncogene, fusion', 'fusion', 'TSG',
       nan, 'oncogene, TSG', 'oncogene, TSG, fusion'], dtype=object)

In [11]:
for symbol, data in cgc_df.loc[cgc_df['Role in Cancer'] != 'fusion'].groupby('Gene Symbol'): 
    if symbol in geneid_d.keys(): 
        driver_genes.add((geneid_d[symbol], symbol))
    else:
        symbol = synonyms_d[symbol]
        driver_genes.add((geneid_d[symbol], symbol))

In [12]:
len(driver_genes)

589

#### Read IntOGen

In [13]:
intogen_f = os.path.join(main_dir, 'data', 'Compendium_Cancer_Genes.tsv')
intogen_df = pd.read_csv(intogen_f, sep='\t', header=0)

In [14]:
len(intogen_df['SYMBOL'].unique())

568

In [15]:
for symbol in intogen_df['SYMBOL']: 
    if symbol in geneid_d.keys(): 
        driver_genes.add((geneid_d[symbol], symbol))
    else: 
        symbol = synonyms_d[symbol]
        driver_genes.add((geneid_d[symbol], symbol))

In [16]:
len(driver_genes)

782

#### Save

In [17]:
with open(output_f, 'w') as ofd: 
    ofd.write('{}\n'.format('\t'.join(['ID', 'SYMBOL'])))
    for symbol, geneid in driver_genes: 
        ofd.write('{}\n'.format('\t'.join([symbol, geneid])))