In [39]:
#Import modules
import os
import pandas as pd
import numpy as np
import pybedtools
import re

In [7]:
#Create a bedtools of the relevant regions and append their score at the end of it
def get_regions(feature_scores_table, data_table):
    feature_scores = pd.read_csv(feature_scores_table, index_col=0)
    data = pd.read_table(data_table)
    
    
    cv_regions = pybedtools.BedTool(
        [list(data.loc[index, ['Chromosome', 'Start', 'End']]) + [str(row[0])]
         for index, row in feature_scores.iterrows()]
    )
    return cv_regions


xg_cv_regions = get_regions('XGB_feature_importance.csv', '../data/Train_call.txt')
mi_cv_regions = get_regions('MI_scores.csv', '../data/Train_call.txt')
lr_cv_regions = get_regions('LR_feature_importance.csv', '../data/Train_call.txt')
rf_cv_regions = get_regions('RF_feature_importance.csv', '../data/Train_call.txt')

In [71]:
#Create a bed tools of the relevant genes
cancer_genes_table = pd.read_csv('../data/census_cancer_genes.csv')
cancer_genes = pybedtools.BedTool(
    [re.split(r'[:\-]', row['Genome Location']) + [row['Gene Symbol']]
    for i, row in cancer_genes_table.iterrows()
         if ':-' not in row['Genome Location']
])

all_genes_table = pd.read_table('../data/BasepairToGeneMap.tsv')
# all_genes_table.rename(columns={'HGNC_symbol':'Gene Symbol'}, inplace=True)
all_genes = pybedtools.BedTool([
    [row['Chromosome'], row['Gene_start'], row['Gene_end'], row['ENSEMBL_gene_id'], row['HGNC_symbol']]
    for i, row in all_genes_table.iterrows()
])

In [74]:
#Intersect with the regions to identify relevant cancer genes
def intersect(regions, genes, all_genes_table, cancer_genes_table = None):
    hits = []
    for gene in genes:
        for hit in regions.all_hits(gene):
            
            #If a cancer gene table is provided, only continue with genes in this table
            if cancer_genes_table is not None:
                match = False
                for i, row in cancer_genes_table.iterrows():
                    cancer_gene_synonyms = row['Synonyms']
                    if not isinstance(cancer_gene_synonyms, str):
                        continue
                    if gene.fields[3] in cancer_gene_synonyms:
                        hits.append([
                            gene.fields[3], 
                            f'{hit.fields[0]}:{hit.fields[1]}-{hit.fields[2]}',
                            hit.fields[3]
                        ] + list(row))
            else:
                hits.append([
                            gene.fields[3], 
                            f'{hit.fields[0]}:{hit.fields[1]}-{hit.fields[2]}',
                            hit.fields[3],
                            gene.fields[4]
                ])
    hits = pd.DataFrame(data=hits, columns=['Gene Ensembl ID', 'CV Region', 'CV Region Score'] + list(cancer_genes_table.columns.values))\
                    if cancer_genes_table is not None\
                    else pd.DataFrame(data=hits, columns=['Gene Ensembl ID', 'CV Region', 'CV Region Score', 'Gene Symbol'])
    return hits.sort_values(by='CV Region Score', ascending=False)

In [75]:
#Intersect with XGBoost regions
xg_inter = intersect(xg_cv_regions, all_genes, all_genes_table, cancer_genes_table)
display(xg_inter)

Unnamed: 0,Gene Ensembl ID,CV Region,CV Region Score,Gene Symbol,Name,Entrez GeneId,Genome Location,Tier,Hallmark,Chr Band,...,Tumour Types(Germline),Cancer Syndrome,Tissue Type,Molecular Genetics,Role in Cancer,Mutation Types,Translocation Partner,Other Germline Mut,Other Syndrome,Synonyms
6,ENSG00000141736,17:35076296-35282086,3.693419933319092,ERBB2,v-erb-b2 erythroblastic leukemia viral oncogen...,2064.0,17:39700080-39728662,1,Yes,12.0,...,,,E,Dom,"oncogene, fusion","A, Mis, O",,,,"2064,CD340,ENSG00000141736.13,ERBB2,HER-2,HER2..."
7,ENSG00000161405,17:35076296-35282086,3.693419933319092,IKZF3,IKAROS Family Zinc Finger 3,22806.0,17:39757715-39864188,1,,21.1,...,,,L,,"oncogene, TSG","D, Mis, F, N",,,,"22806,Aiolos,ENSG00000161405.16,IKZF3,Q9UKT9,Z..."
3,ENSG00000145819,5:140433878-142819410,0.8109956979751587,ARHGAP26,Rho GTPase activating protein 26,23092.0,5:142770377-143229011,1,Yes,31.3,...,,,L,Dom,"TSG, fusion","T, F, S",KMT2A,,,"23092,ARHGAP26,ENSG00000145819.16,GRAF,KIAA062..."
0,ENSG00000116560,1:35336344-35895515,0.7100714445114136,SFPQ,splicing factor proline/glutamine rich(polypyr...,6421.0,1:35182932-35193148,1,Yes,34.3,...,,,E,Dom,"TSG, fusion",T,TFE3,,,"6421,ENSG00000116560.10,P23246,PPP1R140,PSF,SFPQ"
5,ENSG00000140262,15:51919876-55250930,0.3569802939891815,TCF12,"transcription factor 12 (HTF4, helix-loop-heli...",6938.0,15:56918623-57288514,1,,21.3,...,,,M,Dom,fusion,T,NR4A3,,,"6938,ENSG00000140262.17,HEB,HTF4,HsT17266,Q990..."
1,ENSG00000158711,1:203539204-205954108,0.2540072798728943,ELK4,"ELK4, ETS-domain protein (SRF accessory protei...",2005.0,1:205607943-205631962,1,Yes,32.1,...,,,E,Dom,"oncogene, fusion",T,SLC45A3,,,"2005,ELK4,ENSG00000158711.13,P28324,SAP1"
2,ENSG00000158715,1:203539204-205954108,0.2540072798728943,SLC45A3,"solute carrier family 45, member 3",85414.0,1:205657851-205680459,1,Yes,32.1,...,,,E,Dom,fusion,T,"ETV1, ETV5, ELK4, ERG, BRAF",,,"85414,ENSG00000158715.5,IPCA-2,IPCA-6,IPCA-8,P..."
4,ENSG00000140396,8:71114730-72584340,0.2506601214408874,NCOA2,nuclear receptor coactivator 2 (TIF2),10499.0,8:70109762-70403805,1,,13.3,...,,,"L, M",Dom,"oncogene, fusion",T,"KAT6A, HEY1, NCOA2",,,"10499,ENSG00000140396.12,GRIP1,KAT13C,NCOA2,NC..."


In [65]:
#Intersect with MI regions
mi_inter = intersect(mi_cv_regions, all_genes, all_genes_table, cancer_genes_table)
display(mi_inter)

Unnamed: 0,Gene Ensembl ID,CV Region,CV Region Score,Gene Symbol,Name,Entrez GeneId,Genome Location,Tier,Hallmark,Chr Band,...,Tumour Types(Germline),Cancer Syndrome,Tissue Type,Molecular Genetics,Role in Cancer,Mutation Types,Translocation Partner,Other Germline Mut,Other Syndrome,Synonyms
6,ENSG00000141736,17:35076296-35282086,0.6275767412423157,ERBB2,v-erb-b2 erythroblastic leukemia viral oncogen...,2064.0,17:39700080-39728662,1,Yes,12.0,...,,,E,Dom,"oncogene, fusion","A, Mis, O",,,,"2064,CD340,ENSG00000141736.13,ERBB2,HER-2,HER2..."
7,ENSG00000161405,17:35076296-35282086,0.6275767412423157,IKZF3,IKAROS Family Zinc Finger 3,22806.0,17:39757715-39864188,1,,21.1,...,,,L,,"oncogene, TSG","D, Mis, F, N",,,,"22806,Aiolos,ENSG00000161405.16,IKZF3,Q9UKT9,Z..."
4,ENSG00000140396,8:71114730-72584340,0.2436973725601476,NCOA2,nuclear receptor coactivator 2 (TIF2),10499.0,8:70109762-70403805,1,,13.3,...,,,"L, M",Dom,"oncogene, fusion",T,"KAT6A, HEY1, NCOA2",,,"10499,ENSG00000140396.12,GRIP1,KAT13C,NCOA2,NC..."
0,ENSG00000116560,1:35336344-35895515,0.2104239596754651,SFPQ,splicing factor proline/glutamine rich(polypyr...,6421.0,1:35182932-35193148,1,Yes,34.3,...,,,E,Dom,"TSG, fusion",T,TFE3,,,"6421,ENSG00000116560.10,P23246,PPP1R140,PSF,SFPQ"
5,ENSG00000140262,15:51919876-55250930,0.208057000385615,TCF12,"transcription factor 12 (HTF4, helix-loop-heli...",6938.0,15:56918623-57288514,1,,21.3,...,,,M,Dom,fusion,T,NR4A3,,,"6938,ENSG00000140262.17,HEB,HTF4,HsT17266,Q990..."
1,ENSG00000158711,1:203539204-205954108,0.1853945301831827,ELK4,"ELK4, ETS-domain protein (SRF accessory protei...",2005.0,1:205607943-205631962,1,Yes,32.1,...,,,E,Dom,"oncogene, fusion",T,SLC45A3,,,"2005,ELK4,ENSG00000158711.13,P28324,SAP1"
2,ENSG00000158715,1:203539204-205954108,0.1853945301831827,SLC45A3,"solute carrier family 45, member 3",85414.0,1:205657851-205680459,1,Yes,32.1,...,,,E,Dom,fusion,T,"ETV1, ETV5, ELK4, ERG, BRAF",,,"85414,ENSG00000158715.5,IPCA-2,IPCA-6,IPCA-8,P..."
3,ENSG00000145819,5:140433878-142819410,0.1849441706091821,ARHGAP26,Rho GTPase activating protein 26,23092.0,5:142770377-143229011,1,Yes,31.3,...,,,L,Dom,"TSG, fusion","T, F, S",KMT2A,,,"23092,ARHGAP26,ENSG00000145819.16,GRAF,KIAA062..."


In [64]:
#Intersect with LR regions
lr_inter = intersect(lr_cv_regions, all_genes, all_genes_table, cancer_genes_table)
display(lr_inter)

Unnamed: 0,Gene Ensembl ID,CV Region,CV Region Score,Gene Symbol,Name,Entrez GeneId,Genome Location,Tier,Hallmark,Chr Band,...,Tumour Types(Germline),Cancer Syndrome,Tissue Type,Molecular Genetics,Role in Cancer,Mutation Types,Translocation Partner,Other Germline Mut,Other Syndrome,Synonyms
6,ENSG00000141736,17:35076296-35282086,0.536930513497084,ERBB2,v-erb-b2 erythroblastic leukemia viral oncogen...,2064.0,17:39700080-39728662,1,Yes,12.0,...,,,E,Dom,"oncogene, fusion","A, Mis, O",,,,"2064,CD340,ENSG00000141736.13,ERBB2,HER-2,HER2..."
7,ENSG00000161405,17:35076296-35282086,0.536930513497084,IKZF3,IKAROS Family Zinc Finger 3,22806.0,17:39757715-39864188,1,,21.1,...,,,L,,"oncogene, TSG","D, Mis, F, N",,,,"22806,Aiolos,ENSG00000161405.16,IKZF3,Q9UKT9,Z..."
4,ENSG00000140396,8:71114730-72584340,0.0423165181528808,NCOA2,nuclear receptor coactivator 2 (TIF2),10499.0,8:70109762-70403805,1,,13.3,...,,,"L, M",Dom,"oncogene, fusion",T,"KAT6A, HEY1, NCOA2",,,"10499,ENSG00000140396.12,GRIP1,KAT13C,NCOA2,NC..."
5,ENSG00000140262,15:51919876-55250930,0.0161501686772575,TCF12,"transcription factor 12 (HTF4, helix-loop-heli...",6938.0,15:56918623-57288514,1,,21.3,...,,,M,Dom,fusion,T,NR4A3,,,"6938,ENSG00000140262.17,HEB,HTF4,HsT17266,Q990..."
1,ENSG00000158711,1:203539204-205954108,0.0065370807502261,ELK4,"ELK4, ETS-domain protein (SRF accessory protei...",2005.0,1:205607943-205631962,1,Yes,32.1,...,,,E,Dom,"oncogene, fusion",T,SLC45A3,,,"2005,ELK4,ENSG00000158711.13,P28324,SAP1"
2,ENSG00000158715,1:203539204-205954108,0.0065370807502261,SLC45A3,"solute carrier family 45, member 3",85414.0,1:205657851-205680459,1,Yes,32.1,...,,,E,Dom,fusion,T,"ETV1, ETV5, ELK4, ERG, BRAF",,,"85414,ENSG00000158715.5,IPCA-2,IPCA-6,IPCA-8,P..."
0,ENSG00000116560,1:35336344-35895515,0.001897034589146,SFPQ,splicing factor proline/glutamine rich(polypyr...,6421.0,1:35182932-35193148,1,Yes,34.3,...,,,E,Dom,"TSG, fusion",T,TFE3,,,"6421,ENSG00000116560.10,P23246,PPP1R140,PSF,SFPQ"
3,ENSG00000145819,5:140433878-142819410,-0.0351102825062545,ARHGAP26,Rho GTPase activating protein 26,23092.0,5:142770377-143229011,1,Yes,31.3,...,,,L,Dom,"TSG, fusion","T, F, S",KMT2A,,,"23092,ARHGAP26,ENSG00000145819.16,GRAF,KIAA062..."


In [67]:
#Intersect with RF regions
rf_inter = intersect(rf_cv_regions, all_genes, all_genes_table, cancer_genes_table)
display(rf_inter)

Unnamed: 0,Gene Ensembl ID,CV Region,CV Region Score,Gene Symbol,Name,Entrez GeneId,Genome Location,Tier,Hallmark,Chr Band,...,Tumour Types(Germline),Cancer Syndrome,Tissue Type,Molecular Genetics,Role in Cancer,Mutation Types,Translocation Partner,Other Germline Mut,Other Syndrome,Synonyms
6,ENSG00000141736,17:35076296-35282086,0.3466000737517017,ERBB2,v-erb-b2 erythroblastic leukemia viral oncogen...,2064.0,17:39700080-39728662,1,Yes,12.0,...,,,E,Dom,"oncogene, fusion","A, Mis, O",,,,"2064,CD340,ENSG00000141736.13,ERBB2,HER-2,HER2..."
7,ENSG00000161405,17:35076296-35282086,0.3466000737517017,IKZF3,IKAROS Family Zinc Finger 3,22806.0,17:39757715-39864188,1,,21.1,...,,,L,,"oncogene, TSG","D, Mis, F, N",,,,"22806,Aiolos,ENSG00000161405.16,IKZF3,Q9UKT9,Z..."
3,ENSG00000145819,5:140433878-142819410,0.0664386983457401,ARHGAP26,Rho GTPase activating protein 26,23092.0,5:142770377-143229011,1,Yes,31.3,...,,,L,Dom,"TSG, fusion","T, F, S",KMT2A,,,"23092,ARHGAP26,ENSG00000145819.16,GRAF,KIAA062..."
5,ENSG00000140262,15:51919876-55250930,0.0145620925463143,TCF12,"transcription factor 12 (HTF4, helix-loop-heli...",6938.0,15:56918623-57288514,1,,21.3,...,,,M,Dom,fusion,T,NR4A3,,,"6938,ENSG00000140262.17,HEB,HTF4,HsT17266,Q990..."
1,ENSG00000158711,1:203539204-205954108,0.0111803883282212,ELK4,"ELK4, ETS-domain protein (SRF accessory protei...",2005.0,1:205607943-205631962,1,Yes,32.1,...,,,E,Dom,"oncogene, fusion",T,SLC45A3,,,"2005,ELK4,ENSG00000158711.13,P28324,SAP1"
2,ENSG00000158715,1:203539204-205954108,0.0111803883282212,SLC45A3,"solute carrier family 45, member 3",85414.0,1:205657851-205680459,1,Yes,32.1,...,,,E,Dom,fusion,T,"ETV1, ETV5, ELK4, ERG, BRAF",,,"85414,ENSG00000158715.5,IPCA-2,IPCA-6,IPCA-8,P..."
0,ENSG00000116560,1:35336344-35895515,0.0,SFPQ,splicing factor proline/glutamine rich(polypyr...,6421.0,1:35182932-35193148,1,Yes,34.3,...,,,E,Dom,"TSG, fusion",T,TFE3,,,"6421,ENSG00000116560.10,P23246,PPP1R140,PSF,SFPQ"
4,ENSG00000140396,8:71114730-72584340,0.0,NCOA2,nuclear receptor coactivator 2 (TIF2),10499.0,8:70109762-70403805,1,,13.3,...,,,"L, M",Dom,"oncogene, fusion",T,"KAT6A, HEY1, NCOA2",,,"10499,ENSG00000140396.12,GRIP1,KAT13C,NCOA2,NC..."


In [68]:
#Overlap between all methods, are all the genes
print(set(xg_inter['Gene Symbol']).intersection(mi_inter['Gene Symbol'], lr_inter['Gene Symbol'], rf_inter['Gene Symbol']))

{'TCF12', 'ERBB2', 'SFPQ', 'ARHGAP26', 'NCOA2', 'SLC45A3', 'ELK4', 'IKZF3'}


In [5]:
#Because most or all of their feature regions are the same
xg_features = pd.read_csv('XGB_feature_importance.csv', index_col=0)
mi_features = pd.read_csv('MI_scores.csv', index_col=0)
lr_features = pd.read_csv('LR_feature_importance.csv', index_col=0)
rf_features = pd.read_csv('RF_feature_importance.csv', index_col=0)
feature_overlap = set(xg_features.index.values).intersection(mi_features.index.values, lr_features.index.values, rf_features.index.values)
print(len(feature_overlap))

30


| Gene | Mutual information | XGBoost | Logistic Regression | Random Forest |
| --- | --- | --- | --- | --- |
|  | 1 | 1 | 5 | 2 |
|  | 1 | 1 | 5 | 2 |
|  | 2 | 2 | 1 | 3 |
| ARHGAP26 | 5 | 3 | 4 | 1 |
|  | 3 | 4 | 2 | 4 |
| ELK4 | 4 | 5 | 3 | 5 |
|  | 4 | 5 | 3 | 5 |
|  | 4 | 5 | 3 | 5 |

_The rankings of the genes by their region importances per machine learning method_

ERBB2 = HER2 <br>
NCOA2: Amplified: https://www.frontiersin.org/articles/10.3389/fonc.2019.00164/full <br>
ARHGAP26: Downregulation?: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5569970/ <br>
ELK4: No influence?: https://pubmed.ncbi.nlm.nih.gov/23329352/ <br>

Interestingly ERBB2 (or HER2)/IKZF3 and ELK4/SLC45A3 lie in the same regions.

The region 17:35076296-35282086 seems to be the most important feature and has overlap with the cancer related genes ERBB2 (or HER2) and IKZF3, but numerous other important oncogenes are located on there and polysomy of chromosome 17 is often seen:
https://pubmed.ncbi.nlm.nih.gov/22016618/

That HER2 lies in this region is to be expected, as patients in the clinic are often divided into one of the three tumor subtypes based on HER2 positivity in addition to PR and ER positivity.

Some other genes in this region with possible significance are listed in the cell below.

In [79]:
#Look at all genes in the region 17:35076296-35282086
most_important_inter = intersect(pybedtools.BedTool([[17, 35076296, 35282086, str(0.3466000737517017)]]),
                                 all_genes, all_genes_table)
display(most_important_inter)

Unnamed: 0,Gene Ensembl ID,CV Region,CV Region Score,Gene Symbol
0,ENSG00000173991,17:35076296-35282086,0.3466000737517017,TCAP
1,ENSG00000141744,17:35076296-35282086,0.3466000737517017,PNMT
2,ENSG00000161395,17:35076296-35282086,0.3466000737517017,PERLD1
3,ENSG00000141736,17:35076296-35282086,0.3466000737517017,ERBB2
4,ENSG00000141741,17:35076296-35282086,0.3466000737517017,C17orf37
5,ENSG00000141738,17:35076296-35282086,0.3466000737517017,GRB7
6,ENSG00000161405,17:35076296-35282086,0.3466000737517017,IKZF3
7,ENSG00000218970,17:35076296-35282086,0.3466000737517017,AC079199.2
8,ENSG00000186075,17:35076296-35282086,0.3466000737517017,ZPBP2
