In [1]:
#Import modules
import os
import pandas as pd
import pybedtools
import re

In [2]:
#Create a bedtools of the relevant regions and append their score at the end of it
def get_regions(feature_scores_table, data_table):
    feature_scores = pd.read_csv(feature_scores_table, index_col=0)
    data = pd.read_table(data_table)
    
    
    cv_regions = pybedtools.BedTool(
        [list(data.loc[index, ['Chromosome', 'Start', 'End']]) + [str(row[0])]
         for index, row in feature_scores.iterrows()]
    )
    return cv_regions


xg_cv_regions = get_regions('XGB_feature_importance.csv', '../data/Train_call.txt')
mi_cv_regions = get_regions('MI_scores.csv', '../data/Train_call.txt')
lr_cv_regions = get_regions('LR_feature_importance.csv', '../data/Train_call.txt')
rf_cv_regions = get_regions('RF_feature_importance.csv', '../data/Train_call.txt')

In [11]:
#Create a bed tools of the relevant genes
cancer_genes_table = pd.read_csv('../data/census_cancer_genes.csv')
cancer_genes = pybedtools.BedTool(
    [re.split(r'[:\-]', row['Genome Location']) + [row['Gene Symbol']]
    for i, row in cancer_genes_table.iterrows()
         if ':-' not in row['Genome Location']
])

In [37]:
#Intersect with the regions to identify relevant genes
def intersect(regions, genes, genes_table):
    hits = []
    for gene in genes:
        for hit in regions.all_hits(gene):
            hits.append([
                gene.fields[3], 
                f'{hit.fields[0]}:{hit.fields[1]}-{hit.fields[2]}',
                hit.fields[3]
            ])
    hits = pd.DataFrame(data=hits, columns=['Gene Symbol', 'CV Region', 'CV Region Score'])

    return hits.merge(genes_table, how='inner', on='Gene Symbol')\
                .sort_values(by='CV Region Score', ascending=False)

In [39]:
#Intersect with XGBoost regions
xg_inter = intersect(xg_cv_regions, cancer_genes, cancer_genes_table)
display(xg_inter)

Unnamed: 0,Gene Symbol,CV Region,CV Region Score,Name,Entrez GeneId,Genome Location,Tier,Hallmark,Chr Band,Somatic,...,Tumour Types(Germline),Cancer Syndrome,Tissue Type,Molecular Genetics,Role in Cancer,Mutation Types,Translocation Partner,Other Germline Mut,Other Syndrome,Synonyms
2,HMGA2,12:64727853-66012212,1.2138625383377075,high mobility group AT-hook 2 (HMGIC),8091.0,12:65824460-65915447,1,,14.3,yes,...,,,M,Dom,"oncogene, fusion",T,"LHFP, RAD51B, LPP, COX6C, ACKR3, NFIB, ALDH2, ...",,,"8091,BABL,ENSG00000149948.13,HMGA2,HMGIC,LIPO"
7,WIF1,12:64727853-66012212,1.2138625383377075,WNT inhibitory factor 1,11197.0,12:65050626-65121566,1,Yes,14.3,yes,...,,,E,Dom,"TSG, fusion",T,HMGA2,,,"11197,ENSG00000156076.9,Q9Y5W5,WIF1"
5,PDGFRB,5:150148732-150201145,1.1297380924224854,"platelet-derived growth factor receptor, beta ...",5159.0,5:150113837-150155860,1,,32.0,yes,...,,,L,Dom,"oncogene, fusion",T,"ETV6, TRIP11, HIP1, RABEP1, H4, NIN, SPECC1, P...",,,"5159,CD140b,ENSG00000113721.13,JTK12,P09619,PD..."
0,ARHGAP26,5:140433878-142819410,0.8109956979751587,Rho GTPase activating protein 26,23092.0,5:142770377-143229011,1,Yes,31.3,yes,...,,,L,Dom,"TSG, fusion","T, F, S",KMT2A,,,"23092,ARHGAP26,ENSG00000145819.16,GRAF,KIAA062..."
4,MYO5A,15:51919876-55250930,0.3569802939891815,"myosin VA (heavy chain 12, myoxin)",4644.0,15:52307283-52529050,1,,21.2,yes,...,,,E,Dom,fusion,T,ROS1,yes,Griscelli syndrome,"4644,ENSG00000197535.14,GS1,MYH12,MYO5,MYO5A,M..."
1,ELK4,1:203539204-205954108,0.2540072798728943,"ELK4, ETS-domain protein (SRF accessory protei...",2005.0,1:205607943-205631962,1,Yes,32.1,yes,...,,,E,Dom,"oncogene, fusion",T,SLC45A3,,,"2005,ELK4,ENSG00000158711.13,P28324,SAP1"
3,MDM4,1:203539204-205954108,0.2540072798728943,Mdm4 p53 binding protein homolog,4194.0,1:204516383-204558120,1,,32.1,yes,...,,,M,Dom,oncogene,A,,,,"4194,ENSG00000198625.12,HDMX,MDM4,MDMX,O15151"
6,SLC45A3,1:203539204-205954108,0.2540072798728943,"solute carrier family 45, member 3",85414.0,1:205657851-205680459,1,Yes,32.1,yes,...,,,E,Dom,fusion,T,"ETV1, ETV5, ELK4, ERG, BRAF",,,"85414,ENSG00000158715.5,IPCA-2,IPCA-6,IPCA-8,P..."


In [40]:
#Intersect with MI regions
mi_inter = intersect(mi_cv_regions, cancer_genes, cancer_genes_table)
display(mi_inter)

Unnamed: 0,Gene Symbol,CV Region,CV Region Score,Name,Entrez GeneId,Genome Location,Tier,Hallmark,Chr Band,Somatic,...,Tumour Types(Germline),Cancer Syndrome,Tissue Type,Molecular Genetics,Role in Cancer,Mutation Types,Translocation Partner,Other Germline Mut,Other Syndrome,Synonyms
2,HMGA2,12:64727853-66012212,0.2591695967246044,high mobility group AT-hook 2 (HMGIC),8091.0,12:65824460-65915447,1,,14.3,yes,...,,,M,Dom,"oncogene, fusion",T,"LHFP, RAD51B, LPP, COX6C, ACKR3, NFIB, ALDH2, ...",,,"8091,BABL,ENSG00000149948.13,HMGA2,HMGIC,LIPO"
7,WIF1,12:64727853-66012212,0.2591695967246044,WNT inhibitory factor 1,11197.0,12:65050626-65121566,1,Yes,14.3,yes,...,,,E,Dom,"TSG, fusion",T,HMGA2,,,"11197,ENSG00000156076.9,Q9Y5W5,WIF1"
5,PDGFRB,5:150148732-150201145,0.238038177117174,"platelet-derived growth factor receptor, beta ...",5159.0,5:150113837-150155860,1,,32.0,yes,...,,,L,Dom,"oncogene, fusion",T,"ETV6, TRIP11, HIP1, RABEP1, H4, NIN, SPECC1, P...",,,"5159,CD140b,ENSG00000113721.13,JTK12,P09619,PD..."
4,MYO5A,15:51919876-55250930,0.208057000385615,"myosin VA (heavy chain 12, myoxin)",4644.0,15:52307283-52529050,1,,21.2,yes,...,,,E,Dom,fusion,T,ROS1,yes,Griscelli syndrome,"4644,ENSG00000197535.14,GS1,MYH12,MYO5,MYO5A,M..."
1,ELK4,1:203539204-205954108,0.1853945301831827,"ELK4, ETS-domain protein (SRF accessory protei...",2005.0,1:205607943-205631962,1,Yes,32.1,yes,...,,,E,Dom,"oncogene, fusion",T,SLC45A3,,,"2005,ELK4,ENSG00000158711.13,P28324,SAP1"
3,MDM4,1:203539204-205954108,0.1853945301831827,Mdm4 p53 binding protein homolog,4194.0,1:204516383-204558120,1,,32.1,yes,...,,,M,Dom,oncogene,A,,,,"4194,ENSG00000198625.12,HDMX,MDM4,MDMX,O15151"
6,SLC45A3,1:203539204-205954108,0.1853945301831827,"solute carrier family 45, member 3",85414.0,1:205657851-205680459,1,Yes,32.1,yes,...,,,E,Dom,fusion,T,"ETV1, ETV5, ELK4, ERG, BRAF",,,"85414,ENSG00000158715.5,IPCA-2,IPCA-6,IPCA-8,P..."
0,ARHGAP26,5:140433878-142819410,0.1849441706091821,Rho GTPase activating protein 26,23092.0,5:142770377-143229011,1,Yes,31.3,yes,...,,,L,Dom,"TSG, fusion","T, F, S",KMT2A,,,"23092,ARHGAP26,ENSG00000145819.16,GRAF,KIAA062..."


In [41]:
#Intersect with LR regions
lr_inter = intersect(lr_cv_regions, cancer_genes, cancer_genes_table)
display(lr_inter)

Unnamed: 0,Gene Symbol,CV Region,CV Region Score,Name,Entrez GeneId,Genome Location,Tier,Hallmark,Chr Band,Somatic,...,Tumour Types(Germline),Cancer Syndrome,Tissue Type,Molecular Genetics,Role in Cancer,Mutation Types,Translocation Partner,Other Germline Mut,Other Syndrome,Synonyms
5,PDGFRB,5:150148732-150201145,0.0236900970567131,"platelet-derived growth factor receptor, beta ...",5159.0,5:150113837-150155860,1,,32.0,yes,...,,,L,Dom,"oncogene, fusion",T,"ETV6, TRIP11, HIP1, RABEP1, H4, NIN, SPECC1, P...",,,"5159,CD140b,ENSG00000113721.13,JTK12,P09619,PD..."
4,MYO5A,15:51919876-55250930,0.0161501686772575,"myosin VA (heavy chain 12, myoxin)",4644.0,15:52307283-52529050,1,,21.2,yes,...,,,E,Dom,fusion,T,ROS1,yes,Griscelli syndrome,"4644,ENSG00000197535.14,GS1,MYH12,MYO5,MYO5A,M..."
1,ELK4,1:203539204-205954108,0.0065370807502261,"ELK4, ETS-domain protein (SRF accessory protei...",2005.0,1:205607943-205631962,1,Yes,32.1,yes,...,,,E,Dom,"oncogene, fusion",T,SLC45A3,,,"2005,ELK4,ENSG00000158711.13,P28324,SAP1"
3,MDM4,1:203539204-205954108,0.0065370807502261,Mdm4 p53 binding protein homolog,4194.0,1:204516383-204558120,1,,32.1,yes,...,,,M,Dom,oncogene,A,,,,"4194,ENSG00000198625.12,HDMX,MDM4,MDMX,O15151"
6,SLC45A3,1:203539204-205954108,0.0065370807502261,"solute carrier family 45, member 3",85414.0,1:205657851-205680459,1,Yes,32.1,yes,...,,,E,Dom,fusion,T,"ETV1, ETV5, ELK4, ERG, BRAF",,,"85414,ENSG00000158715.5,IPCA-2,IPCA-6,IPCA-8,P..."
0,ARHGAP26,5:140433878-142819410,-0.0351102825062545,Rho GTPase activating protein 26,23092.0,5:142770377-143229011,1,Yes,31.3,yes,...,,,L,Dom,"TSG, fusion","T, F, S",KMT2A,,,"23092,ARHGAP26,ENSG00000145819.16,GRAF,KIAA062..."
2,HMGA2,12:64727853-66012212,-0.0105106986619249,high mobility group AT-hook 2 (HMGIC),8091.0,12:65824460-65915447,1,,14.3,yes,...,,,M,Dom,"oncogene, fusion",T,"LHFP, RAD51B, LPP, COX6C, ACKR3, NFIB, ALDH2, ...",,,"8091,BABL,ENSG00000149948.13,HMGA2,HMGIC,LIPO"
7,WIF1,12:64727853-66012212,-0.0105106986619249,WNT inhibitory factor 1,11197.0,12:65050626-65121566,1,Yes,14.3,yes,...,,,E,Dom,"TSG, fusion",T,HMGA2,,,"11197,ENSG00000156076.9,Q9Y5W5,WIF1"


In [42]:
#Intersect with RF regions
rf_inter = intersect(rf_cv_regions, cancer_genes, cancer_genes_table)
display(rf_inter)

Unnamed: 0,Gene Symbol,CV Region,CV Region Score,Name,Entrez GeneId,Genome Location,Tier,Hallmark,Chr Band,Somatic,...,Tumour Types(Germline),Cancer Syndrome,Tissue Type,Molecular Genetics,Role in Cancer,Mutation Types,Translocation Partner,Other Germline Mut,Other Syndrome,Synonyms
0,ARHGAP26,5:140433878-142819410,0.0664386983457401,Rho GTPase activating protein 26,23092.0,5:142770377-143229011,1,Yes,31.3,yes,...,,,L,Dom,"TSG, fusion","T, F, S",KMT2A,,,"23092,ARHGAP26,ENSG00000145819.16,GRAF,KIAA062..."
2,HMGA2,12:64727853-66012212,0.0541739158999621,high mobility group AT-hook 2 (HMGIC),8091.0,12:65824460-65915447,1,,14.3,yes,...,,,M,Dom,"oncogene, fusion",T,"LHFP, RAD51B, LPP, COX6C, ACKR3, NFIB, ALDH2, ...",,,"8091,BABL,ENSG00000149948.13,HMGA2,HMGIC,LIPO"
7,WIF1,12:64727853-66012212,0.0541739158999621,WNT inhibitory factor 1,11197.0,12:65050626-65121566,1,Yes,14.3,yes,...,,,E,Dom,"TSG, fusion",T,HMGA2,,,"11197,ENSG00000156076.9,Q9Y5W5,WIF1"
5,PDGFRB,5:150148732-150201145,0.0382735842328934,"platelet-derived growth factor receptor, beta ...",5159.0,5:150113837-150155860,1,,32.0,yes,...,,,L,Dom,"oncogene, fusion",T,"ETV6, TRIP11, HIP1, RABEP1, H4, NIN, SPECC1, P...",,,"5159,CD140b,ENSG00000113721.13,JTK12,P09619,PD..."
4,MYO5A,15:51919876-55250930,0.0145620925463143,"myosin VA (heavy chain 12, myoxin)",4644.0,15:52307283-52529050,1,,21.2,yes,...,,,E,Dom,fusion,T,ROS1,yes,Griscelli syndrome,"4644,ENSG00000197535.14,GS1,MYH12,MYO5,MYO5A,M..."
1,ELK4,1:203539204-205954108,0.0111803883282212,"ELK4, ETS-domain protein (SRF accessory protei...",2005.0,1:205607943-205631962,1,Yes,32.1,yes,...,,,E,Dom,"oncogene, fusion",T,SLC45A3,,,"2005,ELK4,ENSG00000158711.13,P28324,SAP1"
3,MDM4,1:203539204-205954108,0.0111803883282212,Mdm4 p53 binding protein homolog,4194.0,1:204516383-204558120,1,,32.1,yes,...,,,M,Dom,oncogene,A,,,,"4194,ENSG00000198625.12,HDMX,MDM4,MDMX,O15151"
6,SLC45A3,1:203539204-205954108,0.0111803883282212,"solute carrier family 45, member 3",85414.0,1:205657851-205680459,1,Yes,32.1,yes,...,,,E,Dom,fusion,T,"ETV1, ETV5, ELK4, ERG, BRAF",,,"85414,ENSG00000158715.5,IPCA-2,IPCA-6,IPCA-8,P..."


In [44]:
#Overlap between all methods:
set(xg_inter['Gene Symbol']).intersection(mi_inter['Gene Symbol'], lr_inter['Gene Symbol'], rf_inter['Gene Symbol'])

{'ARHGAP26', 'ELK4', 'HMGA2', 'MDM4', 'MYO5A', 'PDGFRB', 'SLC45A3', 'WIF1'}

HMGA2/WIF1: Translocation: https://www.nature.com/articles/s41523-018-0101-7 <br>
PDGFRB: Overexpression: https://link.springer.com/article/10.1007/s10549-021-06136-4 https://aacrjournals.org/cancerres/article/77/13_Supplement/2966/618518/Abstract-2966-Stromal-platelet-derived-growth <br>
ARHGAP26: Downregulation?: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5569970/ <br>
MYO5A: ?: https://www.sciencedirect.com/topics/medicine-and-dentistry/myo5a <br>
ELK4: No influence?: https://pubmed.ncbi.nlm.nih.gov/23329352/ <br>
MDM4: Overexpression: https://breast-cancer-research.biomedcentral.com/articles/10.1186/s13058-018-1094-8 <br>

The region 17:35076296-35282086 seems to be the most important feature, 
but has no overlap with the most common cancer genes, although numerous other important oncogenes (HER2, BRCA) are located on there and polysomy of chromosome 17 is often seen:
https://pubmed.ncbi.nlm.nih.gov/22016618/
Some other genes in this region with possible significance are:
SLFN5: https://www.nature.com/articles/s41416-020-0873-z