# Notebook 8: Comparison with known gene signatures
Goal: Compare genes associated with tumor/normal cells with existing gene signatures (ikarus, Parker 2009 - breast cancer microarrays)

In [1]:
import os
from pathlib import Path
import pandas as pd
import numpy as np

In [2]:
# Set up working directory
folder_path = input('Please enter the path of the folder containing datasets: ')
os.chdir(folder_path)

Please enter the path of the folder containing datasets:  ../data/


In [3]:
# Load ikarus gene signature (from pre-trained model)
# Downloaded signature from https://github.com/BIMSBbioinfo/ikarus/blob/master/tutorials/out/signatures.gmt
signatures_path = Path('signatures_ikarus.gmt')
gene_sig = pd.read_csv(signatures_path, sep='\t', header=None, index_col = 0)
display(gene_sig.head())

Unnamed: 0_level_0,1,2,3,4,5,6,7,8,9,10,...,1305,1306,1307,1308,1309,1310,1311,1312,1313,1314
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Normal,ikarus,RP11-128M1.1,TRAV8-2,PTPRD-AS1,MEOX2,CXCL12,KLRC4-KLRK1,BCAS1,SCNN1A,HCST,...,C22ORF15,CYP4F11,AK8,LRRC18,LMO2,COL12A1,ITGA11,EGFL6,RGS11,PCDHB15
Tumor,ikarus,RP11-277P12.10,RP13-895J2.6,BNC1,MAGEA6,ISX,MAGEA3,RP13-614K11.2,CDH7,CALML3,...,,,,,,,,,,


In [4]:
# Transpose, get tumor and normal gene lists as columns
gene_sig_transpose = gene_sig.T
gene_sig_transpose = gene_sig_transpose.drop(index = 1).reset_index().rename(columns = {'index': 'gene_order'})
gene_sig_transpose['gene_order'] = gene_sig_transpose['gene_order'] - 1
display(gene_sig_transpose.head())

Unnamed: 0,gene_order,Normal,Tumor
0,1,RP11-128M1.1,RP11-277P12.10
1,2,TRAV8-2,RP13-895J2.6
2,3,PTPRD-AS1,BNC1
3,4,MEOX2,MAGEA6
4,5,CXCL12,ISX


In [5]:
# Get lists of tumor and normal genes from ikarus gene signature
ikarus_tumor_genes = gene_sig_transpose['Tumor'].dropna().to_list()
ikarus_norm_genes = gene_sig_transpose['Normal'].dropna().to_list()
print(f'Number of tumor genes: {len(ikarus_tumor_genes)}, number of normal genes: {len(ikarus_norm_genes)}')

Number of tumor genes: 162, number of normal genes: 1313


In [6]:
# Load in Parker 2009 list (Supplemental Table 5)
# https://ascopubs.org/doi/10.1200/JCO.2008.18.1370
parker_genes = pd.read_csv('parker2009_genes.csv')
parker_genes = parker_genes.rename(columns = {'Gene Symbol': 'gene'})
display(parker_genes.head())

Unnamed: 0,gene,Name,num_papers
0,AADACL1,Arylacetamide deacetylase-like 1,1
1,ABAT,4-aminobutyrate aminotransferase,1
2,ABCC11,"ATP-binding cassette, sub-family C (CFTR/MRP),...",1
3,ABCC3,"ATP-binding cassette, sub-family C (CFTR/MRP),...",2
4,ABCD3,"ATP-binding cassette, sub-family D (ALD), memb...",2


In [9]:
gene_overlap_stats = pd.DataFrame()

# Load in SHAP value dataframes (generated from notebook 4) - 200 genes, Pearson, DGE, random
for curr_file in os.listdir('shap_vals/'):
    curr_overlap_df = pd.DataFrame()

    # Load in file
    curr_shap = pd.read_csv('shap_vals/' + curr_file, index_col = 0)

    # Calculate mean feature importance values per gene (column)
    curr_shap_mean = pd.DataFrame(curr_shap.mean(axis = 0), columns = ['mean_shap'])
    curr_shap_mean = curr_shap_mean.sort_values(by = 'mean_shap', ascending = False)

    # Drop rows with mean = 0
    curr_shap_mean = curr_shap_mean[curr_shap_mean['mean_shap'] != 0]

    # Get features means > 0 (tumor), < 0 (normal)
    curr_shap_mean['feature_type'] = np.where(curr_shap_mean['mean_shap'] > 0, 'tumor', 'normal')
    tumor_genes = curr_shap_mean[curr_shap_mean['feature_type'] == 'tumor'].index.to_list()
    normal_genes = curr_shap_mean[curr_shap_mean['feature_type'] == 'normal'].index.to_list()

    # Number of features overlapping with ikarus tumor signature
    ikarus_tumor_overlap = gene_sig_transpose.loc[gene_sig_transpose['Tumor'].isin(tumor_genes), ['gene_order', 'Tumor']]
    print(f'Number of features in ikarus tumor: {len(ikarus_tumor_overlap)} out of {len(tumor_genes)} features in tumor_genes')
    curr_overlap_df = pd.concat([curr_overlap_df,
                                 pd.DataFrame({'file_name': [curr_file], 'feature_type': ['ikarus_tumor'],
                                               'num_feat_genesig': [len(ikarus_tumor_overlap)],
                                               'num_feat_featsel': [len(tumor_genes)],
                                               'perc_genesig': [len(ikarus_tumor_overlap) / len(tumor_genes)],
                                               'overlap_genes': [ikarus_tumor_overlap['Tumor'].values],
                                               })])

    curr_shap_mean = curr_shap_mean.reset_index().rename(columns = {'index': 'gene'})
    curr_shap_mean = (curr_shap_mean.merge(ikarus_tumor_overlap,
                                           left_on = 'gene', right_on = 'Tumor', how = 'left')
                                    .rename(columns = {'gene_order': 'gene_order_ikarus_tumor'})
                                    .drop(columns = 'Tumor'))

    # Number of features overlapping with ikarus normal signature
    ikarus_norm_overlap = gene_sig_transpose.loc[gene_sig_transpose['Normal'].isin(normal_genes), ['gene_order', 'Normal']]
    print(f'Number of features in ikarus normal: {len(ikarus_norm_overlap)} out of {len(normal_genes)} features in normal_genes')
    curr_overlap_df = pd.concat([curr_overlap_df,
                                 pd.DataFrame({'file_name': [curr_file], 'feature_type': ['ikarus_norm'],
                                               'num_feat_genesig': [len(ikarus_norm_overlap)],
                                               'num_feat_featsel': [len(normal_genes)],
                                               'perc_genesig': [len(ikarus_norm_overlap) / len(normal_genes)],
                                               'overlap_genes': [ikarus_norm_overlap['Normal'].values],
                                               })])
    curr_shap_mean = (curr_shap_mean.merge(ikarus_norm_overlap,
                                           left_on = 'gene', right_on = 'Normal', how = 'left')
                                    .rename(columns = {'gene_order': 'gene_order_ikarus_norm'})
                                    .drop(columns = 'Normal'))

    # Number of features overlapping with Parker 2009
    parker_tumor_overlap = parker_genes.loc[parker_genes['gene'].isin(tumor_genes)]
    print(f'Number of features in Parker tumor: {len(parker_tumor_overlap)} out of {len(tumor_genes)} features in tumor_genes')
    curr_overlap_df = pd.concat([curr_overlap_df,
                                 pd.DataFrame({'file_name': [curr_file], 'feature_type': ['parker_tumor'],
                                               'num_feat_genesig': [len(parker_tumor_overlap)],
                                               'num_feat_featsel': [len(tumor_genes)],
                                               'perc_genesig': [len(parker_tumor_overlap) / len(tumor_genes)],
                                               'overlap_genes': [parker_tumor_overlap['gene'].values],
                                               })])

    curr_shap_mean = (curr_shap_mean.merge(parker_tumor_overlap,
                                           on = 'gene', how = 'left'))
    gene_overlap_stats = pd.concat([gene_overlap_stats, curr_overlap_df])
    print()

#gene_overlap_stats.to_csv('gene_overlap_stats.csv', index = False)

Number of features in ikarus tumor: 3 out of 84 features in tumor_genes
Number of features in ikarus normal: 29 out of 155 features in normal_genes
Number of features in Parker tumor: 31 out of 84 features in tumor_genes

Number of features in ikarus tumor: 0 out of 28 features in tumor_genes
Number of features in ikarus normal: 4 out of 79 features in normal_genes
Number of features in Parker tumor: 5 out of 28 features in tumor_genes

Number of features in ikarus tumor: 1 out of 49 features in tumor_genes
Number of features in ikarus normal: 25 out of 112 features in normal_genes
Number of features in Parker tumor: 12 out of 49 features in tumor_genes

