# Notebook 8: Test ikarus and XGBoost models with swapped features
Question: Does swapping feature sets affect performance for ikarus and XGBoost models?

Tested the following:
* ikarus model using top 200 DGEs for our model
* XGBoost model using ikarus gene signatures (identified from training set)

Code uses 6_ikarus.ipynb and [ikarus tutorial](https://github.com/BIMSBbioinfo/ikarus/blob/master/tutorials/tutorial.ipynb) as references

In [1]:
import main_functions as mf
from pathlib import Path
import os
import pandas as pd
import numpy as np
import anndata
import scanpy as sc
from ikarus import classifier, gene_list, utils, data
import upsetplot
from sklearn.metrics import accuracy_score, confusion_matrix, ConfusionMatrixDisplay, f1_score, recall_score, precision_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedGroupKFold
from xgboost import XGBClassifier

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Set up working directory
folder_path = input('Please enter the path of the folder containing datasets: ')
os.chdir(folder_path)

Please enter the path of the folder containing datasets:  ../data/


In [3]:
# Constants and global variables
RANDOM_STATE = 42
metrics_df_models = pd.DataFrame()

In [4]:
# Function to calculate and output evaluation metrics
def calc_eval_metrics(test_labels, y_pred):
    recall = recall_score(test_labels, y_pred)
    precision = precision_score(test_labels, y_pred)
    accuracy = accuracy_score(test_labels, y_pred)
    f1 = f1_score(test_labels, y_pred)

    metrics_df = pd.DataFrame({'recall': [recall], 'precision': [precision],
                               'f1': [f1], 'accuracy': [accuracy]})
    metrics_df = metrics_df.round(3)
    conf_matrix = confusion_matrix(test_labels, y_pred)
        
    print(conf_matrix)
    display(metrics_df)
    return metrics_df, conf_matrix

In [5]:
# Load training and two test sets
adatas = {}

train_adata = sc.read_h5ad('train.h5ad')
train_adata.obs['CellType'] = np.where(train_adata.obs.orig_cancer_label == 1, 'Cancer', 'Other')
train_adata.var['gene_symbol'] = train_adata.var.gene_ids
adatas['train'] = train_adata

# Load in test sets 1 and 2
test1_adata = sc.read_h5ad('test1.h5ad')
test1_adata.obs['CellType'] = np.where(test1_adata.obs.orig_cancer_label == 1, 'Cancer', 'Other')
test1_adata.var['gene_symbol'] = test1_adata.var.gene_ids

test2_adata = sc.read_h5ad('test2.h5ad')
test2_adata.obs['CellType'] = np.where(test2_adata.obs.orig_cancer_label == 1, 'Cancer', 'Other')
test2_adata.var['gene_symbol'] = test2_adata.var.gene_ids

adatas['test1'] = test1_adata
adatas['test2'] = test2_adata

In [6]:
# Get top 200 DGEs from training data - 2 lists

cancer_dge, norm_dge = mf.get_diff_exp_genes(train_adata)
cancer_dge_top100 = cancer_dge[:100]
norm_dge_top100 = norm_dge[:100]
model_dge_top200  = list(set(cancer_dge_top100).union(norm_dge_top100))

In [7]:
# Save DGE signature as .gmt file.
gene_list.save_gmt([norm_dge_top100, cancer_dge_top100],
                   ['Normal', 'Tumor'],
                   out_dir='train_model_dge/')

In [8]:
# Create gene signatures using ikarus method

dfs = [train_adata]
names = ['train']
obs_names = ['CellType']
label_upregs = ['Cancer']
label_downregs = ['Other']

signatures_cancer = gene_list.create_all(
    label_upregs_list=label_upregs,
    label_downregs_list=label_downregs,
    adatas_dict=adatas,
    names_list=names,
    obs_names_list=obs_names,
    integration_fun=utils.intersection_fun,
    top_x=300
)

signatures_norm = gene_list.create_all(
    label_upregs_list=label_downregs,
    label_downregs_list=label_upregs,
    adatas_dict=adatas,
    names_list=names,
    obs_names_list=obs_names,
    integration_fun=utils.intersection_fun,
    top_x=300
)

In [9]:
# Get lists of tumor and normal genes from ikarus signature (generated with Qian dataset)

contents_cancer = upsetplot.from_contents(signatures_cancer)
contents_norm = upsetplot.from_contents(signatures_norm)

tumor_genes = contents_cancer['id'].values.ravel().tolist()
normal_genes = contents_norm['id'].values.ravel().tolist()
print(f'num tumor_genes: {len(tumor_genes)}, num normal_genes: {len(normal_genes)}')
overlap = list(set(tumor_genes).union(normal_genes))
print(len(overlap)) # Equals sum of tumor_genes and normal_genes, so no overlap

# Save signatures as .gmt file
gene_list.save_gmt([normal_genes, tumor_genes], ['Normal', 'Tumor'], out_dir='../out/train_ikarus_dge/')

num tumor_genes: 300, num normal_genes: 300
600


In [10]:
# Run baseline ikarus - confirms similar results as 6_ikarus.ipynb
# Train with training set, using features from ikarus approach

signatures_path_qian_ikarus = Path('../out/train_ikarus_baseline/signatures.gmt')

model_qian_ikarus = classifier.Ikarus(signatures_gmt=signatures_path_qian_ikarus,
                                      out_dir='../out/train_ikarus_baseline', adapt_signatures = True)

train_adata_list = [adatas['train']]
train_names_list = ['train']
obs_columns_list = ['orig_cancer_label']

model_qian_ikarus.fit(train_adata_list, train_names_list, obs_columns_list, save=True)

# Run on test sets 1 and 2

ikarus_baseline_test1 = model_qian_ikarus.predict(test1_adata, 'test1', save=True)
y_test1 = test1_adata.obs.orig_cancer_label
y_pred1 = np.where(ikarus_baseline_test1 == 'Tumor', 1, 0)
ikarus_baseline_metrics1, ikarus_baseline_cm1 = calc_eval_metrics(y_test1, y_pred1)
ikarus_baseline_metrics1['model'] = 'ikarus'
ikarus_baseline_metrics1['feat_set'] = 'ikarus'
ikarus_baseline_metrics1['dataset'] = 'test1'

ikarus_baseline_test2 = model_qian_ikarus.predict(test2_adata, 'test2', save=True)
y_test2 = test2_adata.obs.orig_cancer_label
y_pred2 = np.where(ikarus_baseline_test2 == 'Tumor', 1, 0)
ikarus_baseline_metrics2, ikarus_baseline_cm2 = calc_eval_metrics(y_test2, y_pred2)
ikarus_baseline_metrics2['model'] = 'ikarus'
ikarus_baseline_metrics2['feat_set'] = 'ikarus'
ikarus_baseline_metrics2['dataset'] = 'test2'

# Save metrics to metrics_df_models
metrics_df_models = pd.concat([metrics_df_models, ikarus_baseline_metrics1])
metrics_df_models = pd.concat([metrics_df_models, ikarus_baseline_metrics2])

Less than 80% of signature genes are available in data set. A temporary signature is stored where non-overlapping genes are removed. It is proceeded with the temporary signature.
Less than 80% of signature genes are available in data set. A temporary signature is stored where non-overlapping genes are removed. It is proceeded with the temporary signature.


  adata.uns["hvg"] = {"flavor": flavor}


converged at iteration step: 32 with 0.0008 < 0.001
[[1336 2634]
 [  26 6363]]


Unnamed: 0,recall,precision,f1,accuracy
0,0.996,0.707,0.827,0.743


Less than 80% of signature genes are available in data set. A temporary signature is stored where non-overlapping genes are removed. It is proceeded with the temporary signature.
Less than 80% of signature genes are available in data set. A temporary signature is stored where non-overlapping genes are removed. It is proceeded with the temporary signature.


  adata.uns["hvg"] = {"flavor": flavor}


converged at iteration step: 44 with 0.0009 < 0.001
[[5542 2541]
 [   6 1767]]


Unnamed: 0,recall,precision,f1,accuracy
0,0.997,0.41,0.581,0.742


In [11]:
# Run ikarus using our model's DGE gene signature
signatures_path_qian_model = Path('../out/train_model_dge/signatures.gmt')

model_qian_dge = classifier.Ikarus(signatures_gmt=signatures_path_qian_model,
                                      out_dir='../out/train_ikarus_dge/', adapt_signatures = True)

model_qian_dge.fit(train_adata_list, train_names_list, obs_columns_list, save=True)

# Run on test sets 1 and 2 - compare with Bryan's results

model_val_test1 = model_qian_dge.predict(test1_adata, 'test1', save=True)
y_test1 = test1_adata.obs.orig_cancer_label
y_pred1 = np.where(model_val_test1 == 'Tumor', 1, 0)
print(f'Metrics for ikarus model using custom DGE features, test set 1')
ikarus_dge_metrics1, ikarus_dge_cm1 = calc_eval_metrics(y_test1, y_pred1)
ikarus_dge_metrics1['model'] = 'ikarus'
ikarus_dge_metrics1['feat_set'] = 'dge'
ikarus_dge_metrics1['dataset'] = 'test1'

model_val_test2 = model_qian_dge.predict(test2_adata, 'test2', save=True)
y_test2 = test2_adata.obs.orig_cancer_label
y_pred2 = np.where(model_val_test2 == 'Tumor', 1, 0)
print(f'Metrics for ikarus model using custom DGE features, test set 2')
ikarus_dge_metrics2, ikarus_dge_cm2 = calc_eval_metrics(y_test2, y_pred2)
ikarus_dge_metrics2['model'] = 'ikarus'
ikarus_dge_metrics2['feat_set'] = 'dge'
ikarus_dge_metrics2['dataset'] = 'test2'

metrics_df_models = pd.concat([metrics_df_models, ikarus_dge_metrics1])
metrics_df_models = pd.concat([metrics_df_models, ikarus_dge_metrics2])

  adata.uns["hvg"] = {"flavor": flavor}


converged at iteration step: 7 with 0.0008 < 0.001
Metrics for ikarus model using custom DGE features, test set 1
[[3690  280]
 [  19 6370]]


Unnamed: 0,recall,precision,f1,accuracy
0,0.997,0.958,0.977,0.971


  adata.uns["hvg"] = {"flavor": flavor}


converged at iteration step: 8 with 0.0004 < 0.001
Metrics for ikarus model using custom DGE features, test set 2
[[8083    0]
 [  29 1744]]


Unnamed: 0,recall,precision,f1,accuracy
0,0.984,1.0,0.992,0.997


# Test our model using ikarus gene signature (using training set)

In [12]:
# Load gene signatures using training set (ikarus approach)
gene_sig = pd.read_csv(signatures_path_qian_ikarus, sep='\t', header=None, index_col = 0)
display(gene_sig.head())

Unnamed: 0_level_0,1,2,3,4,5,6,7,8,9,10,...,292,293,294,295,296,297,298,299,300,301
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Normal,ikarus,TRBV21-1,KLRC4,IGKV1OR22-5,TRDV3,TRAV40,TRBV12-5,TRBV23-1,CTC-490G23.2,RP11-428G5.5,...,AC093326.3,RP11-397O8.7,TRERNA1,RP11-318A15.8,AC016994.2,MYF6,RP11-255M2.2,HCFC1-AS1,IGKV1OR2-1,OTOS
Tumor,ikarus,RP3-340B19.3,CTD-2589M5.4,AC012485.2,MCCD1,KRTAP5-3,RP11-107I14.2,RP11-429J17.5,AC097468.4,LINC01194,...,LRTM1,CHST9,ADGRD2,BARX1-AS1,EVPLL,ANXA8,MCIDAS,GRHL2,OPRK1,PRSS8


In [13]:
# Transpose, get tumor and normal gene lists as columns
gene_sig_transpose = gene_sig.T
gene_sig_transpose = gene_sig_transpose.drop(index = 1).reset_index().rename(columns = {'index': 'gene_order'})
gene_sig_transpose['gene_order'] = gene_sig_transpose['gene_order'] - 1
display(gene_sig_transpose.head())

Unnamed: 0,gene_order,Normal,Tumor
0,1,TRBV21-1,RP3-340B19.3
1,2,KLRC4,CTD-2589M5.4
2,3,IGKV1OR22-5,AC012485.2
3,4,TRDV3,MCCD1
4,5,TRAV40,KRTAP5-3


In [14]:
# Get lists of tumor and normal genes from ikarus signatures
ikarus_tumor_genes = gene_sig_transpose['Tumor'].dropna().to_list()
ikarus_norm_genes = gene_sig_transpose['Normal'].dropna().to_list()
len(ikarus_tumor_genes), len(ikarus_norm_genes)

(300, 300)

In [15]:
# Confirm tumor and normal genes do not contain overlapping genes (total genes = 600)
ikarus_genes = list(set(ikarus_tumor_genes).union(ikarus_norm_genes))
len(ikarus_genes)

600

In [16]:
# Concatenate all datasets to create a dataframe with all the same features (for training and testing purposes)
X1, X2, X3 = adatas['train'].to_df(), adatas['test1'].to_df(), adatas['test2'].to_df()
concat = pd.concat([X1,X2,X3]).fillna(0)

# Extracting relevant columns from concatenated dataset
X1 = concat.iloc[0:len(X1)]
X2 = concat.iloc[len(X1):len(X1)+len(X2)]
X3 = concat.iloc[len(X1)+len(X2):len(X1)+len(X2)+len(X3)]

y1 = adatas['train'].obs.orig_cancer_label
y2 = adatas['test1'].obs.orig_cancer_label
y3 = adatas['test2'].obs.orig_cancer_label

groups = adatas['train'].obs.PatientNumber
group_kfold = StratifiedGroupKFold(n_splits=5) #Using stratified K-fold to make sure labels are stratified
cv = group_kfold.get_n_splits(X1,y1, groups)

# Creating predictions dataframes to export later
test1_pred = pd.DataFrame()
test2_pred = pd.DataFrame()

In [17]:
# Run XGBoost model using DGE hyperparameters on the two test sets
clf_dge = XGBClassifier(eta = 0.3, max_depth = 3, n_estimators = 200, random_state = RANDOM_STATE) 

print(f'XGBoost model, ikarus features, test set 1')
xgboost_ikarus_metrics1, xgboost_ikarus_cm1 = mf.train_test_model(clf_dge, X1, y1, X2, y2, ikarus_genes)
xgboost_ikarus_metrics1['model'] = 'xgboost'
xgboost_ikarus_metrics1['feat_set'] = 'ikarus'
xgboost_ikarus_metrics1['dataset'] = 'test1'
print(f'XGBoost model, ikarus features, test set 2')
xgboost_ikarus_metrics2, xgboost_ikarus_cm2 = mf.train_test_model(clf_dge, X1, y1, X3, y3, ikarus_genes)
xgboost_ikarus_metrics2['model'] = 'xgboost'
xgboost_ikarus_metrics2['feat_set'] = 'ikarus'
xgboost_ikarus_metrics2['dataset'] = 'test1'

metrics_df_models = pd.concat([metrics_df_models, xgboost_ikarus_metrics1])
metrics_df_models = pd.concat([metrics_df_models, xgboost_ikarus_metrics2])
metrics_df_models.to_csv('SuppTableS4_ikarus_swapped_feat_metrics.csv')

XGBoost model, ikarus features, test set 1
# cells in training: 33043, # cells in test: 10359
[[3533  437]
 [ 102 6287]]


Unnamed: 0,recall,precision,f1,accuracy
0,0.984,0.935,0.959,0.948


XGBoost model, ikarus features, test set 2
# cells in training: 33043, # cells in test: 9856
[[7766  317]
 [  20 1753]]


Unnamed: 0,recall,precision,f1,accuracy
0,0.989,0.847,0.912,0.966


In [18]:
# Run confusion matrices with labels without network propagation for ikarus

baseline_ikarus_path = '../out/train_ikarus_dge/' # Change to 'train_ikarus_baseline'
dge_ikarus_path = '../out/train_model_dge/' # change to ../out/train_ikarus_dge/

# Baseline ikarus
y_test1 = test1_adata.obs.orig_cancer_label
y_pred1 = pd.read_csv(baseline_ikarus_path + 'test1/prediction.csv', index_col = 0)['core_pred']
ikarus_baseline_metrics1, ikarus_baseline_cm1 = calc_eval_metrics(y_test1, y_pred1)

y_test2 = test2_adata.obs.orig_cancer_label
y_pred2 = pd.read_csv(baseline_ikarus_path + 'test2/prediction.csv', index_col = 0)['core_pred']
ikarus_baseline_metrics2, ikarus_baseline_cm2 = calc_eval_metrics(y_test2, y_pred2)

# ikarus with DGE features
y_pred1 = pd.read_csv(dge_ikarus_path + 'test1/prediction.csv', index_col = 0)['core_pred']
ikarus_baseline_metrics1, ikarus_baseline_cm1 = calc_eval_metrics(y_test1, y_pred1)

y_pred2 = pd.read_csv(dge_ikarus_path + 'test2/prediction.csv', index_col = 0)['core_pred']
ikarus_baseline_metrics2, ikarus_baseline_cm2 = calc_eval_metrics(y_test2, y_pred2)

[[3485  485]
 [ 852 5537]]


Unnamed: 0,recall,precision,f1,accuracy
0,0.867,0.919,0.892,0.871


[[7587  496]
 [ 100 1673]]


Unnamed: 0,recall,precision,f1,accuracy
0,0.944,0.771,0.849,0.94


[[3657  313]
 [   8 6381]]


Unnamed: 0,recall,precision,f1,accuracy
0,0.999,0.953,0.975,0.969


[[8079    4]
 [  10 1763]]


Unnamed: 0,recall,precision,f1,accuracy
0,0.994,0.998,0.996,0.999


In [None]:
# Also run logistic regression model with ikarus features - same metrics as without network propagation?