# Model Evaluation of NeoPrecis-Integrated

This notebook is for evaluating the integration of multi-dimensional metrics. 

The data is presented in Table S3, and the results are shown in Figure 4.

In [None]:
from nci_utils import *

In [None]:
### path

#nci_file = 'TableS3.csv' # fill in the path of TableS3.csv
nci_file = '/cellar/users/kol004/neoantigen/manuscript/tables/nci.round.csv'
out_dir = '/cellar/users/kol004/neoantigen/manuscript/plots/np_integrated_eval'
savefig = True

# columns
abundance_features = ['DNA_AF', 'RNA_AF', 'RNA_EXP_QRT']
presentation_features = ['Robustness', 'PHBR']
recognition_features = ['Agretopicity', 'Foreignness', 'NP-Immuno']

# load
nci_data = BuildDataset(nci_file, abundance_features, presentation_features, recognition_features)
nci_df = nci_data.df.copy()

### Feature association

In [None]:
from matplotlib.colors import ListedColormap
from matplotlib.patches import Patch
def CreateSidebar(series, colors):
    cats = series.unique().tolist()
    cat_to_idx = {cat: idx for idx, cat in enumerate(cats)}
    sidebar = pd.DataFrame(series.map(cat_to_idx))
    cmap = ListedColormap(colors)
    return cats, sidebar, cmap

In [None]:
# columns
common_cols = ['DNA_AF', 'RNA_AF', 'RNA_EXP_QRT']
specific_cols = ['Robustness', 'PHBR', 'Agretopicity', 'Foreignness', 'NP-Immuno']
cols = common_cols + [f'{col}-I' for col in specific_cols] + [f'{col}-II' for col in specific_cols]
y_cols = ['CD4', 'CD8']

# stat test
stat_results = dict()
for y_col in y_cols:
    mhc = 'I' if y_col == 'CD8' else 'II'
    tmp_df = nci_df.copy()
    tmp_df = tmp_df.fillna(0) # fill NA with 0, especially for those with masking
    
    stat_results[y_col] = dict()
    for col in cols:
        pval = NonParamTest(tmp_df, col, y_col, alternative='greater') # PHBR has been transformed to 0-1
        stat_results[y_col][col] = pval

stat_results = pd.DataFrame(stat_results)
stat_results = -np.log10(stat_results) # log10(P)

In [None]:
fig = plt.figure(layout='tight', figsize=(6, 3), dpi=dpi)
gs = fig.add_gridspec(1, 2, width_ratios=(5, 1), wspace=0)
gs0 = gs[0].subgridspec(2, 1, height_ratios=(0.2, 2.8), hspace=0)
gs1 = gs[1].subgridspec(2, 1, height_ratios=(0.5, 0.5), hspace=0)

# axes
axes = list()
for i in range(2):
    axes.append(fig.add_subplot(gs0[i]))
for i in range(2):
    axes.append(fig.add_subplot(gs1[i]))

# main heatmap (axes[1])
plot_df = stat_results.T
sns.heatmap(plot_df, cmap='Blues', annot=True, linewidths=0.3,
            annot_kws={'fontsize':9}, ax=axes[1],
            cbar_kws={'label': '-logP'}, cbar_ax=axes[3])
axes[3].set_aspect(0.5)

# annotation (axes[0])
palette = sns.color_palette('pastel')
colors = [palette[i] for i in range(3)] # three colors
annots = pd.Series(['A']*len(abundance_features) + (['P']*len(presentation_features) + ['R']*len(recognition_features))*2)
cats, sidebar, cmap = CreateSidebar(annots, colors)
sns.heatmap(sidebar.T, cmap=cmap, yticklabels=False, xticklabels=False, linewidth=0.3, ax=axes[0], cbar=False)
handles = [Patch(facecolor=colors[i], label=cat) for i, cat in enumerate(cats)] # legend
axes[2].legend(handles=handles)

# remove ticks
axes[2].set_xticks([])
axes[2].set_yticks([])
for spine in axes[2].spines.values():
    spine.set_visible(False)

if savefig:
    fig.savefig(f'{out_dir}/feature_association.png')

### Feature selection

In [None]:
tasks = [('I', 'CD8'), ('II', 'CD4')]
plot_df = list()
for i, (mhc, y_col) in enumerate(tasks):
    # RFECV
    x_cols = abundance_features + [f'{s}-{mhc}' for s in presentation_features] + [f'{s}-{mhc}' for s in recognition_features]
    rfecv = FeatureSelection(nci_data.df, x_cols, y_col)
    print(f'MHC-{mhc}')
    print('All features:', x_cols)
    print('Feature rank:', rfecv.ranking_)
    print('#Selected features =', rfecv.n_features_)
    print('Selected features:', [x_cols[i] for i, s in enumerate(rfecv.support_) if s])
    print('-----------------------')

    # for plot
    tmp_df = pd.DataFrame(rfecv.cv_results_)
    tmp_df['MHC'] = mhc
    tmp_df['#Features'] = range(1, tmp_df.shape[0]+1)
    tmp_df = tmp_df.melt(id_vars=['MHC', '#Features', 'mean_test_score', 'std_test_score'], var_name='split', value_name='score')
    plot_df.append(tmp_df)
    
plot_df = pd.concat(plot_df, axis=0, ignore_index=True)

In [None]:
# plot

fig, ax = plt.subplots(1, 1, figsize=(4, 3), dpi=dpi)
sns.pointplot(data=plot_df, x='#Features', y='score', hue='MHC', dodge=True, ax=ax, palette='pastel')
fig.tight_layout()
fig.savefig(f'{out_dir}/feature_selection.png')

### Data

In [None]:
### loading

# features
abundance_features = ['DNA_AF', 'RNA_AF', 'RNA_EXP_QRT']
presentation_features = ['PHBR']
recognition_features = ['NP-Immuno']

# dataset obj
nci_data = BuildDataset(nci_file, abundance_features, presentation_features, recognition_features)

In [None]:
### tasks

mhci_task_dict = OrderedDict({
    'CD8-A': {'label':'CD8', 'feature_group':['abundance']},
    'CD8-P1': {'label':'CD8', 'feature_group':['presentation-I']},
    'CD8-R1': {'label':'CD8', 'feature_group':['recognition-I']},
    'CD8-P1+R1': {'label':'CD8', 'feature_group':['presentation-I', 'recognition-I']},
    'CD8-A+P1': {'label':'CD8', 'feature_group':['abundance', 'presentation-I']},
    'CD8-A+R1': {'label':'CD8', 'feature_group':['abundance', 'recognition-I']},
    'CD8-A+P1+R1': {'label':'CD8', 'feature_group':['abundance', 'presentation-I', 'recognition-I']},
})

mhcii_task_dict = OrderedDict({
    'CD4-A': {'label':'CD4', 'feature_group':['abundance']},
    'CD4-P2': {'label':'CD4', 'feature_group':['presentation-II']},
    'CD4-R2': {'label':'CD4', 'feature_group':['recognition-II']},
    'CD4-P2+R2': {'label':'CD4', 'feature_group':['presentation-II', 'recognition-II']},
    'CD4-A+P2': {'label':'CD4', 'feature_group':['abundance', 'presentation-II']},
    'CD4-A+R2': {'label':'CD4', 'feature_group':['abundance', 'recognition-II']},
    'CD4-A+P2+R2': {'label':'CD4', 'feature_group':['abundance', 'presentation-II', 'recognition-II']},
})

### Cross validation

In [None]:
### setting

n_fold = 4
n_exp = 100
model = LogisticRegression()

In [None]:
### MHC-I : CD8

CV = CrossValidation(nci_data, model, importance=False)
mhci_cv_perf_df, mhci_cv_imp_df = CV(mhci_task_dict, n_fold=n_fold, n_exp=n_exp, normalized=True)

# rename PPV@k
k = nci_data.df['CD8'].sum()
mhci_cv_perf_df = mhci_cv_perf_df.rename(columns={f'top{k}_precision': 'PPV@k'})

# save
mhci_cv_perf_df.to_csv(f'{out_dir}/mhci_cv_perf.csv', index=False)

In [None]:
### MHC-II : CD4

CV = CrossValidation(nci_data, model, importance=False)
mhcii_cv_perf_df, mhcii_cv_imp_df = CV(mhcii_task_dict, n_fold=n_fold, n_exp=n_exp, normalized=True)

# rename PPV@k
k = nci_data.df['CD4'].sum()
mhcii_cv_perf_df = mhcii_cv_perf_df.rename(columns={f'top{k}_precision': 'PPV@k'})

# save
mhcii_cv_perf_df.to_csv(f'{out_dir}/mhcii_cv_perf.csv', index=False)

In [None]:
### comparison plot
mhci_cv_perf_df['MHC'] = 'MHC-I'
mhcii_cv_perf_df['MHC'] = 'MHC-II'
cv_perf_df = pd.concat([mhci_cv_perf_df, mhcii_cv_perf_df], axis=0, ignore_index=True)
cv_perf_df['features'] = cv_perf_df['task'].apply(lambda x: x.split('-')[1].replace('1', '').replace('2', ''))

hue_order = ['P','R','A','P+R','A+P','A+R','A+P+R']

metric = 'AUROC'
fig, ax = plt.subplots(1, 1, figsize=(6, 3), dpi=dpi)
sns.barplot(data=cv_perf_df, x='MHC', y=metric, hue='features', hue_order=hue_order, palette='pastel', ax=ax)

# annotation
for container in ax.containers:
    for bar in container:
        height = bar.get_height()
        ax.text(bar.get_x() + bar.get_width()/2, height*0.7, f'{height:.3f}', ha='center', va='bottom', fontsize=5)
      
ax.legend(loc='lower left', bbox_to_anchor=(1, 0))
ax.set_xlabel('')
fig.tight_layout()
if savefig:
    fig.savefig(f'{out_dir}/cv_perf_{metric}.png')

In [None]:
### AUPRC and PPV@k in one figure

fig, ax = plt.subplots(1, 2, figsize=(10, 4), dpi=dpi)
sns.barplot(data=cv_perf_df, x='MHC', y='AUPRC', hue='features', hue_order=hue_order, palette='pastel', ax=ax[0])
sns.barplot(data=cv_perf_df, x='MHC', y='PPV@k', hue='features', hue_order=hue_order, palette='pastel', ax=ax[1])

# annotation
for tmp_ax in [ax[0], ax[1]]:
    for container in tmp_ax.containers:
        for bar in container:
            height = bar.get_height()
            tmp_ax.text(bar.get_x() + bar.get_width()/2, height*0.7, f'{height:.3f}', ha='center', va='bottom', fontsize=5)
    tmp_ax.set_xlabel('')

ax[0].legend_.remove()
sns.move_legend(ax[1], loc='center left', bbox_to_anchor=(1, 0.5), title="")

fig.tight_layout()
if savefig:
    fig.savefig(f'{out_dir}/cv_perf.png')

### Feature importance

In [None]:
model = LogisticRegression()
imp_df = pd.DataFrame(index=['DNA_AF', 'RNA_AF', 'RNA_EXP_QRT', 'PHBR', 'NP-Immuno'])
info_dict = {'I': 'CD8', 'II': 'CD4'}

for mhc, label in info_dict.items():
    feature_dict = {
        'A+P': {
            'features': ['DNA_AF', 'RNA_AF', 'RNA_EXP_QRT', 'PHBR'],
            'feature_groups': ['abundance', f'presentation-{mhc}'],
        },
        'A+R': {
            'features': ['DNA_AF', 'RNA_AF', 'RNA_EXP_QRT', 'NP-Immuno'],
            'feature_groups': ['abundance', f'recognition-{mhc}'],
        },
        'A+P+R': {
            'features': ['DNA_AF', 'RNA_AF', 'RNA_EXP_QRT', 'PHBR', 'NP-Immuno'],
            'feature_groups': ['abundance', f'presentation-{mhc}', f'recognition-{mhc}'],
        }
    }

    for n, d in feature_dict.items():
        x, y = nci_data.GetData(label, feature_groups=d['feature_groups'])
        mean_arr = x.mean(axis=0)
        std_arr = x.std(axis=0)
        x_norm = (x - mean_arr) / std_arr
        model.fit(x_norm,y)
        coef_dict = {d['features'][i]: c for i, c in enumerate(model.coef_[0])}
        imp_df[f'MHC-{mhc}_{n}'] = coef_dict

imp_df.to_csv(f'{out_dir}/feature_coef.csv')

In [None]:
### plot df

plot_df = imp_df.reset_index(names=['Metric'])
plot_df = plot_df.melt(id_vars='Metric', var_name='Setting', value_name='Coefficient')
plot_df['MHC'] = plot_df['Setting'].apply(lambda x: x.split('_')[0])
plot_df['FeatureSet'] = plot_df['Setting'].apply(lambda x: x.split('_')[1])

In [None]:
### plot: single

tmp_df = plot_df[plot_df['FeatureSet']=='A+P+R']

fig, ax = plt.subplots(1, 1, figsize=(4, 3), dpi=dpi)
sns.barplot(data=tmp_df, x='Coefficient', y='Metric', hue='MHC', palette='pastel', ax=ax)
for container in ax.containers:
    ax.bar_label(container, fmt='%.3f', padding=0, fontsize=8)
ax.set_ylabel('')
fig.tight_layout()
if savefig:
    fig.savefig(f'{out_dir}/coef.png')

In [None]:
### plot: comparison

fig, ax = plt.subplots(1, 2, figsize=(6, 3), dpi=dpi)
sns.barplot(data=plot_df[plot_df['MHC']=='MHC-I'], x='Coefficient', y='Metric', hue='FeatureSet', ax=ax[0], palette='Set2')
sns.barplot(data=plot_df[plot_df['MHC']=='MHC-II'], x='Coefficient', y='Metric', hue='FeatureSet', ax=ax[1], palette='Set2')
ax[0].set_title('MHC-I')
ax[1].set_title('MHC-II')
ax[0].set_ylabel('')
ax[1].set_ylabel('')
ax[1].tick_params(axis='y', labelleft=False)
ax[0].legend_.remove()
sns.move_legend(ax[1], loc='center left', bbox_to_anchor=(1, 0.5))

fig.tight_layout()
if savefig:
    fig.savefig(f'{out_dir}/coef_comp.png')