# Model Evaluation of NeoPrecis-Integrated

This notebook is for evaluating the integration of multi-dimensional metrics. 

The data is presented in Table S3, and the results are shown in Figure 4.

In [None]:
from nci_utils import *

dpi = 600

### Data

In [None]:
### function

def BuildDataset(file,
                 abundance_features,
                 presentation_features,
                 recognition_features,
                 sample_col='Patient',
                 index_cols=['Patient', 'Mutation_Index', 'Mutation_ID'],
                 missense_name='missense_variant'):
    
    df = pd.read_csv(file)

    ### filtering
    print('#Mutations')
    print(f'Before filtering: {df.shape[0]}')
    print(f'#CD8: {(df["CD8"]==1).sum()}')
    print(f'#CD4: {(df["CD4"]==1).sum()}')
    # substitution mutations
    df = df[df['Consequence']==missense_name]
    print(f'Drop non-SNVs: {df.shape[0]}')
    print(f'#CD8: {(df["CD8"]==1).sum()}')
    print(f'#CD4: {(df["CD4"]==1).sum()}')

    # normalization
    #df['PHBR-I'] = -np.log((df['PHBR-I']+1e-3)/100)
    #df['PHBR-II'] = -np.log((df['PHBR-II']+1e-3)/100)
    df['PHBR-I'] = 1 - df['PHBR-I']/100
    df['PHBR-II'] = 1 - df['PHBR-II']/100
    
    ### data object
    data = NeoAgData(
        df,
        sample_col=sample_col, # individual ID
        index_cols=index_cols, # unique neoantigen
        abundance_features=abundance_features,
        presentation_features=presentation_features,
        recognition_features=recognition_features,
    )

    return data

In [None]:
### loading

# features
abundance_features = ['DNA_AF', 'RNA_AF', 'RNA_EXP_QRT']
presentation_features = ['PHBR']
recognition_features = ['NP-Immuno']

# dataset obj
nci_file = 'TableS3.csv' # fill in the path of TableS3.csv
nci_data = BuildDataset(nci_file, abundance_features, presentation_features, recognition_features)

In [None]:
### tasks

mhci_task_dict = OrderedDict({
    'CD8-A': {'label':'CD8', 'feature_group':['abundance']},
    'CD8-P1': {'label':'CD8', 'feature_group':['presentation-I']},
    'CD8-R1': {'label':'CD8', 'feature_group':['recognition-I']},
    'CD8-P1+R1': {'label':'CD8', 'feature_group':['presentation-I', 'recognition-I']},
    'CD8-A+P1': {'label':'CD8', 'feature_group':['abundance', 'presentation-I']},
    'CD8-A+R1': {'label':'CD8', 'feature_group':['abundance', 'recognition-I']},
    'CD8-A+P1+R1': {'label':'CD8', 'feature_group':['abundance', 'presentation-I', 'recognition-I']},
})

mhcii_task_dict = OrderedDict({
    'CD4-A': {'label':'CD4', 'feature_group':['abundance']},
    'CD4-P2': {'label':'CD4', 'feature_group':['presentation-II']},
    'CD4-R2': {'label':'CD4', 'feature_group':['recognition-II']},
    'CD4-P2+R2': {'label':'CD4', 'feature_group':['presentation-II', 'recognition-II']},
    'CD4-A+P2': {'label':'CD4', 'feature_group':['abundance', 'presentation-II']},
    'CD4-A+R2': {'label':'CD4', 'feature_group':['abundance', 'recognition-II']},
    'CD4-A+P2+R2': {'label':'CD4', 'feature_group':['abundance', 'presentation-II', 'recognition-II']},
})

### Cross validation

In [None]:
### setting

n_fold = 4
n_exp = 100
model = LogisticRegression()

In [None]:
### MHC-I : CD8

# CV
CV = CrossValidation(nci_data, model, importance=False)
mhci_cv_perf_df, mhci_cv_imp_df = CV(mhci_task_dict, n_fold=n_fold, n_exp=n_exp, normalized=True)

# plot
mhci_tasks = list(mhci_task_dict.keys())
CV.eval._performance_plot(mhci_cv_perf_df, mhci_tasks)

In [None]:
### MHC-II : CD4

# CV
CV = CrossValidation(nci_data, model, importance=False)
mhcii_cv_perf_df, mhcii_cv_imp_df = CV(mhcii_task_dict, n_fold=n_fold, n_exp=n_exp, normalized=True)

# plot
mhcii_tasks = list(mhcii_task_dict.keys())
CV.eval._performance_plot(mhcii_cv_perf_df, mhcii_tasks)

In [None]:
### comparison plot
mhci_cv_perf_df['MHC'] = 'MHC-I'
mhcii_cv_perf_df['MHC'] = 'MHC-II'
cv_perf_df = pd.concat([mhci_cv_perf_df, mhcii_cv_perf_df], axis=0, ignore_index=True)
cv_perf_df['features'] = cv_perf_df['task'].apply(lambda x: x.split('-')[1].replace('1', '').replace('2', ''))

hue_order = ['P','R','A','P+R','A+P','A+R','A+P+R']

# AUROC
for metric in ['AUROC', 'AUPRC']:
    fig, ax = plt.subplots(1, 1, figsize=(6, 3), dpi=dpi)
    sns.barplot(data=cv_perf_df, x='MHC', y=metric, hue='features', hue_order=hue_order, palette='pastel', ax=ax)
    ax.legend(loc='lower left', bbox_to_anchor=(1, 0))
    ax.set_xlabel('')
    fig.tight_layout()

### Feature importance

In [None]:
### model coef

model = LogisticRegression()
metric_list = ['DNA_AF', 'RNA_AF', 'RNA_EXP_QRT', 'PHBR', 'Immgen']
imp_df = pd.DataFrame(index=metric_list)

info_dict = {'I': 'CD8', 'II': 'CD4'}
for mhc, label in info_dict.items():
    x, y = nci_data.GetData(label, feature_groups=['abundance', f'presentation-{mhc}', f'recognition-{mhc}'])
    mean_arr = x.mean(axis=0)
    std_arr = x.std(axis=0)
    x_norm = (x - mean_arr) / std_arr
    model.fit(x_norm,y)
    imp_df[f'MHC-{mhc}'] = model.coef_[0]

In [None]:
### plot

plot_df = imp_df.reset_index(names=['Metric'])
plot_df = plot_df.melt(id_vars='Metric', var_name='MHC', value_name='Coefficient')
plot_df['Metric'] = plot_df['Metric'].replace('Immgen', 'NP-Immuno')

fig, ax = plt.subplots(1, 1, figsize=(4, 3), dpi=dpi)
sns.barplot(data=plot_df, x='Coefficient', y='Metric', hue='MHC', palette='pastel', ax=ax)
ax.set_ylabel('')
fig.tight_layout()