## Immunotherapy Response Prediction

This notebook is for immunotherapy response prediction. 

The data is presented in Table S7, and the results are shown in Figure 5-6.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from ici_utils import (
    MetricPerformance,
    FourGroupsPerf,
    PerformanceBarPlot,
    ROCCurve,
    BurdenBoxPlot,
    BurdenCliffPlot,
    SurvivalCurvePlot,
    HRPlot,
    TwoGroupSurvivalCurvePlot
)
from utils import set_font_style

# figure setting
dpi = 600
num_units = 10
unit_length = 180 / 25.4 / num_units # 180 mm, 25.4 mm/inch, 10 units
base_fontsize = 7
set_font_style(base_fontsize=base_fontsize)

In [None]:
### loading

ici_file = 'SuppData5.csv' # fill in the path of Supplementary Data 5
out_dir = './'
"""
work_dir = ''
ici_file = f'{work_dir}/manuscript/tables/ici.csv'
out_dir = f'{work_dir}/manuscript/plots/np_landscape/'
"""
savefig = True

ici_df = pd.read_csv(ici_file)
label_col = 'ICI response'
hetero_orders = [
    'high sGini - high pGini',
    'high sGini - low pGini',
    'low sGini - high pGini',
    'low sGini - low pGini'
]
ici_df['heterogeneity_group'] = pd.Categorical(ici_df['heterogeneity_group'], categories=hetero_orders, ordered=True)

### Allele benefit scores

In [None]:
### Cox model

# arguments
cancers = ['melanoma', 'NSCLC']
confounders = ['sex', 'age']
metrics = ['benefitScore_I', 'benefitScore_II', 'benefitScore_dual', 'logTMB', 'combinedScore']
method_rename_dict = {'benefitScore_I':'benefitScore-I', 'benefitScore_II':'benefitScore-II', 'benefitScore_dual':'benefitScore-dual'}

# duplicate columns (because "-" is not a valid symbol in Cox model function)
ici_df['benefitScore_I'] = ici_df['benefitScore-I']
ici_df['benefitScore_II'] = ici_df['benefitScore-II']
ici_df['benefitScore_dual'] = ici_df['benefitScore-dual']

# plot
fig, ax = plt.subplots(1, 2, figsize=(7*unit_length, 3*unit_length), dpi=dpi)
for idx, cancer in enumerate(cancers):
    event = 'OS' if cancer == 'melanoma' else 'PFS'
    duration = f'{event}.time'
    cancer_df = ici_df[ici_df['cancer']==cancer] # specific cancer
    cancer_df = cancer_df.dropna(subset=[duration, event] + confounders) # drop NA
    print(f'#Samples in {cancer} =', cancer_df.shape[0])
    HRPlot(cancer_df, metrics, confounder_cols=confounders, duration_col=duration,
           method_rename_dict=method_rename_dict, event_col=event, ax=ax[idx])
    ax[idx].set_title(cancer)
    ax[idx].set_xlabel('')
ax[1].set_yticklabels('')
fig.supxlabel('Hazard Ratio (95% CI)')
fig.tight_layout()
if savefig:
    fig.savefig(f'{out_dir}/benefit_score.png')

### Neoantigen landscape w/o tumor clonality

In [None]:
### Burden

x_cols = ['TMB', 'TNB', 'NP-Immuno-dual.burden']
method_rename_dict = {'TMB': 'TMB', 'TNB': 'TNB', 'NP-Immuno-dual.burden': 'NPB'}

# box plot
fig, ax = plt.subplots(1, 2, figsize=(10*unit_length, 3*unit_length), dpi=dpi)
BurdenBoxPlot(ici_df, x_cols, label_col, 'melanoma', method_rename_dict=method_rename_dict,
              annot_fontsize=base_fontsize, ax=ax[0], legend=False) # melanoma
BurdenBoxPlot(ici_df, x_cols, label_col, 'NSCLC', method_rename_dict=method_rename_dict,
              annot_fontsize=base_fontsize, ax=ax[1]) # NSCLC
fig.tight_layout()
if savefig:
    fig.savefig(f'{out_dir}/burden_box.png')

# bar plot for Cliff's delta
fig, ax = plt.subplots(1, 1, figsize=(3*unit_length, 3*unit_length), dpi=dpi)
BurdenCliffPlot(ici_df, x_cols, label_col, ['melanoma', 'NSCLC'], method_rename_dict=method_rename_dict, ax=ax)
fig.tight_layout()
if savefig:
    fig.savefig(f'{out_dir}/burden_cliff.png')

In [None]:
### Sum

x_cols = ['TMB', 'TNB', 'NP-LandscapeSum']

# plot
method_rename_dict = {'NP-LandscapeSum': 'NP-Sum'}
for cancer in ['melanoma', 'NSCLC']:
    figfile = f'{out_dir}/sum_{cancer.lower()}_perf.png' if savefig else None
    ROCCurve(ici_df, x_cols, label_col, cancer, method_rename_dict=method_rename_dict,
             legend_fontsize=base_fontsize, figfile=figfile, figsize=(3*unit_length, 3*unit_length))

In [None]:
### NP-Integrated in RNA-available samples

x_cols = ['TMB', 'TNB', 'NP-LandscapeSum', 'NP-LandscapeSum(Integrated)']
perf_df = MetricPerformance(ici_df, x_cols, label_col, group_col='cancer')
method_rename_dict = {'NP-LandscapeSum': 'NP-Sum', 'NP-LandscapeSum(Integrated)': 'NP-Integrated'}

# plot
fig, ax = plt.subplots(figsize=(6*unit_length, 3*unit_length), dpi=dpi)
PerformanceBarPlot(perf_df, 'AUROC', size_col='size', method_rename_dict=method_rename_dict,
                   fontsize=base_fontsize, ax=ax)
sns.move_legend(ax, loc='lower center', bbox_to_anchor=(0.5, 1), ncol=4, columnspacing=1)
fig.tight_layout()
if savefig:
    fig.savefig(f'{out_dir}/np_integrated_perf.png')

In [None]:
### comparing approaches of combining I and II

x_cols = [
    'NP-LandscapeSum-I',
    'NP-LandscapeSum-II',
    'NP-LandscapeSum-add',
    'NP-LandscapeSum-max',
    'NP-LandscapeSum',
]
perf_df = MetricPerformance(ici_df, x_cols, label_col, group_col='cancer')

method_rename_dict={
    'NP-LandscapeSum-I': 'I',
    'NP-LandscapeSum-II': 'II',
    'NP-LandscapeSum-add': 'I+II',
    'NP-LandscapeSum-max': 'max(I,II)',
    'NP-LandscapeSum': 'IxII',
}

# plot
for metric in ['AUROC', 'AUPRC']:
    figfile = f'{out_dir}/dual_perf_{metric.lower()}.png' if savefig else None
    PerformanceBarPlot(perf_df, metric, method_rename_dict=method_rename_dict, ncol=5, size_col='size',
                       figsize=(5*unit_length, 2.5*unit_length), figfile=figfile, fontsize=base_fontsize-2) # bar plot

In [None]:
### survival curve

tasks = [
    ('melanoma', 'OS', 'OS.time', 'TMB'),
    ('melanoma', 'OS', 'OS.time', 'NP-LandscapeSum'),
    ('NSCLC', 'PFS', 'PFS.time', 'TMB'),
    ('NSCLC', 'PFS', 'PFS.time', 'NP-LandscapeSum'),
]

for cancer, event, duration, method in tasks:
    method_name = 'npsum' if method == 'NP-LandscapeSum' else method.lower()
    figfile = f'{out_dir}/surv_{cancer}_{method}.png' if savefig else None
    figsize = (5*unit_length, 3*unit_length) if method == 'TMB' else (4*unit_length, 3*unit_length)
    SurvivalCurvePlot(ici_df, method, cancer, event_col=event, duration_col=duration, figfile=figfile, figsize=figsize)

### Clonal analysis

In [None]:
### purity distribution (only on RNA-available samples)

plot_df = ici_df[~ici_df['rna_sra_id'].isna()]

fig, ax = plt.subplots(1, 1, figsize=(4*unit_length, 3*unit_length), dpi=dpi)
sns.histplot(data=plot_df, x='purity', hue='cancer', hue_order=['NSCLC', 'melanoma'],
             stat='probability', common_norm=False, bins=20, ax=ax)
fig.tight_layout()
if savefig:
    fig.savefig(f'{out_dir}/purity_rna_aval.png')

In [None]:
### sGini vs. pGini

g = sns.JointGrid(data=ici_df, x='sGini', y='pGini', hue='cancer', height=4*unit_length, ratio=4)

# Scatterplot with color (hue) and shape
sns.scatterplot(
    data=ici_df,
    x='sGini',
    y='pGini',
    hue='cancer',
    style=label_col,
    ax=g.ax_joint,
    s=10
)

# Add vertical and horizontal median lines for each hue
for category in ici_df['cancer'].unique():
    subset = ici_df[ici_df['cancer'] == category]
    median_x = subset['sGini'].median()
    median_y = subset['pGini'].median()
    g.ax_joint.axvline(median_x, color=sns.color_palette()[ici_df['cancer'].unique().tolist().index(category)],
                       linestyle='--', alpha=0.5, lw=0.5, label='')
    g.ax_joint.axhline(median_y, color=sns.color_palette()[ici_df['cancer'].unique().tolist().index(category)],
                       linestyle='-.', alpha=0.5, lw=0.5, label='')

# Add marginal plots
sns.histplot(data=ici_df, x='sGini', hue='cancer', stat='probability', common_norm=False,
             ax=g.ax_marg_x, bins=30, element='step', legend=False)
sns.histplot(data=ici_df, y='pGini', hue='cancer', stat='probability', common_norm=False,
             ax=g.ax_marg_y, bins=30, element='step', legend=False)

# Adjust legend
handles, labels = g.ax_joint.get_legend_handles_labels()
handles = handles[1:3] + handles[4:6]
labels = labels[1:3] + labels[4:6]
labels = [s.replace('0', 'Negative').replace('1', 'Positive') for s in labels]
legend = g.ax_joint.legend(handles, labels, title='',
                           bbox_to_anchor=(-.2, 1.1), loc='lower left', ncol=2, fontsize=base_fontsize-1)
g.fig.add_artist(legend)
if savefig:
    g.fig.savefig(f'{out_dir}/gini_dist.png', bbox_inches='tight', dpi=dpi)

In [None]:
### immunoediting: smoking status

nsclc_df = ici_df[ici_df['cancer']=='NSCLC']
hue_order = ['never', 'former', 'current']


### clonal ratio
fig, ax = plt.subplots(1, 1, figsize=(4*unit_length, 3*unit_length), dpi=dpi)
sns.kdeplot(data=ici_df, x='%ClonalMuts', hue='smoking_status', hue_order=hue_order,
            common_norm=False, clip=(0, 1), fill=True, ax=ax)
sns.move_legend(ax, loc='upper right', bbox_to_anchor=(1, 1))

# add sample size
counts = ici_df['smoking_status'].value_counts()
if ax.legend_:
    for text_obj in ax.legend_.get_texts():
        label = text_obj.get_text()
        if label in counts:
            text_obj.set_text(f"{label} (n={counts[label]})")

fig.tight_layout()
if savefig:
    fig.savefig(f'{out_dir}/clonal_mut_smoking.png')

### binding ratio
fig, ax = plt.subplots(1, 1, figsize=(4*unit_length, 3*unit_length), dpi=dpi)
sns.kdeplot(data=ici_df, x='%Binding-I', hue='smoking_status', hue_order=hue_order,
            common_norm=False, clip=(0, 1), fill=True, ax=ax)
ax.legend_.remove()
fig.tight_layout()
if savefig:
    fig.savefig(f'{out_dir}/bind_ratio_smoking.png')

In [None]:
### immunoediting: melanoma vs. NSCLC

### clonal ratio
fig, ax = plt.subplots(1, 1, figsize=(5*unit_length, 3*unit_length), dpi=dpi)
sns.kdeplot(data=ici_df, x='%ClonalMuts', hue='cancer',
            common_norm=False, clip=(0, 1), fill=True, ax=ax)
sns.move_legend(ax, loc='upper center', bbox_to_anchor=(0.5, 1))

# add sample size
counts = ici_df['cancer'].value_counts()
if ax.legend_:
    for text_obj in ax.legend_.get_texts():
        label = text_obj.get_text()
        if label in counts:
            text_obj.set_text(f"{label} (n={counts[label]})")

fig.tight_layout()
if savefig:
    fig.savefig(f'{out_dir}/clonal_mut_cancer.png')

### binding ratio
fig, ax = plt.subplots(1, 1, figsize=(5*unit_length, 3*unit_length), dpi=dpi)
sns.kdeplot(data=ici_df, x='%Binding-I', hue='cancer',
            common_norm=False, clip=(0, 1), fill=True, ax=ax)
ax.legend_.remove()
fig.tight_layout()
if savefig:
    fig.savefig(f'{out_dir}/bind_ratio_cancer.png')

### Neoantigen landscape w/ tumor clonality

In [None]:
### MHC integration

x_cols = [
    'NP-LandscapeClone-I',
    'NP-LandscapeClone-II',
    'NP-LandscapeClone-add',
    'NP-LandscapeClone-max',
    'NP-LandscapeClone',
]
perf_df = MetricPerformance(ici_df, x_cols, label_col, group_col='cancer')

method_rename_dict={
    'NP-LandscapeClone-I': 'I',
    'NP-LandscapeClone-II': 'II',
    'NP-LandscapeClone-add': 'I+II',
    'NP-LandscapeClone-max': 'max(I,II)',
    'NP-LandscapeClone': 'IxII',
}

# plot
for metric in ['AUROC', 'AUPRC']:
    figfile = f'{out_dir}/dual_cluster_perf_{metric.lower()}.png' if savefig else None
    PerformanceBarPlot(perf_df, metric, method_rename_dict=method_rename_dict, ncol=5, size_col='size',
                       figsize=(5*unit_length, 3*unit_length), figfile=figfile, fontsize=base_fontsize-2) # bar plot

In [None]:
### performance

x_cols = ['TMB','TNB', 'CSiN', 'ioTNL', 'NP-LandscapeSum', 'NP-LandscapeCCF', 'NP-LandscapeClone']
perf_df = MetricPerformance(ici_df, x_cols, label_col, group_col='cancer')
method_rename_dict = {
    'NP-LandscapeSum': 'NP-Sum',
    'NP-LandscapeCCF': 'NP-CCF',
    'NP-LandscapeClone': 'NP-Clone'
}

fig, ax = plt.subplots(1, 2, figsize=(10*unit_length, 3*unit_length), dpi=dpi)
ROCCurve(ici_df, x_cols, label_col, 'melanoma', method_rename_dict=method_rename_dict,
         legend_fontsize=base_fontsize, ax=ax[0]) # ROC curve for melanoma
ROCCurve(ici_df, x_cols, label_col, 'NSCLC', method_rename_dict=method_rename_dict,
         legend_fontsize=base_fontsize, ax=ax[1]) # ROC curve for NSCLC

fig.tight_layout()
if savefig:
    fig.savefig(f'{out_dir}/clone_perf.png')

In [None]:
### group by heterogeneity

split_col1 = 'sGini'
split_col2 = 'pGini'

# performance
x_cols = ['TMB', 'CSiN', 'ioTNL', 'NP-LandscapeSum', 'NP-LandscapeCCF', 'NP-LandscapeClone']
method_rename_dict = {}
perf_df = FourGroupsPerf(ici_df, split_col1, split_col2, x_cols, label_col)
perf_df['group'] = perf_df['group'].replace({
    'low-low': 'low sGini - low pGini',
    'low-high': 'low sGini - high pGini',
    'high-low': 'high sGini - low pGini',
    'high-high': 'high sGini - high pGini'
})

# plot
fig, ax = plt.subplots(2, 1, figsize=(10*unit_length, 4*unit_length), dpi=dpi)
cancers = perf_df['cancer'].unique().tolist()
for i, cancer in enumerate(cancers):
    tmp_perf_df = perf_df[perf_df['cancer']==cancer]
    PerformanceBarPlot(tmp_perf_df, 'AUROC', group_col='group', ax=ax[i], size_col='size',
                       method_rename_dict=method_rename_dict, fontsize=base_fontsize-2)
    ax[i].set_title(cancer, loc='left')
sns.move_legend(ax[0], loc='lower center', bbox_to_anchor=(0.53, 1), ncol=6, columnspacing=1)
ax[1].get_legend().remove()
fig.tight_layout()
if savefig:
    fig.savefig(f'{out_dir}/hetero_perf.png')

In [None]:
### hetero vs. homo

plot_df = perf_df[perf_df['group'].isin(['high sGini - high pGini', 'low sGini - low pGini'])]
plot_df['group'] = plot_df['group'].replace({
    'high sGini - high pGini': 'homogeneous',
    'low sGini - low pGini': 'heterogeneous'
})


fig, ax = plt.subplots(2, 1, figsize=(6*unit_length, 6*unit_length), dpi=dpi)
cancers = perf_df['cancer'].unique().tolist()
for i, cancer in enumerate(cancers):
    tmp_perf_df = plot_df[plot_df['cancer']==cancer]
    PerformanceBarPlot(tmp_perf_df, 'AUROC', group_col='group', ax=ax[i], size_col='size',
                       method_rename_dict=method_rename_dict, ncol=2, fontsize=base_fontsize-2)
    ax[i].set_title(cancer, loc='left')
ax[1].get_legend().remove()

fig.tight_layout()
if savefig:
    fig.savefig(f'{out_dir}/hetero_two_perf.png')

In [None]:
### survival

# preparing df
lung_gp1_df = ici_df[(ici_df['cancer']=='NSCLC') & (ici_df['heterogeneity_group']=='low sGini - low pGini')]
lung_gp2_df = ici_df[(ici_df['cancer']=='NSCLC') & (ici_df['heterogeneity_group']=='high sGini - high pGini')]
skin_gp1_df = ici_df[(ici_df['cancer']=='melanoma') & (ici_df['heterogeneity_group']=='low sGini - low pGini')]
skin_gp2_df = ici_df[(ici_df['cancer']=='melanoma') & (ici_df['heterogeneity_group']=='high sGini - high pGini')]

# plot
fig, ax = plt.subplots(1, 2, figsize=(10*unit_length, 3*unit_length), dpi=dpi)

TwoGroupSurvivalCurvePlot(lung_gp1_df, lung_gp2_df, event_col='PFS', duration_col='PFS.time',
                          gp1_name='hetero.', gp2_name='homo.', ax=ax[0])

TwoGroupSurvivalCurvePlot(skin_gp1_df, skin_gp2_df, event_col='OS', duration_col='OS.time',
                          gp1_name='hetero.', gp2_name='homo.', ax=ax[1])

ax[0].set_title('PFS in NSCLC')
ax[1].set_title('OS in melanoma')

fig.tight_layout()
if savefig:
    fig.savefig(f'{out_dir}/surv_hetero.png')

### Smoking status

In [None]:
### NSCLC data

cancer_df = ici_df[ici_df['cancer']=='NSCLC']
smoking_order = ['never', 'former', 'current']

In [None]:
### TMB

fig, ax = plt.subplots(1, 1, figsize=(3*unit_length, 4*unit_length), dpi=dpi)
sns.boxplot(data=cancer_df, x='smoking_status', y='TMB', hue='smoking_status',
            hue_order=smoking_order, order=smoking_order, palette='muted', ax=ax)
counts = cancer_df['smoking_status'].value_counts()
new_labels = [f"{status}\n(n={counts.get(status, 0)})" for status in smoking_order]
ax.set_xticklabels(new_labels)
fig.tight_layout()
if savefig:
    fig.savefig(f'{out_dir}/smoking_tmb.png')

In [None]:
### heterogeneity

fig, ax = plt.subplots(1, 1, figsize=(4*unit_length, 4*unit_length), dpi=dpi)
sns.histplot(data=cancer_df, x='heterogeneity_group', hue='smoking_status', hue_order=smoking_order, multiple='fill', palette='muted', ax=ax)
ax.set_xlabel('')
ax.set_ylabel('Proportion')
ax.tick_params(axis='x', rotation=90)
sns.move_legend(ax, loc='center left', bbox_to_anchor=(1, 0.5))
fig.tight_layout()
if savefig:
    fig.savefig(f'{out_dir}/smoking_hetero.png')

In [None]:
### immunotherapy prediction performance

x_cols = ['TMB', 'CSiN', 'ioTNL', 'NP-LandscapeSum', 'NP-LandscapeCCF', 'NP-LandscapeClone']

perf_df = pd.DataFrame()
for smoke in smoking_order:
    tmp_df = cancer_df[cancer_df['smoking_status']==smoke]
    tmp_perf_df = MetricPerformance(tmp_df, x_cols, label_col)
    tmp_perf_df['smoking_status'] = smoke
    perf_df = pd.concat([perf_df, tmp_perf_df], axis=0)

# plot
fig, ax = plt.subplots(1, 1, figsize=(3*unit_length, 4*unit_length), dpi=dpi)
sns.barplot(data=perf_df, x='smoking_status', y='AUROC', hue='method', palette='pastel', ax=ax)
sns.move_legend(ax, title='', ncol=1, loc='lower center', bbox_to_anchor=(0.5, 1))
fig.tight_layout()
if savefig:
    fig.savefig(f'{out_dir}/smoking_perf.png')