# Imports

In [None]:
%reload_ext autoreload
%autoreload 2

from peptdeep.pretrained_models import ModelManager
from peptdeep import settings
from alphabase.psm_reader import psm_reader_provider

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn.metrics as sk

%run alpha_pept_deep_methods.ipynb

# Input

In [None]:
# Read Data
df = pd.read_csv("evidence_freshfrozen.txt",  sep="\t")

# Gain as many useable data as possible
%run alpha_pept_deep_methods.ipynb
df  = fill_modified_sequence(df)
df['MS/MS scan number'] = df['MS/MS scan number'].fillna(-1)
df = df.dropna(subset = ['Modified sequence'])

# Load Model

In [36]:
# load model
models = ModelManager(device = 'gpu')
models.load_installed_models()

In [None]:
# receive settings of the model
settings = settings.global_settings
for key, value in settings.items():
    print(key)
    print(f"\t{value}")

# Prediction

In [None]:
# predict
%run alpha_pept_deep_methods.ipynb
df_ab = mq_to_ab(df)
prediction = models.predict_mobility(df_ab)

In [39]:
# merge prediction into original df for comparison of prediction to experimental values
%run alpha_pept_deep_methods.ipynb
df_comp = ab_to_mq(df, prediction)

In [63]:
df_comp.to_csv('prediction.txt', sep = '\t')

# Evaluation of Model Performance

### R2

In [None]:
# R2 for CCS
print(sk.r2_score(y_true = df_comp['CCS'], y_pred = df_comp['ccs_pred']))

In [None]:
# R2 for 1/K0
print(sk.r2_score(y_true = df_comp['1/K0'], y_pred = df_comp['mobility_pred']))

### Error

In [None]:
# Mean and Standard Deviation of Error
# Table 4.3
print(f"CCS error:{df_comp['ccs_error'].mean()}")
print(f"CCS standard deviation:{df_comp['ccs_error'].std()}")
print(f"IM error:{df_comp['IM_error'].mean()}")
print(f"IM standard deviation:{df_comp['IM_error'].std()}")

In [None]:
# absolute Mean and Standard Deviation of Error
print(f"CCS error:{df_comp['ccs_error'].abs().mean()}")
print(f"CCS standard deviation:{df_comp['ccs_error'].std()}")
print(f"IM error:{df_comp['IM_error'].abs().mean()}")
print(f"IM standard deviation:{df_comp['IM_error'].std()}")

In [None]:
# Histogram: CCS error
# Figure 4.3
plt.hist(df_comp['ccs_error'], range=(-100, 100), bins = 50)
plt.xlabel('CCS Difference')

### Percentiles

In [None]:
# percentile of CCS Error
# Table 4.4
%run alpha_pept_deep_methods.ipynb
delta95 = percentiles(df_comp, 'ccs_error')
print(f"2.5 Percentile: {delta95[0]}, 97.5 Percentile: {delta95[1]}, Delta95: {delta95[2]}")

In [None]:
# percentile of IM Error
# Table 4.4
%run alpha_pept_deep_methods.ipynb
delta95 = percentiles(df_comp, 'IM_error')
print(f"2.5 Percentile: {delta95[0]}, 97.5 Percentile: {delta95[1]}, Delta95: {delta95[2]}")

# Evaluation of Influences on the Model

## Biological Replicates

In [None]:
# Violin Plot: CCS error per Experiment
# Figure 4.4
plt.figure(figsize=(10, 6))
sns.violinplot(x='Experiment', y='ccs_error', data=df_comp, density_norm='count')
plt.xlabel('Biological Replicate')
plt.ylabel('CCS Error')
plt.hlines(y = 0.0, xmin = -0.5, xmax= 2.5, linestyles='--', colors='grey')
plt.show()

In [None]:
# Error per Biological Replicate
# Table 4.15 a
df_51 = df_comp[df_comp["Experiment"]=='P064051']
print(len(df_51))
print(df_51['ccs_error'].mean())
print(df_51['ccs_error'].std())
df_64 = df_comp[df_comp["Experiment"]=='P064064']
print(len(df_64))
print(df_64['ccs_error'].mean())
print(df_64['ccs_error'].std())
df_28 = df_comp[df_comp["Experiment"]=='P064428']
print(len(df_28))
print(df_28['ccs_error'].mean())
print(df_28['ccs_error'].std())

In [None]:
# Error per Biological Replicate
# Table 4.15 b
df_51 = df_comp[df_comp["Experiment"]=='P064051']
print(len(df_51))
print(df_51['IM_error'].mean())
print(df_51['IM_error'].std())
df_64 = df_comp[df_comp["Experiment"]=='P064064']
print(len(df_64))
print(df_64['IM_error'].mean())
print(df_64['IM_error'].std())
df_28 = df_comp[df_comp["Experiment"]=='P064428']
print(len(df_28))
print(df_28['IM_error'].mean())
print(df_28['IM_error'].std())

In [None]:
# Delta 95 per Biological Replicate
# Table 4.16 a
%run alpha_pept_deep_methods.ipynb
perc_51 = percentiles(df_comp[df_comp['Experiment']== 'P064051'], 'ccs_error')
print(f"P064051; 2.5 Percentile: {delta95[0]}, 97.5 Percentile: {delta95[1]}, Delta95: {delta95[2]}")
perc_64 = percentiles(df_comp[df_comp['Experiment']== 'P064064'], 'ccs_error')
print(f"P064064; 2.5 Percentile: {delta95[0]}, 97.5 Percentile: {delta95[1]}, Delta95: {delta95[2]}")
perc_28 = percentiles(df_comp[df_comp['Experiment']== 'P064428'], 'ccs_error')
print(f"P064428; 2.5 Percentile: {delta95[0]}, 97.5 Percentile: {delta95[1]}, Delta95: {delta95[2]}")


In [None]:
# Delta 95 per Biological Replicate
# Table 4.16 b
%run alpha_pept_deep_methods.ipynb
perc_51 = percentiles(df_comp[df_comp['Experiment']== 'P064051'], 'IM_error')
print(f"P064051; 2.5 Percentile: {delta95[0]}, 97.5 Percentile: {delta95[1]}, Delta95: {delta95[2]}")
perc_64 = percentiles(df_comp[df_comp['Experiment']== 'P064064'], 'IM_error')
print(f"P064064; 2.5 Percentile: {delta95[0]}, 97.5 Percentile: {delta95[1]}, Delta95: {delta95[2]}")
perc_28 = percentiles(df_comp[df_comp['Experiment']== 'P064428'], 'IM_error')
print(f"P064428; 2.5 Percentile: {delta95[0]}, 97.5 Percentile: {delta95[1]}, Delta95: {delta95[2]}")


## Raw File

In [52]:
# split df into the 3 Experiments
df = df_comp
df_51 = df[df["Experiment"]=='P064051']
df_64 = df[df["Experiment"]=='P064064']
df_28 = df[df["Experiment"]=='P064428']

In [None]:
# Violin Plots per Experiment across Raw Files for CCS Error
list_df = [df_51, df_64, df_28]
list_exp = ['P064051', 'P064064', 'P064428']
for df in list_df:
    plt.figure(figsize=(18, 6))
    sns.violinplot(x='Raw file', y='ccs_error', data=df, density_norm='count')
    plt.xlabel('Raw file')
    plt.ylabel('CCS Error')
    plt.xticks(rotation = 90)
    plt.hlines(y = 0.0, xmin = -0.5, xmax= 48.5, linestyles='--', colors='grey')

## Confidence of Identification

In [None]:
# Scatter Plot with kde: Andromeda Score vs ccs_Diff
# Figure 4.6
g = sns.jointplot(data=df_comp, x='Score', y='ccs_error', hue='Charge', kind="scatter", palette='viridis', alpha = 0.5)
g.plot_marginals(sns.kdeplot, common_norm=True)
plt.xlabel('Andromeda Score')
plt.ylabel('CCS Error')
plt.show()

## PTMs

In [None]:
# Error per Modification
# Table 4.5
for mod in df_comp['Modifications'].unique():
    df_mod = df_comp[df_comp['Modifications']==mod]
    print(f"{mod}: {len(df_mod)}")
    print(f"{df_mod['ccs_error'].mean()}, {df_mod['ccs_error'].std()};  {df_mod['IM_error'].mean()}, {df_mod['IM_error'].std()}")

## Charge

In [None]:
# Violin Plot: CCS Error per Charge
# Figure 4.7
plt.figure(figsize=(10, 6))
sns.violinplot(x='Charge', y='ccs_error', data=df_comp, scale='count')
plt.ylabel('CCS Error')
plt.hlines(y = 0.0, xmin = -0.1, xmax= 3.1, linestyles='--', colors='grey')
plt.tight_layout()
plt.show()


## Retention Time

In [None]:
# Scatter Plot with kde: Retention Time vs CCS Error
# Figure 4.8 a
g = sns.jointplot(data=df_comp, x='Retention time', y='ccs_error', hue='Charge', kind="scatter", palette='viridis', alpha = 0.5)
g.plot_marginals(sns.kdeplot, common_norm=True)
plt.xlabel('Retention time')
plt.ylabel('CCS Error')
plt.show()

In [None]:
# Scatter Plot with kde: rtRetention Length vs CCS Error
# Figure 4.8 b
g = sns.jointplot(data=df_comp, x='Retention length', y='ccs_error', hue='Charge', kind="scatter", palette='viridis', alpha = 0.5)
g.plot_marginals(sns.kdeplot, common_norm=True)
plt.xlabel('Retention length')
plt.ylabel('CCS Error')
plt.show()

## Ion Mobility

In [None]:
# Scatter Plot with kde: CCS vs CCS Error
# Figure 4.9 a
g = sns.jointplot(data=df_comp, x='CCS', y='ccs_error', hue='Charge', kind="scatter", palette='viridis', alpha = 0.5)
g.plot_marginals(sns.kdeplot, common_norm=True)
plt.xlabel('Experimental CCS')
plt.ylabel('CCS Error')
plt.show()

In [None]:
# Scatter Plot with kde: 1/K0 vs CCS Error
# Figure 4.9 b
g = sns.jointplot(data=df_comp, x='1/K0', y='ccs_error', hue='Charge', kind="scatter", palette='viridis', alpha = 0.5)
g.plot_marginals(sns.kdeplot, common_norm=True)
plt.xlabel('1/K0')
plt.ylabel('CCS Error')
plt.show()

## Peptide Length

In [None]:
# Violin Plot: Length vs CCS Error
# Figure 4.10
plt.figure(figsize=(18, 4))
sns.violinplot(x='Length', y='ccs_error', data=df_comp, scale='count', width=0.9)
plt.xlabel('Length')
plt.ylabel('CCS Error')
plt.hlines(y = 0.0, xmin = -0.5, xmax= 45.5, linestyles='--', colors='grey')
plt.tight_layout()
plt.show()

# AlphaPeptDeep Training Set

Short Analysis of the Training Set used for AlphaPeptDeep's CCS Model

In [None]:
df_lys = pd.read_csv('alpha_trainset/Hela_lys/evidence.txt', sep ='\t')

In [None]:
df_tryp = pd.read_csv('alpha_trainset/Hela_trypsin/evidence.txt', sep = '\t')

In [None]:
df_train = pd.concat([df_lys, df_tryp], axis = 0)

In [None]:
for charge in df_train['Charge'].unique():
    print(f"Charge {charge}:{len(df_train[df_train['Charge']==charge])}")

In [None]:
plt.hist(x = df_train['Length'], bins = 60)
print(df_train['Length'].min())
print(df_train['Length'].max())

In [None]:
# range CCS
print(df_train['CCS'].min())
print(df_train['CCS'].max())
print(df_train['CCS'].max()-df_train['CCS'].min())

In [None]:
# range 1/K0
print(df_train['K0'].min())
print(df_train['K0'].max())
print(df_train['K0'].max()-df_train['K0'].min())

In [None]:
print(df_train.columns)