In [None]:
%reload_ext autoreload
%autoreload 2

In [None]:
from peptdeep.pretrained_models import ModelManager
from peptdeep import settings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
import seaborn as sns
import sklearn.metrics as sk
import sklearn.linear_model as sk_lm
from alphabase.psm_reader import psm_reader_provider

In [None]:
df_base = pd.read_csv("evidence_freshfrozen_base.txt",  sep="\t")

In [None]:
# split df into the 3 Experiments
df = df_base
df_51 = df[df["Experiment"]=='P064051']
df_64 = df[df["Experiment"]=='P064064']
df_28 = df[df["Experiment"]=='P064428']

# Load Model

# Transfer Learning

## Training

In [None]:
# imprved sampling
n = 10000

# ensure same quantity of experiments

# calculate the numbers for each experiment
n_51 = round((len(df_51)/len(df))*n)
n_64 = round((len(df_64)/len(df))*n)
n_28 = round((len(df_28)/len(df))*n)

df_51 = df_51.sample(n = n_51, random_state =  42)
df_64 = df_64.sample(n = n_64, random_state = 42)
df_28 = df_28.sample(n=n_28, random_state = 42)

df_train = pd.concat(objs=[df_51, df_64, df_28])
print(len(df_train))


In [None]:
# turn training df into alpha base format
df_train.loc[:,'Original index'] = df_train.index
# run AlphaBase
mq_reader = psm_reader_provider.get_reader('maxquant')
mq_reader.column_mapping['Original index'] = 'Original index'
mq_reader._translate_columns(df_train)
mq_reader._transform_table(df_train)
mq_reader._translate_decoy(df_train)
mq_reader._translate_score(df_train)
mq_reader._load_modifications(df_train)
mq_reader._translate_modifications()
mq_reader._post_process(df_train)  
df_train_ab = mq_reader.psm_df

In [None]:
models = ModelManager(device = 'gpu')
models.load_installed_models()

In [None]:
models.train_ccs_model(df_train_ab)

## Testing

In [None]:
# turn training df into alpha base format
df_mq = df_base
df_mq.loc[:,'Original index'] = df_mq.index
# run AlphaBase
mq_reader = psm_reader_provider.get_reader('maxquant')
mq_reader.column_mapping['Original index'] = 'Original index'
mq_reader._translate_columns(df_mq)
mq_reader._transform_table(df_mq)
mq_reader._translate_decoy(df_mq)
mq_reader._translate_score(df_mq)
mq_reader._load_modifications(df_mq)
mq_reader._translate_modifications()
mq_reader._post_process(df_mq)  
df_ab = mq_reader.psm_df

In [None]:
# predict whole dataset
prediction = models.predict_mobility(df_ab)

In [None]:
prediction.to_csv('predictions/pred_trans_v2', sep = '\t')

In [None]:
# merging alpha base format back to MaxQuant
#df_ab = pd.read_csv('comparisons/comp_all_trans.txt', sep = '\t')
df_ab = prediction
df_comp = df_base
df_comp['ccs_pred'] = df_ab['ccs_pred']
df_comp['mobility_pred'] = df_ab['mobility_pred']
mapping_dict = {
            'sequence': 'Sequence',
            'charge': 'Charge',
            'rt': 'Retention time',
            'ccs': 'CCS',
            'mobility': '1/K0',
            'scan_num': 'MS/MS scan number',
            'raw_name': 'Raw file',
            'precursor_mz': 'm/z',
            'score': 'Score',
            'proteins': 'Proteins',
            'genes': 'Gene names',
            'decoy': 'Reverse',
            'intensity': 'Intensity',
            'nAA':'Length'}
merging_list = ['Sequence', 'Charge', 'CCS', 'Score', 'Length', 'Retention time', 'Proteins', 'Gene names','1/K0' , 'MS/MS scan number', 'Raw file'
                                                  , 'm/z', 'Intensity' ]
        # rename the columns of the alphabase dataframe according to the mapping dict
df_ab.rename(columns=mapping_dict, inplace=True)
        # set the original index as index
#df_ab.set_index('Original index', inplace=True)
df_merged = pd.merge( df_comp,df_ab, on = merging_list, how = 'inner')
df_merged.to_csv('comparisons/comp_trans_v2_orig.txt', sep = '\t')

In [None]:
df_comp = pd.read_csv('comparisons/comp_trans_orig.txt', sep = '\t')

In [None]:
df_pred = df_merged

In [None]:
df_pred['ccs_diff'] = np.subtract(df_pred['CCS'], df_pred['ccs_pred_y'])
df_pred['IM_diff'] = np.subtract(df_pred['1/K0'], df_pred['mobility_pred_y'])

In [None]:
plt.hist(df_pred['IM_diff'], bins=100)
plt.axvline(x = 0.0, c = 'black', linestyle = '--')
plt.xlabel('1/K0 Difference')
plt.title('Adjusted Error(Transfer Learning)')

In [None]:
print(df_pred['ccs_diff'].mean())
print(df_pred['ccs_diff'].std())

In [None]:
print(df_pred['ccs_diff'].abs().mean())

In [None]:
# percentile of CCS Difference
perc_low = np.percentile(df_pred['ccs_diff'], 2.5)
perc_up = np.percentile(df_pred['ccs_diff'], 97.5)
print(f'({perc_low}, {perc_up})')


In [None]:
print((perc_low)*(-1)+perc_up)
window = (perc_low)*(-1)+perc_up

In [None]:
print(df_pred['IM_diff'].mean())
print(df_pred['IM_diff'].std())


In [None]:
# percentile of IM Difference
perc_low = np.percentile(df_pred['IM_diff'], 2.5)
perc_up = np.percentile(df_pred['IM_diff'], 97.5)
print(f'({perc_low}, {perc_up})')


In [None]:
print((perc_low)*(-1)+perc_up)
window = (perc_low)*(-1)+perc_up

## Plots

In [None]:
# violin plot: experiment vs CCS error
plt.figure(figsize=(10, 6))
sns.violinplot(x='Experiment', y='ccs_diff', data=df_pred, density_norm='count')

# Set plot labels and title
plt.xlabel('Biological Replicate')
plt.ylabel('CCS Error')
plt.hlines(y = 0.0, xmin = -0.5, xmax= 2.5, linestyles='--', colors='grey')

# Display the plot
plt.show()

In [None]:
print(df_pred[df_pred['Experiment']=='P064051']['ccs_diff'].mean())
print(df_pred[df_pred['Experiment']=='P064064']['ccs_diff'].mean())
print(df_pred[df_pred['Experiment']=='P064428']['ccs_diff'].mean())

In [None]:
print(df_pred[df_pred['Experiment']=='P064051']['IM_diff'].mean())
print(df_pred[df_pred['Experiment']=='P064064']['IM_diff'].mean())
print(df_pred[df_pred['Experiment']=='P064428']['IM_diff'].mean())

In [None]:
# violin plot: fractions vs CCS error
plt.figure(figsize=(15, 6))
sns.violinplot(x='Fraction', y='ccs_diff', data=df_comp_trans, scale='count', width=1.0)

# Set plot labels and title
plt.xlabel('Fraction')
plt.ylabel('CCS Error')
plt.hlines(y = 0.0, xmin = -0.5, xmax= 48, linestyles='--', colors='grey')

# Display the plot
plt.show()

# Transfer Learning per Experiment

## Ex 51

In [None]:
df = df_51
df_train = df.sample(n = round(0.3*len(df)), random_state=42)
print(len(df_train))

In [None]:
# turn training df into alpha base format
df_train.loc[:,'Original index'] = df_train.index
# run AlphaBase
mq_reader = psm_reader_provider.get_reader('maxquant')
mq_reader.column_mapping['Original index'] = 'Original index'
mq_reader._translate_columns(df_train)
mq_reader._transform_table(df_train)
mq_reader._translate_decoy(df_train)
mq_reader._translate_score(df_train)
mq_reader._load_modifications(df_train)
mq_reader._translate_modifications()
mq_reader._post_process(df_train)  
df_train_ab = mq_reader.psm_df

In [None]:
models = ModelManager(device = 'gpu')
models.load_installed_models()


In [None]:
models.train_ccs_model(df_train_ab)

In [None]:
# turn complete df into alpha base format
df = df_51
df.loc[:,'Original index'] = df.index
# run AlphaBase
mq_reader = psm_reader_provider.get_reader('maxquant')
mq_reader.column_mapping['Original index'] = 'Original index'
mq_reader._translate_columns(df)
mq_reader._transform_table(df)
mq_reader._translate_decoy(df)
mq_reader._translate_score(df)
mq_reader._load_modifications(df)
mq_reader._translate_modifications()
mq_reader._post_process(df)  
df_ab = mq_reader.psm_df

In [None]:
prediction = models.predict_mobility(df_ab)

In [None]:
# merging alpha base format back to MaxQuant

df['ccs_pred'] = prediction['ccs_pred']
df['mobility_pred'] = prediction['mobility_pred']
mapping_dict = {
            'sequence': 'Sequence',
            'charge': 'Charge',
            'rt': 'Retention time',
            'ccs': 'CCS',
            'mobility': '1/K0',
            'scan_num': 'MS/MS scan number',
            'raw_name': 'Raw file',
            'precursor_mz': 'm/z',
            'score': 'Score',
            'proteins': 'Proteins',
            'genes': 'Gene names',
            'decoy': 'Reverse',
            'intensity': 'Intensity',
            'nAA':'Length'}
merging_list = ['Sequence', 'Charge', 'CCS', 'Score', 'Length', 'Retention time', 'Proteins', 'Gene names','1/K0' , 'MS/MS scan number', 'Raw file'
                                                  , 'm/z', 'Intensity' ]
        # rename the columns of the alphabase dataframe according to the mapredictionping dict
prediction.rename(columns=mapping_dict, inplace=True)
        # set the original index as index
#prediction.set_index('Original index', inplace=True)
df_merged = pd.merge(df, prediction, on = merging_list, how = 'inner')
#df_merged.to_csv('comparisons/comp__trans_orig.txt', sep = '\t')

df_merged['ccs_diff'] = np.subtract(df_merged['CCS'], df_merged['ccs_pred_y'])
df_merged['IM_diff'] = np.subtract(df_merged['1/K0'], df_merged['mobility_pred_y'])

sns.violinplot(x='Experiment', y='ccs_diff', data=df_merged, density_norm='count')
print(f'Mean Error: {df_merged['ccs_diff'].mean()}')
# percentile of CCS Difference
perc_low = np.percentile(df_merged['ccs_diff'], 2.5)
perc_up = np.percentile(df_merged['ccs_diff'], 97.5)
print(f'Percentiles:({perc_low}, {perc_up})')

print(f'Window:{(perc_low)*(-1)+perc_up}')
window = (perc_low)*(-1)+perc_up


### Run 51 trained model on all experiments

In [None]:
df = pd.read_csv('evidence_freshfrozen_base.txt', sep = '\t')

In [None]:
# turn complete df into alpha base format
df = df_base
df.loc[:,'Original index'] = df.index
# run AlphaBase
mq_reader = psm_reader_provider.get_reader('maxquant')
mq_reader.column_mapping['Original index'] = 'Original index'
mq_reader._translate_columns(df)
mq_reader._transform_table(df)
mq_reader._translate_decoy(df)
mq_reader._translate_score(df)
mq_reader._load_modifications(df)
mq_reader._translate_modifications()
mq_reader._post_process(df)  
df_ab = mq_reader.psm_df

In [None]:
prediction = models.predict_mobility(df_ab)

In [None]:
# merging alpha base format back to MaxQuant

df['ccs_pred'] = prediction['ccs_pred']
df['mobility_pred'] = prediction['mobility_pred']
mapping_dict = {
            'sequence': 'Sequence',
            'charge': 'Charge',
            'rt': 'Retention time',
            'ccs': 'CCS',
            'mobility': '1/K0',
            'scan_num': 'MS/MS scan number',
            'raw_name': 'Raw file',
            'precursor_mz': 'm/z',
            'score': 'Score',
            'proteins': 'Proteins',
            'genes': 'Gene names',
            'decoy': 'Reverse',
            'intensity': 'Intensity',
            'nAA':'Length'}
merging_list = ['Sequence', 'Charge', 'CCS', 'Score', 'Length', 'Retention time', 'Proteins', 'Gene names','1/K0' , 'MS/MS scan number', 'Raw file'
                                                  , 'm/z', 'Intensity' ]
        # rename the columns of the alphabase dataframe according to the mapredictionping dict
prediction.rename(columns=mapping_dict, inplace=True)
        # set the original index as index
prediction.set_index('Original index', inplace=True)
df_pred = pd.merge(df, prediction, on = merging_list, how = 'inner')
#df_merged.to_csv('comparisons/comp__trans_orig.txt', sep = '\t')
df_pred['ccs_diff'] = np.subtract(df_pred['CCS'], df_pred['ccs_pred_y'])
df_pred['IM_diff'] = np.subtract(df_pred['1/K0'], df_pred['mobility_pred_y'])


In [None]:
df_pred.to_csv('comparisons/comp_trans_51.txt', sep = '\t')

In [None]:
# violin plot: experiment vs CCS error
plt.figure(figsize=(10, 6))
sns.violinplot(x='Experiment', y='ccs_diff', data=df_pred, density_norm='count')

# Set plot labels and title
plt.xlabel('Biological Replicate')
plt.ylabel('CCS Error')
plt.hlines(y = 0.0, xmin = -0.5, xmax= 2.5, linestyles='--', colors='grey')

# Display the plot
plt.show()
print(df_pred[df_pred['Experiment']=='P064051']['ccs_diff'].mean())
print(df_pred[df_pred['Experiment']=='P064064']['ccs_diff'].mean())
print(df_pred[df_pred['Experiment']=='P064428']['ccs_diff'].mean())

In [None]:
print(df_pred[df_pred['Experiment']=='P064051']['IM_diff'].mean())
print(df_pred[df_pred['Experiment']=='P064064']['IM_diff'].mean())
print(df_pred[df_pred['Experiment']=='P064428']['IM_diff'].mean())

In [None]:
# violin plot: charge vs CCS error
df_comp = df_pred
plt.figure(figsize=(10, 6))
sns.violinplot(x='Charge', y='ccs_diff', data=df_comp, scale='count')#inner='box',

# Set plot labels and title
plt.xlabel('Charge')
plt.ylabel('CCS Error')
plt.hlines(y = 0.0, xmin = -0.1, xmax= 3.1, linestyles='--', colors='grey')
plt.tight_layout()

# Display the plot
plt.show()
print('CCS:')
print(df_comp[df_comp['Charge']==1]['ccs_diff'].mean())
print(df_comp[df_comp['Charge']==1]['ccs_diff'].std())
print(df_comp[df_comp['Charge']==2]['ccs_diff'].mean())
print(df_comp[df_comp['Charge']==2]['ccs_diff'].std())
print(df_comp[df_comp['Charge']==3]['ccs_diff'].mean())
print(df_comp[df_comp['Charge']==3]['ccs_diff'].std())
print(df_comp[df_comp['Charge']==4]['ccs_diff'].mean())
print(df_comp[df_comp['Charge']==4]['ccs_diff'].std())
print('IM:')
print(df_comp[df_comp['Charge']==1]['IM_diff'].mean())
print(df_comp[df_comp['Charge']==1]['IM_diff'].std())
print(df_comp[df_comp['Charge']==2]['IM_diff'].mean())
print(df_comp[df_comp['Charge']==2]['IM_diff'].std())
print(df_comp[df_comp['Charge']==3]['IM_diff'].mean())
print(df_comp[df_comp['Charge']==3]['IM_diff'].std())
print(df_comp[df_comp['Charge']==4]['IM_diff'].mean())
print(df_comp[df_comp['Charge']==4]['IM_diff'].std())

# Ex 64

In [None]:
df = df_64
df_train = df.sample(n = round(0.2*len(df)), random_state=42)
print(len(df_train))

In [None]:
# turn training df into alpha base format
df_train.loc[:,'Original index'] = df_train.index
# run AlphaBase
mq_reader = psm_reader_provider.get_reader('maxquant')
mq_reader.column_mapping['Original index'] = 'Original index'
mq_reader._translate_columns(df_train)
mq_reader._transform_table(df_train)
mq_reader._translate_decoy(df_train)
mq_reader._translate_score(df_train)
mq_reader._load_modifications(df_train)
mq_reader._translate_modifications()
mq_reader._post_process(df_train)  
df_train_ab = mq_reader.psm_df

In [None]:
models = ModelManager(device = 'gpu')
models.load_installed_models()


In [None]:
models.train_ccs_model(df_train_ab)

In [None]:
# turn complete df into alpha base format
df = df_64
df.loc[:,'Original index'] = df.index
# run AlphaBase
mq_reader = psm_reader_provider.get_reader('maxquant')
mq_reader.column_mapping['Original index'] = 'Original index'
mq_reader._translate_columns(df)
mq_reader._transform_table(df)
mq_reader._translate_decoy(df)
mq_reader._translate_score(df)
mq_reader._load_modifications(df)
mq_reader._translate_modifications()
mq_reader._post_process(df)  
df_ab = mq_reader.psm_df

In [None]:
prediction = models.predict_mobility(df_ab)

In [None]:
# merging alpha base format back to MaxQuant

df['ccs_pred'] = prediction['ccs_pred']
df['mobility_pred'] = prediction['mobility_pred']
mapping_dict = {
            'sequence': 'Sequence',
            'charge': 'Charge',
            'rt': 'Retention time',
            'ccs': 'CCS',
            'mobility': '1/K0',
            'scan_num': 'MS/MS scan number',
            'raw_name': 'Raw file',
            'precursor_mz': 'm/z',
            'score': 'Score',
            'proteins': 'Proteins',
            'genes': 'Gene names',
            'decoy': 'Reverse',
            'intensity': 'Intensity',
            'nAA':'Length'}
merging_list = ['Sequence', 'Charge', 'CCS', 'Score', 'Length', 'Retention time', 'Proteins', 'Gene names','1/K0' , 'MS/MS scan number', 'Raw file'
                                                  , 'm/z', 'Intensity' ]
        # rename the columns of the alphabase dataframe according to the mapredictionping dict
prediction.rename(columns=mapping_dict, inplace=True)
        # set the original index as index
prediction.set_index('Original index', inplace=True)
df_merged = pd.merge(df, prediction, on = merging_list, how = 'inner')
#df_merged.to_csv('comparisons/comp__trans_orig.txt', sep = '\t')

df_merged['ccs_diff'] = np.subtract(df_merged['CCS'], df_merged['ccs_pred_y'])
df_merged['IM_diff'] = np.subtract(df_merged['1/K0'], df_merged['mobility_pred_y'])

sns.violinplot(x='Experiment', y='ccs_diff', data=df_merged, density_norm='count')
print(f"Mean Error: {df_merged['ccs_diff'].mean()}")
# percentile of CCS Difference
perc_low = np.percentile(df_merged['ccs_diff'], 2.5)
perc_up = np.percentile(df_merged['ccs_diff'], 97.5)
print(f"Percentiles:({perc_low}, {perc_up})")

print(f"Window:{(perc_low)*(-1)+perc_up}")
window = (perc_low)*(-1)+perc_up


### Run 51 trained model on all experiments

In [None]:
df = pd.read_csv('evidence_freshfrozen_base.txt', sep = '\t')

In [None]:
# turn complete df into alpha base format
df = df_base
df.loc[:,'Original index'] = df.index
# run AlphaBase
mq_reader = psm_reader_provider.get_reader('maxquant')
mq_reader.column_mapping['Original index'] = 'Original index'
mq_reader._translate_columns(df)
mq_reader._transform_table(df)
mq_reader._translate_decoy(df)
mq_reader._translate_score(df)
mq_reader._load_modifications(df)
mq_reader._translate_modifications()
mq_reader._post_process(df)  
df_ab = mq_reader.psm_df

In [None]:
prediction = models.predict_mobility(df_ab)

In [None]:
# merging alpha base format back to MaxQuant

df['ccs_pred'] = prediction['ccs_pred']
df['mobility_pred'] = prediction['mobility_pred']
mapping_dict = {
            'sequence': 'Sequence',
            'charge': 'Charge',
            'rt': 'Retention time',
            'ccs': 'CCS',
            'mobility': '1/K0',
            'scan_num': 'MS/MS scan number',
            'raw_name': 'Raw file',
            'precursor_mz': 'm/z',
            'score': 'Score',
            'proteins': 'Proteins',
            'genes': 'Gene names',
            'decoy': 'Reverse',
            'intensity': 'Intensity',
            'nAA':'Length'}
merging_list = ['Sequence', 'Charge', 'CCS', 'Score', 'Length', 'Retention time', 'Proteins', 'Gene names','1/K0' , 'MS/MS scan number', 'Raw file'
                                                  , 'm/z', 'Intensity' ]
        # rename the columns of the alphabase dataframe according to the mapredictionping dict
prediction.rename(columns=mapping_dict, inplace=True)
        # set the original index as index
prediction.set_index('Original index', inplace=True)
df_pred = pd.merge(df, prediction, on = merging_list, how = 'inner')
#df_merged.to_csv('comparisons/comp__trans_orig.txt', sep = '\t')
df_pred['ccs_diff'] = np.subtract(df_pred['CCS'], df_pred['ccs_pred_y'])
df_pred['IM_diff'] = np.subtract(df_pred['1/K0'], df_pred['mobility_pred_y'])


In [None]:
# violin plot: experiment vs CCS error
plt.figure(figsize=(10, 6))
sns.violinplot(x='Experiment', y='ccs_diff', data=df_pred, density_norm='count')

# Set plot labels and title
plt.xlabel('Biological Replicate')
plt.ylabel('CCS Error')
plt.hlines(y = 0.0, xmin = -0.5, xmax= 2.5, linestyles='--', colors='grey')

# Display the plot
plt.show()
print(df_pred[df_pred['Experiment']=='P064051']['ccs_diff'].mean())
print(df_pred[df_pred['Experiment']=='P064064']['ccs_diff'].mean())
print(df_pred[df_pred['Experiment']=='P064428']['ccs_diff'].mean())

In [None]:
print(df_pred[df_pred['Experiment']=='P064051']['IM_diff'].mean())
print(df_pred[df_pred['Experiment']=='P064064']['IM_diff'].mean())
print(df_pred[df_pred['Experiment']=='P064428']['IM_diff'].mean())

## Ex 28

In [None]:
df = df_28
df_train = df.sample(n = round(0.2*len(df)), random_state=42)
print(len(df_train))

In [None]:
# turn training df into alpha base format
df_train.loc[:,'Original index'] = df_train.index
# run AlphaBase
mq_reader = psm_reader_provider.get_reader('maxquant')
mq_reader.column_mapping['Original index'] = 'Original index'
mq_reader._translate_columns(df_train)
mq_reader._transform_table(df_train)
mq_reader._translate_decoy(df_train)
mq_reader._translate_score(df_train)
mq_reader._load_modifications(df_train)
mq_reader._translate_modifications()
mq_reader._post_process(df_train)  
df_train_ab = mq_reader.psm_df

In [None]:
models = ModelManager(device = 'gpu')
models.load_installed_models()

In [None]:
models.train_ccs_model(df_train_ab)

In [None]:
# turn complete df into alpha base format
df = df_28
df.loc[:,'Original index'] = df.index
# run AlphaBase
mq_reader = psm_reader_provider.get_reader('maxquant')
mq_reader.column_mapping['Original index'] = 'Original index'
mq_reader._translate_columns(df)
mq_reader._transform_table(df)
mq_reader._translate_decoy(df)
mq_reader._translate_score(df)
mq_reader._load_modifications(df)
mq_reader._translate_modifications()
mq_reader._post_process(df)  
df_ab = mq_reader.psm_df

In [None]:
prediction = models.predict_mobility(df_ab)

In [None]:
# merging alpha base format back to MaxQuant

df['ccs_pred'] = prediction['ccs_pred']
df['mobility_pred'] = prediction['mobility_pred']
mapping_dict = {
            'sequence': 'Sequence',
            'charge': 'Charge',
            'rt': 'Retention time',
            'ccs': 'CCS',
            'mobility': '1/K0',
            'scan_num': 'MS/MS scan number',
            'raw_name': 'Raw file',
            'precursor_mz': 'm/z',
            'score': 'Score',
            'proteins': 'Proteins',
            'genes': 'Gene names',
            'decoy': 'Reverse',
            'intensity': 'Intensity',
            'nAA':'Length'}
merging_list = ['Sequence', 'Charge', 'CCS', 'Score', 'Length', 'Retention time', 'Proteins', 'Gene names','1/K0' , 'MS/MS scan number', 'Raw file'
                                                  , 'm/z', 'Intensity' ]
        # rename the columns of the alphabase dataframe according to the mapredictionping dict
prediction.rename(columns=mapping_dict, inplace=True)
        # set the original index as index
#prediction.set_index('Original index', inplace=True)
df_merged = pd.merge(df, prediction, on = merging_list, how = 'inner')
#df_merged.to_csv('comparisons/comp__trans_orig.txt', sep = '\t')

df_merged['ccs_diff'] = np.subtract(df_merged['CCS'], df_merged['ccs_pred_y'])
df_merged['IM_diff'] = np.subtract(df_merged['1/K0'], df_merged['mobility_pred_y'])

sns.violinplot(x='Experiment', y='ccs_diff', data=df_merged, density_norm='count')
print(f"Mean Error: {df_merged['ccs_diff'].mean()}")
# percentile of CCS Difference
perc_low = np.percentile(df_merged['ccs_diff'], 2.5)
perc_up = np.percentile(df_merged['ccs_diff'], 97.5)
print(f"Percentiles:({perc_low}, {perc_up})")

print(f"Window:{(perc_low)*(-1)+perc_up}")
window = (perc_low)*(-1)+perc_up


### Run 51 trained model on all experiments

In [None]:
df = pd.read_csv('evidence_freshfrozen_base.txt', sep = '\t')

In [None]:
# turn complete df into alpha base format
df = df_base
df.loc[:,'Original index'] = df.index
# run AlphaBase
mq_reader = psm_reader_provider.get_reader('maxquant')
mq_reader.column_mapping['Original index'] = 'Original index'
mq_reader._translate_columns(df)
mq_reader._transform_table(df)
mq_reader._translate_decoy(df)
mq_reader._translate_score(df)
mq_reader._load_modifications(df)
mq_reader._translate_modifications()
mq_reader._post_process(df)  
df_ab = mq_reader.psm_df

In [None]:
prediction = models.predict_mobility(df_ab)

In [None]:
# merging alpha base format back to MaxQuant

df['ccs_pred'] = prediction['ccs_pred']
df['mobility_pred'] = prediction['mobility_pred']
mapping_dict = {
            'sequence': 'Sequence',
            'charge': 'Charge',
            'rt': 'Retention time',
            'ccs': 'CCS',
            'mobility': '1/K0',
            'scan_num': 'MS/MS scan number',
            'raw_name': 'Raw file',
            'precursor_mz': 'm/z',
            'score': 'Score',
            'proteins': 'Proteins',
            'genes': 'Gene names',
            'decoy': 'Reverse',
            'intensity': 'Intensity',
            'nAA':'Length'}
merging_list = ['Sequence', 'Charge', 'CCS', 'Score', 'Length', 'Retention time', 'Proteins', 'Gene names','1/K0' , 'MS/MS scan number', 'Raw file'
                                                  , 'm/z', 'Intensity' ]
        # rename the columns of the alphabase dataframe according to the mapredictionping dict
prediction.rename(columns=mapping_dict, inplace=True)
        # set the original index as index
prediction.set_index('Original index', inplace=True)
df_pred = pd.merge(df, prediction, on = merging_list, how = 'inner')
#df_merged.to_csv('comparisons/comp__trans_orig.txt', sep = '\t')
df_pred['ccs_diff'] = np.subtract(df_pred['CCS'], df_pred['ccs_pred_y'])
df_pred['IM_diff'] = np.subtract(df_pred['1/K0'], df_pred['mobility_pred_y'])


In [None]:
# violin plot: experiment vs CCS error
plt.figure(figsize=(10, 6))
sns.violinplot(x='Experiment', y='ccs_diff', data=df_pred, density_norm='count')

# Set plot labels and title
plt.xlabel('Biological Replicate')
plt.ylabel('CCS Error')
plt.hlines(y = 0.0, xmin = -0.5, xmax= 2.5, linestyles='--', colors='grey')

# Display the plot
plt.show()
print(df_pred[df_pred['Experiment']=='P064051']['ccs_diff'].mean())
print(df_pred[df_pred['Experiment']=='P064064']['ccs_diff'].mean())
print(df_pred[df_pred['Experiment']=='P064428']['ccs_diff'].mean())

In [None]:
print(df_pred[df_pred['Experiment']=='P064051']['IM_diff'].mean())
print(df_pred[df_pred['Experiment']=='P064064']['IM_diff'].mean())
print(df_pred[df_pred['Experiment']=='P064428']['IM_diff'].mean())

# Rawfile wise

In [None]:
def mq_to_ab(df: pd.DataFrame):
    # turn training df into alpha base format
    df.loc[:,'Original index'] = df.index
    # run AlphaBase
    mq_reader = psm_reader_provider.get_reader('maxquant')
    mq_reader.column_mapping['Original index'] = 'Original index'
    mq_reader._translate_columns(df)
    mq_reader._transform_table(df)
    mq_reader._translate_decoy(df)
    mq_reader._translate_score(df)
    mq_reader._load_modifications(df)
    mq_reader._translate_modifications()
    mq_reader._post_process(df)  
    df_ab = mq_reader.psm_df
    return df_ab

In [None]:
def ab_to_mq(df: pd.DataFrame, prediction:pd.DataFrame):
    # merging alpha base format back to MaxQuant
    df['ccs_pred'] = prediction['ccs_pred']
    df['mobility_pred'] = prediction['mobility_pred']
    mapping_dict = {
                'sequence': 'Sequence',
                'charge': 'Charge',
                'rt': 'Retention time',
                'ccs': 'CCS',
                'mobility': '1/K0',
                'scan_num': 'MS/MS scan number',
                'raw_name': 'Raw file',
                'precursor_mz': 'm/z',
                'score': 'Score',
                'proteins': 'Proteins',
                'genes': 'Gene names',
                'decoy': 'Reverse',
                'intensity': 'Intensity',
                'nAA':'Length'}
    merging_list = ['Sequence', 'Charge', 'CCS', 'Score', 'Length', 'Retention time', 'Proteins', 'Gene names','1/K0' , 'MS/MS scan number', 'Raw file'
                                                    , 'm/z', 'Intensity' ]
            # rename the columns of the alphabase dataframe according to the mapredictionping dict
    prediction.rename(columns=mapping_dict, inplace=True)
            # set the original index as index
    prediction.set_index('Original index', inplace=True)
    df_merged = pd.merge(df, prediction, on = merging_list, how = 'inner')
    df_merged['ccs_diff'] = np.subtract(df_merged['CCS'], df_merged['ccs_pred_y'])
    df_merged['IM_diff'] = np.subtract(df_merged['1/K0'], df_merged['mobility_pred_y'])
    return df_merged

In [None]:
# for all experiments
df = df_base
result_dict = {}
# experiments
for experiment in df['Experiment'].unique():
    print(experiment)
    df_ex = df[df['Experiment']== experiment]
    result_dict[experiment] = {}
    # raw file wise
    for raw_file in df_ex['Raw file']:
        print(raw_file)
        df_raw = df_ex[df_ex['Raw file']== raw_file]
        df_ab = mq_to_ab(df_raw)
        df_train = df_ab.sample(n = round(0.2*len(df_ab)))
        models = ModelManager(device = 'gpu')
        models.load_installed_models()      
        models.train_ccs_model(df_train)
        prediction = models.predict_mobility(df_ab)
        df_result = ab_to_mq(df_raw, prediction)
        mean = df_result['ccs_diff'].mean()
        perc_low = np.percentile(df_result['ccs_diff'], 2.5)
        perc_up = np.percentile(df_result['ccs_diff'], 97.5)
        window = (perc_low)*(-1)+perc_up
        result_dict[experiment][raw_file] = [mean, perc_low, perc_up, window]

        


## 51

In [None]:

df_ex = df_51
result_dict_51 = {}
# raw file wise
for raw_file in df_ex['Raw file'].unique():
    print(raw_file)
    df_raw = df_ex[df_ex['Raw file']== raw_file]
    df_ab = mq_to_ab(df_raw)
    df_train = df_ab.sample(n = round(0.3*len(df_ab)))
    print(len(df_train))
    print(len(df_raw))
    models = ModelManager(device = 'gpu')
    models.load_installed_models()      
    models.train_ccs_model(df_train)
    prediction = models.predict_mobility(df_ab)
    df_result = ab_to_mq(df_raw, prediction)
    mean = df_result['ccs_diff'].mean()
    perc_low = np.percentile(df_result['ccs_diff'], 2.5)
    perc_up = np.percentile(df_result['ccs_diff'], 97.5)
    window = (perc_low)*(-1)+perc_up
    result_dict_51[raw_file] = [mean, perc_low, perc_up, window]       


In [None]:
import pickle
with open('TL_rawfile_51_30.pkl', 'wb') as f:
    pickle.dump(result_dict_51, f)

In [None]:
with open('TL_rawfile_51.pkl', 'rb') as f:
    result_dict_51 = pickle.load(f)

## 64

In [None]:

df_ex = df_64
result_dict_64 = {}
# raw file wise
for raw_file in df_ex['Raw file'].unique():
    print(raw_file)
    df_raw = df_ex[df_ex['Raw file']== raw_file]
    df_ab = mq_to_ab(df_raw)
    df_train = df_ab.sample(n = round(0.2*len(df_ab)))
    models = ModelManager(device = 'gpu')
    models.load_installed_models()      
    models.train_ccs_model(df_train)
    prediction = models.predict_mobility(df_ab)
    df_result = ab_to_mq(df_raw, prediction)
    mean = df_result['ccs_diff'].mean()
    perc_low = np.percentile(df_result['ccs_diff'], 2.5)
    perc_up = np.percentile(df_result['ccs_diff'], 97.5)
    window = (perc_low)*(-1)+perc_up
    result_dict_64[raw_file] = [mean, perc_low, perc_up, window]       


In [None]:
import pickle
with open('TL_rawfile_64.pkl', 'wb') as f:
    pickle.dump(result_dict_64, f)

In [None]:
with open('TL_rawfile_64.pkl', 'rb') as f:
    loaded_dict = pickle.load(f)

## 28

In [None]:

df_ex = df_28
result_dict_28 = {}
# raw file wise
for raw_file in df_ex['Raw file'].unique():
    print(raw_file)
    df_raw = df_ex[df_ex['Raw file']== raw_file]
    df_ab = mq_to_ab(df_raw)
    df_train = df_ab.sample(n = round(0.2*len(df_ab)))
    models = ModelManager(device = 'gpu')
    models.load_installed_models()      
    models.train_ccs_model(df_train)
    prediction = models.predict_mobility(df_ab)
    df_result = ab_to_mq(df_raw, prediction)
    mean = df_result['ccs_diff'].mean()
    perc_low = np.percentile(df_result['ccs_diff'], 2.5)
    perc_up = np.percentile(df_result['ccs_diff'], 97.5)
    window = (perc_low)*(-1)+perc_up
    result_dict_28[raw_file] = [mean, perc_low, perc_up, window]       


In [None]:
import pickle
with open('TL_rawfile_28.pkl', 'wb') as f:
    pickle.dump(result_dict_28, f)

In [None]:
with open('TL_rawfile_28.pkl', 'rb') as f:
    loaded_dict = pickle.load(f)

## Try different training sizes

In [None]:
print(len(df_51['Raw file'].unique()))
print(df_51['Raw file'].unique())

In [None]:
rawfile_sample = ['5471_P064051_R1_U23_GG9_1_2596', '5471_P064051_R1_U28_GE10_1_2601', '5471_P064051_R1_U33_GA11_1_2606', '5471_P064051_R1_U45_GD12_1_2618', '5471_P064051_R1_U38_GF11_1_2611',
                 '5471_P064051_R1_U2_GB7_1_2575', '5471_P064051_R1_U41_GH12_1_2614',  '5471_P064051_R1_U46_GC12_1_2619', '5471_P064051_R1_U1_GA7_1_2574', '5471_P064051_R1_U13_GD8_1_2586']
train_size = [0.6,0.7,0.8]

In [None]:
df_try = df_51[df_51['Raw file'].isin(rawfile_sample)]

In [None]:
print(df_try.shape)
print(len(df_try['Raw file'].unique()))

In [None]:

df_ex = df_try
for size in train_size:
    print(size)
    result_dict = {}
    # raw file wise
    for raw_file in df_ex['Raw file'].unique():
        df_raw = df_ex[df_ex['Raw file']== raw_file]
        df_ab = mq_to_ab(df_raw)
        df_train = df_ab.sample(n = round(size*len(df_ab)))
        models = ModelManager(device = 'gpu')
        models.load_installed_models()      
        models.train_ccs_model(df_train)
        prediction = models.predict_mobility(df_ab)
        df_result = ab_to_mq(df_raw, prediction)
        mean = df_result['ccs_diff'].mean()
        perc_low = np.percentile(df_result['ccs_diff'], 2.5)
        perc_up = np.percentile(df_result['ccs_diff'], 97.5)
        window = (perc_low)*(-1)+perc_up
        result_dict[raw_file] = [mean, perc_low, perc_up, window]  
    with open(f'train_size_test/51_{size*100}.pkl', 'wb') as f:
        pickle.dump(result_dict, f)
             


In [None]:
import pickle
with open('TL_rawfile_51_30.pkl', 'wb') as f:
    pickle.dump(result_dict_51, f)

## Plots

In [None]:
import pickle
with open('TL_rawfile_51.pkl', 'rb') as f:
    result_dict_51 = pickle.load(f)


In [None]:
with open('TL_rawfile_64.pkl', 'rb') as f:
    result_dict_64 = pickle.load(f)

with open('TL_rawfile_28.pkl', 'rb') as f:
    result_dict_28 = pickle.load(f)

In [None]:
result_dict_51_30 = result_dict_51

In [None]:
result_51 = pd.DataFrame.from_dict(result_dict_51)
result_51 = result_51.T
result_51.columns = ['Mean', 'Lower bound', 'Upper bound', 'Window']

In [None]:
result_51_30 = pd.DataFrame.from_dict(result_dict_51_30)
result_51_30 = result_51_30.T
result_51_30.columns = ['Mean', 'Lower bound', 'Upper bound', 'Window']

In [None]:
result_64 = pd.DataFrame.from_dict(result_dict_64)
result_64 = result_64.T
result_64.columns = ['Mean', 'Lower bound', 'Upper bound', 'Window']

In [None]:
result_28 = pd.DataFrame.from_dict(result_dict_28)
result_28 = result_28.T
result_28.columns = ['Mean', 'Lower bound', 'Upper bound', 'Window']

In [None]:
df = result_51
df['error_lower'] = df['Mean'] - df['Lower bound']
df['error_upper'] = df['Upper bound'] - df['Mean']

# Plot setup
plt.figure(figsize=(15, 6))

# Plotting the error bars
plt.errorbar(x=range(len(df)), y=df['Mean'], 
             yerr=[df['error_lower'], df['error_upper']], 
             fmt='o', capsize=5, capthick=2,   elinewidth=2)#ecolor='gray',color='blue',
plt.scatter(x= range(len(df)), y=df['Mean'],  label='CCS prediction', zorder=5)
# Customize the plot
plt.xlabel('Fractions')
plt.ylabel('Delta 95 bounds')
plt.grid(False)
plt.hlines(y = 0.0, xmin = -1.5, xmax= 48.5, linestyles='--', colors='grey')
plt.show()

In [None]:
# 51 wit 30%
df = result_51
df['error_lower'] = df['Mean'] - df['Lower bound']
df['error_upper'] = df['Upper bound'] - df['Mean']

# Plot setup
plt.figure(figsize=(15, 6))

# Plotting the error bars
plt.errorbar(x=range(len(df)), y=df['Mean'], 
             yerr=[df['error_lower'], df['error_upper']], 
             fmt='o', capsize=5, capthick=2,   elinewidth=2)#ecolor='gray',color='blue',
plt.scatter(x= range(len(df)), y=df['Mean'],  label='CCS prediction', zorder=5)
# Customize the plot
plt.xlabel('Fractions')
plt.ylabel('Delta 95 bounds')
plt.grid(False)
plt.hlines(y = 0.0, xmin = -1.5, xmax= 48.5, linestyles='--', colors='grey')
plt.show()

In [None]:
df = result_64
df['error_lower'] = df['Mean'] - df['Lower bound']
df['error_upper'] = df['Upper bound'] - df['Mean']

# Plot setup
plt.figure(figsize=(15, 6))

# Plotting the error bars
plt.errorbar(x=range(len(df)), y=df['Mean'], 
             yerr=[df['error_lower'], df['error_upper']], 
             fmt='o', capsize=5, capthick=2,  elinewidth=2)
plt.scatter(x= range(len(df)), y=df['Mean'],  label='CCS prediction', zorder=5)
# Customize the plot
plt.xlabel('Fractions')
plt.ylabel('Delta 95 bounds')
plt.grid(False)
plt.hlines(y = 0.0, xmin = -1.5, xmax= 48.5, linestyles='--', colors='grey')
plt.show()

In [None]:
df = result_28
df['error_lower'] = df['Mean'] - df['Lower bound']
df['error_upper'] = df['Upper bound'] - df['Mean']

# Plot setup
plt.figure(figsize=(15, 6))

# Plotting the error bars
plt.errorbar(x=range(len(df)), y=df['Mean'], 
             yerr=[df['error_lower'], df['error_upper']], 
             fmt='o', capsize=5, capthick=2,   elinewidth=2)
plt.scatter(x= range(len(df)), y=df['Mean'], label='CCS prediction', zorder=5)
# Customize the plot
plt.xlabel('Fractions')
plt.ylabel('Delta 95 bounds')
plt.grid(False)
plt.hlines(y = 0.0, xmin = -1.5, xmax= 48.5, linestyles='--', colors='grey')
plt.show()

In [None]:
result_51['Source'] = 'P064051'
result_64['Source'] = 'P064064'
result_28['Source'] = 'P064428'
df_trans = pd.concat([result_51, result_64, result_28],ignore_index=True)

In [None]:
result_51['Source'] = '51_20'
result_51_30['Source'] = '51_30'
df_trans = pd.concat([result_51, result_51_30],ignore_index=True)

In [None]:
plt.figure(figsize=(10, 6))
sns.histplot(x=df_trans['Window'],hue=df_trans['Source'], multiple='dodge', palette='viridis', bins = 48)
plt.xlabel('Window size')

In [None]:
# camparison 20% to 30%
plt.figure(figsize=(10, 6))
sns.histplot(x=df_trans['Window'],hue=df_trans['Source'], multiple='dodge', palette='viridis', bins = 48)
plt.xlabel('Window size')

# Charge

In [None]:
# generate training set
# discard Charge 1, because of too few measurements
df = df_base
n = 50000

df_2 = df[df['Charge']==2].sample(n = n, random_state =  42)
df_3 = df[df['Charge']==3].sample(n = n, random_state = 42)
df_4 = df[df['Charge']==4].sample(n=n, random_state = 42)

df_train = pd.concat(objs=[df_2, df_3, df_4])
print(len(df_train))

In [None]:
# turn training df into alpha base format
df_train.loc[:,'Original index'] = df_train.index
# run AlphaBase
mq_reader = psm_reader_provider.get_reader('maxquant')
mq_reader.column_mapping['Original index'] = 'Original index'
mq_reader._translate_columns(df_train)
mq_reader._transform_table(df_train)
mq_reader._translate_decoy(df_train)
mq_reader._translate_score(df_train)
mq_reader._load_modifications(df_train)
mq_reader._translate_modifications()
mq_reader._post_process(df_train)  
df_train_ab = mq_reader.psm_df

In [None]:
models = ModelManager(device = 'gpu')
models.load_installed_models()

In [None]:
models.train_ccs_model(df_train_ab)

In [None]:
# turn training df into alpha base format
df_mq = df_base
df_mq.loc[:,'Original index'] = df_mq.index
# run AlphaBase
mq_reader = psm_reader_provider.get_reader('maxquant')
mq_reader.column_mapping['Original index'] = 'Original index'
mq_reader._translate_columns(df_mq)
mq_reader._transform_table(df_mq)
mq_reader._translate_decoy(df_mq)
mq_reader._translate_score(df_mq)
mq_reader._load_modifications(df_mq)
mq_reader._translate_modifications()
mq_reader._post_process(df_mq)  
df_ab = mq_reader.psm_df

In [None]:
# predict whole dataset
prediction = models.predict_mobility(df_ab)

In [None]:
prediction.to_csv('predictions/pred_trans_charge.txt', sep = '\t')

In [None]:
# merging alpha base format back to MaxQuant
#df_ab = pd.read_csv('comparisons/comp_all_trans.txt', sep = '\t')
df_ab = prediction
df_comp = df_base
df_comp['ccs_pred'] = df_ab['ccs_pred']
df_comp['mobility_pred'] = df_ab['mobility_pred']
mapping_dict = {
            'sequence': 'Sequence',
            'charge': 'Charge',
            'rt': 'Retention time',
            'ccs': 'CCS',
            'mobility': '1/K0',
            'scan_num': 'MS/MS scan number',
            'raw_name': 'Raw file',
            'precursor_mz': 'm/z',
            'score': 'Score',
            'proteins': 'Proteins',
            'genes': 'Gene names',
            'decoy': 'Reverse',
            'intensity': 'Intensity',
            'nAA':'Length'}
merging_list = ['Sequence', 'Charge', 'CCS', 'Score', 'Length', 'Retention time', 'Proteins', 'Gene names','1/K0' , 'MS/MS scan number', 'Raw file'
                                                  , 'm/z', 'Intensity' ]
        # rename the columns of the alphabase dataframe according to the mapping dict
df_ab.rename(columns=mapping_dict, inplace=True)
        # set the original index as index
#df_ab.set_index('Original index', inplace=True)
df_merged = pd.merge( df_comp,df_ab, on = merging_list, how = 'inner')
#df_merged.to_csv('comparisons/comp_trans_v2_orig.txt', sep = '\t')

In [None]:
df_merged.shape

In [None]:
df_comp = pd.read_csv('comparisons/comp_trans_orig.txt', sep = '\t')

In [None]:
df_comp = df_merged

In [None]:
df_comp['ccs_diff'] = np.subtract(df_comp['CCS'], df_comp['ccs_pred_y'])
df_comp['IM_diff'] = np.subtract(df_comp['1/K0'], df_comp['mobility_pred_y'])

In [None]:
# violin plot: charge vs CCS error
plt.figure(figsize=(10, 6))
sns.violinplot(x='Charge', y='ccs_diff', data=df_comp, scale='count')#inner='box',

# Set plot labels and title
plt.xlabel('Charge')
plt.ylabel('CCS Error')
plt.hlines(y = 0.0, xmin = -0.1, xmax= 3.1, linestyles='--', colors='grey')
plt.tight_layout()

# Display the plot
plt.show()
print('CCS:')
print(df_comp[df_comp['Charge']==1]['ccs_diff'].mean())
print(df_comp[df_comp['Charge']==1]['ccs_diff'].std())
print(df_comp[df_comp['Charge']==2]['ccs_diff'].mean())
print(df_comp[df_comp['Charge']==2]['ccs_diff'].std())
print(df_comp[df_comp['Charge']==3]['ccs_diff'].mean())
print(df_comp[df_comp['Charge']==3]['ccs_diff'].std())
print(df_comp[df_comp['Charge']==4]['ccs_diff'].mean())
print(df_comp[df_comp['Charge']==4]['ccs_diff'].std())
print('IM:')
print(df_comp[df_comp['Charge']==1]['IM_diff'].mean())
print(df_comp[df_comp['Charge']==1]['IM_diff'].std())
print(df_comp[df_comp['Charge']==2]['IM_diff'].mean())
print(df_comp[df_comp['Charge']==2]['IM_diff'].std())
print(df_comp[df_comp['Charge']==3]['IM_diff'].mean())
print(df_comp[df_comp['Charge']==3]['IM_diff'].std())
print(df_comp[df_comp['Charge']==4]['IM_diff'].mean())
print(df_comp[df_comp['Charge']==4]['IM_diff'].std())

In [None]:
# violin plot: experiment vs CCS error
plt.figure(figsize=(10, 6))
sns.violinplot(x='Experiment', y='ccs_diff', data=df_comp, density_norm='count')

# Set plot labels and title
plt.xlabel('Biological Replicate')
plt.ylabel('CCS Error')
plt.hlines(y = 0.0, xmin = -0.5, xmax= 2.5, linestyles='--', colors='grey')

# Display the plot
plt.show()
print(df_comp[df_comp['Experiment']=='P064051']['ccs_diff'].mean())
print(df_comp[df_comp['Experiment']=='P064064']['ccs_diff'].mean())
print(df_comp[df_comp['Experiment']=='P064428']['ccs_diff'].mean())

In [None]:
plt.hist(df_comp['IM_diff'], bins=100)
plt.axvline(x = 0.0, c = 'black', linestyle = '--')
plt.xlabel('1/K0 Difference')
plt.title('Adjusted Error(Transfer Learning)')

In [None]:
print(df_comp['ccs_diff'].mean())
print(df_comp['ccs_diff'].std())

In [None]:
print(df_pred['ccs_diff'].abs().mean())

In [None]:
# percentile of CCS Difference
perc_low = np.percentile(df_comp['ccs_diff'], 2.5)
perc_up = np.percentile(df_comp['ccs_diff'], 97.5)
print(f'({perc_low}, {perc_up})')


In [None]:
print((perc_low)*(-1)+perc_up)
window = (perc_low)*(-1)+perc_up

In [None]:
print(df_comp['IM_diff'].mean())
print(df_comp['IM_diff'].std())


In [None]:
# percentile of IM Difference
perc_low = np.percentile(df_comp['IM_diff'], 2.5)
perc_up = np.percentile(df_comp['IM_diff'], 97.5)
print(f'({perc_low}, {perc_up})')


In [None]:
print((perc_low)*(-1)+perc_up)
window = (perc_low)*(-1)+perc_up

# AlphaPeptDeep Training Set

In [None]:
df_lys = pd.read_csv('orig_trainset/Hela_lys/evidence.txt', sep ='\t')

In [None]:
df_tryp = pd.read_csv('orig_trainset/Hela_trypsin/evidence.txt', sep = '\t')

In [None]:
print(df_lys.shape)
print(df_tryp.shape)

In [None]:
df_train = pd.concat([df_lys, df_tryp], axis = 0)

In [None]:
print(df_train.shape)

In [None]:
print(f"Charge 1:{len(df_train[df_train['Charge']==1])}")
print(f"Charge 2:{len(df_train[df_train['Charge']==2])}")
print(f"Charge 3:{len(df_train[df_train['Charge']==3])}")
print(f"Charge 4:{len(df_train[df_train['Charge']==4])}")
print(f"Charge 5:{len(df_train[df_train['Charge']==5])}")

In [None]:
print(df_train['Charge'].unique())

In [None]:
plt.hist(x = df_train['Length'], bins = 60)

In [None]:
print(df_train['Length'].max())

In [None]:
# range CCS
print(df_train['CCS'].min())
print(df_train['CCS'].max())
print(df_train['CCS'].max()-df_train['CCS'].min())

In [None]:
# range 1/K0
print(df_train['K0'].min())
print(df_train['K0'].max())
print(df_train['K0'].max()-df_train['K0'].min())

In [None]:
print(df_train.columns)