<a href="https://colab.research.google.com/github/yc386/anubis_palaeoproteomics/blob/main/anubis_position_lv.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Run Anubis for position-specific deamidation abundance with RF probabilities.</br>BLG-only!

Please note specific columns are required. Have a look at the example.csv.


1.   pattern_position
2.   modified_count
3.   ms2_intensity
4.   pep_len
5.   rf_pred_prob



In [None]:
#@title Add path to .csv
#@markdown **Inputs**
csv_file = "path to a .csv" #@param {type:"string"}

In [None]:
#@title import libs and functions


import pandas as pd
import regex as re
import numpy as np

"""
calculate position-specific deamidation abundance for samples.

Parameters:
df (pd.DataFrame): Input DataFrame containing the necessary columns.
Please check 'sample', 'FileName', 'pattern_position', 'modified_count', 'ms2_intensity', 'pep_len', 'rf_pred_prob' already in the df,
if not check train_a_RF_for_anubis.ipynb to generate them. Currently, Anubis only offers a BLG-focused model.

Returns:
pd.DataFrame: Transformed DataFrame with computed 'de_abundance', 'de_type', and log10 values.
"""
def anubis_position_lv (df):
  grouped = df.groupby(['sample', 'FileName', 'pattern_position']).agg(
        total_pattern_count=('pattern_position', 'size'),
        modified_count=('modified_count', 'sum'),
        ms2_intensity=('ms2_intensity', 'sum'),
        pep_len=('pep_len', 'sum'),
        rf_pred_prob_mean=('rf_pred_prob', 'mean')
    ).reset_index()

  grouped['de_abundance'] = (
        (grouped['modified_count'] / grouped['total_pattern_count']) *
        (grouped['ms2_intensity'] / grouped['pep_len']) *
        grouped['rf_pred_prob_mean']
    )

  grouped['de_type'] = np.where(
        grouped['pattern_position'].str.contains('N', regex=False), 'N2D', 'Q2E'
    )

  df1 = grouped.pivot_table(
        values="de_abundance", index=['sample', 'FileName'], columns="de_type"
    ).reset_index()

  df2 = df1.fillna(0)
  df3=df2.assign(
        log10_N=lambda x: np.where(x['N2D'] > 0, np.log10(x['N2D']), 0),
        log10_Q=lambda x: np.where(x['Q2E'] > 0, np.log10(x['Q2E']), 0)
  )
  return df3


In [None]:
#@title execute the function

df=pd.read_csv(csv_file)
df1=anubis_position_lv(df)
df1
#how to save the dataframe
#df1.to_csv(path, index=False)