# Run `Anubis` for a single ancient protein vs trypsin at the protein lv

In [4]:
#@title Add inputs for `Anubis` -> click `Runtime` -> `Run all` (normal CPU runtime)

#@markdown **Inputs for protein lv deamidation estimations**
sage_file = "path to a .results.sage.tsv file" #@param {type:"string"}
psm_file = "path to a mokapot.psms.txt" #@param {type:"string"}
output_folder='test'#@param {type:"string"}
ancient_protein_target='P02754'#@param {type:"string"}
#@markdown - Provide UniProt ID of the target
#@markdown ---


In [5]:
#@title import libs and functions


import pandas as pd
import regex as re
import numpy as np
import os

def extract_naked_seqs(peptide):
  return ''.join(re.findall(r'[A-Z]', peptide))

def prep_Sage (sage_file):
  s=pd.read_table(sage_file)
  s=s.assign(ScanNr=s['scannr'].str.extract(r'scan=(\d+)', expand=False))
  s=s.rename(columns={'filename': 'FileName'})
  s1=s[['FileName', 'ScanNr', 'ms2_intensity']]
  s1=s1.assign(ScanNr=s1['ScanNr'].astype(int))

  return s1

def sage_pot_only(sage_file, psm_file):
  s=prep_Sage(sage_file)
  m=pd.read_table(psm_file)
  c=pd.merge(s, m, on=['FileName', 'ScanNr'])
  return c

def cal_de_v3 (df):
  d=df.copy()
  d1=d.assign(
      total_CNQM=d['Peptide'].str.count(r'N|Q|M'),
      total_NQ=d['Peptide'].str.count(r'N|Q'),
      total_mods=d['Peptide'].str.count(r'N\[\+0\.9848\]|Q\[\+0\.9848\]|M\[\+15\.9949\]'),
      ox_M=d['Peptide'].str.count(r'M\[\+15\.9949\]'),
      N2D=d['Peptide'].str.count(r'N\[\+0\.9848\]'),
      Q2E=d['Peptide'].str.count(r'Q\[\+0\.9848\]'),
      seq_naked=d['Peptide'].apply(extract_naked_seqs)
  )
  d2=d1.assign(adjusted_int=d1['ms2_intensity']/d1['seq_naked'].str.len(),
               total_mods_weighted=d1['ox_M']+d1['N2D']+(d1['Q2E']*103.18),
               total_de_NQ_weighted=d1['N2D']+(d1['Q2E']*103.18),
               total_de_NQ=d1['N2D']+d1['Q2E']
  )

  d3=d2.assign(
      mod_int=d2['adjusted_int']*(d2['total_mods']/d2['total_CNQM']),
      weighted_mod_int=(d2['adjusted_int']*(d2['total_mods_weighted']/d2['total_CNQM'])),
      adjusted_NQ_int=d2['adjusted_int']*(d2['total_de_NQ']/d2['total_NQ']),
      adjusted_NQ_weighted_int=d2['adjusted_int']*(d2['total_de_NQ_weighted']/d2['total_NQ'])
  )
  d4=d3.fillna(0)
  return d4

In [7]:
#@title Estimating deamidation


c=sage_pot_only(sage_file, psm_file)

c1 = c[c['Proteins'].str.contains(f"{ancient_protein_target}|P00761", regex=True)]

c2=cal_de_v3(c1)

if output_folder and not os.path.exists(output_folder):
    os.makedirs(output_folder)

c2.to_csv(f"{output_folder}/anubis_protein_lv.csv", index=False)