In [1]:
import numpy as np
import pandas as pd
mass_H = 1.0078
mass_N_terminus = 1.0078
mass_C_terminus = 17.0027
mass_AA = {
           'A': 71.03711, # 0
           'R': 156.10111, # 1
           'N': 114.04293, # 2
           'n': 115.02695,
           'D': 115.02694, # 3
           #~ 'C': 103.00919, # 4
           'C': 160.03065, # C(+57.02)
           #~ 'Cmod': 161.01919, # C(+58.01) # orbi
           'E': 129.04259, # 5
           'Q': 128.05858, # 6
           'q': 129.0426,
           'G': 57.02146, # 7
           'H': 137.05891, # 8
           'I': 113.08406, # 9
           'L': 113.08406, # 10
           'K': 128.09496, # 11
           'M': 131.04049, # 12
           'm': 147.0354,
           'F': 147.06841, # 13
           'P': 97.05276, # 14
           'S': 87.03203, # 15
           'T': 101.04768, # 16
           'W': 186.07931, # 17
           'Y': 163.06333, # 18
           'V': 99.06841, # 19
          }

In [2]:
def process_casanovo(casanovo_path,mgf_file,out_file):
    """Parse de novo results from Casanovo to dataframe.
            :param
                casanovo_path: path to the mztab file of Casanovo
            :return
                pnovo_df: dataframe with Score, Peptide, AAScore of Casanovo
    """
    titles=[]
    with open(mgf_file,'r') as fr:
        lines = fr.readlines()
        for line in lines:
            if 'TITLE=' in line:
                titles.append( line.strip().split('TITLE=')[-1] )
    
    casanovo_out = casanovo_path.replace('.mztab','1.csv')
    with open(casanovo_path) as f:
            
        # Identify header
        for i, line in enumerate(f):
            if line.startswith("PSH"):
                casanovo_header = i 
                break
            
    # Load the data into a DataFrame
    df_denovo = pd.read_csv(casanovo_path, sep="\t", header=casanovo_header)      
    # Filter out rows where the sequence length is less than 5
    df_denovo = df_denovo[df_denovo['sequence'].str.len() >= 5]
    df_denovo['PSM_ID'] = df_denovo['PSM_ID'].astype(int)
    df_denovo['PSM_ID'] = df_denovo['PSM_ID'].apply(lambda x: titles[x] if x < len(titles) else None)

    sequence = df_denovo['sequence'].tolist()
    PSM_ID = df_denovo['PSM_ID'].tolist()
    Score = [int((i * 100 + 100)/2) for i in df_denovo['search_engine_score[1]'].tolist()]
    charge = df_denovo['charge']
    exp_mass_to_charge = df_denovo['exp_mass_to_charge']
    calc_mass_to_charge = df_denovo['calc_mass_to_charge']
    aascore = df_denovo['opt_ms_run[1]_aa_scores'].tolist()
    for i in range(len(aascore)):
        aascore[i]=aascore[i].replace(',', ' ')
    aascore= [i.replace(',', ' ') for i in aascore]
    aascore = [i.split() for i in aascore]
    aascore = [[str(int(float(j) * 100)) for j in i] for i in aascore]
    df_denovo['aaScore'] = [" ".join(i) for i in aascore]
    aascore = df_denovo['aaScore'].tolist()
    scan = df_denovo['PSM_ID'].tolist()
    mode = ['HCD' for x in PSM_ID]
    seq = [str(i).replace('M(+15.99)', 'm').replace('Q(+.98)', 'q').replace('N(+.98)', 'n').replace('Q(-17.03)','q').replace(' ','').replace('C(+57.02)', 'C') for i in sequence]
    length = [len(x) for x in seq]
    Mass_peptide = []
    for se in seq:
        Mass_peptide.append(sum(mass_AA[aa] for aa in se)+mass_N_terminus + mass_C_terminus)
    Mass_exp = exp_mass_to_charge * charge - charge * mass_H
    ppm = 10**6 * (Mass_exp - Mass_peptide) / Mass_peptide
    RT = []
    with open(mgf_file,'r') as fr:
        lines = fr.readlines()
        for line in lines:
            if 'TITLE=' in line:
                title = line.strip().split('TITLE=')[-1]
            if 'RTINSECONDS=' in line and title in PSM_ID:
                RT.append(line.strip().split('=')[-1])
    DF = np.empty((len(PSM_ID), 20),dtype='object')
    DF[:,1] = PSM_ID
    DF[:,3] = sequence 
    DF[:,4] = scan 
    DF[:,5] = length 
    DF[:,6] = Score 
    DF[:,7] = Score
    DF[:,8] = length
    DF[:,9] = exp_mass_to_charge
    DF[:,10] = charge 
    DF[:,11] = RT 
    DF[:,12] = '-' 
    DF[:,13] = 0 
    DF[:,14] = Mass_peptide 
    DF[:,15] = ppm
    DF[:,17] = aascore 
    DF[:,18] = sequence 
    DF[:,19] = mode 
    name = ['Fraction','Source File','Feature','Peptide','Scan','Tag Length','Denovo Score','ALC (%)','length','m/z','z','RT','Predict RT','Area','Mass','ppm','PTM','local confidence (%)','tag (>=0%)','mode']
    df = pd.DataFrame(DF, columns=name)
    df = df[df['Denovo Score'] >= 50]
    df.to_csv(out_file,index=False)

In [4]:
file_denovo='/mnt/data/antibody/S2P6/50ug/denovo/HCD/denovo_HCD.mztab'
file_mgf='/mnt/data/antibody/S2P6/50ug/process1/spectrum_S2P6_HCD.mgf'
file_out='/mnt/data/antibody/S2P6/50ug/denovo/HCD/S2P6_casanovo_stitch_HCD.csv'
process_casanovo(file_denovo,file_mgf,file_out)