# Convert MaxQuant output to Percolator input format
Percolator takes a tsv file as input. The output from MaxQuant can be reformatted with this script to run with Percolator. Before running Percolator, remember to run MQ without any prior FDR filtering. 

#### Feature input and column conversion
Most of the input features needs to be renamed. Some should be calculated oin existing columns. The feature space is described on the Crux website: https://crux.ms/index.html 

* SpecId  =  (Identifier of this peptide-spectrum match. If the PIN file was created by Crux, then the ID will be of the form target_0_8000_2_1, where the components are "target" or "decoy," the file index, scan number, charge, and PSM rank.) (103111-Yeast-2hr-01_27_2_1)
* Label = 'Reverse' (shold be converted from NAN to -1 and + to -1 )
* ScanNr = 'Scan number' 
* ExpMass = 'm/z' (precursor m/z) 
* CalcMass =  'Mass'
* Sp = 'Score'
* ChargeN = 'Charge'
* Peptide	= 'Modified sequence'
* Proteins = 'Proteins' (should be writrten seperated by commas)
* enzN = 'is enzymatic cleavage possible, N term'
* enzC = same but C term
* lnrSp =
* deltLCn =
* deltCn =
* IonFrac = 'Peak coverage'
* PepLen = Length (length of peptide)

#### Example dataframe from Percolator github
SpecId	Label	ScanNr	ExpMass	CalcMass	lnrSp	deltLCn	deltCn	Xcorr	Sp	IonFrac	Mass	PepLen	Charge1	Charge2	Charge3	Charge4	Charge5	enzN	enzC	enzInt	lnNumSP	dM	absdM	Peptide	Proteins
103111-Yeast-2hr-01_27_2_1	1	27	1139.57	1139.57	0.693147	0.0121837	0.0121837	0.757094	98.633	0.375	1139.57	9	0	1	0	0	0	1	1	0	5.34711	0.00275	0.00275	R.LFLVM[16]DEEK.N	sp|P40497|YIJ2_YEAST

#### Digestion Patterns
This script has used a dataset with complex digestion pattern. Feel free to change to fit your dataset. 

* Asp-N cleaves at the N-terminus of aspartic (D) and cysteic acid residues with high specificity (1–3).
* Lys-C is a protease that cleaves proteins on the C-terminal side of lysine residues. (K) 
* trypsin, C-terminal side of lysine (K) and arginine (R)


In [1]:
import pandas as pd
import os
import numpy as np

In [2]:
MSMS_PATH = 'PATH_TO_FILE/msms.txt'
PEPTIDES_PATH = 'PATH_TO_FILE/peptides.txt'
OUTPUT_PATH = 'OUTPUT_PATH'

In [4]:
def convert_maxquant_to_percolator(msms_path, peptides_path, output_file, digestion ='trypsin'):
    
    # Load MaxQuant data
    df = pd.read_csv(msms_path, sep='\t') 
    
    # read peptides table for digestion pattern
    df_peps = pd.read_csv(peptides_path, delimiter="\t")
    df_peps['Evidence ID'] = df_peps['Evidence IDs'].str.split(';') 
    df_peps = df_peps.explode('Evidence ID', ignore_index=True)
    df_peps['Evidence ID']  = df_peps['Evidence ID'].astype(int)

    df = pd.merge(df, df_peps, on=['Evidence ID', 'Sequence', 'Reverse'], how='inner', suffixes=('', '_peps'))
    
    percolator_df = pd.DataFrame()
    percolator_df['SpecId'] = df.apply(lambda x: f"{'decoy' if pd.notna(x['Reverse']) else 'target'}_{x['Scan number']}_{x['Raw file']}_{x['Charge']}_1", axis=1)
    percolator_df['Label'] = df['Reverse'].apply(lambda x: -1 if pd.notna(x) else 1)
    percolator_df['ScanNr'] = df['Scan number']
    percolator_df['ExpMass'] = df['m/z']
    percolator_df['CalcMass'] = df['Mass']
    df['Rank'] = df['Score'].rank(method='dense', ascending=False)
    percolator_df['lnrSp'] = np.log(df['Rank']) 
    df['Last score'] = df['All scores'].str.split(';').str[-1].astype(float)
    percolator_df['deltLCn'] = (df['Score'] - df['Last score'] ) / np.maximum(df['Score'], 1)  # divided by this PSM's score or 1, whichever is larger.
    percolator_df['deltCn'] = (df['Delta score'] ) / np.maximum(df['Score'], 1)
    #percolator_df['Xcorr'] = df['Score'] 
    percolator_df['Sp'] = df['Score'] 
    percolator_df['IonFrac'] = df['Peak coverage']
    percolator_df['Mass'] = df['Mass']  
    percolator_df['PepLen'] = df['Length']
    percolator_df['ChargeN'] = df['Charge']

    # if you want charge serpeated into different columns
    #unique_charges = sorted(df['Charge'].dropna().unique())
    #for charge in unique_charges:
    #    col_name = f'Charge{int(charge)}'
    #    percolator_df[col_name] = (df['Charge'] == charge).astype(int)
    
    percolator_df['enzInt'] = df['Missed cleavages']
    percolator_df['dM'] = df['Mass error [Da]']
    percolator_df['absdM'] = df['Mass error [Da]'].abs()

    # If you want to simplyfy modification names
    df['Modified sequence'] = df['Modified sequence'].str.replace('Acetyl (Protein N-term)', 'ac', regex=False).str.replace('Oxidation (M)', 'ox', regex=False)
    
    # String formatting peptides from () to [] notation
    df['Modified sequence'] = df['Modified sequence'].str.replace('(', '[', regex=False).str.replace(')', ']', regex=False).str.replace('_', '.', regex=False)

    if mod == 'trypsin':
        percolator_df['enzN'] = (
            ((df['Amino acid before'].isna()) | 
             (df['Amino acid before'] == 'K') | 
             (df['Amino acid before'] == 'R'))
            .astype(int)
        )
        
        percolator_df['enzC'] = (
            ((df['Last amino acid'] == 'K') | 
              df['Last amino acid'] == 'R') | 
              df['Amino acid after'].isna()))
            .astype(int)
        )

    elif digestion == 'other_enzyme':
        # change digestion pattern to fit your data
        None

    percolator_df['Peptide'] = df['Modified sequence'] 
    percolator_df['Proteins'] = df['Proteins'].apply(lambda x: ','.join(str(x).split(';')) if pd.notna(x) else 'unknown') 
    
    # Save to output file
    percolator_df.to_csv(output_file, sep='\t', index=False)
    print(f'Converted file saved to {output_file}')

    return percolator_df

In [None]:
per_df = convert_maxquant_to_percolator(MSMS_PATH, OUTPUT_PATH, PEPTIDES_PATH)