<a href="https://colab.research.google.com/github/yc386/orthrus_metaproteomics/blob/main/orthrus_cloud_stable_v100/orthrus_stable_v100_main.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

<img src='https://drive.google.com/uc?export=view&id=19rmmQI1H2nIqgU598WROTcUNhOUoXcBP' width='400px' align='right'>

# **Readme**

---
[Orthrus](https://www.biorxiv.org/content/10.1101/2024.11.15.623814v1) 🐾 is a hybrid, two-software pipeline that integrates [Casanovo](https://github.com/Noble-Lab/casanovo) (an AI transformer) with [Sage](https://github.com/lazear/sage) (a fast database search engine with advanced features like retention time alignment and machine learning-based rescoring).

Designed to handle large search spaces in metaproteomics and palaeoproteomics, Orthrus leverages *de novo* sequencing to define sample-specific databases, and uses probability ranking and conventional database searching to control FDRs (false discovery rates).

This notebook is optimised for Google Colab 🥳

# **Quick start**❗️
1. Before walking the dog, please change the runtime type to GPU (A100, L4, or T4. A100 most efficient but T4 is free)
2. Click the folder image 🗂️ on the left and mount your Google drive (permission pending)
3. Click `File` (top left) to save a local copy
4. **Run the `Install everything, will automatically restart` cell first and wait for restarting (until you see {'status': 'ok', 'restart': True})**, to resolve the numpy+pandas version conflicts. Casanovo 4x and Mokapot require numpy 1x
5. After restarting, choose, Casanovo, Sage, and Mokapot configurations. Then **from the `Configure Casanovo` cell, click `Runtime` -> `Run cell and below`**

In [None]:
#@title Install everything, will **automatically restart** to resolve version conflicts

import os, sys, subprocess, time, IPython
from pathlib import Path
if not Path("Orthrus_READY").exists():
    print("installing Sage binary and packages📦 ⬇️")
    #Sage version 0.14.7
    !wget -q https://github.com/lazear/sage/releases/download/v0.14.7/sage-v0.14.7-x86_64-unknown-linux-gnu.tar.gz
    !tar -xzf sage-v0.14.7-x86_64-unknown-linux-gnu.tar.gz && rm sage-v0.14.7-x86_64-unknown-linux-gnu.tar.gz
    pip_packages = ["casanovo==4.3.0", "biopython==1.85", "pyteomics==4.7.5", "mokapot==0.10.0", "numpy==1.26.4", "pandas==2.1.4", "xgboost==3.0.4", "rich[jupyter]"]
    subprocess.check_call([sys.executable, "-m", "pip", "install", "--upgrade",
                           *pip_packages])
    Path("Orthrus_READY").touch()
else:
    print("Environment already prepared.")

msg = "Restarting 🫨 ➡️ Don't click any Google prompts"
print(msg, flush=True)
time.sleep(0.5)
IPython.Application.instance().kernel.do_shutdown(restart=True)

In [None]:
#@title Configure `Casanovo`
#@markdown **`Casanovo` inputs**
folder_path=""#@param {type:"string"}
#@markdown - a folder contains single or multiple `.mzML` or `.mgf` files for `Casanovo`. Please check only _ (underscore) and no other special characters or space in a file name. **Ensure all instrument files in a single folder and no other subfolders in that parent folder.**
file_type="mzML" #@param ["mzML", "mgf"]
#@markdown - use the drop-down menu to choose the instrument file type

use_default = True #@param {type:"boolean"}
#@markdown - use the default model + configuration yaml from `Casanovo` github repo, may take time to download

#@markdown **Advanced Options (user provided model + configuration yaml)**

model = "" #@param {type:"string"}
#@markdown - a `.ckpt` trained model (check point)
config = "" #@param {type:"string"}
#@markdown - a `.yaml` configuration file (see config_420_precursor_7_ppm.yaml)

#@markdown **Inputs for converting Casanovo results to a `.fasta`**
use_SwissProt = True #@param {type:"boolean"}
#@markdown - use the latest, reviewed SwissProt form the UniProt FTP
database_path=""#@param {type:"string"}
#@markdown - path to a user-defined database (`.fasta`)

In [None]:
#@title Configure `SAGE`
json_file_path = '' #@param {type:"string"}
#@markdown - a configuration `.json` file (see config_general_MQ_fixed_CAM_v1.json)
enzyme = "KR" #@param {type:"string"}
#@markdown **`SAGE` PTM plus**
#@markdown - Default `Sage` contains CAM (fixed) (+57.021464) + variable mods: Oxidation(M) (+15.994915), Deamidation(NQ) (+0.984016)
#@markdown - PTM plus up to 5 variable mods and CAM (cysteine carbamidomethylation) can be turned off
#@markdown - PTM mass can be any decimals
use_PTM_plus = False #@param {type:"boolean"}
static_CAM = True #@param {type:"boolean"}
max_variable_mods = 3 #@param {type: "number"}
#@markdown - please note `SAGE` only allow max 3 variable mods per PSM
missed_cleavages = 2 #@param {type:"number"}
AA_1 = "M" #@param ["None", "[","]","A", "C", "D", "E", "F", "G", "H", "I", "K", "L", "M", "N", "P", "Q", "R", "S", "T", "V", "W", "Y"]
AA_1_mod = 15.9949 #@param {type:"number"}
AA_2 = "P" #@param ["None", "[","]","A", "C", "D", "E", "F", "G", "H", "I", "K", "L", "M", "N", "P", "Q", "R", "S", "T", "V", "W", "Y"]
AA_2_mod = 15.9949 #@param {type:"number"}
AA_3 = "N" #@param ["None", "[","]","A", "C", "D", "E", "F", "G", "H", "I", "K", "L", "M", "N", "P", "Q", "R", "S", "T", "V", "W", "Y"]
AA_3_mod = 0.984016 #@param {type:"number"}
AA_4 = "Q" #@param ["None", "[","]","A", "C", "D", "E", "F", "G", "H", "I", "K", "L", "M", "N", "P", "Q", "R", "S", "T", "V", "W", "Y"]
AA_4_mod = 0.984016 #@param {type:"number"}
AA_5 = "[" #@param ["None", "[","]","A", "C", "D", "E", "F", "G", "H", "I", "K", "L", "M", "N", "P", "Q", "R", "S", "T", "V", "W", "Y"]
#@markdown - [ = n-terminal
AA_5_mod = 	42.010565 #@param {type:"number"}

In [None]:
#@title Configure `Mokapot`

joint_modelling= True #@param {type:"boolean"}
#@markdown - a joint model for low abundance samples, unclick for a separate model per experiment
default_Percolator=True #@param {type:"boolean"}
#@markdown - Python implementation of the Percolator SVM model, otherwise the non-linear XGBoost model

In [None]:
#@title import functions

# ----------------------------- Utility Setup ------------------------------- #

from __future__ import annotations

# Python standards
import argparse
import datetime
import glob
import gzip
import io
import json
import logging
import os
import re
import shutil
import subprocess
import sys
import time
import warnings
from contextlib import redirect_stderr, redirect_stdout
from itertools import chain
from typing import Dict, List, Optional, Set

# 3rd part from pip etc
from rich.logging import RichHandler
from rich.traceback import install as install_rich_traceback
import mokapot
import numpy as np
import pandas as pd
import requests
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from joblib import Parallel, delayed
from pyteomics import mztab
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import MinMaxScaler
from xgboost import XGBClassifier

#for logging
LOGGER = logging.getLogger("orthrus_metaproteomics")
#quieter outputs
warnings.filterwarnings("ignore")


def initiate_logger(level: str = "INFO",
                    name_color: str = "#00D1FF"):
    #use Rich to add colours to logging, hex code version
    install_rich_traceback(show_locals=False)
    numeric = getattr(logging, level.upper(), logging.INFO)
    logger = globals().get("LOGGER", logging.getLogger())
    logger.setLevel(numeric)
    logger.handlers.clear()
    logger.propagate = False

    console = RichHandler(
        level=numeric,
        show_time=True,
        show_level=True,
        show_path=False,
        markup=True,
        rich_tracebacks=True,
        log_time_format="%Y-%m-%d %H:%M:%S",
    )
    console.setFormatter(logging.Formatter(f"[bold {name_color}]%(name)s[/] | %(message)s"))
    logger.addHandler(console)

    logging.getLogger("mokapot").setLevel(logging.ERROR)

    return logger


log = initiate_logger("INFO", name_color="#03cafc")


# Casanovo 4x column names
mztab_seq_col = 'sequence'
score_col = 'search_engine_score[1]'


def prep_mztab(mztab_path: str):
    # mztab -> pd.Dataframe
    log.info("Reading mzTab: %s", mztab_path)
    m = mztab.MzTab(mztab_path)
    df = m.spectrum_match_table
    if df is None or df.empty:
        raise ValueError(f"{mztab_path} is empty")
    if mztab_seq_col not in df.columns:
        raise KeyError(f"'{mztab_seq_col}' column is missing in the file: {mztab_path}")
    df1 = df.reset_index(drop=True)
    #^[+-]?\d+(?:\.\d+)?) for n-terminal mods
    df2 = df1.assign(sequence_naked=df1[mztab_seq_col].str.replace(r'(?:(?<=[A-Z])[+-]?\d+(?:\.\d+)?|^[+-]?\d+(?:\.\d+)?)', '', regex=True))
    df3= df2.assign(nAA=df2['sequence_naked'].str.len())
    df4=df3.sort_values(by='sequence_naked').drop_duplicates(subset='sequence_naked', keep="first").reset_index(drop=True)
    log.debug("mzTab prepped: %d unique naked sequences", df4.shape[0])
    return df4


def fasta_to_df(fasta_file: str):
    # fasta -> pd.Dataframe
    log.info("Reading FASTA: %s", fasta_file)
    data = []
    for record in SeqIO.parse(fasta_file, "fasta"):
        protein_id = record.id
        description = record.description
        sequence = str(record.seq)
        if not sequence:
            raise ValueError(f"Record with ID '{protein_id}' has no sequence in the fasta file.")
        data.append((protein_id, description, sequence))
    df = pd.DataFrame(data, columns=["Protein_ID", "Description", "Sequence"])
    df1=df.assign(UniProt_ID=df['Protein_ID'].str.split('|').str[1])
    log.debug("FASTA prepped: %d protein entries", df1.shape[0])
    return df1


# filter casanovo outputs using the maximum score below zero
def casa_filter(df):
    if score_col not in df.columns:
        raise KeyError(f"'{score_col}' not found")
    np_array = df[score_col].to_numpy()
    max_below_zero = np_array[np_array < 0].max()
    df1=df[df[score_col]>=max_below_zero]
    log.info(
        "casa_filter: kept %d/%d PSMs (threshold=%.6f)",
        df1.shape[0],
        df.shape[0],
        max_below_zero)
    return df1


# ----------------------------- matching & Bayes ranking ------------------------------- #


#prepare overlapping sequence tags for string matching
def get_seq_tags(sequence: str, k: int):
    return set(sequence[i:i+k] for i in range(len(sequence) - k + 1))


#change chunk size here for memory if needed
def matching_count_v5(fasta_df: pd.DataFrame, casanovo_df: pd.DataFrame, k: int, chunk_size: int=10000):
    log.info("Generating sequence tags (k=%d)...", k)
    sequence_set = get_seq_tags(''.join(chain.from_iterable(casanovo_df['sequence_naked'].astype(str))), k)
    log.info("Generated %d unique tags from Casanovo outputs.", len(sequence_set))
    result_df = pd.DataFrame()
    for start in range(0, len(fasta_df), chunk_size):
        chunk = fasta_df.iloc[start:start+chunk_size].copy()
        chunk['seq_tags'] = chunk['Sequence'].astype(str).str.replace('I', 'L').apply(lambda x: get_seq_tags(x, k))
        chunk['matched_count'] = chunk['seq_tags'].apply(lambda seq_tags: len(seq_tags & sequence_set))
        chunk = chunk.assign(matched=chunk['matched_count'].apply(lambda x: 1 if x >= 2 else 0))
        result_df = pd.concat([result_df, chunk], ignore_index=True)
    log.info(
        "Tag matching complete: total matched tag counts=%d",
        int(result_df["matched_count"].sum()))
    return result_df


#get tryptic peptides per database entry
def count_tryptic_peptides(sequence: str):
    pattern=r'(?<=[KR])'
    peptides = re.split(pattern, sequence)
    filtered_peptides = [peptide for peptide in peptides if len(peptide) >= 6]
    return len(filtered_peptides)


#prepare a dataframe for NB classification
def prep_Bayes(df: pd.DataFrame):
    df1=df.assign(length=df['Sequence'].astype(str).str.len(),
                 tryptic_count=df['Sequence'].apply(count_tryptic_peptides),
                 tag_count=df['seq_tags'].apply(len))
    df2=df1.assign(SAF=df1['matched_count']/df1['length'],
                 try_ratio=df1['tryptic_count']/df1['tag_count'])
    return df2


# bayes ranking
def get_bayes_ranking_test(df: pd.DataFrame, threshold: float = 0.95):
    m=prep_Bayes(df)
    required_columns = {'SAF', 'try_ratio', 'matched'}
    if not required_columns.issubset(m.columns):
        missing = required_columns - set(m.columns)
        raise ValueError(f"Missing columns in DataFrame: {missing}")
    m1 = m[m['tag_count']>0]
    X = m1[['SAF', 'try_ratio']].to_numpy()
    y = m1['matched'].to_numpy()
    scaler = MinMaxScaler()
    X_scaled = scaler.fit_transform(X.reshape(-1, 1)).reshape(*X.shape)
    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=7)
    gnb = GaussianNB()
    gnb.fit(X_train, y_train)
    y_pred = gnb.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    log.info("GaussianNB ▶ accuracy=%.4f, precision=%.4f, f1=%.4f", accuracy, precision, f1)
    whole_pred = gnb.predict(X_scaled)
    class_probabilities = gnb.predict_proba(X_scaled)
    m2 = m1.assign(pred=class_probabilities[:, 1])
    m3 = m2[m2['pred']>=threshold]
    log.info("Shortlisted %d proteins at ≥ %.2f.", m3.shape[0], threshold)
    return m3


# ----------------------------- De novo -> sample-specific .fastas ------------------------------- #


def matching_ranking_to_fasta_v5(mztab_path: str, fasta_df: pd.DataFrame):
    p = prep_mztab(mztab_path)
    casanovo_df = casa_filter(p)
    k = int(casanovo_df['nAA'].median())
    m = matching_count_v5 (fasta_df, casanovo_df, k, chunk_size=10000)
    m1 = get_bayes_ranking_test (m)
    seq_records = []
    for index, row in m1.iterrows():
        header_id = f"{row['Description']}"
        sequence = Seq(row['Sequence'])
        description = ""
        seq_record = SeqRecord(sequence, id=header_id, description=description)
        seq_records.append(seq_record)

    output_fasta_filepath = mztab_path.replace('.mztab', '_matched.fasta')

    with open(output_fasta_filepath, 'w') as output_file:
        SeqIO.write(seq_records, output_file, 'fasta')
    log.info("Wrote matched FASTA %s (entries=%d)", output_fasta_filepath, m1.shape[0])


def process_all_mztab_files_v2(folder_path: str, database_path: str):
    mztab_filepaths = glob.glob(f"{folder_path}/*.mztab")
    log.info("Found %d mzTab file(s) in %s", len(mztab_filepaths), folder_path)
    fasta_df=fasta_to_df(database_path)
    log.info("Reference FASTA loaded from %s (proteins=%d)", database_path, fasta_df.shape[0])
    for mztab in mztab_filepaths:
        matching_ranking_to_fasta_v5 (mztab, fasta_df)


# ----------------------------- utilities for Sage ------------------------------- #


def organise_files(directory: str, file_type: str):
    if not os.path.isdir(directory):
        log.error("Directory does not exist: %s", directory)
        return

    MS2_files = glob.glob(os.path.join(directory, f'*.{file_type}'))
    if not MS2_files:
        log.warning("No %s files found in %s, or files already organised", file_type, directory)
    for MS2 in MS2_files:
        log.info("Organising files in %s ...", directory)
        base_name = os.path.splitext(os.path.basename(MS2))[0]
        new_folder_path = os.path.join(directory, base_name)
        if not os.path.exists(new_folder_path):
            os.makedirs(new_folder_path)

        MS2_path = os.path.join(new_folder_path, os.path.basename(MS2))
        if not os.path.exists(MS2_path):
            shutil.move(MS2, new_folder_path)
            log.info("Moved %s to %s", MS2, new_folder_path)
        else:
            log.warning("MS2 file already exists in the destination: %s", MS2_path)

        fasta_filename = f"{base_name}_matched.fasta"
        fasta_file = os.path.join(directory, fasta_filename)
        if os.path.exists(fasta_file):
            new_fasta_path = os.path.join(new_folder_path, fasta_filename)
            if not os.path.exists(new_fasta_path):
                shutil.move(fasta_file, new_folder_path)
                log.info("Moved %s to %s", fasta_file, new_folder_path)
            else:
                log.info(".fasta file already exists in the destination: %s", new_fasta_path)
        else:
            log.warning("No matching .fasta file found for %s", base_name)


def get_sage_config(json_file_path: str,
                    peak_path: Union[str, List[str]],
                    static_mods: Dict[str, float],
                    new_mods: Dict[str, List[float]],
                    missed_cleavages: int,
                    enzyme: str,
                    min_len: int,
                    max_len: int,
                    max_variable_mods: int,
                    output_config_path: str):

    with open(json_file_path, 'r') as file:
        json_data = json.load(file)
        if isinstance(peak_path, str):
            peak_path = [peak_path]

        json_data['mzml_paths'] = peak_path
        json_data['database']['static_mods'] = static_mods
        json_data['database']['variable_mods'] = new_mods
        json_data['database']['enzyme']['missed_cleavages'] = missed_cleavages
        json_data['database']['enzyme']['cleave_at']= enzyme
        json_data['database']['enzyme']['min_len'] = min_len
        json_data['database']['enzyme']['max_len'] = max_len
        json_data['database']['max_variable_mods'] = max_variable_mods
        json_data['database']['decoy_tag'] = "rev_"
        json_data['database']['generate_decoys'] = True

    with open(output_config_path, 'w') as f:
        json.dump(json_data, f, indent=4)
    log.info("Wrote Sage config: %s", output_config_path)

In [None]:
#@title Run `Casanovo`

def run_casanovo(
        folder_path: str,
        file_type: str,
        use_default: bool,
        model: Optional[str] = None,
        config: Optional[str] = None):
    files = glob.glob(f"{folder_path}/*.{file_type}")
    if not files:
        log.error("No instrument files found in %s with extension .%s", folder_path, file_type)
        return

    env = os.environ.copy()
    env["TF_CPP_MIN_LOG_LEVEL"] = "2"
    env["TF_ENABLE_ONEDNN_OPTS"] = "0"

    for instrument_file in files:
        output_path=instrument_file.replace(f".{file_type}", ".mztab")
        if use_default:
            cmd = ["casanovo", "sequence", instrument_file, "-v", "info", "-o", output_path]
        else:
            cmd= ["casanovo", "sequence", instrument_file, "-m", model, "-c", config, "-v", "info", "-o", output_path]
        log.info("Running Casanovo: %s", " ".join(cmd))
        subprocess.run(cmd,
                       check=True,
                       stdout=subprocess.DEVNULL,
                       stderr=subprocess.DEVNULL,
                       env=env)
        log.info("Casanovo done 👍")

run_casanovo(folder_path, file_type, use_default, model, config)

In [None]:
#@title Convert `Casanovo` results to .fasta per experiment

if use_SwissProt:
    url = "https://ftp.uniprot.org/pub/databases/uniprot/knowledgebase/complete/uniprot_sprot.fasta.gz"
    output_file = "uniprot_sprot.fasta.gz"
    decompressed_file = "uniprot_sprot.fasta"
    response = requests.get(url, stream=True)
    if response.status_code == 200:
        with open(output_file, 'wb') as f:
            shutil.copyfileobj(response.raw, f)
            log.info("%s downloaded successfully.", output_file)
    else:
        log.error("Failed to download %s. Status code: %d", output_file, response.status_code)

    with gzip.open(output_file, 'rb') as f_in:
        with open(decompressed_file, 'wb') as f_out:
            shutil.copyfileobj(f_in, f_out)
    sprot_path="uniprot_sprot.fasta"
    process_all_mztab_files_v2(folder_path, sprot_path)

else:
    process_all_mztab_files_v2(folder_path, database_path)

In [None]:
#@title Run `Sage`

# colab version sage binary
sage_path="/content/sage-v0.14.7-x86_64-unknown-linux-gnu/sage"


def run_sage (
        folder_path: str,
        file_type: str,
        sage_path: str,
        json_file_path: str,
        enzyme: str,
        use_PTM_plus: bool,
        missed_cleavages: int,
        max_variable_mods: int,
        static_CAM: bool):

    organise_files(folder_path, file_type)

    if not (os.path.isfile(sage_path)):
        log.error("Sage binary not found or not executable: %s", sage_path)
        return

    # constants
    min_len, max_len = 6, 30
    static_mods = {"C": 57.021464} if static_CAM else {}
    missed_cleavages = 2 if not use_PTM_plus else missed_cleavages
    max_variable_mods = 3 if not use_PTM_plus else max_variable_mods
    enzyme = enzyme

    if use_PTM_plus:
        AAs = [AA_1, AA_2, AA_3, AA_4, AA_5]
        mods = [AA_1_mod, AA_2_mod, AA_3_mod, AA_4_mod, AA_5_mod]
        PTMs = {}
        for AA, mod in zip(AAs, mods):
            if AA != "None":
                PTMs[AA] = [mod]
    else:
        PTMs = {"M": [15.994915], "N": [0.984016], "Q": [0.984016]}

    # iterate each sub-folder
    big_folder = [p for p in glob.glob(f"{folder_path}/*") if os.path.isdir(p)]
    if not big_folder:
        log.warning("No subfolders found.", RuntimeWarning)
        return

    for folder in big_folder:
        files = glob.glob(f"{folder}/*.{file_type}")
        if not files:
            log.warning("No %s files in %s; skipping.", file_type, folder)
            continue

        peak_path   = files[0]
        output_json = peak_path.replace(f".{file_type}", ".json")

        get_sage_config(
            json_file_path, peak_path, static_mods, PTMs,
            missed_cleavages, enzyme,
            min_len, max_len,
            max_variable_mods, output_json
        )

        fasta_files = glob.glob(f"{folder}/*.fasta")
        if not fasta_files:
            log.warning("No FASTA in %s.", folder)
            continue

        fasta_path = fasta_files[0]
        cmd = [sage_path, output_json, "--fasta", fasta_path,
               "--write-pin",
               "--output_directory", folder
               ]
        log.info("Running Sage for %s", folder)
        subprocess.run(cmd,
                       check=True,
                       stdout=subprocess.DEVNULL,
                       stderr=subprocess.DEVNULL)
        log.info("Sage finished casting spells 🧙 for %s", folder)


run_sage(
    folder_path,
    file_type,
    sage_path,
    json_file_path,
    enzyme,
    use_PTM_plus,
    missed_cleavages,
    max_variable_mods,
    static_CAM)

In [None]:
#@title Brew `Mokapot`


def get_all_pin_files(folder_path):
    psm_files = []
    for root, dirs, files in os.walk(folder_path):
        for file in files:
            if file.endswith('.pin'):
                full_path = os.path.join(root, file)
                psm_files.append(full_path)
    return psm_files


def brew_mokapot(folder_path: str,
                joint_modelling: bool,
                default_Percolator: bool):
    #XGBoost schema from Fondrie & Noble (2021).A non-linear XGBoost seems to be better for rescoring open search results.
    grid = {
        "scale_pos_weight": np.logspace(0, 2, 3),
        "max_depth": [1, 3, 6],
        "min_child_weight": [1, 10, 100],
        "gamma": [0, 1, 10]}
    xgb_mod = GridSearchCV(
        XGBClassifier(),
        param_grid=grid,
        n_jobs=-1,
        cv=3,
        scoring="roc_auc")

    if joint_modelling:
        psm_files = get_all_pin_files(folder_path)
        if not psm_files:
            log.warning("No .pin files found.")
            return
        log.info("Brewing all .pins in folder: %s", folder_path)
        joint_psm_list = mokapot.read_pin(psm_files)
        model = mokapot.PercolatorModel() if default_Percolator else mokapot.Model(xgb_mod)
        with open(os.devnull, "w") as devnull, redirect_stdout(devnull), redirect_stderr(devnull):
            results, models = mokapot.brew(joint_psm_list, model)
        result_files = results.to_txt(folder_path)
        log.info("Mokapot (joint modelling) brewed ☕️ (output: %s)", folder_path)
    else:
        big_folder = [p for p in glob.glob(f"{folder_path}/*") if os.path.isdir(p)]
        for folder in big_folder:
            if not os.path.isdir(folder):
                continue
            log.info("Brewing folder: %s", folder)
            pin_files = glob.glob(f"{folder}/*.pin")
            if not pin_files:
                log.warning("No .pin files in %s; skipping.", folder)
                continue
            pin = pin_files[0]
            psm_list = mokapot.read_pin(pin)
            model = mokapot.PercolatorModel() if default_Percolator else mokapot.Model(xgb_mod)
            with open(os.devnull, "w") as devnull, redirect_stdout(devnull), redirect_stderr(devnull):
                results, models = mokapot.brew(psm_list, model)
            result_files = results.to_txt(folder)
            log.info("Mokapot (single model per experiment) brewed ☕️ (output: %s)", folder)


brew_mokapot(folder_path,
             joint_modelling,
             default_Percolator)