diff --git a/.gitignore b/.gitignore index f7e62d9..d2748b4 100644 --- a/.gitignore +++ b/.gitignore @@ -169,3 +169,9 @@ cython_debug/ .qodo /tests/test_data/RD139_Narrow_UPS1_0_1fmol_inj1.mzML + +# DIA-NN config file generated by tests +diann_config.cfg + +# Parquet files generated by tests +*.parquet diff --git a/environment.yml b/environment.yml index 31f1371..157604c 100644 --- a/environment.yml +++ b/environment.yml @@ -6,8 +6,8 @@ channels: - nodefaults dependencies: - click - - sdrf-pipelines>=0.0.31 - - pyopenms>=3.3.0 + - sdrf-pipelines==0.0.33 + - pyopenms>=3.2.0 - pandas - pyarrow>=16.1.0 - scipy \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index 93ade61..be05058 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -31,8 +31,8 @@ packages = [ [tool.poetry.dependencies] python = "*" click = "*" -sdrf-pipelines = ">=0.0.32" -pyopenms = ">=3.2.0" +sdrf-pipelines = "==0.0.33" +pyopenms = ">=3.3.0" pandas = "*" pyarrow = ">=16.1.0" scipy = "*" diff --git a/quantmsutils/__init__.py b/quantmsutils/__init__.py index 40b07ef..5681085 100644 --- a/quantmsutils/__init__.py +++ b/quantmsutils/__init__.py @@ -1 +1 @@ -__version__ = "0.0.23" +__version__ = "0.0.24" diff --git a/quantmsutils/diann/dianncfg.py b/quantmsutils/diann/dianncfg.py index 1b6003e..20ba92f 100644 --- a/quantmsutils/diann/dianncfg.py +++ b/quantmsutils/diann/dianncfg.py @@ -7,13 +7,38 @@ import logging import re from typing import List, Tuple - +from collections import defaultdict import click from sdrf_pipelines.openms.unimod import UnimodDatabase logging.basicConfig(format="%(asctime)s [%(funcName)s] - %(message)s", level=logging.DEBUG) logger = logging.getLogger(__name__) +# Lazy initialization of UnimodDatabase for improved testability. +# The database is created on first access rather than at module import time, +# which allows tests to mock or replace it more easily. +_unimod_database = None + + +def get_unimod_database(): + """ + Get the UnimodDatabase instance, creating it lazily on first access. + + This pattern improves testability by avoiding database initialization at module + import time. For testing purposes, the internal _unimod_database variable can be + set to None to force re-initialization on the next call. + + :return: The UnimodDatabase instance. + """ + global _unimod_database + if _unimod_database is None: + _unimod_database = UnimodDatabase() + return _unimod_database + + +# Met-loss modification constant (UniMod:765) with mass shift and site specification +MET_LOSS_MODIFICATION = "UniMod:765,-131.040485,*nM" + @click.command("dianncfg", short_help="Create DIA-NN config file with enzyme and PTMs") @click.option("--enzyme", "-e", help="") @@ -32,8 +57,7 @@ def dianncfg(ctx, enzyme, fix_mod, var_mod): :param var_mod: A string of variable modifications, separated by commas. """ cut = enzyme_cut(enzyme) - unimod_database = UnimodDatabase() - fix_ptm, var_ptm = convert_mod(unimod_database, fix_mod, var_mod) + fix_ptm, var_ptm = convert_mod(fix_mod, var_mod) var_ptm_str = " --var-mod " fix_ptm_str = " --fixed-mod " @@ -42,83 +66,106 @@ def dianncfg(ctx, enzyme, fix_mod, var_mod): for mod in fix_ptm: diann_fix_ptm += fix_ptm_str + mod for mod in var_ptm: - diann_var_ptm += var_ptm_str + mod + if mod == MET_LOSS_MODIFICATION: + diann_var_ptm += " --met-excision " + else: + diann_var_ptm += var_ptm_str + mod with open("diann_config.cfg", "w") as file: file.write("--cut " + cut + diann_fix_ptm + diann_var_ptm) -def convert_mod(unimod_database, fix_mod: str, var_mod: str) -> Tuple[List, List]: +def get_mod(mod, mod_type): + """ + Retrieve and format a modification from the Unimod database for DIA-NN compatibility. + + :param mod: The modification string, typically containing the modification name and site. + :param mod_type: The type of modification ('fixed_mod' or 'var_mod'). + :return: A tuple (diann_mod_accession, site), where diann_mod_accession is a formatted string + for DIA-NN and site is the modification site. + :raises SystemExit: If the modification is not found in the Unimod database, logs an error and exits. + """ pattern = re.compile(r"\((.*?)\)") + modification_found = 0 + diann_mod_accession = None + diann_mod_name = None + for modification in get_unimod_database().modifications: + if modification.get_name() == mod.split(" ")[0]: + diann_mod_accession = modification.get_accession().replace("UNIMOD:", "UniMod:") + "," + str(modification._delta_mono_mass) + diann_mod_name = modification.get_name() + modification_found = 1 + break + + if modification_found == 0: + logging.error( + f"Only Unimod modifications are currently supported for the DIA pipeline. Unsupported modification: {mod}" + ) + exit(1) + + # TODO support DIA multiplex + if ( + "TMT" in diann_mod_name + or "Label:" in diann_mod_name + or "iTRAQ" in diann_mod_name + or "mTRAQ" in diann_mod_name + or "Dimethyl:" in diann_mod_name + ): + logging.error( + "quantms DIA-NN workflow only supports LFQ now! Unsupported modifications: " + + mod + ) + exit(1) + + sites = re.findall(pattern, " ".join(mod.split(" ")[1:])) + if not sites: + logging.error( + f"No site specification found in modification string: {mod}" + ) + exit(1) + site = sites[0] + if site == "Protein N-term": + site = "*n" + elif site == "N-term": + site = "n" + elif len(site.split(" ")) >= 2: + pp = " ".join(site.split(" ")[:-1]) + if pp == "Protein N-term": + pp = "*n" + elif pp == "N-term": + pp = "n" + aa = site.split(" ")[-1] + site = pp + aa + if site == "*nM" and diann_mod_name == "Met-loss" and mod_type == "var_mod": + return diann_mod_accession, site + else: + logging.error("Restricting to certain terminal AAs isn't directly supported. Please see https://github.com/vdemichev/DiaNN/issues/1791") + exit(1) + return diann_mod_accession, site + + +def convert_mod(fix_mod: str, var_mod: str) -> Tuple[List, List]: var_ptm = [] fix_ptm = [] - - if fix_mod != "": + if fix_mod: + merged = defaultdict(list) for mod in fix_mod.split(","): - tag = 0 - diann_mod = None - for modification in unimod_database.modifications: - if modification.get_name() == mod.split(" ")[0]: - diann_mod = modification.get_name() + "," + str(modification._delta_mono_mass) - tag = 1 - break - if tag == 0: - logging.info( - "Warning: Currently only supported unimod modifications for DIA pipeline. Skipped: " - + mod - ) - continue - site = re.findall(pattern, " ".join(mod.split(" ")[1:]))[0] - if site == "Protein N-term": - site = "*n" - elif site == "N-term": - site = "n" - - if ( - "TMT" in diann_mod - or "Label" in diann_mod - or "iTRAQ" in diann_mod - or "mTRAQ" in diann_mod - ): - fix_ptm.append(diann_mod + "," + site + "," + "label") - elif diann_mod is not None: - fix_ptm.append(diann_mod + "," + site) - else: - print( - "Warning: Currently only supported unimod modifications for DIA pipeline. Skipped: " - + mod - ) - - if var_mod != "": + diann_mod, site = get_mod(mod, "fixed_mod") + merged[diann_mod].append(site) + + # merge same modification for different sites + for name, site_list in merged.items(): + site_str = "".join(sorted(set(site_list))) + fix_ptm.append(f"{name},{site_str}") + + if var_mod: + merged = defaultdict(list) for mod in var_mod.split(","): - tag = 0 - diann_mod = None - for modification in unimod_database.modifications: - if modification.get_name() == mod.split(" ")[0]: - diann_mod = modification.get_name() + "," + str(modification._delta_mono_mass) - tag = 1 - break - if tag == 0: - print( - "Warning: Currently only supported unimod modifications for DIA pipeline. Skipped: " - + mod - ) - continue - site = re.findall(pattern, " ".join(mod.split(" ")[1:]))[0] - if site == "Protein N-term": - site = "*n" - elif site == "N-term": - site = "n" - - if ( - "TMT" in diann_mod - or "Label" in diann_mod - or "iTRAQ" in diann_mod - or "mTRAQ" in diann_mod - ): - var_ptm.append(diann_mod + "," + site + "," + "label") - else: - var_ptm.append(diann_mod + "," + site) + diann_mod, site = get_mod(mod, "var_mod") + merged[diann_mod].append(site) + # merge same modification for different sites + for name, site_list in merged.items(): + site_str = "".join(sorted(set(site_list))) + var_ptm.append(f"{name},{site_str}") return fix_ptm, var_ptm diff --git a/requirements.txt b/requirements.txt index 1a53263..b95ff25 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,6 @@ click -sdrf-pipelines>=0.0.31 -pyopenms>=3.2.0 +sdrf-pipelines==0.0.33 +pyopenms>=3.3.0 pandas pyarrow>=16.1.0 scipy diff --git a/tests/test_commands.py b/tests/test_commands.py index 70885fe..c80380f 100644 --- a/tests/test_commands.py +++ b/tests/test_commands.py @@ -84,7 +84,20 @@ def test_diann2mztab_example(self): ] result = run_cli_command("diann2mztab", args) assert result.exit_code == 0 - # Additional assertions could check for expected output files + + def test_dianncfg_example(self): + """Test generating the DIA-NN config with example data""" + args = [ + "--enzyme", + "Trypsin", + "--fix_mod", + "Carbamidomethyl (C)", + "--var_mod", + "Oxidation (M),Phospho (S),Phospho (T),Phospho (Y),Acetyl (Protein N-term),Acetyl (K),Acetyl (R),Met-loss (Protein N-term M)", + ] + result = run_cli_command("dianncfg", args) + + assert result.exit_code == 0 class TestSamplesheetCommands: