diff --git a/setup.py b/setup.py index c18af08..d904244 100644 --- a/setup.py +++ b/setup.py @@ -43,7 +43,7 @@ # For a discussion on single-sourcing the version across setup.py and the # project code, see # https://packaging.python.org/en/latest/single_source_version.html - version='0.3.4', # Required + version='0.3.5', # Required # This is a one-line description or tagline of what your project does. This # corresponds to the "Summary" metadata field: diff --git a/tests/test_parsers.py b/tests/test_parsers.py index 612525a..3ac7984 100644 --- a/tests/test_parsers.py +++ b/tests/test_parsers.py @@ -160,6 +160,23 @@ def test_parses_freq_from_freq(self): p = special_parser(line) assert p.alt_allele_freq == 0.25, "Parses frequency as is" + def test_pathological_chromosome_fails(self): + # Regression test for a badly specified user input file: restrict chromosomes to a whitelist, because sometimes + # people manage to slip non-categorical fields to tabix (and then tabix eats all the RAM on the server) + line = '1\tchr1:722408:G:C\t722408\tc\tg\t0.9298\tchr1:722408' + parser_options = { + "alt_col": 5, "pos_col": 3, "ref_col": 4, "beta_col": None, "chrom_col": 2, "pvalue_col": 6, + "stderr_beta_col": None, "is_neg_log_pvalue": None + } + + parser = parsers.GenericGwasLineParser(**parser_options) + with pytest.raises( + exceptions.LineParseException, + match="Chromosome 1:722408:G:C is not a valid option. Must be one of: '1 2 3 4 5 6 7 8 9 10 11 12 13 " + "14 15 16 17 18 19 20 21 22 23 24 25 M MT X Y'" + ): + parser(line) + class TestStandardGwasParser: def test_parses_locuszoom_standard_format(self, standard_gwas_parser): diff --git a/zorp/__init__.py b/zorp/__init__.py index df61579..32c1263 100644 --- a/zorp/__init__.py +++ b/zorp/__init__.py @@ -1,6 +1,6 @@ from distutils.version import LooseVersion -__version__ = '0.3.4' +__version__ = '0.3.5' __version_info__ = tuple(LooseVersion(__version__).version) __all__ = [ diff --git a/zorp/parser_utils.py b/zorp/parser_utils.py index daedc2a..3b54f4f 100644 --- a/zorp/parser_utils.py +++ b/zorp/parser_utils.py @@ -119,3 +119,10 @@ def human_to_zero(value): return value else: return value - 1 + + +def natural_sort(items: ty.Iterable): + """Natural sort a list of strings. Used for human-friendly error messages, eg, from a `set` of allowed strings""" + convert = lambda text: int(text) if text.isdigit() else text.lower() # noqa: E731 + alphanum_key = lambda key: [convert(c) for c in re.split('([0-9]+)', key)] # noqa: E731 + return sorted(items, key=alphanum_key) diff --git a/zorp/parsers.py b/zorp/parsers.py index 4899347..7b0bf5f 100644 --- a/zorp/parsers.py +++ b/zorp/parsers.py @@ -13,6 +13,15 @@ from .const import MISSING_VALUES from . import exceptions, parser_utils as utils +# Whitelist of allowed chromosomes. It's ok to add more values, as long as we have some kind of whitelist. +# The generic parser uses these as a safeguard, because when people slip a non-categorical value into the chrom field, +# tabix uses all the RAM on the system and then crashes horribly. +ALLOWED_CHROMS = frozenset({ + '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', + '21', '22', '23', '24', '25', + 'X', 'Y', 'M', 'MT' +}) + class BasicVariant: """ @@ -160,6 +169,11 @@ def inner(line): chrom = chrom.upper() + if chrom not in ALLOWED_CHROMS: + options = ' '.join(utils.natural_sort(ALLOWED_CHROMS)) + raise exceptions.LineParseException( + f"Chromosome {chrom} is not a valid option. Must be one of: '{options}'") + # Explicit columns will override a value from the marker, by design if _ref_col is not None: ref = fields[_ref_col]