From 84c7aa9d2af1e9d82a99825548026bd315a0ba0a Mon Sep 17 00:00:00 2001 From: Andy Boughton Date: Wed, 4 May 2022 19:52:18 -0400 Subject: [PATCH 1/5] Enforce chromosomes from whitelist to prevent a pathological edge case of non-categorical chromosomes --- tests/test_parsers.py | 9 +++++++++ zorp/parser_utils.py | 7 +++++++ zorp/parsers.py | 11 +++++++++++ 3 files changed, 27 insertions(+) diff --git a/tests/test_parsers.py b/tests/test_parsers.py index 612525a..98f5cfe 100644 --- a/tests/test_parsers.py +++ b/tests/test_parsers.py @@ -160,6 +160,15 @@ def test_parses_freq_from_freq(self): p = special_parser(line) assert p.alt_allele_freq == 0.25, "Parses frequency as is" + def test_pathological_chromosome_fails(self): + # Regression test for a badly specified user input file: restrict chromosomes to a whitelist, because sometimes + # people manage to slip non-categorical fields to tabix (and then tabix eats all the RAM on the server) + line = '1\tchr1:722408:G:C\t722408\tc\tg\t0.9298\tchr1:722408' + parser_options = {"alt_col": 5, "pos_col": 3, "ref_col": 4, "beta_col": None, "chrom_col": 2, "pvalue_col": 6, "stderr_beta_col": None, "is_neg_log_pvalue": None} + + parser = parsers.GenericGwasLineParser(**parser_options) + with pytest.raises(exceptions.LineParseException, match="Chromosome 1:722408:G:C is not a valid option. Must be one of: '1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 M MT X Y'"): + parser(line) class TestStandardGwasParser: def test_parses_locuszoom_standard_format(self, standard_gwas_parser): diff --git a/zorp/parser_utils.py b/zorp/parser_utils.py index daedc2a..d9eecf8 100644 --- a/zorp/parser_utils.py +++ b/zorp/parser_utils.py @@ -119,3 +119,10 @@ def human_to_zero(value): return value else: return value - 1 + + +def natural_sort(l): + """Nastural sort a list of strings. Used for human-friendly error messages, eg, from a `set` of allowed strings""" + convert = lambda text: int(text) if text.isdigit() else text.lower() + alphanum_key = lambda key: [convert(c) for c in re.split('([0-9]+)', key)] + return sorted(l, key=alphanum_key) diff --git a/zorp/parsers.py b/zorp/parsers.py index 4899347..faf6049 100644 --- a/zorp/parsers.py +++ b/zorp/parsers.py @@ -13,6 +13,11 @@ from .const import MISSING_VALUES from . import exceptions, parser_utils as utils +# Whitelist of allowed chromosomes. It's ok to add more values, as long as we have some kind of whitelist. +# The generic parser uses these as a safeguard, because when people slip a non-categorical value into the chrom field, +# tabix uses all the RAM on the system and then crashes horribly. +ALLOWED_CHROMS = frozenset({'1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23', '24', '25', 'X', 'Y', 'M', 'MT'}) + class BasicVariant: """ @@ -160,6 +165,12 @@ def inner(line): chrom = chrom.upper() + if chrom not in ALLOWED_CHROMS: + options = ' '.join(utils.natural_sort(ALLOWED_CHROMS)) + raise exceptions.LineParseException( + # NOTE: Future python versions might preserve set insertion order, but until then super-fast membership lookups mean that the set isn't ordered by default + f"Chromosome {chrom} is not a valid option. Must be one of: '{options}'") + # Explicit columns will override a value from the marker, by design if _ref_col is not None: ref = fields[_ref_col] From d09f49d866025636dbae1a5ef4ff9b64c0629193 Mon Sep 17 00:00:00 2001 From: Andy Boughton Date: Fri, 27 May 2022 15:26:39 -0400 Subject: [PATCH 2/5] Appease linters --- tests/test_parsers.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/tests/test_parsers.py b/tests/test_parsers.py index 98f5cfe..3ac7984 100644 --- a/tests/test_parsers.py +++ b/tests/test_parsers.py @@ -164,12 +164,20 @@ def test_pathological_chromosome_fails(self): # Regression test for a badly specified user input file: restrict chromosomes to a whitelist, because sometimes # people manage to slip non-categorical fields to tabix (and then tabix eats all the RAM on the server) line = '1\tchr1:722408:G:C\t722408\tc\tg\t0.9298\tchr1:722408' - parser_options = {"alt_col": 5, "pos_col": 3, "ref_col": 4, "beta_col": None, "chrom_col": 2, "pvalue_col": 6, "stderr_beta_col": None, "is_neg_log_pvalue": None} + parser_options = { + "alt_col": 5, "pos_col": 3, "ref_col": 4, "beta_col": None, "chrom_col": 2, "pvalue_col": 6, + "stderr_beta_col": None, "is_neg_log_pvalue": None + } parser = parsers.GenericGwasLineParser(**parser_options) - with pytest.raises(exceptions.LineParseException, match="Chromosome 1:722408:G:C is not a valid option. Must be one of: '1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 M MT X Y'"): + with pytest.raises( + exceptions.LineParseException, + match="Chromosome 1:722408:G:C is not a valid option. Must be one of: '1 2 3 4 5 6 7 8 9 10 11 12 13 " + "14 15 16 17 18 19 20 21 22 23 24 25 M MT X Y'" + ): parser(line) + class TestStandardGwasParser: def test_parses_locuszoom_standard_format(self, standard_gwas_parser): line = '1\t100\tA\tC\t10\t0.5\t0.5\t0.25' From ab5765eb4f03ebf47108680a8dc8ce92f2c8683b Mon Sep 17 00:00:00 2001 From: Andy Boughton Date: Fri, 27 May 2022 15:36:43 -0400 Subject: [PATCH 3/5] Adjust linting rules --- setup.cfg | 1 + zorp/parser_utils.py | 6 +++--- zorp/parsers.py | 7 +++++-- 3 files changed, 9 insertions(+), 5 deletions(-) diff --git a/setup.cfg b/setup.cfg index 5c86bb4..c2eeb03 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,4 +1,5 @@ [flake8] +ignore=E731 exclude = .git,env,venv,.venv,node_modules,docs max-line-length = 120 diff --git a/zorp/parser_utils.py b/zorp/parser_utils.py index d9eecf8..7dbaae3 100644 --- a/zorp/parser_utils.py +++ b/zorp/parser_utils.py @@ -121,8 +121,8 @@ def human_to_zero(value): return value - 1 -def natural_sort(l): - """Nastural sort a list of strings. Used for human-friendly error messages, eg, from a `set` of allowed strings""" +def natural_sort(items: ty.Iterable): + """Natural sort a list of strings. Used for human-friendly error messages, eg, from a `set` of allowed strings""" convert = lambda text: int(text) if text.isdigit() else text.lower() alphanum_key = lambda key: [convert(c) for c in re.split('([0-9]+)', key)] - return sorted(l, key=alphanum_key) + return sorted(items, key=alphanum_key) diff --git a/zorp/parsers.py b/zorp/parsers.py index faf6049..7b0bf5f 100644 --- a/zorp/parsers.py +++ b/zorp/parsers.py @@ -16,7 +16,11 @@ # Whitelist of allowed chromosomes. It's ok to add more values, as long as we have some kind of whitelist. # The generic parser uses these as a safeguard, because when people slip a non-categorical value into the chrom field, # tabix uses all the RAM on the system and then crashes horribly. -ALLOWED_CHROMS = frozenset({'1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23', '24', '25', 'X', 'Y', 'M', 'MT'}) +ALLOWED_CHROMS = frozenset({ + '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', + '21', '22', '23', '24', '25', + 'X', 'Y', 'M', 'MT' +}) class BasicVariant: @@ -168,7 +172,6 @@ def inner(line): if chrom not in ALLOWED_CHROMS: options = ' '.join(utils.natural_sort(ALLOWED_CHROMS)) raise exceptions.LineParseException( - # NOTE: Future python versions might preserve set insertion order, but until then super-fast membership lookups mean that the set isn't ordered by default f"Chromosome {chrom} is not a valid option. Must be one of: '{options}'") # Explicit columns will override a value from the marker, by design From eac89b619e4ff1399ab8d0552589d655c3078d4b Mon Sep 17 00:00:00 2001 From: Andy Boughton Date: Fri, 27 May 2022 15:45:13 -0400 Subject: [PATCH 4/5] Update some local package versions to align with GH actions --- setup.cfg | 1 - zorp/parser_utils.py | 4 ++-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/setup.cfg b/setup.cfg index c2eeb03..5c86bb4 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,5 +1,4 @@ [flake8] -ignore=E731 exclude = .git,env,venv,.venv,node_modules,docs max-line-length = 120 diff --git a/zorp/parser_utils.py b/zorp/parser_utils.py index 7dbaae3..3b54f4f 100644 --- a/zorp/parser_utils.py +++ b/zorp/parser_utils.py @@ -123,6 +123,6 @@ def human_to_zero(value): def natural_sort(items: ty.Iterable): """Natural sort a list of strings. Used for human-friendly error messages, eg, from a `set` of allowed strings""" - convert = lambda text: int(text) if text.isdigit() else text.lower() - alphanum_key = lambda key: [convert(c) for c in re.split('([0-9]+)', key)] + convert = lambda text: int(text) if text.isdigit() else text.lower() # noqa: E731 + alphanum_key = lambda key: [convert(c) for c in re.split('([0-9]+)', key)] # noqa: E731 return sorted(items, key=alphanum_key) From d6aec3a2ef4e48708227be4d7cba1cb077e30a1c Mon Sep 17 00:00:00 2001 From: Andy Boughton Date: Fri, 27 May 2022 15:52:20 -0400 Subject: [PATCH 5/5] Bump for v0.3.5 --- setup.py | 2 +- zorp/__init__.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index c18af08..d904244 100644 --- a/setup.py +++ b/setup.py @@ -43,7 +43,7 @@ # For a discussion on single-sourcing the version across setup.py and the # project code, see # https://packaging.python.org/en/latest/single_source_version.html - version='0.3.4', # Required + version='0.3.5', # Required # This is a one-line description or tagline of what your project does. This # corresponds to the "Summary" metadata field: diff --git a/zorp/__init__.py b/zorp/__init__.py index df61579..32c1263 100644 --- a/zorp/__init__.py +++ b/zorp/__init__.py @@ -1,6 +1,6 @@ from distutils.version import LooseVersion -__version__ = '0.3.4' +__version__ = '0.3.5' __version_info__ = tuple(LooseVersion(__version__).version) __all__ = [