Merge branch 'release/0.3.4'

abought · Dec 17, 2021 · d151504 · d151504
2 parents e926ce9 + 9e7d1bd
commit d151504
Show file tree

Hide file tree

Showing 9 changed files with 86 additions and 34 deletions.
diff --git a/.github/workflows/pythonpackage.yml b/.github/workflows/pythonpackage.yml
@@ -0,0 +1,47 @@
+name: Python unit tests
+
+on:
+  push:
+    branches:
+      - master
+      - develop
+  pull_request:
+    branches:
+      - master
+      - develop
+  release:
+    types:
+      - created
+
+jobs:
+  build:
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python-version: [3.6, 3.8]
+
+    steps:
+    - uses: actions/checkout@v2
+    - name: Set up Python ${{ matrix.python-version }}
+      uses: actions/setup-python@v2
+      with:
+        python-version: ${{ matrix.python-version }}
+    - uses: actions/cache@v1
+      with:
+        path: ~/.cache/pip
+        key: ${{ runner.os }}-pip-${{ hashFiles('**/setup.py') }}
+        restore-keys: |
+          ${{ runner.os }}-pip-
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        # Install `wheel` so that pip can cache wheels it builds to save a few minutes on each run.
+        # Pip keeps a cache of HTTPS requests in `~/.cache/pip/http/`.  For packages that have only
+        # a tar file available (and no wheel), pip builds the package itself.  It only caches to
+        # `~/.cache/pip/wheels/` if `wheel` is installed.
+        pip install wheel
+        pip install -e '.[test,perf,lookups]'
+    - name: Run tests
+      run: |
+        mypy .
+        pytest --flake8 .
diff --git a/.travis.yml b/.travis.yml
diff --git a/README.md b/README.md
@@ -1,6 +1,6 @@
 # ZORP: A helpful GWAS parser
 
-[![Build Status](https://api.travis-ci.org/abought/zorp.svg?branch=develop)](https://api.travis-ci.org/abought/zorp)
+![Build Status](https://github.com/abought/zorp/workflows/Python%20unit%20tests/badge.svg?branch=develop)
 
 ## Why?
 ZORP is intended to abstract away differences in file formats, and help you work with GWAS data from many 

diff --git a/bin/zorp_convert.py b/bin/zorp_convert.py
@@ -60,7 +60,7 @@ def main(source: ty.Union[str, ty.Iterable],
          skip_rows=None,
          skip_errors=True,
          max_errors=100,
-         make_tabix: bool = False) -> str:
+         make_tabix: bool = False):
     try:
         parser = parsers.GenericGwasLineParser(**parser_options)
     except exceptions.ConfigurationException:
@@ -85,7 +85,6 @@ def main(source: ty.Union[str, ty.Iterable],
         logger.exception('Conversion failed due to unknown error')
     else:
         logger.info('Conversion succeeded! Results written to: {}'.format(dest_fn))
-        return dest_fn
     finally:
         for n, reason, _ in reader.errors:
             logger.error('Excluded row {} from output due to parse error: {}'.format(n, reason))

diff --git a/setup.cfg b/setup.cfg
@@ -3,6 +3,7 @@ exclude = .git,env,venv,.venv,node_modules,docs
 max-line-length = 120
 
 [mypy]
+exclude = tests/*
 python_version = 3.6
 check_untyped_defs = True
 ignore_errors = False

diff --git a/setup.py b/setup.py
@@ -43,7 +43,7 @@
     # For a discussion on single-sourcing the version across setup.py and the
     # project code, see
     # https://packaging.python.org/en/latest/single_source_version.html
-    version='0.3.3',  # Required
+    version='0.3.4',  # Required
 
     # This is a one-line description or tagline of what your project does. This
     # corresponds to the "Summary" metadata field:

diff --git a/tests/test_sniffers.py b/tests/test_sniffers.py
@@ -200,6 +200,24 @@ def test_can_guess_standard_format(self):
         assert actual._parser._allele_freq_col is None, 'Sniffer does not try to detect allele freq'
         # ...yet. Allele freqs are hard to guess reliably, but maybe we will improve.
 
+    def test_can_guess_gwas_catalog_mostly(self):
+        data = _fixture_to_strings([
+            ['chromosome', 'base_pair_location', 'effect_allele', 'other_allele', 'odds_ratio', 'ci_lower', 'ci_upper', 'standard_error', 'p_value'], # noqa
+            ['1', '1108138', 'A', 'G', '1.081', '0.8822', '1.325', '0.1038', '0.4517']
+        ])
+        actual = sniffers.guess_gwas_generic(data)
+
+        assert h(actual._parser._chrom_col) == 1, 'Found index of chr col'
+        assert h(actual._parser._pos_col) == 2, 'Found index of pos col'
+        # The EBI GWAS catalog uses "effect" and "non_effect". The meaning of this varies from one analysis to another.
+        #   A user will have to decide how to handle the reference genome for themselves.
+        assert actual._parser._ref_col is None, 'Did NOT identify ref col, b/c GWAS catalog uses ambiguous "effect"'
+        assert actual._parser._alt_col is None, 'Did NOT identify alt col, b/c GWAS catalog uses ambiguous "effect"'
+
+        assert h(actual._parser._stderr_col) == 8, 'stderr_beta field detected'
+        assert h(actual._parser._pvalue_col) == 9, 'Found index of pval col'
+        assert actual._parser._is_neg_log_pvalue is False, 'Determined whether is log'
+
     def test_can_guess_bolt_lmm(self):
         data = _fixture_to_strings([
             ['SNP', 'CHR', 'BP', 'A1', 'A0', 'MAF', 'HWEP', 'INFO', 'BETA', 'SE', 'P'],

diff --git a/zorp/__init__.py b/zorp/__init__.py
@@ -1,6 +1,6 @@
 from distutils.version import LooseVersion
 
-__version__ = '0.3.3'
+__version__ = '0.3.4'
 __version_info__ = tuple(LooseVersion(__version__).version)
 
 __all__ = [

diff --git a/zorp/sniffers.py b/zorp/sniffers.py
@@ -93,7 +93,7 @@ def get_pval_column(header_names: list, data_rows: ty.Iterable, overrides: dict
     overrides = overrides or {}
 
     LOGPVALUE_FIELDS = ('neg_log_pvalue', 'log_pvalue', 'log_pval', 'logpvalue')
-    PVALUE_FIELDS = ('pvalue', 'p.value', 'p-value', 'pval', 'p_score', 'p')
+    PVALUE_FIELDS = ('pvalue', 'p.value', 'p-value', 'pval', 'p_score', 'p', 'p_value')
 
     data = itertools.islice(data_rows, 100)
 
@@ -135,12 +135,11 @@ def get_chrom_pos_ref_alt_columns(header_names: list, data_rows: ty.Iterable, ov
 
     # Get from either a marker, or 4 separate columns
     MARKER_FIELDS = ('snpid', 'marker', 'markerid', 'snpmarker', 'chr:position')
-    CHR_FIELDS = ('chrom', 'chr')
-    POS_FIELDS = ('position', 'pos', 'begin', 'beg', 'bp', 'end', 'ps')
+    CHR_FIELDS = ('chrom', 'chr', 'chromosome')
+    POS_FIELDS = ('position', 'pos', 'begin', 'beg', 'bp', 'end', 'ps', 'base_pair_location')
 
     data = itertools.islice(data_rows, 100)
 
-    # TODO: How to handle orienting ref vs effect?
     # Order matters: consider ambiguous field names for ref before alt
     REF_FIELDS = ('A1', 'ref', 'reference', 'allele0', 'allele1')
     ALT_FIELDS = ('A2', 'alt', 'alternate', 'allele1', 'allele2')
@@ -155,21 +154,21 @@ def get_chrom_pos_ref_alt_columns(header_names: list, data_rows: ty.Iterable, ov
     #  be found for this function to report a match.
     headers_marked = header_names.copy()
     to_find = [
-        ['chrom_col', CHR_FIELDS],
-        ['pos_col', POS_FIELDS],
-        ['ref_col', REF_FIELDS],
-        ['alt_col', ALT_FIELDS],
+        ['chrom_col', CHR_FIELDS, True],
+        ['pos_col', POS_FIELDS, True],
+        ['ref_col', REF_FIELDS, False],
+        ['alt_col', ALT_FIELDS, False],
     ]
     config = {}
-    for col_name, col_choices in to_find:
+    for col_name, col_choices, is_required in to_find:
         col = utils.human_to_zero(overrides.get(col_name)) or \
               find_column(col_choices, headers_marked, threshold=1)  # type: ignore
-        if col is None:
+        if col is None and is_required:
             return {}
-
-        config[col_name] = col + 1
-        # Once a column has been assigned, remove it from consideration for future matches
-        headers_marked[col] = None
+        if col is not None:
+            config[col_name] = col + 1
+            # Once a column has been assigned, remove it from consideration for future matches
+            headers_marked[col] = None
 
     return config
 
@@ -178,7 +177,7 @@ def get_effect_size_columns(header_names: list, data_rows: ty.Iterable, override
     overrides = overrides or {}
 
     BETA_FIELDS = ('beta', 'effect_size', 'alt_effsize', 'effect')
-    STDERR_BETA_FIELDS = ('stderr_beta', 'stderr', 'sebeta', 'effect_size_sd', 'se')
+    STDERR_BETA_FIELDS = ('stderr_beta', 'stderr', 'sebeta', 'effect_size_sd', 'se', 'standard_error')
 
     data = itertools.islice(data_rows, 100)
 
@@ -333,10 +332,11 @@ def guess_gwas_standard(filename: ty.Union[ty.Iterable, str], *,
     column_config = {}
 
     required_cols = [
-        ['chrom', 'chrom_col'], ['pos', 'pos_col'], ['ref', 'ref_col'], ['alt', 'alt_col'],
+        ['chrom', 'chrom_col'], ['pos', 'pos_col'],
         ['neg_log_pvalue', 'pvalue_col']
     ]
     optional_cols = [
+        ['ref', 'ref_col'], ['alt', 'alt_col'],
         ['beta', 'beta_col'], ['stderr_beta', 'stderr_beta_col'],
         ['alt_allele_freq', 'allele_freq_col'],
         ['rsid', 'rsid_col']