-
Notifications
You must be signed in to change notification settings - Fork 0
/
parsers.py
289 lines (234 loc) · 11.8 KB
/
parsers.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
"""
Parsers: handle the act of reading one entity (such as line)
"""
import math
import numbers
import typing as ty
try:
from fastnumbers import int, float
except ImportError: # pragma: no cover
pass
from .const import MISSING_VALUES
from . import exceptions, parser_utils as utils
# Whitelist of allowed chromosomes. It's ok to add more values, as long as we have some kind of whitelist.
# The generic parser uses these as a safeguard, because when people slip a non-categorical value into the chrom field,
# tabix uses all the RAM on the system and then crashes horribly.
ALLOWED_CHROMS = frozenset({
'1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20',
'21', '22', '23', '24', '25',
'X', 'Y', 'M', 'MT'
})
class BasicVariant:
"""
Store GWAS results in a predictable format, with a minimal set of fields; optimize for name-based attribute access
"""
# Slots specify the data this holds (a performance optimization); _fields is human-curated list
__slots__ = ('chrom', 'pos', 'ref', 'alt', 'neg_log_pvalue', 'beta', 'stderr_beta', 'alt_allele_freq', 'rsid')
_fields = ('chrom', 'pos', 'rsid', 'ref', 'alt', 'neg_log_pvalue', 'beta', 'stderr_beta', 'alt_allele_freq')
def __init__(self, chrom, pos, rsid, ref, alt, neg_log_pvalue, beta, stderr_beta, alt_allele_freq):
self.chrom = chrom # type: str
self.pos = pos # type: int
self.rsid = rsid # type: str
self.ref = ref # type: str
self.alt = alt # type: str
self.neg_log_pvalue = neg_log_pvalue # type: float
# # Optional fields for future expansion
# af: float
self.beta = beta # type: float
self.stderr_beta = stderr_beta # type: float
self.alt_allele_freq = alt_allele_freq
@property
def pvalue(self) -> ty.Union[float, None]:
if self.neg_log_pvalue is None:
return None
elif math.isinf(self.neg_log_pvalue):
# This is an explicit design choice here, since we parse p=0 to infinity
return 0
else:
return 10 ** -self.neg_log_pvalue
@property
def pval(self) -> numbers.Number:
"""A common field name alias"""
return self.pvalue
@property
def maf(self) -> ty.Union[numbers.Number, None]:
af = self.alt_allele_freq
return min(af, 1 - af) if af is not None else None
@property
def marker(self) -> str:
"""Specify the marker in a string format compatible with UM LD server and other variant-specific requests"""
ref_alt = '_{}/{}'.format(self.ref, self.alt) \
if (self.ref and self.alt) else ''
return '{}:{}{}'.format(self.chrom, self.pos, ref_alt)
def to_dict(self):
# Some tools expect the data in a mutable form (eg dicts)
return {s: getattr(self, s, None) for s in self._fields}
def TupleLineParser(*args, container: ty.Callable = tuple, delimiter='\t', **kwargs):
"""
Parse a line of text and return a tuple of the fields. Performs no type coercion
This isn't recommended for everyday parsing, but it is useful internally for format detection (where we need to
split columns of data, but aren't yet ready to clean and assign meaning to the values)
"""
def inner(line: str):
"""Return a stateful closure that actually does the work of parsing"""
try:
values = line.strip().split(delimiter)
return container(values)
except Exception as e:
raise exceptions.LineParseException(str(e), line=line)
return inner
def GenericGwasLineParser(
*args,
delimiter: str = '\t',
container: ty.Callable[..., BasicVariant] = BasicVariant,
# Variant identifiers: marker OR individual
chrom_col: int = None, chr_col: int = None, # Legacy alias
pos_col: int = None,
ref_col: int = None,
alt_col: int = None,
marker_col: int = None,
# Other required data
pvalue_col: int = None, pval_col: int = None, # Legacy alias
# Optional fields
rsid_col: int = None,
beta_col: int = None,
stderr_beta_col: int = None,
allele_freq_col: int = None, # As freq OR by two count fields
allele_count_col: int = None,
n_samples_col: int = None,
# Other configuration options that apply to every row as constants
is_neg_log_pvalue: bool = False, is_log_pval: bool = False, # Legacy alias
is_alt_effect: bool = True, # whether effect allele is oriented towards alt
**kwargs):
"""
A simple parser that extracts GWAS information from a flexible file format.
Constructor expects human-friendly column numbers (first = column 1)
"""
def validate_config():
"""Ensures that a minimally working parser has been created"""
# Some old gwas files may not have ref and alt (incomplete marker, or missing columns). These fields aren't
# strictly required, but we really really like to have them
has_position = (_marker_col is not None) ^ all(x is not None
for x in (_chrom_col, _pos_col))
# If we do have one allele, we must have both
both_markers = (_ref_col is None and _alt_col is None) or \
(_ref_col is not None and _alt_col is not None)
is_valid = has_position and both_markers and (_pvalue_col is not None)
if not is_valid:
raise exceptions.ConfigurationException('GWAS parser must specify how to find all required fields')
if _allele_count_col is not None and _allele_freq_col is not None:
raise exceptions.ConfigurationException('Allele count and frequency options are mutually exclusive')
if _allele_count_col is not None and _n_samples_col is None:
raise exceptions.ConfigurationException(
'To calculate allele frequency from counts, you must also provide n_samples')
return is_valid
def inner(line):
# Return a stateful closure that does the actual work of parsing
try:
fields = line.strip().split(delimiter)
if len(fields) == 1:
raise exceptions.LineParseException(
'Unable to split line into separate fields. This line may have a missing or incorrect delimiter.')
# Fetch values
ref = None
alt = None
if _marker_col is not None:
chrom, pos, ref, alt = utils.parse_marker(fields[_marker_col])
else:
chrom = fields[_chrom_col]
pos = fields[_pos_col]
if chrom.startswith('chr'):
chrom = chrom[3:]
chrom = chrom.upper()
if chrom not in ALLOWED_CHROMS:
options = ' '.join(utils.natural_sort(ALLOWED_CHROMS))
raise exceptions.LineParseException(
f"Chromosome {chrom} is not a valid option. Must be one of: '{options}'")
# Explicit columns will override a value from the marker, by design
if _ref_col is not None:
ref = fields[_ref_col]
if _alt_col is not None:
alt = fields[_alt_col]
pval = fields[_pvalue_col]
# Some optional fields
rsid = None
beta = None
stderr_beta = None
alt_allele_freq = None
allele_count = None
n_samples = None
if _rsid_col is not None:
rsid = fields[_rsid_col]
if rsid in MISSING_VALUES:
rsid = None
elif not rsid.startswith('rs'):
rsid = 'rs' + rsid
if _beta_col is not None:
beta = fields[_beta_col]
if _stderr_col is not None:
stderr_beta = fields[_stderr_col]
if _allele_freq_col is not None:
alt_allele_freq = fields[_allele_freq_col]
if _allele_count_col is not None:
allele_count = fields[_allele_count_col]
n_samples = fields[_n_samples_col]
# Perform type coercion
log_pval = utils.parse_pval_to_log(pval, is_neg_log=_is_neg_log_pvalue)
try:
pos = int(pos)
except ValueError:
# Some programs seem to write long positions using scientific notation, which int cannot handle
try:
pos = int(float(pos))
except ValueError:
# If we still can't parse, it's probably bad data
raise exceptions.LineParseException(
'Positions should be specified as integers. Could not parse value: {}'.format(pos))
if beta is not None:
beta = None if beta in MISSING_VALUES else float(beta)
if stderr_beta is not None:
stderr_beta = None if stderr_beta in MISSING_VALUES else float(stderr_beta)
if _allele_freq_col or _allele_count_col:
alt_allele_freq = utils.parse_allele_frequency(
freq=alt_allele_freq,
allele_count=allele_count,
n_samples=n_samples,
is_alt_effect=_is_alt_effect
)
# Some old GWAS files simply won't provide ref or alt information, and the parser will need to do without
if ref in MISSING_VALUES:
ref = None
if isinstance(ref, str):
ref = ref.upper()
if alt in MISSING_VALUES:
alt = None
if isinstance(alt, str):
alt = alt.upper()
result = container(chrom, pos, rsid, ref, alt, log_pval, beta, stderr_beta, alt_allele_freq)
except Exception as e:
raise exceptions.LineParseException(str(e), line=line)
return result
# Convert the user-provided values to field array indices (0-based), and validate config
# Chrom field has a legacy alias, allowing older parser configs to work.
inner._chrom_col = _chrom_col = utils.human_to_zero(chrom_col) if chrom_col is not None else utils.human_to_zero(chr_col) # type: ignore # noqa
inner._pos_col = _pos_col = utils.human_to_zero(pos_col) # type: ignore
inner._ref_col = _ref_col = utils.human_to_zero(ref_col) # type: ignore
inner._alt_col = _alt_col = utils.human_to_zero(alt_col) # type: ignore
inner._marker_col = _marker_col = utils.human_to_zero(marker_col) # type: ignore
# Support legacy alias for field name
inner._pvalue_col = _pvalue_col = utils.human_to_zero(pvalue_col) if pvalue_col is not None else utils.human_to_zero(pval_col) # type: ignore # noqa
inner._rsid_col = _rsid_col = utils.human_to_zero(rsid_col) # type: ignore
inner._beta_col = _beta_col = utils.human_to_zero(beta_col) # type: ignore
inner._stderr_col = _stderr_col = utils.human_to_zero(stderr_beta_col) # type: ignore
inner._allele_freq_col = _allele_freq_col = utils.human_to_zero(allele_freq_col) # type: ignore
inner._allele_count_col = _allele_count_col = utils.human_to_zero(allele_count_col) # type: ignore
inner._n_samples_col = _n_samples_col = utils.human_to_zero(n_samples_col) # type: ignore
# The latter option is an alias for legacy reasons
inner._is_neg_log_pvalue = _is_neg_log_pvalue = is_neg_log_pvalue or is_log_pval # type: ignore
inner._is_alt_effect = _is_alt_effect = is_alt_effect # type: ignore
# Raise an exception if the provided options are invalid
validate_config()
# Provide the outside world with access to additional named attributes
# We are slightly abusing closures, but the end result is ~10% is faster than a class-based callable
inner.fields = container._fields # type: ignore
return inner