-
Notifications
You must be signed in to change notification settings - Fork 8
/
vcf2npy
executable file
·208 lines (189 loc) · 9.18 KB
/
vcf2npy
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
#!/usr/bin/env python
from __future__ import print_function, division, absolute_import
import os
import sys
import argparse
from datetime import datetime
import numpy as np
import pyfasta
import vcfnp
def log(*msg):
print('[vcf2npy] ' +
str(datetime.now()) +
' :: ' +
' '.join(map(str, msg)),
file=sys.stderr)
sys.stderr.flush()
def main(vcf_filename, fasta_filename, output_dir, array_type, chromosome,
task_size, task_index, region, exclude_fields, ploidy, dtypes, arities,
progress, truncate, compress):
assert vcf_filename is not None, 'missing VCF filename, try vcf2npy --help'
assert os.path.exists(vcf_filename), 'VCF file not found'
log('loading', array_type, 'from', vcf_filename)
# determine region to extract
if region is not None:
if (chromosome is not None or
task_index is not None or
task_size is not None ):
raise Exception('region is incompatibile with chromosome, '
'task-index, and task-size arguments')
log(region)
else:
if chromosome is None:
log('extract whole genome')
region = None
elif task_size is None:
log('extract whole chromosome', chromosome)
region = chromosome
else:
log('extract a chromosome split', chromosome, task_size, task_index)
# N.B., make sure regions will work as sortable file names by
# left-padding with zeros
assert fasta_filename is not None, \
'missing FASTA filename, try vcf2npy --help'
assert os.path.exists(fasta_filename), 'FASTA file not found'
genome = pyfasta.Fasta(fasta_filename)
chrom_size = len(genome[chromosome])
n_fill = int(np.ceil(np.log10(chrom_size)))
start = range(0, chrom_size, task_size)[task_index]
startstr = str(start+1).zfill(n_fill)
stop = start + task_size
stopstr = str(stop).zfill(n_fill)
region = str(chromosome) + ':' + startstr + '-' + stopstr
log(region)
if array_type == 'variants':
arr = vcfnp.variants(vcf_filename, region=region, progress=progress,
exclude_fields=exclude_fields, dtypes=dtypes,
arities=arities, flatten_filter=True, cache=True,
cachedir=output_dir, skip_cached=True,
compress_cache=compress, verbose=True,
truncate=truncate)
elif array_type == 'calldata':
arr = vcfnp.calldata(vcf_filename, region=region, progress=progress,
ploidy=ploidy, exclude_fields=exclude_fields,
dtypes=dtypes, arities=arities, cache=True,
cachedir=output_dir, skip_cached=True,
compress_cache=compress, verbose=True,
truncate=truncate)
elif array_type == 'calldata_2d':
arr = vcfnp.calldata_2d(vcf_filename, region=region, progress=progress,
ploidy=ploidy, exclude_fields=exclude_fields,
dtypes=dtypes, arities=arities, cache=True,
cachedir=output_dir, skip_cached=True,
compress_cache=compress, verbose=True,
truncate=truncate)
else:
raise Exception('unexpected array type: %s' % array_type)
if arr is not None:
log('loaded', arr.size, 'items', arr.nbytes, 'bytes')
log('all done')
if __name__ == '__main__':
# handle command line options
pyv = '.'.join(map(str, sys.version_info[:3]))
epilog = "Version: vcfnp %s (Python %s, NumPy %s)" % \
(vcfnp.__version__, pyv, np.__version__)
parser = argparse.ArgumentParser(epilog=epilog)
parser.add_argument('--vcf', dest='vcf_filename', metavar='FILE',
default=None, help='input VCF file')
parser.add_argument('--fasta', dest='fasta_filename', metavar='FASTA',
default=None,
help='reference genome as FASTA file (only required '
'if --task-size is given)')
parser.add_argument('--output-dir', dest='output_dir', metavar='DIR',
default=None,
help='output directory (omit to use default vcfnp '
'cache directory)')
parser.add_argument('--array-type', dest='array_type', default='variants',
help='array type, one of ['
'variants|calldata|calldata_2d]')
parser.add_argument('--chromosome', dest='chromosome', default=None,
help='chromosome to extract (omit to extract whole '
'genome)')
parser.add_argument('--task-size', dest='task_size', metavar='BP',
default=None, type=int,
help='size (in base pairs) of region to extract (omit '
'to extract whole chromosome)')
parser.add_argument('--task-index', dest='task_index', metavar='N',
default=None,
help='task index as integer or string to get task '
'from environment variable (e.g., "SGE_TASK_ID")')
parser.add_argument('--region', default=None,
help='region to extract '
'in (chr:start-end | chr:start | chr) format '
'(omit to extract whole genome) '
'Note: not compatable with --chromosome, '
'--task-size, or --task-index')
parser.add_argument('--exclude-field', dest='exclude_fields',
metavar='FIELD', default=None, action='append',
help='field to exclude')
parser.add_argument('--ploidy', dest='ploidy', default=2, type=int,
help='sample ploidy')
parser.add_argument('--dtype', metavar='FIELD:DTYPE', dest='dtypes',
action='append',
help='override default dtype for a given field, e.g., '
'"MQ:f4"')
parser.add_argument('--arity', metavar='FIELD:ARITY', dest='arities',
action='append',
help='override default arity for a given field, e.g., '
'"AD:2"')
parser.add_argument('--progress', dest='progress', metavar='N',
default=10000, type=int,
help='log progress every N rows')
parser.add_argument('--no-truncate', dest='no_truncate',
action='store_true',
default=False,
help='do not truncate variants to region boundary ('
'i.e., use default tabix behaviour)')
parser.add_argument('--compress', dest='compress', action='store_true',
default=False, help='save in a compressed .npz file')
args = parser.parse_args()
# region or chromosome [[task-index] task-size]
if args.region is not None and (
args.chromosome is not None or
args.task_index is not None or
args.task_size is not None ):
raise Exception('--region is incompatibile with --chromosome, '
'--task-index, and --task-size arguments')
# determine task index
task_index = None
if args.chromosome is not None and args.task_size is not None:
# assume we are running as part of a parallel job array
# want to determine task index
if args.task_index is not None:
try:
task_index = int(args.task_index)
except ValueError:
task_index = str(args.task_index).strip()
assert task_index in os.environ, 'expected %r in environment' % args.task_index
task_index = int(os.environ[task_index])
# if task_index not given in options, assume SGE job array
elif 'SGE_TASK_ID' in os.environ:
task_index = int(os.environ['SGE_TASK_ID'])
else:
raise Exception('could not determine task index; %s', args)
task_index -= 1 # use zero-based indices internally
# determine dtype overrides
if args.dtypes:
dtypes = dict(s.split(':') for s in args.dtypes)
else:
dtypes = None
# determine arity overrides
if args.arities:
arities = dict((s.split(':')[0], int(s.split(':')[1])) for s in args.arities)
else:
arities = None
main(vcf_filename=args.vcf_filename,
fasta_filename=args.fasta_filename,
output_dir=args.output_dir,
array_type=args.array_type,
chromosome=args.chromosome,
task_size=args.task_size,
task_index=task_index,
region=args.region,
exclude_fields=args.exclude_fields,
ploidy=args.ploidy,
dtypes=dtypes,
arities=arities,
progress=args.progress,
truncate=(not args.no_truncate),
compress=args.compress)