/
virsorter_to_anvio.py
executable file
·399 lines (358 loc) · 20.9 KB
/
virsorter_to_anvio.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
#!/usr/bin/env python
import collections
import argparse
import sys
"""
Parses viral contig prediction results from VirSorter and generates three files usable by Anvi'o:
1. "virsorter_additional_info.txt" which you can import as an additional data file for splits. This can be helpful while binning using anvi-interactive or anvi-refine to identify contigs that are phages.
2. "virsorter_collection.txt" which is a collection file that will automatically group splits belonging to each VirSorter phage prediction into its own bin. This can be imported using anvi-import-collection.
3. "virsorter_annotations.txt" which is an annotations file containing predicted functions for hallmark genes. This can be imported using anvi-import-functions.
"""
"""
Parses viral contig prediction results from VirSorter and generates three files usable by Anvi'o:
1. "virsorter_additional_info.txt" which you can import as an additional data file for splits. This can be helpful while binning using anvi-interactive or anvi-refine to identify contigs that are phages.
2. "virsorter_collection.txt" which is a collection file that will automatically group splits belonging to each VirSorter phage prediction into its own bin. This can be imported using anvi-import-collection.
3. "virsorter_annotations.txt" which is an annotations file containing predicted functions for hallmark genes. This can be imported using anvi-import-functions.
"""
parser = argparse.ArgumentParser(description="Parses VirSorter predictions for Anvi\'o.")
parser.add_argument("-a","--affi-file", help = "REQUIRED INPUT. VIRSorter_affi-contigs.tab file.")
parser.add_argument("-g","--global-file", help="REQUIRED INPUT. VIRSorter_global_signal.csv file.")
parser.add_argument("--db", help="REQUIRED INPUT. Which VirSorter database you used. Enter 1 for RefSeq, 2 for RefSeq+viromes.")
parser.add_argument("-s","--splits-info", help = "REQUIRED INPUT. splits_basic_info.txt file.\n\n")
parser.add_argument("-n","--anvio-gene-calls", help = "OPTIONAL INPUT, but REQUIRED if you want an output functions file. Anvi'o gene calls exported from the contigs database")
parser.add_argument("-f","--hallmark-functions", help = "OPTIONAL INPUT, but REQUIRED if you want an output functions file. Functional annotation for protein clusters of hallmark genes. You'll want to specify the db1 file if you ran VirSorter against db1, and the db2 file if you ran VirSorter against db2.")
parser.add_argument("-A","--addl-info", default = "virsorter_additional_info.txt", help = "OUTPUT. Additional info file. Default = \"virsorter_additional_info.txt\". You can import this as an additional data file for splits. This can be helpful while binning using anvi-interactive or anvi-refine to identify contigs that are phages.")
parser.add_argument("-C","--phage-collection", default = "virsorter_collection.txt", help = "OUTPUT. An Anvi\'o collections file with splits for each phage gathered into a separate bin. The default name is \"virsorter_collection.txt\".")
parser.add_argument("-F","--output-functions", default = "virsorter_annotations.txt", help = "OPTIONAL OUTPUT. Annotations for hallmark genes in VirSorter-predicted phages / prophages. This file is importable using anvi-import-functions.\n\n")
parser.add_argument("-L","--min-phage-length", type = int, default = 1000, help = "PARAMETER, AFFECTS ALL OUTPUTS. Specify the minimum phage length to report. Default is 1000 bp.")
parser.add_argument("--exclude-cat3", action = "store_true", help = "PARAMETER, AFFECTS ALL OUTPUTS. Excludes all category 3 predictions.")
parser.add_argument("--exclude-prophages", action = "store_true", help = "PARAMETER, AFFECTS ALL OUTPUTS. Exclude all prophage predictions.")
args = parser.parse_args()
arg_dict = vars(args)
#print(arg_dict)
if arg_dict['affi_file'] == None or arg_dict['global_file'] == None or arg_dict['splits_info'] == None or arg_dict['db'] == None:
print("\n***A required option is missing. Try again.***\n\n")
parser.print_help()
sys.exit()
if (arg_dict['anvio_gene_calls'] == None and arg_dict['hallmark_functions'] != None) or (arg_dict['anvio_gene_calls'] != None and arg_dict['hallmark_functions'] == None):
print("\n***You only specified one of the files required to generate an annotations file for predicted phages.\n")
parser.print_help()
sys.exit()
#PART ZERO
#Here I need to read in functional annotations for each hallmark phage cluster
f_dict = collections.defaultdict(dict)
if 'hallmark_functions' in arg_dict:
f_in = open(arg_dict['hallmark_functions'],"r")
line = f_in.readline()
while line != "":
line = line.strip().split('\t')
f_dict[line[0]] = line[1]
line = f_in.readline()
f_in.close()
#PART ONE
contig_set = set()
#Populate contig_set with all of the contigs we're interested in:
virsorter_global = open(arg_dict['global_file'], "r")
line = virsorter_global.readline()
while line != "":
if line[0] == "#":
line = virsorter_global.readline()
else:
line = line.strip().split(',')
#print line
contig = line[0].replace("VIRSorter_","").replace("-circular","")
contig_set.add(contig)
line = virsorter_global.readline()
virsorter_global.close()
#First, I need to read in virsorter_affi, the input to Step3 in Virsorter.
#I need this because I need to translate the gene boundaries of each predicted phage
#to start and stop positions (bp) along the contig.
#columns of VIRSorter_affi-contigs.tab:
# 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11
# gene_id|start|stop|length|strand|affi_phage|score|evalue|category|affi_pfam|score|evalue|
virsorter_affi = open(arg_dict['affi_file'], "r")
affi_dict = {}
affi_dict = collections.defaultdict(dict)
hallmark_dict = collections.defaultdict(dict)
line = virsorter_affi.readline()
while line != "" and line[0] == ">":
header = line[1:].strip().split('|')
line = virsorter_affi.readline()
while line != "" and line[0] != ">":
var = line.strip().split('|')
header2 = header[0].replace("VIRSorter_","").replace("-circular","")
if header2 in contig_set:
gene = var[0].split('-')
gene = gene[-1]
gene_start = var[1]
gene_stop = var[2]
hit = var[5]
cat = var[8]
evalue = var[7]
pfam_eval = var[11]
affi_dict[header2][gene] = {}
affi_dict[header2][gene]['start'] = gene_start
affi_dict[header2][gene]['stop'] = gene_stop
affi_dict[header2][gene]['cat'] = cat
if cat == str(0) or cat == str(3):
#This next if statement deals with the fact that VirSorter doesn't count hallmark genes
#if a hallmark gene has a PFAM hit with a better e-value than the phage cluster hit e-value.
if pfam_eval == "-" or float(pfam_eval) > float(evalue):
affi_dict[header2][gene]['hallmark_function'] = f_dict[hit]
hallmark_dict[header2][gene] = {}
hallmark_dict[header2][gene]['hallmark_function'] = f_dict[hit]
hallmark_dict[header2][gene]['phage_cluster'] = hit
hallmark_dict[header2][gene]['start'] = gene_start
hallmark_dict[header2][gene]['stop'] = gene_stop
hallmark_dict[header2][gene]['cat'] = cat
hallmark_dict[header2][gene]['evalue'] = evalue
hallmark_dict[header2][gene]['pfam_eval'] = pfam_eval
line = virsorter_affi.readline()
virsorter_affi.close()
#PART TWO
#Now we read in the final output of VirSorter, which is VIRSorter_global_signal.csv
#and save it as a dictionary.
#From this, we need: (0) contig_id, (1) genes in contig, (2) Fragment, (3) Nb genes, (4) Category, (5) nb_hallmark
#Column types for virsorter_global:
# 0 / 1 / 2 / 3 / 4 / 5 / 6 / 7 / 8 / 9 / 10 / 11 /
# Contig / Total Nb Genes / Fragment / Size / Type detection / Category / Enrich Phage / Enrich Pfam / Enrich Unch / Enrich Switch / Avg_g_size / Nb Hallmark
virsorter_global = open(arg_dict['global_file'], "r")
global_dict = {}
global_dict = collections.defaultdict(dict)
m = 1
n = 1
line = virsorter_global.readline()
while line != "":
if line[0] == "#" or line.strip().split(',')[0] == "Contig_id":
line = virsorter_global.readline()
else:
line = line.strip().split(',')
#print line
contig = line[0].replace("VIRSorter_","").replace("-circular","")
genes_in_contig = line[1]
fragment = line[2].replace("VIRSorter_","").replace("-circular","")
num_fragment_genes = int(line[3])
nb_hallmark = line[5]
if nb_hallmark == "":
nb_hallmark = 0
nb_hallmark = int(nb_hallmark)
global_dict[contig]['num_genes_in_contig'] = int(genes_in_contig)
global_dict[contig]['fragment'] = fragment
global_dict[contig]['num_fragment_genes'] = int(num_fragment_genes)
global_dict[contig]['nb_hallmark'] = nb_hallmark
global_dict[contig]['length'] = 0
if "-circular" in line[0]:
global_dict[contig]['circular'] = True
else:
global_dict[contig]['circular'] = False
if contig == fragment:
#print category
category = int(line[4])
global_dict[contig]['category'] = str(category)
global_dict[contig]['phage_num'] = "phage_"+str(m)
m += 1
if contig != fragment:
#print category
category = int(line[4])
global_dict[contig]['category'] = str(category)
genes_in_fragment = fragment.split('-')
start_gene = genes_in_fragment[-2]
stop_gene = genes_in_fragment[-1]
global_dict[contig]['start_gene'] = start_gene
global_dict[contig]['stop_gene'] = stop_gene
global_dict[contig]['start_gene_pos'] = int(affi_dict[contig][start_gene]['start'])
global_dict[contig]['stop_gene_pos'] = int(affi_dict[contig][stop_gene]['stop'])
global_dict[contig]['phage_num'] = "prophage_"+str(n)
n += 1
line = virsorter_global.readline()
virsorter_global.close()
#PART THREE
#This writes all phages and prophages, cat1-3, to additional-info and collections files.
splits_input = open(arg_dict['splits_info'], "r")
splits_output = open(arg_dict['addl_info'],'w')
collection_output = open(arg_dict['phage_collection'],'w')
#splits_basic_info column format:
# 0 1 2 3 4 5 6 7
#split order_in_parent start end length gc_content gc_content_parent parent
#is_prev_phage = False
#n = 1
splits_output.write("split\tphage_name\tphage_category\tphage_length\tnum_genes_in_phage\tnum_phage_hallmark_genes_in_phage\n")
line = splits_input.readline()
line = splits_input.readline()
split_length_dict = collections.defaultdict(dict)
while line != "":
line = line.strip().split('\t')
if 'length' not in split_length_dict[line[7]]:
split_length_dict[line[7]]['length'] = 0
split_length_dict[line[7]]['length'] += int(line[4])
line = splits_input.readline()
splits_input.close()
splits_input = open(arg_dict['splits_info'], "r")
line = splits_input.readline()
line = splits_input.readline()
while line != "":
line = line.strip().split('\t')
# print line[7]
split_name = line[0]
split_start = int(line[2])
split_stop = int(line[3])
split_parent = line[7]
parent_length = split_length_dict[split_parent]['length']
if split_parent in global_dict:
nb_hallmark = global_dict[split_parent]['nb_hallmark']
nb_genes = global_dict[split_parent]['num_fragment_genes']
phage_name = global_dict[split_parent]['phage_num']
if split_parent == global_dict[split_parent]['fragment']:
category = "cat"+str(global_dict[split_parent]['category'])+"_phage"
global_dict[split_parent]['length'] = int(parent_length)
phage_length = global_dict[split_parent]['length']
if split_parent != global_dict[split_parent]['fragment']:
#This is where it gets hairy....
start_gene_pos = global_dict[split_parent]['start_gene_pos']
stop_gene_pos = global_dict[split_parent]['stop_gene_pos']
#print(split_name+" prophage start/stop "+str(start_gene_pos)+" "+str(stop_gene_pos)+" split start/stop "+str(split_start)+" "+str(split_stop))
if (start_gene_pos <= split_start <= stop_gene_pos or start_gene_pos <= split_stop <= stop_gene_pos) or \
(split_start <= start_gene_pos <= stop_gene_pos <= split_stop):
category = "cat"+str(global_dict[split_parent]['category'])+"_prophage"
global_dict[split_parent]['length'] = stop_gene_pos - start_gene_pos
phage_length = global_dict[split_parent]['length']
#print(split_parent)
else:
category = ""
nb_genes = 0
nb_hallmark = 0
phage_name = ""
phage_length = 0
else:
nb_genes = 0
nb_hallmark = 0
category = ""
phage_name = ""
phage_length = 0
#Output format: split_name phage_name category phage_length nb_genes nb_hallmark
if phage_length >= int(arg_dict['min_phage_length']):
if arg_dict['exclude_cat3'] == True:
if category == "cat1_phage" or category == "cat2_phage" or category == "cat1_prophage" or category == "cat2_prophage":
if arg_dict['exclude_prophages'] == True:
if category == "cat1_phage" or category == "cat2_phage":
splits_output.write("%s\t%s\t%s\t%s\t%s\t%s\n" % (
split_name, phage_name, category, str(phage_length), str(nb_genes), str(nb_hallmark)))
collection_output.write("%s\t%s\n" %(split_name, phage_name))
else:
splits_output.write("%s\t%s\t%s\t%s\t%s\t%s\n" % (
split_name, phage_name, category, str(phage_length), str(nb_genes), str(nb_hallmark)))
collection_output.write("%s\t%s\n" %(split_name, phage_name))
### THIS PART NEEDS LOTS OF WORK TO GET ALL ARGS TO WORK CORRECTLY. ###
else:
if arg_dict['exclude_prophages'] == True:
if category == "cat1_phage" or category == "cat2_phage" or category == "cat3_phage":
splits_output.write("%s\t%s\t%s\t%s\t%s\t%s\n" % (
split_name, phage_name, category, str(phage_length), str(nb_genes), str(nb_hallmark)))
collection_output.write("%s\t%s\n" %(split_name, phage_name))
else:
splits_output.write("%s\t%s\t%s\t%s\t%s\t%s\n" % (
split_name, phage_name, category, str(phage_length), str(nb_genes), str(nb_hallmark)))
collection_output.write("%s\t%s\n" %(split_name, phage_name))
line = splits_input.readline()
splits_input.close()
splits_output.close()
collection_output.close()
#Part 4a:
#Here I need to read in gene calls from Anvi'o
#File Format:
#gene_callers_id contig start stop direction partial source version
#We just read in contigs to gene_dict that were annotated as being all or partially viral.
gene_dict = collections.defaultdict(dict)
if 'anvio_gene_calls' in arg_dict:
g_in = open(arg_dict['anvio_gene_calls'],'r')
line = g_in.readline()
line = g_in.readline()
while line != "":
line = line.strip().split('\t')
if line[1] in contig_set:
gcid = str(line[0])
contig = line[1]
start = line[2]
stop = line[3]
direction = line[4]
gene_dict[contig][gcid] = {}
gene_dict[contig][gcid]['start'] = start
gene_dict[contig][gcid]['stop'] = stop
gene_dict[contig][gcid]['direction'] = direction
line = g_in.readline()
g_in.close
#Part 4b
#I need to read in the splits file I just generated to get a set of just the contigs we're reporting on here
#rather than integrating the below code into the many cases in part 3 above.
#I'll only export functions for contigs/regions declared interesting in the command line.
#open additional data file, ead in first column splitting on 'split', add to set
if 'anvio_gene_calls' in arg_dict:
contigs_set = set()
vir_contigs = open(arg_dict['phage_collection'],"r")
line = vir_contigs.readline()
while line != "":
line = line.strip().split('\t')
contig = line[0].split('_split_')[0]
contigs_set.add(contig)
line = vir_contigs.readline()
vir_contigs.close()
#write output functions file
func_out = open(arg_dict['output_functions'],"w")
func_out.write("gene_callers_id\tsource\taccession\tfunction\te_value\n")
count = 0
matched = 0
for contig in hallmark_dict:
#if contig in additional data file:... tab the stuff below after turhing this on
if 'start_gene_pos' in global_dict[contig]:
prophage_start = global_dict[contig]['start_gene_pos']
prophage_end = global_dict[contig]['stop_gene_pos']
for gene in hallmark_dict[contig]:
h_gene_start = int(hallmark_dict[contig][gene]['start'])
h_gene_stop = int(hallmark_dict[contig][gene]['stop'])
if prophage_start <= h_gene_start <= prophage_end or \
prophage_start <= h_gene_stop <= prophage_end:
matched = 0
count += 1
#print(contig+"\t"+gene)
for gcid in gene_dict[contig]:
#if h_gene_start == int(gene_dict[contig][gcid]['start'])+1 or h_gene_stop == int(gene_dict[contig][gcid]['stop']):
if abs(h_gene_start-int(gene_dict[contig][gcid]['start']))<2 or abs(h_gene_stop-int(gene_dict[contig][gcid]['stop']))<2:
#print(contig+", anvi-gene = "+gcid+", start = "+str(h_gene_start)+", gcid start = "+str(gene_dict[contig][gcid]['start'])+", phage function = "+hallmark_dict[contig][gene]['hallmark_function'])
matched = 1
pc = hallmark_dict[contig][gene]['phage_cluster']
hf = hallmark_dict[contig][gene]['hallmark_function']
ev = hallmark_dict[contig][gene]['evalue']
#print(hallmark_dict[contig][gene]['cat'])
if hallmark_dict[contig][gene]['cat'] == str(3):
hf = hf+(" (non-Caudovirales)")
func_out.write("%s\t%s\t%s\t%s\t%s\n" % (
str(gcid), "VirSorter (db"+str(arg_dict['db'])+")", pc, hf, str(ev)))
break
if matched == 0:
print("No match for "+contig+"\t"+gene+"\t"+str(h_gene_start)+"\t"+str(h_gene_stop))
else:
for gene in hallmark_dict[contig]:
h_gene_start = int(hallmark_dict[contig][gene]['start'])
h_gene_stop = int(hallmark_dict[contig][gene]['stop'])
matched = 0
count += 1
#print(" hstart = "+str(h_gene_start)+", hstop = "+str(h_gene_stop))
for gcid in gene_dict[contig]:
# print(gene_dict[contig][gcid]['start']+", "+gene_dict[contig][gcid]['stop'])
if abs(h_gene_start-int(gene_dict[contig][gcid]['start']))<2 or abs(h_gene_stop-int(gene_dict[contig][gcid]['stop']))<2:
#if h_gene_start == int(gene_dict[contig][gcid]['start'])+1 or h_gene_stop == int(gene_dict[contig][gcid]['stop']):
#print("anvi-gene = "+gcid+", "+direction+" , start = "+str(h_gene_start)+", gcid start = "+str(gene_dict[contig][gcid]['start'])+", stop = "+str(h_gene_stop)+", gcid stop = "+str(gene_dict[contig][gcid]['stop'])+", phage function = "+hallmark_dict[contig][gene]['hallmark_function'])
matched += 1
pc = hallmark_dict[contig][gene]['phage_cluster']
#print(hallmark_dict[contig][gene]['cat'])
hf = hallmark_dict[contig][gene]['hallmark_function']
ev = hallmark_dict[contig][gene]['evalue']
if hallmark_dict[contig][gene]['cat'] == str(3):
hf = hf+(" (non-Caudovirales)")
func_out.write("%s\t%s\t%s\t%s\t%s\n" % (
str(gcid), "VirSorter (db"+str(arg_dict['db'])+")", pc, hf, str(ev)))
break
if matched == 0:
#print("No match for "+contig+"\t"+gene+"\t"+str(h_gene_start)+"\t"+str(h_gene_stop))
pass
func_out.close()