-
Notifications
You must be signed in to change notification settings - Fork 0
/
fasta_search.py
80 lines (67 loc) · 4.14 KB
/
fasta_search.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
#!/usr/bin/python -tt
# Automated BLAST search of a FASTA file. Search defaults to blastx (Protein) search,
# blast type, database, e-value threshold and number of hits can be changed with
# commandline inputs.
# Any inputs with - or -- are optional and will default to certain values.
# Written by: Christopher R. Main, University of Delaware
# Last Updated: 09/08/2012
# Versions:
# 0.1 - Open Cluster file and begin searching internet blast
# 0.2 - Search and send output to screen of top result
# 0.3 - Output BLAST results with GI, Length, E-Value, Query Start, Subject Start, Score and Bits
# 0.4 - Setup of for loop to run multiple queries, and output to separate files
# 0.5 - Append data to the file with added sequence name to first column
# 0.6 - Change way of doing inputs
# Future versions:
# 0.7 - Write for local database search, for use on BioHen
# Allow manipulating of FASTA file
from Bio import SeqIO
# Allows for internet blast search
from Bio.Blast import NCBIWWW
# Parsing of BLAST results
from Bio.Blast import NCBIXML
# Ready arguments from the commandline
import argparse
# Read and parse the arguments from the command line
parser = argparse.ArgumentParser()
parser.add_argument("-v", "--version", action="version", version='Version 0.6')
parser.add_argument("filename", help="location of FASTA file")
parser.add_argument("out_file", help="filename for output of BLAST search results")
parser.add_argument("-b", "--blast", help="what type of blast to use (Defaults to blastx)", default='blastx')
parser.add_argument("-t", "--thres", help="e-value threshold, ignores any numbers above this number (Defaults to 0.005)", type=int, default=0.005)
parser.add_argument("-hl", "--hitlist", help="how many alignments do you want parsed into the file (Defaults to 10)", default=10)
parser.add_argument("-d", "--database", help="what database to search against (Defaults to nr)", default='nr')
parser.add_argument("-l", "--local", help="location of the local database to blast search against", default='')
args = parser.parse_args()
# Open file
handle = open(args.filename, "rU")
# BLAST first sequence of file
records = list(SeqIO.parse(handle, format="fasta"))
# Open file, this will append the file for each sequence, information is tab delimited and easily imported into Excel or other type software
wfile = open(args.out_file, "a")
# Write headers of columns: Sequence Name, GI #, Title, Length, E-Value, Query Start, Subject Start, Score, Bits
wfile.write("Sequence Name\tGI\tTitle\tLength\te-value\tQuery Start\tQuery End\tSubject Start\tSubject End\tGaps\tScore\tBits\n")
# Begin the loop to search each individual record. Each iteration of the loop will search a new sequence
for i in range(len(records)):
print "Blasting %s..." % (records[i].id)
# Blast type, database, name of sequence being run from fasta file, grab gi #s, limit number of hits to 10
result_handle = NCBIWWW.qblast(args.blast, args.database, records[i].seq, ncbi_gi="TRUE", hitlist_size=args.hitlist)
# Take Search and output to file
blast_records = NCBIXML.parse(result_handle)
#blast_records = NCBIXML.parse(open("blarg.xml"))
# Begin the loop, run as many times as there are records in the blast search
for blast_record in blast_records:
print "Writing search results for %s..." % (records[i].id)
# Run as many times as there are alignment sequences set by user input
for alignment in blast_record.alignments:
# Run as many times as there are information stored in the hsp (Has e-value, and other info)
for hsp in alignment.hsps:
# Ignore e-values above what was set by user
if hsp.expect < float(args.thres):
# Strip the ugliness out of the name file, so we can get what we want (ie GI # and Title
giFields = alignment.title.strip().split("|")
gi_num = giFields[1]
gi_title = giFields[4]
wfile.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (records[i].id, gi_num, gi_title, alignment.length, hsp.expect, hsp.query_start, hsp.query_end, hsp.sbjct_start, hsp.sbjct_end, hsp.gaps, hsp.score, hsp.bits))
wfile.close()
print "Writing of %s complete, closing file..." % (args.out_file)