-
Notifications
You must be signed in to change notification settings - Fork 1
/
bioplots.py
44 lines (35 loc) · 1.2 KB
/
bioplots.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
#!/usr/bin/python
# by Aaron (ams@bio.aau.dk)
from Bio import SeqIO
from toolbox import biofileformat
import matplotlib.pyplot as plt
def get_sizes(fname, sub=False):
"""Parses a sequence file and returns a list of sequence lengths"""
with biofileformat.FileType('rb')(fname) as fh:
seqformat = biofileformat.from_handle(fh)
okformats = [ "fasta", "fastq" ]
if seqformat not in okformats:
print "takes only fasta/fastq w/wo compression"
return
if sub:
sizes = []
for n, rec in enumerate(SeqIO.parse(fh, seqformat)):
sizes.append(len(rec))
if n == (sub - 1):
break
else:
sizes = [len(rec) for rec in SeqIO.parse(fh, seqformat)]
return sizes
def plot_length(fname, sub=False):
"""
Parses a sequence file and returns a plot of sequence lengths.
Optional argument to subset the file.
"""
sizes = get_sizes(fname, sub)
plt.hist(sizes)
plt.title("%s\n%i sequences, range %i to %i bp" \
% (fname, len(sizes), min(sizes),max(sizes)))
plt.xlabel("Sequence length (bp)")
plt.ylabel("Count")
plt.show()
return