forked from BioJulia/Automa.jl
-
Notifications
You must be signed in to change notification settings - Fork 0
/
fasta.jl
90 lines (77 loc) · 3.29 KB
/
fasta.jl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
# A simple and practical FASTA file parser
# ========================================
import Automa
import Automa.RegExp: @re_str
import Compat: take!
const re = Automa.RegExp
# Create a machine of FASTA.
fasta_machine = (function ()
# First, describe FASTA patterns in regular expression.
lf = re"\n"
newline = re"\r?" * lf
identifier = re"[!-~]*"
description = re"[!-~][ -~]*"
letters = re"[A-Za-z*-]*"
sequence = re.cat(letters, re.rep(newline * letters))
record = re.cat('>', identifier, re.opt(re" " * description), newline, sequence)
fasta = re.rep(record)
# Second, bind action names to each regular expression.
lf.actions[:enter] = [:count_line]
identifier.actions[:enter] = [:mark]
identifier.actions[:exit] = [:identifier]
description.actions[:enter] = [:mark]
description.actions[:exit] = [:description]
letters.actions[:enter] = [:mark]
letters.actions[:exit] = [:letters]
record.actions[:exit] = [:record]
# Finally, compile the final FASTA pattern into a state machine.
return Automa.compile(fasta)
end)()
# It is useful to visualize the state machine for debugging.
# write("fasta.dot", Automa.dfa2dot(fasta_machine.dfa))
# run(`dot -Tsvg -o fasta.svg fasta.dot`)
# Bind Julia code to each action name (see the `parse_fasta` function defined below).
fasta_actions = Dict(
:count_line => :(linenum += 1),
:mark => :(mark = p),
:identifier => :(identifier = mark == 0 ? "" : String(data[mark:p-1]); mark = 0),
:description => :(description = mark == 0 ? "" : String(data[mark:p-1]); mark = 0),
:letters => :(mark > 0 && unsafe_write(buffer, pointer(data, mark), p - mark); mark = 0),
:record => :(push!(records, FASTARecord(identifier, description, take!(buffer)))))
# Define a type to store a FASTA record.
type FASTARecord
identifier::String
description::String
sequence::Vector{UInt8}
end
# Generate a parser function from `fasta_machine` and `fasta_actions`.
@eval function parse_fasta(data::Union{String,Vector{UInt8}})
# Initialize variables you use in the action code.
records = FASTARecord[]
mark = 0
linenum = 1
identifier = description = ""
buffer = IOBuffer()
# Initialize variables used by the state machine.
$(Automa.generate_init_code(fasta_machine))
p_end = p_eof = endof(data)
# This is the main loop to iterate over the input data.
$(Automa.generate_exec_code(fasta_machine, actions=fasta_actions, code=:goto, check=false))
# Check the last state the machine reached.
if cs != 0
error("failed to parse on line ", linenum)
end
# Finally, return records accumulated in the action code.
return records
end
# Run the FASTA parser.
records = parse_fasta("""
>NP_003172.1 brachyury protein isoform 1 [Homo sapiens]
MSSPGTESAGKSLQYRVDHLLSAVENELQAGSEKGDPTERELRVGLEESELWLRFKELTNEMIVTKNGRR
MFPVLKVNVSGLDPNAMYSFLLDFVAADNHRWKYVNGEWVPGGKPEPQAPSCVYIHPDSPNFGAHWMKAP
VSFSKVKLTNKLNGGGQIMLNSLHKYEPRIHIVRVGGPQRMITSHCFPETQFIAVTAYQNEEITALKIKY
NPFAKAFLDAKERSDHKEMMEEPGDSQQPGYSQWGWLLPGTSTLCPPANPHPQFGGALSLPSTHSCDRYP
TLRSHRSSPYPSPYAHRNNSPTYSDNSPACLSMLQSHDNWSSLGMPAHPSMLPVSHNASPPTSSSQYPSL
WSVSNGAVTPGSQAAAVSNGLGAQFFRGSPAHYTPLTHPVSAPSSSGSPLYEGAAAATDIVDSQYDAAAQ
GRLIASWTPVSPPSM
""")