diff --git a/.gitmodules b/.gitmodules
index 59a2a63..c304f5f 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,3 +1,6 @@
 [submodule "Prodigal"]
-	path = Prodigal
+	path = vendor/Prodigal
 	url = https://github.com/hyattpd/Prodigal
+[submodule "vendor/cpu_features"]
+	path = vendor/cpu_features
+	url = https://github.com/google/cpu_features/
diff --git a/MANIFEST.in b/MANIFEST.in
index b293965..c564538 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -6,6 +6,7 @@ include README.md
 recursive-include pyrodigal/tests      *.py *.txt
 recursive-include pyrodigal/tests/data *
 
-recursive-include Prodigal *.c *.h *.md VERSION LICENSE CHANGES Makefile
+recursive-include vendor/Prodigal *.c *.h *.md VERSION LICENSE CHANGES Makefile
+
 recursive-include pyrodigal *.py *.pyi *.pyx *.pxd *.h py.typed
-recursive-exclude pyrodigal *.c
+recursive-exclude pyrodigal *.c *.html
diff --git a/README.md b/README.md
index e87b2ed..50e2a1b 100644
--- a/README.md
+++ b/README.md
@@ -1,7 +1,7 @@
 # 🔥 Pyrodigal [![Stars](https://img.shields.io/github/stars/althonos/pyrodigal.svg?style=social&maxAge=3600&label=Star)](https://github.com/althonos/pyrodigal/stargazers)
 
 *Cython bindings and Python interface to [Prodigal](https://github.com/hyattpd/Prodigal/), an ORF
-finder for genomes and metagenomes.*
+finder for genomes and metagenomes. **Now with SIMD!***
 
 [![Actions](https://img.shields.io/github/workflow/status/althonos/pyrodigal/Test/master?logo=github&style=flat-square&maxAge=300)](https://github.com/althonos/pyrodigal/actions)
 [![Coverage](https://img.shields.io/codecov/c/gh/althonos/pyrodigal?style=flat-square&maxAge=3600)](https://codecov.io/gh/althonos/pyrodigal/)
@@ -27,14 +27,17 @@ internals, which has the following advantages:
 - **single dependency**: Pyrodigal is distributed as a Python package, so you
   can add it as a dependency to your project, and stop worrying about the
   Prodigal binary being present on the end-user machine.
-- **no intermediate files**: everything happens in memory, in a Python object
+- **no intermediate files**: Everything happens in memory, in a Python object
   you fully control, so you don't have to invoke the Prodigal CLI using a
-  sub-process and temporary files.
-- **no input formatting**: sequences are manipulated directly as strings, which
-  leverages the issue of formatting your input to FASTA for Prodigal.
+  sub-process and temporary files. Sequences can be passed directly as
+  strings or bytes, which avoids the issue of formatting your input to
+  FASTA for Prodigal.
 - **lower memory usage**: Pyrodigal is slightly more conservative when it comes
   to using memory, which can help process very large sequences. It also lets
   you save some more memory when running several *meta*-mode analyses
+- **better performance**: Pyrodigal uses *SIMD* instructions to compute which
+  dynamic programming nodes can be ignored when scoring connections. This can
+  save up to half the runtime on some sequences.
 
 ### 📋 Features
 
@@ -50,23 +53,28 @@ metagenomic mode. It is still missing some features of the CLI:
 
 ### 🐏 Memory
 
-Contrary to the Prodigal command line, Pyrodigal attempts to be more conservative
-about memory usage. This means that most of the allocations will be lazy, and
-that some functions will reallocate their results to exact-sized arrays when
-it's possible. This leads to Pyrodigal using about 30% less memory, but with
-a little bit more overhead to compute the size of buffers in advance.
+Pyrodigal makes two changes compared to the origina Prodigal command line:
+
+* Sequences are stored as raw bytes instead of compressed bitmaps. This means
+  that the sequence itself takes 3/8th more space, but since the memory used
+  for storing the sequence is often negligible compared to the memory used to
+  store dynamic programming nodes, this is an acceptable trade-off for better
+  performance when finding the start and stop nodes.
+* Node arrays are dynamically allocated and grow exponentially instead of
+  being pre-allocated with a large size. On small sequences, this leads to
+  Pyrodigal using about 30% less memory.
+
 
 ### 🧶 Thread-safety
 
-`pyrodigal.Pyrodigal` instances are thread-safe, and use an internal lock to
-prevent parallel calls to their methods from overwriting the internal buffers.
-However, a better solution to process sequences in parallel is to use a
-consumer/worker pattern, and have one `Pyrodigal` instance in each worker.
-Using a pool spawning `Pyrodigal` instances on the fly is also fine,
-but prevents recycling memory:
+`pyrodigal.Pyrodigal` instances are thread-safe. In addition, the `find_genes`
+method is re-entrant. This means you can train a `Pyrodigal` instance once,
+and then use a pool to process sequences in parallel:
 ```python
+p = Pyrodigal()
+p.train(training_sequence)
 with multiprocessing.pool.ThreadPool() as pool:
-    pool.map(lambda s: Pyrodigal(meta=True).find_genes(s), sequences)
+    predictions = pool.map(p.find_genes, sequences)
 ```
 
 ## 🔧 Installing
@@ -150,9 +158,10 @@ for more details.
 
 ## ⚖️ License
 
-This library is provided under the [GNU General Public License v3.0](https://choosealicense.com/licenses/gpl-3.0/). The Prodigal code was written by [Doug Hyatt](https://github.com/hyattpd)
-and is distributed under the terms of the GPLv3 as well. See `Prodigal/LICENSE`
-for more information.
+This library is provided under the [GNU General Public License v3.0](https://choosealicense.com/licenses/gpl-3.0/). 
+The Prodigal code was written by [Doug Hyatt](https://github.com/hyattpd) and is distributed under the 
+terms of the GPLv3 as well. See `vendor/Prodigal/LICENSE` for more information. The `cpu_features` library is 
+licensed under the terms of the Apache license. See `vendor/cpu_features/LICENSE` for more information.
 
 *This project is in no way not affiliated, sponsored, or otherwise endorsed
 by the [original Prodigal authors](https://github.com/hyattpd). It was developed
diff --git a/pyrodigal/__main__.py b/pyrodigal/__main__.py
new file mode 100644
index 0000000..ff2332d
--- /dev/null
+++ b/pyrodigal/__main__.py
@@ -0,0 +1,44 @@
+import argparse
+
+import coloredlogs
+coloredlogs.install(level='DEBUG')
+
+from . import __name__, __author__, __version__, Pyrodigal
+from .tests.fasta import parse
+
+parser = argparse.ArgumentParser(prog=__name__)
+# parser.add_argument("-a", required=False, help="Write protein translations to the selected file.")
+parser.add_argument("-c", required=False, action="store_true", help="Closed ends. Do not allow genes to run off edges.", default=False)
+# parser.add_argument("-d", required=False, help="Write nucleotide sequences of genes to the selected file.")
+# parser.add_argument("-f", required=False, help="Select output format.", choices={"gbk", "gff", "sco"}, default="gbk")
+# parser.add_argument("-g", required=False, type=int, choices=_TRANSLATION_TABLES, help="Specify a translation table to use.", default=11)
+parser.add_argument("-i", required=True, help="Specify FASTA input file.")
+parser.add_argument("-p", required=False, help="Select procedure.", choices={"single", "meta"}, default="single")
+
+try:
+    import argcomplete
+    argcomplete.autocomplete(parser)
+except ImportError:
+    pass
+
+args = parser.parse_args()
+
+pyrodigal = Pyrodigal(meta=args.p == "meta", closed=args.c)
+
+
+for i, seq in enumerate(parse(args.i)):
+    if args.p == "single" and i == 0:
+        pyrodigal.train(seq.seq)
+    for pred in predictions:
+        print(
+            seq.id,
+            "{}_v{}".format(__name__, __version__),
+            "CDS",
+            pred.begin,
+            pred.end,
+            "{:.1f}".format(pred.sscore + pred.cscore),
+            "+" if pred.strand > 0 else "-",
+            "0",
+            ";".join([pred._gene_data, pred._score_data]),
+            sep="\t",
+        )
diff --git a/pyrodigal/_pyrodigal.pxd b/pyrodigal/_pyrodigal.pxd
index 4a1876c..03ddc14 100644
--- a/pyrodigal/_pyrodigal.pxd
+++ b/pyrodigal/_pyrodigal.pxd
@@ -3,7 +3,7 @@
 
 # ----------------------------------------------------------------------------
 
-from libc.stdint cimport uint8_t
+from libc.stdint cimport int8_t, uint8_t
 
 from pyrodigal.prodigal.bitmap cimport bitmap_t
 from pyrodigal.prodigal.gene cimport _gene
@@ -41,6 +41,29 @@ cdef class Sequence:
     cdef int _mer_ndx(self, int i, int length, int strand=*) nogil
     cdef char _amino(self, int i, int tt, int strand=*, bint is_init=*) nogil
 
+# --- Connection Scorer ------------------------------------------------------
+
+cdef class ConnectionScorer:
+    # which SIMD backend to use
+    cdef uint8_t  backend
+    # capacity of bypassing buffers
+    cdef size_t   capacity
+    # connection skip lookup table
+    cdef uint8_t* skip_connection
+    cdef uint8_t* skip_connection_raw
+    # aligned storage of node types
+    cdef uint8_t* node_types
+    cdef uint8_t* node_types_raw
+    # aligned storage of node strands
+    cdef int8_t*  node_strands
+    cdef int8_t*  node_strands_raw
+    # aligned storage of node frame
+    cdef uint8_t* node_frames
+    cdef uint8_t* node_frames_raw
+
+    cdef int _index(self, Nodes nodes) nogil except 1
+    cdef int _compute_skippable(self, int min, int i) nogil except 1
+    cdef int _score_connections(self, Nodes nodes, int min, int i, TrainingInfo tinf, bint final=*) nogil except 1
 
 # --- Nodes ------------------------------------------------------------------
 
@@ -53,6 +76,7 @@ cdef class Node:
     cdef _node* node
 
 cdef class Nodes:
+    # contiguous array of nodes, with capacity and length
     cdef _node* nodes
     cdef size_t capacity
     cdef size_t length
@@ -66,6 +90,7 @@ cdef class Nodes:
         const bint edge,
     ) nogil except NULL
 
+    cpdef Nodes copy(self)
     cdef int _clear(self) nogil except 1
     cdef int _sort(self) nogil except 1
 
@@ -143,6 +168,7 @@ cpdef int add_genes(Genes genes, Nodes nodes, int ipath) nogil except -1
 cpdef void calc_dicodon_gene(TrainingInfo tinf, Sequence sequence, Nodes nodes, int ipath) nogil
 cdef int* calc_most_gc_frame(Sequence seq) nogil except NULL
 cpdef int calc_orf_gc(Nodes nodes, Sequence seq, TrainingInfo tinf) nogil except -1
+cpdef int dynamic_programming(Nodes nodes, TrainingInfo tinf, ConnectionScorer score, bint final=*) nogil
 cpdef int find_best_upstream_motif(Nodes nodes, int ni, Sequence seq, TrainingInfo tinf, int stage) nogil except -1
 cpdef void raw_coding_score(Nodes nodes, Sequence seq, TrainingInfo tinf) nogil
 cpdef void rbs_score(Nodes nodes, Sequence seq, TrainingInfo tinf) nogil
@@ -151,17 +177,15 @@ cpdef void score_upstream_composition(Nodes nodes, int ni, Sequence seq, Trainin
 cpdef int shine_dalgarno_exact(Sequence seq, int pos, int start, TrainingInfo tinf, int strand=*) nogil
 cpdef int shine_dalgarno_mm(Sequence seq, int pos, int start, TrainingInfo tinf, int strand=*) nogil
 cpdef void train_starts_nonsd(Nodes nodes, Sequence sequence, TrainingInfo tinf) nogil
+cpdef void train_starts_sd(Nodes nodes, Sequence sequence, TrainingInfo tinf) nogil
 
 # --- Wrappers ---------------------------------------------------------------
 
 cpdef void reset_node_scores(Nodes nodes) nogil
 cpdef void record_overlapping_starts(Nodes nodes, TrainingInfo tinf, bint is_meta=*) nogil
-cpdef int  dynamic_programming(Nodes nodes, TrainingInfo tinf, bint final=*) nogil
 cpdef void eliminate_bad_genes(Nodes nodes, int ipath, TrainingInfo tinf) nogil
 cpdef void tweak_final_starts(Genes genes, Nodes nodes, TrainingInfo tinf) nogil
 cpdef void record_gene_data(Genes genes, Nodes nodes, TrainingInfo tinf, int sequence_index) nogil
-cpdef void calc_dicodon_gene(TrainingInfo tinf, Sequence sequence, Nodes nodes, int ipath) nogil
-cpdef void train_starts_sd(Nodes nodes, Sequence sequence, TrainingInfo tinf) nogil
 cpdef void determine_sd_usage(TrainingInfo tinf) nogil
 
 # --- Main functions ---------------------------------------------------------
diff --git a/pyrodigal/_pyrodigal.pyx b/pyrodigal/_pyrodigal.pyx
index 38172e0..394f85d 100644
--- a/pyrodigal/_pyrodigal.pyx
+++ b/pyrodigal/_pyrodigal.pyx
@@ -50,11 +50,12 @@ from cpython.mem cimport PyMem_Malloc, PyMem_Realloc, PyMem_Free
 from cpython.ref cimport Py_INCREF
 from cpython.tuple cimport PyTuple_New, PyTuple_SET_ITEM
 from libc.math cimport sqrt, log, pow, fmax, fmin, fabs
-from libc.stdint cimport uint8_t
+from libc.stdint cimport int8_t, uint8_t, uintptr_t
 from libc.stdio cimport printf
-from libc.stdlib cimport malloc, free, qsort
-from libc.string cimport memchr, memset, strstr
+from libc.stdlib cimport malloc, calloc, free, qsort
+from libc.string cimport memcpy, memchr, memset, strstr
 
+from pyrodigal.impl.sse cimport skippable_sse
 from pyrodigal.prodigal cimport bitmap, dprog, gene, node, sequence
 from pyrodigal.prodigal.bitmap cimport bitmap_t
 from pyrodigal.prodigal.gene cimport _gene
@@ -65,6 +66,13 @@ from pyrodigal.prodigal.training cimport _training
 from pyrodigal._utils cimport _mini_training
 from pyrodigal._unicode cimport *
 
+IF TARGET_CPU == "x86":
+    from pyrodigal.cpu_features.x86 cimport GetX86Info, X86Info
+    IF SSE2_BUILD_SUPPORT:
+        from pyrodigal.impl.sse cimport skippable_sse
+    IF AVX2_BUILD_SUPPORT:
+        from pyrodigal.impl.avx cimport skippable_avx
+
 # ----------------------------------------------------------------------------
 
 import warnings
@@ -451,6 +459,148 @@ cdef class Sequence:
 
         return b'X'
 
+# --- Connection Scorer ------------------------------------------------------
+
+_TARGET_CPU = TARGET_CPU
+IF TARGET_CPU == "x86":
+    cdef X86Info cpu_info = GetX86Info()
+    _SSE2_RUNTIME_SUPPORT = cpu_info.features.sse2 != 0
+    _AVX2_RUNTIME_SUPPORT = cpu_info.features.avx2 != 0
+    _SSE2_BUILD_SUPPORT   = SSE2_BUILD_SUPPORT
+    _AVX2_BUILD_SUPPORT   = AVX2_BUILD_SUPPORT
+ELSE:
+    _SSE2_RUNTIME_SUPPORT = False
+    _AVX2_RUNTIME_SUPPORT = False
+    _SSE2_BUILD_SUPPORT   = False
+    _AVX2_BUILD_SUPPORT   = False
+
+cdef enum simd_backend:
+    NONE = 0
+    SSE2 = 1
+    AVX2 = 2
+    NEON = 3
+
+cdef class ConnectionScorer:
+
+    def __cinit__(self):
+        self.capacity = 0
+        self.skip_connection = self.skip_connection_raw = NULL
+        self.node_types      = self.node_types_raw      = NULL
+        self.node_strands    = self.node_strands_raw    = NULL
+        self.node_frames     = self.node_frames_raw     = NULL
+
+    def __init__(self, unicode backend="detect"):
+        IF TARGET_CPU == "x86":
+            if backend =="detect":
+                self.backend = simd_backend.NONE
+                IF SSE2_BUILD_SUPPORT:
+                    if _SSE2_RUNTIME_SUPPORT:
+                        self.backend = simd_backend.SSE2
+                IF AVX2_BUILD_SUPPORT:
+                    if _AVX2_RUNTIME_SUPPORT:
+                        self.backend = simd_backend.AVX2
+            elif backend == "sse":
+                IF not SSE2_BUILD_SUPPORT:
+                    raise RuntimeError("Extension was compiled without SSE2 support")
+                ELSE:
+                    if not _SSE2_RUNTIME_SUPPORT:
+                        raise RuntimeError("Cannot run SSE2 instructions on this machine")
+                    self.backend = simd_backend.SSE2
+            elif backend == "avx":
+                IF not AVX2_BUILD_SUPPORT:
+                    raise RuntimeError("Extension was compiled without AVX2 support")
+                ELSE:
+                    if not _AVX2_RUNTIME_SUPPORT:
+                        raise RuntimeError("Cannot run AVX2 instructions on this machine")
+                    self.backend = simd_backend.AVX2
+            elif backend is None:
+                self.backend = simd_backend.NONE
+            else:
+                raise ValueError(f"Unsupported backend on this architecture: {backend}")
+        ELSE:
+            self.backend = simd_backend.NONE
+
+    def __dealloc__(self):
+        PyMem_Free(self.node_types_raw)
+        PyMem_Free(self.node_strands_raw)
+        PyMem_Free(self.node_frames_raw)
+        PyMem_Free(self.skip_connection_raw)
+
+    cdef int _index(self, Nodes nodes) nogil except 1:
+        cdef size_t i
+        # reallocate if needed
+        if self.capacity < nodes.length:
+            with gil:
+                # reallocate new memory
+                self.skip_connection_raw = <uint8_t*> PyMem_Realloc(self.skip_connection_raw, nodes.length * sizeof(uint8_t) + 0x1F)
+                self.node_types_raw      = <uint8_t*> PyMem_Realloc(self.node_types_raw, nodes.length      * sizeof(uint8_t) + 0x1F)
+                self.node_strands_raw    = <int8_t*>  PyMem_Realloc(self.node_strands_raw, nodes.length    * sizeof(int8_t)  + 0x1F)
+                self.node_frames_raw     = <uint8_t*> PyMem_Realloc(self.node_frames_raw, nodes.length     * sizeof(uint8_t) + 0x1F)
+                # check that allocations were successful
+                if self.skip_connection_raw == NULL:
+                    raise MemoryError("Failed to allocate memory for scoring bypass index")
+                if self.node_types_raw == NULL:
+                    raise MemoryError("Failed to allocate memory for node type array")
+                if self.node_strands_raw == NULL:
+                    raise MemoryError("Failed to allocate memory for node strand array")
+                if self.node_frames_raw == NULL:
+                    raise MemoryError("Failed to allocate memory for node frame array")
+            # record new capacity
+            self.capacity = nodes.length
+            # compute pointers to aligned memory
+            self.skip_connection = <uint8_t*> ((<uintptr_t> self.skip_connection_raw + 0x1F) & (~0x1F))
+            self.node_types      = <uint8_t*> ((<uintptr_t> self.node_types_raw      + 0x1F) & (~0x1F))
+            self.node_strands    = <int8_t*>  ((<uintptr_t> self.node_strands_raw    + 0x1F) & (~0x1F))
+            self.node_frames     = <uint8_t*> ((<uintptr_t> self.node_frames_raw     + 0x1F) & (~0x1F))
+        # copy data from the array of nodes
+        for i in range(nodes.length):
+            self.node_types[i]      = nodes.nodes[i].type
+            self.node_strands[i]    = nodes.nodes[i].strand
+            self.node_frames[i]     = nodes.nodes[i].ndx % 3
+            self.skip_connection[i] = False
+        # return 0 if no exceptions were raised
+        return 0
+
+    def index(self, Nodes nodes not None):
+        with nogil:
+            self._index(nodes)
+
+    cdef int _compute_skippable(self, int min, int i) nogil except 1:
+        if self.backend != simd_backend.NONE:
+            memset(&self.skip_connection[min], 0, sizeof(uint8_t) * (i - min))
+        IF AVX2_BUILD_SUPPORT:
+            if self.backend == simd_backend.AVX2:
+                skippable_avx(self.node_strands, self.node_types, self.node_frames, min, i, self.skip_connection)
+        IF SSE2_BUILD_SUPPORT:
+            if self.backend == simd_backend.SSE2:
+                skippable_sse(self.node_strands, self.node_types, self.node_frames, min, i, self.skip_connection)
+        return 0
+
+    def compute_skippable(self, int min, int i):
+        assert self.skip_connection != NULL
+        assert i < self.capacity
+        assert min <= i
+        with nogil:
+            self._compute_skippable(min, i)
+
+    cdef int _score_connections(self, Nodes nodes, int min, int i, TrainingInfo tinf, bint final=False) nogil except 1:
+        cdef int j
+        cdef _node*     raw_nodes = nodes.nodes
+        cdef _training* raw_tinf  = tinf.tinf
+
+        for j in range(min, i):
+            if self.skip_connection[j] == 0:
+                dprog.score_connection(raw_nodes, j, i, raw_tinf, final)
+
+        return 0
+
+    def score_connections(self, Nodes nodes not None, int min, int i, TrainingInfo tinf not None, bint final=False):
+        assert self.skip_connection != NULL
+        assert i < nodes.length
+        assert min <= i
+        with nogil:
+            self._score_connections(nodes, min, i, tinf, final)
+
 # --- Nodes ------------------------------------------------------------------
 
 cdef class Node:
@@ -520,6 +670,9 @@ cdef class Nodes:
     def __dealloc__(self):
         PyMem_Free(self.nodes)
 
+    def __copy__(self):
+        return self.copy()
+
     def __len__(self):
         return self.length
 
@@ -581,6 +734,16 @@ cdef class Nodes:
         with nogil:
             self._clear()
 
+    cpdef Nodes copy(self):
+        cdef Nodes new = Nodes.__new__(Nodes)
+        new.capacity = self.capacity
+        new.length = self.length
+        new.nodes = <_node*> PyMem_Malloc(new.capacity * sizeof(_node))
+        if new.nodes == NULL:
+            raise MemoryError("Failed to reallocate node array")
+        memcpy(new.nodes, self.nodes, new.capacity * sizeof(_node))
+        return new
+
     cdef int _sort(self) nogil except 1:
         """Sort all nodes in the vector by their index and strand.
         """
@@ -1744,7 +1907,7 @@ cpdef void count_upstream_composition(Sequence seq, TrainingInfo tinf, int pos,
                 tinf.tinf.ups_comp[i][_translation[seq.digits[pos+j]] & 0b11] += 1
             i += 1
 
-cpdef int dynamic_programming(Nodes nodes, TrainingInfo tinf, bint final=False) nogil:
+cpdef int dynamic_programming(Nodes nodes, TrainingInfo tinf, ConnectionScorer scorer, bint final=False) nogil:
     cdef int    i
     cdef int    j
     cdef int    min
@@ -1773,8 +1936,10 @@ cpdef int dynamic_programming(Nodes nodes, TrainingInfo tinf, bint final=False)
             if nodes.nodes[i].ndx != nodes.nodes[i].stop_val:
                 min = 0
         min = 0 if min < dprog.MAX_NODE_DIST else min - dprog.MAX_NODE_DIST
-        for j in range(min, i):
-            dprog.score_connection(nodes.nodes, j, i, tinf.tinf, final)
+        # Check which nodes can be skipped
+        scorer._compute_skippable(min, i)
+        # Score connections
+        scorer._score_connections(nodes, min, i, tinf, final)
 
     for i in reversed(range(<int> nodes.length)):
         if nodes.nodes[i].strand == 1 and nodes.nodes[i].type != node_type.STOP:
@@ -2940,7 +3105,6 @@ cpdef inline void record_gene_data(Genes genes, Nodes nodes, TrainingInfo tinf,
 cpdef inline void determine_sd_usage(TrainingInfo tinf) nogil:
     node.determine_sd_usage(tinf.tinf)
 
-
 # --- Main functions ---------------------------------------------------------
 
 cpdef TrainingInfo train(Sequence sequence, bint closed=False, bint force_nonsd=False, double start_weight=4.35, int translation_table=11):
@@ -2948,11 +3112,13 @@ cpdef TrainingInfo train(Sequence sequence, bint closed=False, bint force_nonsd=
     cdef int*         gc_frame
     cdef Nodes        nodes    = Nodes()
     cdef TrainingInfo tinf     = TrainingInfo(sequence.gc, start_weight, translation_table)
+    cdef ConnectionScorer scorer = ConnectionScorer()
 
     with nogil:
         # find all the potential starts and stops
         add_nodes(nodes, sequence, tinf, closed=closed)
         nodes._sort()
+        scorer._index(nodes)
         # scan all the ORFs looking for a potential GC bias in a particular
         # codon position, in order to acquire a good initial set of genes
         gc_frame = calc_most_gc_frame(sequence)
@@ -2963,7 +3129,7 @@ cpdef TrainingInfo train(Sequence sequence, bint closed=False, bint force_nonsd=
         # do an initial dynamic programming routine with just the GC frame bias
         # used as a scoring function.
         record_overlapping_starts(nodes, tinf, is_meta=False)
-        ipath = dynamic_programming(nodes, tinf, final=False)
+        ipath = dynamic_programming(nodes, tinf, scorer, final=False)
         # gather dicodon statistics for the training set
         calc_dicodon_gene(tinf, sequence, nodes, ipath)
         raw_coding_score(nodes, sequence, tinf)
@@ -2984,17 +3150,19 @@ cpdef Predictions find_genes_single(Sequence sequence, TrainingInfo tinf, bint c
     cdef int   ipath
     cdef Genes genes = Genes()
     cdef Nodes nodes = Nodes()
+    cdef ConnectionScorer scorer = ConnectionScorer()
 
     with nogil:
         # find all the potential starts and stops, and sort them
         add_nodes(nodes, sequence, tinf, closed=closed)
         nodes._sort()
+        scorer._index(nodes)
         # second dynamic programming, using the dicodon statistics as the
         # scoring function
         reset_node_scores(nodes)
         score_nodes(nodes, sequence, tinf, closed=closed, is_meta=False)
         record_overlapping_starts(nodes, tinf, is_meta=True)
-        ipath = dynamic_programming(nodes, tinf, final=True)
+        ipath = dynamic_programming(nodes, tinf, scorer, final=True)
         # eliminate eventual bad genes in the nodes
         if nodes.length > 0:
             eliminate_bad_genes(nodes, ipath, tinf)
@@ -3016,6 +3184,7 @@ cpdef Predictions find_genes_meta(Sequence sequence, bint closed = False, int se
     cdef Genes        genes     = Genes()
     cdef Nodes        nodes     = Nodes()
     cdef TrainingInfo tinf      = TrainingInfo.__new__(TrainingInfo)
+    cdef ConnectionScorer scorer = ConnectionScorer()
     cdef int          max_phase = 0
     cdef double       max_score = -100.0
 
@@ -3045,11 +3214,12 @@ cpdef Predictions find_genes_meta(Sequence sequence, bint closed = False, int se
                 nodes._clear()
                 add_nodes(nodes, sequence, tinf, closed=closed)
                 nodes._sort()
+                scorer._index(nodes)
             # compute the score for the current bin
             reset_node_scores(nodes)
             score_nodes(nodes, sequence, tinf, closed=closed, is_meta=True)
             record_overlapping_starts(nodes, tinf, is_meta=True)
-            ipath = dynamic_programming(nodes, tinf, final=True)
+            ipath = dynamic_programming(nodes, tinf, scorer, final=True)
             # update genes if the current bin had a better score
             if nodes.length > 0 and nodes.nodes[ipath].score > max_score:
                 # record best phase and score
diff --git a/pyrodigal/cpu_features/__init__.pxd b/pyrodigal/cpu_features/__init__.pxd
new file mode 100644
index 0000000..e69de29
diff --git a/pyrodigal/cpu_features/x86.pxd b/pyrodigal/cpu_features/x86.pxd
new file mode 100644
index 0000000..ed02363
--- /dev/null
+++ b/pyrodigal/cpu_features/x86.pxd
@@ -0,0 +1,25 @@
+cdef extern from "cpuinfo_x86.h" nogil:
+
+    ctypedef struct X86Features:
+        int sse
+        int sse2
+        int sse3
+        int ssse3
+        int sse4_1
+        int sse4_2
+        int sse4a
+
+        int avx
+        int avx2
+
+        int avx512f
+        int avx512cd
+
+    ctypedef struct X86Info:
+        X86Features features
+        int         family
+        int         model
+        int         stepping
+        char        vendor[13]
+
+    cdef X86Info GetX86Info()
diff --git a/pyrodigal/impl/__init__.pxd b/pyrodigal/impl/__init__.pxd
new file mode 100644
index 0000000..e69de29
diff --git a/pyrodigal/impl/avx.c b/pyrodigal/impl/avx.c
new file mode 100644
index 0000000..0ecb436
--- /dev/null
+++ b/pyrodigal/impl/avx.c
@@ -0,0 +1,83 @@
+#include <immintrin.h>
+
+#include "training.h"
+#include "node.h"
+#include "dprog.h"
+#include "avx.h"
+
+#ifdef __AVX2__
+void skippable_avx(
+    const int8_t* strands,
+    const uint8_t* types,
+    const uint8_t* frames,
+
+    const int min,
+    const int i,
+    uint8_t* skip
+) {
+
+    const __m256i all_stops  = _mm256_set1_epi8(STOP);
+    const __m256i all_fwd    = _mm256_set1_epi8(1);
+    const __m256i all_bwd    = _mm256_set1_epi8(-1);
+
+    int j;
+    __m256i x;
+    __m256i s;
+    __m256i n1_strands;
+    __m256i n1_types;
+    __m256i n1_frames;
+    __m256i n2_strands = _mm256_set1_epi8(strands[i]);
+    __m256i n2_types   = _mm256_set1_epi8(types[i]);
+    __m256i n2_frames  = _mm256_set1_epi8(frames[i]);
+
+    for (j = (min + 0x1F) & (~0x1F); j + 31 < i; j += 32) {
+        n1_strands = _mm256_load_si256((__m256i*) &strands[j]);
+        n1_types =   _mm256_load_si256((__m256i*) &types[j]);
+        n1_frames =  _mm256_load_si256((__m256i*) &frames[j]);
+        s = _mm256_set1_epi8(0);
+        // 5'fwd->5'fwd
+        // n1->strand == n2->strand && n2->type != STOP && n1->type != STOP
+        x =                     _mm256_cmpeq_epi8(n1_strands, n2_strands);
+        x = _mm256_andnot_si256(_mm256_cmpeq_epi8(n2_types, all_stops),    x);
+        x = _mm256_andnot_si256(_mm256_cmpeq_epi8(n1_types, all_stops),    x);
+        s = _mm256_or_si256(                                         s, x);
+        // 5'fwd->5'ref, 5'fwd->3'rev
+        // n2->strand == -1 && n1->strand == 1 && n1->type != STOP
+        x =                     _mm256_cmpeq_epi8(n2_strands, all_bwd);
+        x = _mm256_and_si256(   _mm256_cmpeq_epi8(n1_strands, all_fwd), x);
+        x = _mm256_andnot_si256(_mm256_cmpeq_epi8(n1_types, all_stops), x);
+        s = _mm256_or_si256(                                         s, x);
+        // 5'fwd
+        // n1->type == STOP && n1->strand == -1 && n2->strand == -1
+        x =                  _mm256_cmpeq_epi8(n1_types, all_stops);
+        x = _mm256_and_si256(_mm256_cmpeq_epi8(n1_strands, all_bwd), x);
+        x = _mm256_and_si256(_mm256_cmpeq_epi8(n2_strands, all_fwd), x);
+        s = _mm256_or_si256(                                      s, x);
+        // 5'rev->3'fwd
+        // n2->type == STOP && n1->strand == -1 && n2->strand == 1 && n1->type != STOP
+        x =                     _mm256_cmpeq_epi8(n2_types, all_stops);
+        x = _mm256_and_si256(   _mm256_cmpeq_epi8(n1_strands, all_bwd), x);
+        x = _mm256_and_si256(   _mm256_cmpeq_epi8(n2_strands, all_fwd), x);
+        x = _mm256_andnot_si256(_mm256_cmpeq_epi8(n1_types, all_stops), x);
+        s = _mm256_or_si256(                                         s, x);
+        // 5'fwd->3'fwd
+        // n1->strand == n2->strand && n1->strand == 1 && n1->type != STOP && n2->type == STOP && n1->ndx%3 != n2->ndx%3
+        x =                     _mm256_cmpeq_epi8(n1_strands, n2_strands);
+        x = _mm256_and_si256(   _mm256_cmpeq_epi8(n1_strands, all_fwd),    x);
+        x = _mm256_andnot_si256(_mm256_cmpeq_epi8(n1_types,   all_stops),  x);
+        x = _mm256_and_si256(   _mm256_cmpeq_epi8(n2_types,   all_stops),  x);
+        x = _mm256_andnot_si256(_mm256_cmpeq_epi8(n1_frames,  n2_frames),  x);
+        s = _mm256_or_si256(                                            s, x);
+        // 3'rev->5'rev
+        // n1->strand == n2->strand && n1->strand == -1 && n1->type == STOP && n2->type != STOP && n1->ndx%3 != n2->ndx%3
+        x =                     _mm256_cmpeq_epi8(n1_strands, n2_strands);
+        x = _mm256_and_si256(   _mm256_cmpeq_epi8(n1_strands, all_bwd),    x);
+        x = _mm256_and_si256(   _mm256_cmpeq_epi8(n1_types,   all_stops),  x);
+        x = _mm256_andnot_si256(_mm256_cmpeq_epi8(n2_types,   all_stops),  x);
+        x = _mm256_andnot_si256(_mm256_cmpeq_epi8(n1_frames,  n2_frames),  x);
+        s = _mm256_or_si256(                                            s, x);
+        // store result mask
+        _mm256_store_si256((__m256i*) &skip[j], s);
+    }
+}
+#endif
diff --git a/pyrodigal/impl/avx.h b/pyrodigal/impl/avx.h
new file mode 100644
index 0000000..00a1690
--- /dev/null
+++ b/pyrodigal/impl/avx.h
@@ -0,0 +1,8 @@
+#ifndef _PYRODIGAL_IMPL_AVX_H
+#define _PYRODIGAL_IMPL_AVX_H
+
+#include <stdint.h>
+
+void skippable_avx(const int8_t*, const uint8_t*, const uint8_t*, const int, const int, uint8_t*);
+
+#endif
diff --git a/pyrodigal/impl/avx.pxd b/pyrodigal/impl/avx.pxd
new file mode 100644
index 0000000..9da5b6b
--- /dev/null
+++ b/pyrodigal/impl/avx.pxd
@@ -0,0 +1,4 @@
+from libc.stdint cimport int8_t, uint8_t
+
+cdef extern from "impl/avx.h" nogil:
+    void skippable_avx(const int8_t*, const uint8_t*, const uint8_t*, const int, const int, uint8_t*);
diff --git a/pyrodigal/impl/sse.c b/pyrodigal/impl/sse.c
new file mode 100644
index 0000000..804d90b
--- /dev/null
+++ b/pyrodigal/impl/sse.c
@@ -0,0 +1,84 @@
+#include <emmintrin.h>
+
+#include "training.h"
+#include "node.h"
+#include "dprog.h"
+#include "sse.h"
+
+#ifdef __SSE2__
+void skippable_sse(
+    const int8_t* strands,
+    const uint8_t* types,
+    const uint8_t* frames,
+
+    const int min,
+    const int i,
+    uint8_t* skip
+) {
+
+  const __m128i all_stops  = _mm_set1_epi8(STOP);
+  const __m128i all_fwd    = _mm_set1_epi8(1);
+  const __m128i all_bwd    = _mm_set1_epi8(-1);
+
+  int j;
+  __m128i x;
+  __m128i s;
+  __m128i n1_strands;
+  __m128i n1_types;
+  __m128i n1_frames;
+  __m128i n2_strands = _mm_set1_epi8(strands[i]);
+  __m128i n2_types   = _mm_set1_epi8(types[i]);
+  __m128i n2_frames  = _mm_set1_epi8(frames[i]);
+
+  for (j = (min + 0xF) & (~0xF); j + 15 < i; j += 16) {
+      n1_strands = _mm_load_si128((__m128i*) &strands[j]);
+      n1_types =   _mm_load_si128((__m128i*) &types[j]);
+      n1_frames =  _mm_load_si128((__m128i*) &frames[j]);
+      s = _mm_xor_si128(s, s);
+      // 5'fwd->5'fwd
+      // n1->strand == n2->strand && n2->type != STOP && n1->type != STOP
+      x =                  _mm_cmpeq_epi8(n1_strands, n2_strands);
+      x = _mm_andnot_si128(_mm_cmpeq_epi8(n2_types, all_stops),    x);
+      x = _mm_andnot_si128(_mm_cmpeq_epi8(n1_types, all_stops),    x);
+      s = _mm_or_si128(                                         s, x);
+      // 5'fwd->5'ref, 5'fwd->3'rev
+      // n2->strand == -1 && n1->strand == 1 && n1->type != STOP
+      x =                  _mm_cmpeq_epi8(n2_strands, all_bwd);
+      x = _mm_and_si128(   _mm_cmpeq_epi8(n1_strands, all_fwd), x);
+      x = _mm_andnot_si128(_mm_cmpeq_epi8(n1_types, all_stops), x);
+      s = _mm_or_si128(                                      s, x);
+      // 5'fwd
+      // n1->type == STOP && n1->strand == -1 && n2->strand == -1
+      x =               _mm_cmpeq_epi8(n1_types, all_stops);
+      x = _mm_and_si128(_mm_cmpeq_epi8(n1_strands, all_bwd), x);
+      x = _mm_and_si128(_mm_cmpeq_epi8(n2_strands, all_fwd), x);
+      s = _mm_or_si128(                                   s, x);
+      // 5'rev->3'fwd
+      // n2->type == STOP && n1->strand == -1 && n2->strand == 1 && n1->type != STOP
+      x =                  _mm_cmpeq_epi8(n2_types, all_stops);
+      x = _mm_and_si128(   _mm_cmpeq_epi8(n1_strands, all_bwd), x);
+      x = _mm_and_si128(   _mm_cmpeq_epi8(n2_strands, all_fwd), x);
+      x = _mm_andnot_si128(_mm_cmpeq_epi8(n1_types, all_stops), x);
+      s = _mm_or_si128(                                      s, x);
+      // 5'fwd->3'fwd
+      // n1->strand == n2->strand && n1->strand == 1 && n1->type != STOP && n2->type == STOP && n1->ndx%3 != n2->ndx%3
+      x =                  _mm_cmpeq_epi8(n1_strands, n2_strands);
+      x = _mm_and_si128(   _mm_cmpeq_epi8(n1_strands, all_fwd),    x);
+      x = _mm_andnot_si128(_mm_cmpeq_epi8(n1_types,   all_stops),  x);
+      x = _mm_and_si128(   _mm_cmpeq_epi8(n2_types,   all_stops),  x);
+      x = _mm_andnot_si128(_mm_cmpeq_epi8(n1_frames,  n2_frames),  x);
+      s = _mm_or_si128(                                        s,  x);
+      // 3'rev->5'rev
+      // n1->strand == n2->strand && n1->strand == -1 && n1->type == STOP && n2->type != STOP && n1->ndx%3 != n2->ndx%3
+      x =                  _mm_cmpeq_epi8(n1_strands, n2_strands);
+      x = _mm_and_si128(   _mm_cmpeq_epi8(n1_strands, all_bwd),    x);
+      x = _mm_and_si128(   _mm_cmpeq_epi8(n1_types,   all_stops),  x);
+      x = _mm_andnot_si128(_mm_cmpeq_epi8(n2_types,   all_stops),  x);
+      x = _mm_andnot_si128(_mm_cmpeq_epi8(n1_frames,  n2_frames),  x);
+      s = _mm_or_si128(                                        s,  x);
+
+      // store result mask
+      _mm_store_si128((__m128i*) &skip[j], s);
+  }
+}
+#endif
diff --git a/pyrodigal/impl/sse.h b/pyrodigal/impl/sse.h
new file mode 100644
index 0000000..34a4973
--- /dev/null
+++ b/pyrodigal/impl/sse.h
@@ -0,0 +1,8 @@
+#ifndef _PYRODIGAL_IMPL_SSE_H
+#define _PYRODIGAL_IMPL_SSE_H
+
+#include <stdint.h>
+
+void skippable_sse(const int8_t*, const uint8_t*, const uint8_t*, const int, const int, uint8_t*);
+
+#endif
diff --git a/pyrodigal/impl/sse.pxd b/pyrodigal/impl/sse.pxd
new file mode 100644
index 0000000..9010944
--- /dev/null
+++ b/pyrodigal/impl/sse.pxd
@@ -0,0 +1,4 @@
+from libc.stdint cimport int8_t, uint8_t
+
+cdef extern from "impl/sse.h" nogil:
+    void skippable_sse(const int8_t*, const uint8_t*, const uint8_t*, const int, const int, uint8_t*);
diff --git a/pyrodigal/prodigal/node.pxd b/pyrodigal/prodigal/node.pxd
index d47afc6..60e5b7c 100644
--- a/pyrodigal/prodigal/node.pxd
+++ b/pyrodigal/prodigal/node.pxd
@@ -65,6 +65,8 @@ cdef extern from "node.h" nogil:
 
     void determine_sd_usage(_training *tinf)
 
+    double intergenic_mod(_node*, _node*, _training*)
+
     void train_starts_sd(bitmap_t seq, bitmap_t rseq, int slen, _node *nodes, int nn, _training *tinf)
     void train_starts_nonsd(bitmap_t seq, bitmap_t rseq, int slen, _node *nodes, int nn, _training *tinf)
 
diff --git a/pyrodigal/tests/__init__.py b/pyrodigal/tests/__init__.py
index 2bd309d..2a770ce 100644
--- a/pyrodigal/tests/__init__.py
+++ b/pyrodigal/tests/__init__.py
@@ -1,8 +1,9 @@
-from . import test_gene, test_genes, test_nodes, test_pyrodigal
+from . import test_gene, test_genes, test_nodes, test_pyrodigal, test_connection_scorer
 
 def load_tests(loader, suite, pattern):
     suite.addTests(loader.loadTestsFromModule(test_gene))
     suite.addTests(loader.loadTestsFromModule(test_genes))
     suite.addTests(loader.loadTestsFromModule(test_nodes))
     suite.addTests(loader.loadTestsFromModule(test_pyrodigal))
+    suite.addTests(loader.loadTestsFromModule(test_connection_scorer))
     return suite
diff --git a/pyrodigal/tests/test_connection_scorer.py b/pyrodigal/tests/test_connection_scorer.py
new file mode 100644
index 0000000..933186c
--- /dev/null
+++ b/pyrodigal/tests/test_connection_scorer.py
@@ -0,0 +1,84 @@
+import collections.abc
+import gzip
+import os
+import sys
+import unittest
+
+from .. import Nodes, Sequence, _pyrodigal
+from .._pyrodigal import METAGENOMIC_BINS, add_nodes, ConnectionScorer
+
+from .fasta import parse
+
+
+class TestConnectionScorer(unittest.TestCase):
+
+    @classmethod
+    def setUpClass(cls):
+        data = os.path.realpath(os.path.join(__file__, "..", "data"))
+        fna = os.path.join(data, "MIIJ01000039.fna.gz")
+        with gzip.open(fna, "rt") as f:
+            cls.record = next(parse(f))
+
+    @unittest.skipUnless(_pyrodigal._TARGET_CPU == "x86", "requires x86 CPU")
+    @unittest.skipUnless(_pyrodigal._SSE2_BUILD_SUPPORT, "requires extension compiled with SSE2 support")
+    @unittest.skipUnless(_pyrodigal._SSE2_RUNTIME_SUPPORT, "requires machine with SSE2 support")
+    def test_score_connections_sse(self):
+        # setup
+        seq = Sequence.from_string(self.record.seq)
+        tinf = METAGENOMIC_BINS[0].training_info
+        scorer_sse = ConnectionScorer(backend="sse")
+        scorer_generic = ConnectionScorer(backend=None)
+        # add nodes from the sequence
+        nodes = Nodes()
+        add_nodes(nodes, seq, tinf)
+        nodes.sort()
+        # index nodes for the scorers
+        scorer_sse.index(nodes)
+        scorer_generic.index(nodes)
+        # use copies to compute both scores
+        nodes_sse = nodes.copy()
+        nodes_generic = nodes.copy()
+        for i in range(500, len(nodes)):
+            # compute boundary
+            j = 0 if i < 500 else i - 500
+            # score connections without fast-indexing skippable nodes
+            scorer_generic.score_connections(nodes_generic, j, i, tinf, final=True)
+            scorer_generic.compute_skippable(j, i)
+            # compute skippable nodes with SSE and score connections with
+            scorer_sse.compute_skippable(j, i)
+            scorer_sse.score_connections(nodes_sse, j, i, tinf, final=True)
+        # check that both methods scored the same
+        for n_sse, n_generic in zip(nodes_sse, nodes_generic):
+            self.assertEqual(n_sse.score, n_generic.score)
+
+    @unittest.skipUnless(_pyrodigal._TARGET_CPU == "x86", "requires x86 CPU")
+    @unittest.skipUnless(_pyrodigal._AVX2_BUILD_SUPPORT, "requires extension compiled with AVX2 support")
+    @unittest.skipUnless(_pyrodigal._AVX2_RUNTIME_SUPPORT, "requires machine with AVX2 support")
+    def test_score_connections_avx(self):
+        # setup
+        seq = Sequence.from_string(self.record.seq)
+        tinf = METAGENOMIC_BINS[0].training_info
+        scorer_avx = ConnectionScorer(backend="avx")
+        scorer_generic = ConnectionScorer(backend=None)
+        # add nodes from the sequence
+        nodes = Nodes()
+        add_nodes(nodes, seq, tinf)
+        nodes.sort()
+        # index nodes for the scorers
+        scorer_avx.index(nodes)
+        scorer_generic.index(nodes)
+        # use copies to compute both scores
+        nodes_avx = nodes.copy()
+        nodes_generic = nodes.copy()
+        for i in range(500, len(nodes)):
+            # compute boundary
+            j = 0 if i < 500 else i - 500
+            # score connections without fast-indexing skippable nodes
+            scorer_generic.score_connections(nodes_generic, j, i, tinf, final=True)
+            scorer_generic.compute_skippable(j, i)
+            # compute skippable nodes with SSE and score connections with
+            scorer_avx.compute_skippable(j, i)
+            scorer_avx.score_connections(nodes_avx, j, i, tinf, final=True)
+        # check that both methods scored the same
+        for n_avx, n_generic in zip(nodes_avx, nodes_generic):
+            self.assertEqual(n_avx.score, n_generic.score)
diff --git a/pyrodigal/tests/test_nodes.py b/pyrodigal/tests/test_nodes.py
index d3a498e..5d4d075 100644
--- a/pyrodigal/tests/test_nodes.py
+++ b/pyrodigal/tests/test_nodes.py
@@ -22,21 +22,27 @@ def setUpClass(cls):
     def test_add_nodes_metagenomic_bins(self):
         seq = Sequence.from_string(self.record.seq)
         nodes = Nodes()
-
+        # nodes should start empty
         self.assertEqual(len(nodes), 0)
-
+        # numbers below obtained directly in Prodigal by `printf`-ing the
+        # node numbers on a normal run
         self.assertEqual(add_nodes(nodes, seq, METAGENOMIC_BINS[0].training_info), 2970)
         self.assertEqual(len(nodes), 2970)
         nodes.clear()
-
         self.assertEqual(add_nodes(nodes, seq, METAGENOMIC_BINS[2].training_info), 2970)
         self.assertEqual(len(nodes), 2970)
         nodes.clear()
-
         self.assertEqual(add_nodes(nodes, seq, METAGENOMIC_BINS[11].training_info), 2293)
         self.assertEqual(len(nodes), 2293)
         nodes.clear()
-
         self.assertEqual(add_nodes(nodes, seq, METAGENOMIC_BINS[24].training_info), 2293)
         self.assertEqual(len(nodes), 2293)
         nodes.clear()
+
+    def test_copy(self):
+        seq = Sequence.from_string(self.record.seq)
+        nodes1 = Nodes()
+        add_nodes(nodes1, seq, METAGENOMIC_BINS[0].training_info)
+        nodes2 = nodes1.copy()
+        for n1, n2 in zip(nodes1, nodes2):
+            self.assertEqual(n1.type, n2.type)
diff --git a/setup.py b/setup.py
index a1d494c..d5f7569 100644
--- a/setup.py
+++ b/setup.py
@@ -1,10 +1,15 @@
 import configparser
 import glob
 import os
+import platform
+import re
+import subprocess
 import sys
+from unittest import mock
 
 import setuptools
 from distutils import log
+from distutils.errors import CompileError
 from distutils.command.clean import clean as _clean
 from setuptools.command.build_ext import build_ext as _build_ext
 from setuptools.command.build_clib import build_clib as _build_clib
@@ -16,6 +21,27 @@
 except ImportError as err:
     cythonize = err
 
+# --- Constants -----------------------------------------------------------------
+
+SYSTEM  = platform.system()
+MACHINE = platform.machine()
+if re.match("^mips", MACHINE):
+    TARGET_CPU = "mips"
+elif re.match("^arm", MACHINE):
+    TARGET_CPU = "arm"
+elif re.match("^aarch64", MACHINE):
+    TARGET_CPU = "aarch64"
+elif re.match("(x86_64)|(AMD64|amd64)|(^i.86$)", MACHINE):
+    TARGET_CPU = "x86"
+elif re.match("^(powerpc|ppc)", MACHINE):
+    TARGET_CPU = "ppc"
+
+# --- Utils ------------------------------------------------------------------
+
+def _eprint(*args, **kwargs):
+    print(*args, **kwargs, file=sys.stderr)
+
+# --- Commands ------------------------------------------------------------------
 
 class sdist(_sdist):
     """A `sdist` that generates a `pyproject.toml` on the fly.
@@ -42,12 +68,11 @@ def finalize_options(self):
         self._clib_cmd = self.get_finalized_command("build_clib")
         self._clib_cmd.debug = self.debug
         self._clib_cmd.force = self.force
+        self._clib_cmd.verbose = self.verbose
 
     def build_extension(self, ext):
         # show the compiler being used
-        log.info("building {} with {} compiler".format(
-            ext.name, self.compiler.compiler_type
-        ))
+        _eprint("building", ext.name, "with", self.compiler.compiler_type, "compiler")
 
         # add debug flags if we are building in debug mode
         if self.debug:
@@ -86,6 +111,9 @@ def run(self):
                 "SYS_VERSION_INFO_MAJOR": sys.version_info.major,
                 "SYS_VERSION_INFO_MINOR": sys.version_info.minor,
                 "SYS_VERSION_INFO_MICRO": sys.version_info.micro,
+                "TARGET_CPU": TARGET_CPU,
+                "AVX2_BUILD_SUPPORT": False,
+                "SSE2_BUILD_SUPPORT": False,
             }
         }
         if self.force:
@@ -104,6 +132,22 @@ def run(self):
             cython_args["compiler_directives"]["wraparound"] = False
             cython_args["compiler_directives"]["cdivision"] = True
 
+        # compile the C library
+        if not self.distribution.have_run.get("build_clib", False):
+            self._clib_cmd.run()
+
+        # check if we can build platform-specific code
+        if self._clib_cmd._avx2_supported:
+            cython_args["compile_time_env"]["AVX2_BUILD_SUPPORT"] = True
+            for ext in self.extensions:
+                ext.extra_compile_args.append(self._clib_cmd._avx2_flag())
+                ext.define_macros.append(("__AVX2__", 1))
+        if self._clib_cmd._sse2_supported:
+            cython_args["compile_time_env"]["SSE2_BUILD_SUPPORT"] = True
+            for ext in self.extensions:
+                ext.extra_compile_args.append(self._clib_cmd._sse2_flag())
+                ext.define_macros.append(("__SSE2__", 1))
+
         # cythonize the extensions
         self.extensions = cythonize(self.extensions, **cython_args)
 
@@ -111,10 +155,6 @@ def run(self):
         for ext in self.extensions:
             ext._needs_stub = False
 
-        # compile the C library
-        if not self.distribution.have_run.get("build_clib", False):
-            self._clib_cmd.run()
-
         # build the extensions as normal
         _build_ext.run(self)
 
@@ -123,6 +163,107 @@ class build_clib(_build_clib):
     """A custom `build_clib` that splits the `training.c` file from Prodigal.
     """
 
+    # --- Autotools-like helpers ---
+
+    def _check_simd_generic(self, name, flag, header, vector, set, extract):
+        _eprint('checking whether compiler can build', name, 'code', end="... ")
+
+        base = "have_{}".format(name)
+        testfile = os.path.join(self.build_temp, "{}.c".format(base))
+        binfile = self.compiler.executable_filename(base, output_dir=self.build_temp)
+        objects = []
+
+        with open(testfile, "w") as f:
+            f.write("""
+                #include <{}>
+                int main() {{
+                    {}      a = {}(1);
+                    short   x = {}(a, 1);
+                    return (x == 1) ? 0 : 1;
+                }}
+            """.format(header, vector, set, extract))
+        try:
+            objects = self.compiler.compile([testfile], debug=self.debug, extra_preargs=[flag])
+            self.compiler.link_executable(objects, base, output_dir=self.build_temp)
+            subprocess.run([binfile], check=True)
+        except CompileError:
+            _eprint("no")
+            return False
+        except subprocess.CalledProcessError:
+            _eprint("yes, but cannot run code")
+            return True  # assume we are cross-compiling, and still build
+        else:
+            _eprint("yes, with {}".format(flag))
+            return True
+        finally:
+            os.remove(testfile)
+            for obj in filter(os.path.isfile, objects):
+                os.remove(obj)
+            if os.path.isfile(binfile):
+                os.remove(binfile)
+
+    def _check_function(self, funcname, header, args="()"):
+        _eprint('checking whether function', repr(funcname), 'is available', end="... ")
+
+        base = "have_{}".format(funcname)
+        testfile = os.path.join(self.build_temp, "{}.c".format(base))
+        binfile = self.compiler.executable_filename(base, output_dir=self.build_temp)
+        objects = []
+
+        with open(testfile, "w") as f:
+            f.write("""
+                #include <{}>
+                int main() {{
+                    {}{};
+                    return 0;
+                }}
+            """.format(header, funcname, args))
+        try:
+            objects = self.compiler.compile([testfile], debug=self.debug)
+            self.compiler.link_executable(objects, base, output_dir=self.build_temp)
+        except CompileError:
+            _eprint("no")
+            return False
+        else:
+            _eprint("yes")
+            return True
+        finally:
+            os.remove(testfile)
+            for obj in filter(os.path.isfile, objects):
+                os.remove(obj)
+            if os.path.isfile(binfile):
+                os.remove(binfile)
+
+    def _avx2_flag(self):
+        if self.compiler.compiler_type == "msvc":
+            return "/arch:AVX2"
+        return "-mavx2"
+
+    def _check_avx2(self):
+        return self._check_simd_generic(
+            "AVX2",
+            self._avx2_flag(),
+            header="immintrin.h",
+            vector="__m256i",
+            set="_mm256_set1_epi16",
+            extract="_mm256_extract_epi16",
+        )
+
+    def _sse2_flag(self):
+        if self.compiler.compiler_type == "msvc":
+            return "/arch:SSE2"
+        return "-msse2"
+
+    def _check_sse2(self):
+        return self._check_simd_generic(
+            "SSE2",
+            self._sse2_flag(),
+            header="emmintrin.h",
+            vector="__m128i",
+            set="_mm_set1_epi16",
+            extract="_mm_extract_epi16",
+        )
+
     # --- Compatibility with base `build_clib` command ---
 
     def check_library_list(self, libraries):
@@ -139,6 +280,11 @@ def get_library(self, name):
 
     # --- Compatibility with `setuptools.Command`
 
+    def initialize_options(self):
+        _build_clib.initialize_options(self)
+        self._avx2_supported = False
+        self._sse2_supported = False
+
     def finalize_options(self):
         _build_clib.finalize_options(self)
         # extract the training file and the temporary folder where to split
@@ -182,6 +328,10 @@ def run(self):
         )
         # build the library as normal
         _build_clib.run(self)
+        # check which CPU features are supported, now that
+        # `_build_clib.run` has properly initialized the C compiler
+        self._avx2_supported = self._check_avx2()
+        self._sse2_supported = self._check_sse2()
 
     def build_libraries(self, libraries):
         self.mkpath(self.build_clib)
@@ -194,6 +344,9 @@ def build_libraries(self, libraries):
             )
 
     def build_library(self, library):
+        # show the compiler being used
+        _eprint("building", library.name, "with", self.compiler.compiler_type, "compiler")
+
         # add debug flags if we are building in debug mode
         if self.debug:
             if self.compiler.compiler_type in {"unix", "cygwin", "mingw32"}:
@@ -201,6 +354,11 @@ def build_library(self, library):
             elif self.compiler.compiler_type == "msvc":
                 library.extra_compile_args.append("/Od")
 
+        # check for functions required for libcpu_features
+        if library.name == "cpu_features" and SYSTEM == "Darwin":
+            if self._check_function("sysctlbyname", "sys/sysctl.h", args="(NULL, NULL, 0, NULL, 0)"):
+                library.define_macros.append(("HAVE_SYSCTLBYNAME", 1))
+
         # store compile args
         compile_args = (
             library.define_macros,
@@ -210,15 +368,17 @@ def build_library(self, library):
             None,
             library.depends,
         )
-
-        # compile Prodigal
-        sources_lib = library.sources.copy()
-        sources_lib.remove(self.training_file)
-        objects_lib = [
+        # manually prepare sources and get the names of object files
+        sources = library.sources.copy()
+        if library.name == "prodigal":
+            sources.remove(self.training_file)
+            sources.extend(sorted(glob.iglob(os.path.join(self.training_temp, "*.c"))))
+        objects = [
             os.path.join(self.build_temp, s.replace(".c", self.compiler.obj_extension))
-            for s in sources_lib
+            for s in sources
         ]
-        for source, object in zip(sources_lib, objects_lib):
+        # only compile outdated files
+        for source, object in zip(sources, objects):
             self.make_file(
                 [source],
                 object,
@@ -226,27 +386,16 @@ def build_library(self, library):
                 ([source], self.build_temp, *compile_args)
             )
 
-        # compile `training.c` source files
-        sources_training = sorted(glob.iglob(os.path.join(self.training_temp, "*.c")))
-        objects_training = [s.replace(".c", self.compiler.obj_extension) for s in sources_training]
-        for source, object in zip(sources_training, objects_training):
-            self.make_file(
-                [source],
-                object,
-                self.compiler.compile,
-                ([source], None, *compile_args)
-            )
-
         # link into a static library
         libfile = self.compiler.library_filename(
             library.name,
             output_dir=self.build_clib,
         )
         self.make_file(
-            objects_lib + objects_training,
+            objects,
             libfile,
             self.compiler.create_static_lib,
-            (objects_lib + objects_training, library.name, self.build_clib, None, self.debug)
+            (objects, library.name, self.build_clib, None, self.debug)
         )
 
 
@@ -269,32 +418,58 @@ def run(self):
 
         _clean.run(self)
 
+# --- Setup ---------------------------------------------------------------------
 
 setuptools.setup(
     libraries=[
         Library(
             "prodigal",
             sources=[
-                os.path.join("Prodigal", base)
+                os.path.join("vendor", "Prodigal", "{}.c".format(base))
                 for base in [
-                    "bitmap.c",
-                    "dprog.c",
-                    "gene.c",
-                    "metagenomic.c",
-                    "node.c",
-                    "sequence.c",
-                    "training.c"
+                    "bitmap",
+                    "dprog",
+                    "gene",
+                    "metagenomic",
+                    "node",
+                    "sequence",
+                    "training"
                 ]
             ],
-            include_dirs=["Prodigal"],
+            include_dirs=[os.path.join("vendor", "Prodigal")]
+        ),
+        Library(
+            "cpu_features",
+            sources=[
+                os.path.join("vendor", "cpu_features", "src", "{}.c".format(base))
+                for base in [
+                    "cpuinfo_{}".format(TARGET_CPU),
+                    "filesystem",
+                    "stack_line_reader",
+                    "string_view",
+                ]
+            ],
+            include_dirs=[os.path.join("vendor", "cpu_features", "include")],
+            define_macros=[("STACK_LINE_READER_BUFFER_SIZE", 1024)]
         )
     ],
     ext_modules=[
         Extension(
             "pyrodigal._pyrodigal",
-            sources=["pyrodigal/_pyrodigal.pyx"],
-            include_dirs=["Prodigal"],
-            libraries=["prodigal"],
+            sources=[
+                "pyrodigal/impl/sse.c",
+                "pyrodigal/impl/avx.c",
+                "pyrodigal/_pyrodigal.pyx"
+            ],
+            include_dirs=[
+                "pyrodigal",
+                os.path.join("vendor", "Prodigal"),
+                os.path.join("vendor", "cpu_features", "include"),
+            ],
+            libraries=[
+                "prodigal",
+                "cpu_features",
+            ],
         ),
     ],
     cmdclass={
diff --git a/Prodigal b/vendor/Prodigal
similarity index 100%
rename from Prodigal
rename to vendor/Prodigal
diff --git a/vendor/cpu_features b/vendor/cpu_features
new file mode 160000
index 0000000..a8397ba
--- /dev/null
+++ b/vendor/cpu_features
@@ -0,0 +1 @@
+Subproject commit a8397ba4591237c17d18e4acc091f5f3ebe7391e