Skip to content

Commit

Permalink
Address paper review comments (#169)
Browse files Browse the repository at this point in the history
Made changes in response to JOSS review comments: openjournals/joss-reviews#5759 (comment)
  • Loading branch information
eliotwrobson authored Sep 3, 2023
1 parent 07f3840 commit d3fb992
Show file tree
Hide file tree
Showing 2 changed files with 124 additions and 76 deletions.
97 changes: 75 additions & 22 deletions joss/paper.bib
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ @Misc{brics
author = {Anders M\o{}ller},
title = {dk.brics.automaton -- Finite-State Automata
and Regular Expressions for {Java}},
note = {\texttt{http://www.brics.dk/automaton/}},
url = {http://www.brics.dk/automaton/},
year = 2021
}

Expand All @@ -26,7 +26,7 @@ @article{AlmeidaMR10
Nelma Moreira and
Rog{\'{e}}rio Reis},
title = {Testing the Equivalence of Regular Languages},
journal = {J. Autom. Lang. Comb.},
journal = {Journal of Automata, Languages and Combinatorics},
volume = {15},
number = {1/2},
pages = {7--25},
Expand All @@ -51,28 +51,30 @@ @inbook{mihov_schulz_2019
collection={Cambridge Tracts in Theoretical Computer Science}
}

@inbook{AhoSU86,
author = {Alfred V. Aho and
Ravi Sethi and
Jeffrey D. Ullman},
title = {Compilers: Principles, Techniques, and Tools},
series = {Addison-Wesley series in computer science / World student series edition},
publisher = {Addison-Wesley},
year = {1986},
url = {https://www.worldcat.org/oclc/12285707},
isbn = {0-201-10088-6},
timestamp = {Fri, 17 Jul 2020 16:12:45 +0200},
biburl = {https://dblp.org/rec/books/aw/AhoSU86.bib},
bibsource = {dblp computer science bibliography, https://dblp.org},
pages = {159–163}
@book{AhoSU86,
author = {Aho, Alfred V. and Lam, Monica S. and Sethi, Ravi and Ullman, Jeffrey D.},
title = {Compilers: Principles, Techniques, and Tools (2nd Edition)},
year = {2006},
isbn = {0321486811},
publisher = {Addison-Wesley Longman Publishing Co., Inc.},
address = {USA},
pages = {152-155}
}

@inproceedings{Yingjie09,
title={Describing an n log n algorithm for minimizing states in deterministic finite automaton},
author={Xu Yingjie},
year={2009}
@incollection{Hopcroft71,
title = {AN n log n ALGORITHM FOR MINIMIZING STATES IN A FINITE AUTOMATON},
editor = {Zvi Kohavi and Azaria Paz},
booktitle = {Theory of Machines and Computations},
publisher = {Academic Press},
pages = {189-196},
year = {1971},
isbn = {978-0-12-417750-5},
doi = {https://doi.org/10.1016/B978-0-12-417750-5.50022-1},
url = {https://www.sciencedirect.com/science/article/pii/B9780124177505500221},
author = {John Hopcroft},
}


@INPROCEEDINGS{Erickson23,
author = {Jeff Erickson, Jason Xia, Eliot Wong Robson, Tue Do, Aidan Tzur Glickman, Zhuofan Jia, Eric Jin, Jiwon Lee, Patrick Lin, Steven Pan, Samuel Ruggerio, Tomoko Sakurayama, Andrew Yin, Yael Gertner, and Brad Solomon},
title = {Auto-graded Scaffolding Exercises For Theoretical Computer Science},
Expand All @@ -81,13 +83,12 @@ @INPROCEEDINGS{Erickson23
month = {June},
address = {Baltimore , Maryland},
publisher = {ASEE Conferences},
note = {https://peer.asee.org/42347}
url = {https://peer.asee.org/42347}
}

@misc{Johnson_2010,
title={Nick’s Blog},
url={http://blog.notdot.net/2010/07/Damn-Cool-Algorithms-Levenshtein-Automata},
journal={Damn Cool Algorithms: Levenshtein Automata - Nick’s Blog},
author={Johnson, Nick},
year={2010},
month={Jul}
Expand All @@ -114,3 +115,55 @@ @book{Hopcroft06
publisher = {Addison-Wesley Longman Publishing Co., Inc.},
address = {USA}
}

@book{Sipser12,
series = {Introduction to the {Theory} of {Computation}},
title = {Introduction to the {Theory} of {Computation}},
isbn = {978-1-133-18781-3},
publisher = {Cengage Learning},
author = {Sipser, M.},
year = {2012},
lccn = {2012938665},
pages = {45-47}
}

@article{Marschall11,
title = {Construction of minimal deterministic finite automata from biological motifs},
volume = {412},
issn = {0304-3975},
url = {https://www.sciencedirect.com/science/article/pii/S0304397510006948},
doi = {https://doi.org/10.1016/j.tcs.2010.12.003},
abstract = {Deterministic finite automata (DFAs) are constructed for various purposes in computational biology. Little attention, however, has been given to the efficient construction of minimal DFAs. In this article, we define simple non-deterministic finite automata (NFAs) and prove that the standard subset construction transforms NFAs of this type into minimal DFAs. Furthermore, we show how simple NFAs can be constructed from two types of pattern popular in bioinformatics, namely (sets of) generalized strings and (generalized) strings with a Hamming neighborhood.},
number = {8},
journal = {Theoretical Computer Science},
author = {Marschall, Tobias},
year = {2011},
keywords = {Consensus string, Deterministic finite automaton, Generalized string, Minimization, Motif},
pages = {922--930},
}

@article{Knuutila01,
title = {Re-describing an algorithm by {Hopcroft}},
volume = {250},
issn = {0304-3975},
url = {https://www.sciencedirect.com/science/article/pii/S0304397599001504},
doi = {https://doi.org/10.1016/S0304-3975(99)00150-4},
abstract = {J. Hopcroft introduced already in 1970 an O(nlogn)-time algorithm for minimizing a finite deterministic automaton of n states. Although the existence of the algorithm is widely known, its theoretical justification, correctness and running time analysis are not. We give here a tutorial reconstruction of Hopcroft's algorithm focusing on a firm theoretical basis, clear correctness proofs and a well-founded computational analysis. Our analysis reveals that if the size of the input alphabet m is not fixed, then Hopcroft's original algorithm does not run in time O(mnlogn) as is commonly believed in the literature. The O(mnlogn) holds, however, for the variation presented later by D. Gries and for a new variant given in this article. We also propose a new efficient routine for refining the equivalence classes constructed in the algorithm and suggest a computationally sound heuristics as an enhancement.},
number = {1},
journal = {Theoretical Computer Science},
author = {Knuutila, Timo},
year = {2001},
keywords = {Algorithms, Finite automata, Minimization},
pages = {333--363},
}

@ARTICLE{Xu16,
author={Xu, Chengcheng and Chen, Shuhui and Su, Jinshu and Yiu, S. M. and Hui, Lucas C. K.},
journal={IEEE Communications Surveys & Tutorials},
title={A Survey on Regular Expression Matching for Deep Packet Inspection: Applications, Algorithms, and Hardware Platforms},
year={2016},
volume={18},
number={4},
pages={2991-3029},
doi={10.1109/COMST.2016.2566669}
}
103 changes: 49 additions & 54 deletions joss/paper.md
Original file line number Diff line number Diff line change
Expand Up @@ -24,117 +24,112 @@ bibliography: paper.bib

Automata are abstract machines used to represent models of computation, and are a central object of study in theoretical computer science
[@Hopcroft06]. Given an input string of characters over a fixed alphabet, these machines either accept or reject the string. A language corresponding to an automaton is
the set of all strings it accepts. Three important families of automata in increasing order of generality are as follows:
the set of all strings it accepts. Three important families of automata in increasing order of generality are the following:

1. Finite-state automata
2. Pushdown automata
3. Turing machines

These models are a core component of both computer science education and research, seeing applications in a wide variety of areas. In particular, the ability to manipulate finite-state automata within the context of a software package has seen attention from researchers in the past [@Sutner03]. Similar software has also included
functionality for parsing regular expressions into their corresponding finite-state automata [@brics].
The `automata` package facilitates working with these families by allowing simulation of reading input and higher-level manipulation
of the corresponding languages using specialized algorithms.

# Statement of need

Although there are other packages in the Python software ecosystem that allow for working with
various kinds of automata, they are often niche and lack things like a comprehensive test suite that
allow for more rapid development. This leads to these packages being unable to adopt features that
would be useful to researchers and students alike, such as sophisticated construction and manipulation
algorithms. Moreover, Python is a popular tool for students and researchers, meaning the availability
of a high-quality software package is likely to encourage the further exploration of these structures
in the academic community.
These models are a core component of both computer science education and research, seeing further theoretical work
and applications in a wide variety of areas such as computational biology [@Marschall11] and networking [@Xu16].
In particular, the ability to manipulate finite-state automata within the context of a software package has seen attention from
researchers in the past [@Sutner03]. Similar software has also included
functionality for parsing regular expressions into their corresponding finite-state automata [@brics].

`automata` serves the demand for such a package in the Python software ecosystem. As a popular high-level language, Python enables
significant flexibility and ease of use that directly benefits many users. The package includes a comprehensive test suite,
support for modern language features (including type annotations), and has a large number of different automata,
meeting the demands of users across a wide variety of use cases. In particular, the target audience
is both researchers that wish to manipulate automata, and for those in educational contexts to reinforce understanding about how these
models of computation function.


# The `automata` package

`automata` is a Python package for the manipulation and simulation of automata from the families listed above.
The API is designed to mimic the formal mathematical description of each automata using built-in Python data structures. As a popular high-level language, Python enables greater flexibility and easy-of-use that is difficult
to achieve with a low-level language (e.g., Rust). Algorithms in the package have been optimized for
performance against benchmarks from tasks arising in research. In addition, Python allows for
greater optimization by the integration of lower-level technologies (e.g., Cython), while still
retaining the same high-level API, allowing for integration of more performant features as-needed by
the user base. The package also has native display integration with Jupyter notebooks, enabling
easy visualization.
The API of the package is designed to mimic the formal mathematical description of each automaton using built-in Python data structures
(such as sets and dicts). This is for ease of use by those that are unfamiliar with these models of computation, while also providing performance
suitable for tasks arising in research. In particular, algorithms in the package have been written for tackling
performance on large inputs, incorporating optimizations such as only exploring the reachable set of states
in the construction of a new finite-state automaton. The package also has native display integration with Jupyter
notebooks, enabling easy visualization that allows students to interact with `automata` in an exploratory manner.

Of note are some sophisticated and useful algorithms implemented in the package for finite-state automata:
Of note are some commonly used and technical algorithms implemented in the package for finite-state automata:

- An optimized version of the Hopcroft-Karp algorithm to determine whether two deterministic finite automata (DFA) are equivalent [@AlmeidaMR10].

- The product construction algorithm for binary set operations (union, intersection, etc.) on the languages corresponding to two input DFAs [@Sipser12].

- Thompson's algorithm for converting regular expressions to equivalent nondeterministic finite automata (NFA) [@AhoSU86].

- Hopcroft's algorithm for DFA minimization [@Yingjie09].
- Hopcroft's algorithm for DFA minimization [@Hopcroft71; @Knuutila01].

- A specialized algorithm for directly constructing a state-minimal DFA accepting a given
finite language [@mihov_schulz_2019].
- A specialized algorithm for directly constructing a state-minimal DFA accepting a given finite language [@mihov_schulz_2019].

- A specialized algorithm for directly constructing a minimal DFA recognizing strings containing
a given substring [@Knuth77].

To the author's knowledge, this is the only Python package implementing a number of the algorithms stated above.
To the authors' knowledge, this is the only Python package implementing all of the automata manipulation algorithms stated above.

`automata` was designed around existing theoretical models of automata, for use by both
mathematically-oriented researchers and in educational contexts. The
included functionality for parsing regular expressions and manipulating finite-state
machines enables fast and accessible exploration of these structures by researchers.
On the educational side, the package includes visualization logic that allows students to
interact with these structures in an exploratory manner, and has already seen usage in
undergraduate courses. `automata` has already been cited in publications [@Erickson23], with more
to come as the package becomes more popular.

`automata` has seen a large number of contributions by external contributors and wide adoption,
demonstrating the demand for a high-quality Python package providing these features. The code is
well-maintained, including a comprehensive test suite and type annotations, meaning new features
can be incorporated from requests by the community at a rapid pace.
`automata` has already been cited in publications [@Erickson23], and has seen use in multiple large undergraduate courses in introductory
theoretical computer science at the University of Illinois Urbana-Champaign (roughly 2000 students since Fall 2021). In this instance, the package is being used
both as part of an autograder utility for finite-state automata created by students, and as an exploratory tool for use by students directly.

# Example usage

![A visualization of `target_words_dfa`. Transitions on characters leading to immediate rejections are omitted.\label{fig:target_words_dfa}](finite_language_dfa.png){ width=100% }

The following example is inspired by the use case described in [@Johnson_2010].
We wish to determine which strings in a given set are within the target edit distance
to a reference string. We will do this with utilities provided by `automata`,
first by initializing a DFA corresponding to a set of target words.
to a reference string. We will first initialize a DFA corresponding to a fixed set of target words
over the alphabet of all lowercase ascii characters.

```python
from automata.fa.dfa import DFA
from automata.fa.nfa import NFA
import string

input_symbols = set(string.ascii_lowercase)

target_words = {'these', 'are', 'target', 'words', 'them', 'those'}

target_words_dfa = DFA.from_finite_language(
input_symbols,
target_words,
input_symbols=set(string.ascii_lowercase),
language={'these', 'are', 'target', 'words', 'them', 'those'},
)
```
A visualization of `target_words_dfa`, generated by the package in a Jupyter notebook,
is depicted in \autoref{fig:target_words_dfa}.

Next, we construct an NFA recognizing all strings within the given edit distance of a
reference string. This construction can again be done with functions provided by the library.
We need to perform an NFA to DFA conversion for later.
Next, we construct an NFA recognizing all strings within a target edit distance of a fixed
reference string, and then immediately convert this to an equivalent DFA. The package provides
builtin functions to make this construction easy, and we use the same alphabet as the DFA that was just created.

```python
reference_string = 'they'
edit_distance = 2

words_within_edit_distance_dfa = DFA.from_nfa(
NFA.edit_distance(
input_symbols,
reference_string,
edit_distance,
input_symbols=set(string.ascii_lowercase),
reference_str='they',
max_edit_distance=2,
)
)
```

Finally, we take the intersection of the two DFAs we have constructed and read all of
the words in the result into a list. The library makes this straightforward and idiomatic.
the words in the output DFA into a list. The library makes this straightforward and idiomatic.

```python
found_words_dfa = target_words_dfa & words_within_edit_distance_dfa
found_words = list(found_words_dfa)
```

The DFA `found_words_dfa` accepts strings in the intersection of the languages of the
DFAs given as input, and `found_words` is a list containing this language. Note the power of this
technique is that the DFA `words_within_edit_distance_dfa`
has an infinite language, meaning we could not do this same computation just using the builtin
sets in Python directly (as they always represent a finite collection), although the
syntax used by `automata` is very similar to promote intuition.

# Acknowledgements

Thanks (in no particular order) to GitHub users
Expand Down

0 comments on commit d3fb992

Please sign in to comment.