Address paper review comments (#169)

Made changes in response to JOSS review comments: openjournals/joss-reviews#5759 (comment)
caleb531 · Sep 3, 2023 · d3fb992 · d3fb992
1 parent 07f3840
commit d3fb992
Show file tree

Hide file tree

Showing 2 changed files with 124 additions and 76 deletions.
diff --git a/joss/paper.bib b/joss/paper.bib
@@ -17,7 +17,7 @@ @Misc{brics
   author =  {Anders M\o{}ller},
   title =   {dk.brics.automaton -- Finite-State Automata
              and Regular Expressions for {Java}},
-  note =    {\texttt{http://www.brics.dk/automaton/}},
+  url =    {http://www.brics.dk/automaton/},
   year =    2021
 }
 
@@ -26,7 +26,7 @@ @article{AlmeidaMR10
                   Nelma Moreira and
                   Rog{\'{e}}rio Reis},
   title        = {Testing the Equivalence of Regular Languages},
-  journal      = {J. Autom. Lang. Comb.},
+  journal      = {Journal of Automata, Languages and Combinatorics},
   volume       = {15},
   number       = {1/2},
   pages        = {7--25},
@@ -51,28 +51,30 @@ @inbook{mihov_schulz_2019
   collection={Cambridge Tracts in Theoretical Computer Science}
 }
 
-@inbook{AhoSU86,
-  author       = {Alfred V. Aho and
-                  Ravi Sethi and
-                  Jeffrey D. Ullman},
-  title        = {Compilers: Principles, Techniques, and Tools},
-  series       = {Addison-Wesley series in computer science / World student series edition},
-  publisher    = {Addison-Wesley},
-  year         = {1986},
-  url          = {https://www.worldcat.org/oclc/12285707},
-  isbn         = {0-201-10088-6},
-  timestamp    = {Fri, 17 Jul 2020 16:12:45 +0200},
-  biburl       = {https://dblp.org/rec/books/aw/AhoSU86.bib},
-  bibsource    = {dblp computer science bibliography, https://dblp.org},
-  pages        = {159–163}
+@book{AhoSU86,
+    author = {Aho, Alfred V. and Lam, Monica S. and Sethi, Ravi and Ullman, Jeffrey D.},
+    title = {Compilers: Principles, Techniques, and Tools (2nd Edition)},
+    year = {2006},
+    isbn = {0321486811},
+    publisher = {Addison-Wesley Longman Publishing Co., Inc.},
+    address = {USA},
+    pages = {152-155}
 }
 
-@inproceedings{Yingjie09,
-  title={Describing an n log n algorithm for minimizing states in deterministic finite automaton},
-  author={Xu Yingjie},
-  year={2009}
+@incollection{Hopcroft71,
+    title = {AN n log n ALGORITHM FOR MINIMIZING STATES IN A FINITE AUTOMATON},
+    editor = {Zvi Kohavi and Azaria Paz},
+    booktitle = {Theory of Machines and Computations},
+    publisher = {Academic Press},
+    pages = {189-196},
+    year = {1971},
+    isbn = {978-0-12-417750-5},
+    doi = {https://doi.org/10.1016/B978-0-12-417750-5.50022-1},
+    url = {https://www.sciencedirect.com/science/article/pii/B9780124177505500221},
+    author = {John Hopcroft},
 }
 
+
 @INPROCEEDINGS{Erickson23,
   author = {Jeff Erickson, Jason Xia, Eliot Wong Robson, Tue Do, Aidan Tzur Glickman, Zhuofan Jia, Eric Jin, Jiwon Lee, Patrick Lin, Steven Pan, Samuel Ruggerio, Tomoko Sakurayama, Andrew Yin, Yael Gertner, and Brad Solomon},
   title = {Auto-graded Scaffolding Exercises For Theoretical Computer Science},
@@ -81,13 +83,12 @@ @INPROCEEDINGS{Erickson23
   month = {June},
   address = {Baltimore , Maryland},
   publisher = {ASEE Conferences},
-  note = {https://peer.asee.org/42347}
+  url = {https://peer.asee.org/42347}
 }
 
 @misc{Johnson_2010,
   title={Nick’s Blog},
   url={http://blog.notdot.net/2010/07/Damn-Cool-Algorithms-Levenshtein-Automata},
-  journal={Damn Cool Algorithms: Levenshtein Automata - Nick’s Blog},
   author={Johnson, Nick},
   year={2010},
   month={Jul}
@@ -114,3 +115,55 @@ @book{Hopcroft06
     publisher = {Addison-Wesley Longman Publishing Co., Inc.},
     address = {USA}
 }
+
+@book{Sipser12,
+	series = {Introduction to the {Theory} of {Computation}},
+	title = {Introduction to the {Theory} of {Computation}},
+	isbn = {978-1-133-18781-3},
+	publisher = {Cengage Learning},
+	author = {Sipser, M.},
+	year = {2012},
+	lccn = {2012938665},
+    pages = {45-47}
+}
+
+@article{Marschall11,
+	title = {Construction of minimal deterministic finite automata from biological motifs},
+	volume = {412},
+	issn = {0304-3975},
+	url = {https://www.sciencedirect.com/science/article/pii/S0304397510006948},
+	doi = {https://doi.org/10.1016/j.tcs.2010.12.003},
+	abstract = {Deterministic finite automata (DFAs) are constructed for various purposes in computational biology. Little attention, however, has been given to the efficient construction of minimal DFAs. In this article, we define simple non-deterministic finite automata (NFAs) and prove that the standard subset construction transforms NFAs of this type into minimal DFAs. Furthermore, we show how simple NFAs can be constructed from two types of pattern popular in bioinformatics, namely (sets of) generalized strings and (generalized) strings with a Hamming neighborhood.},
+	number = {8},
+	journal = {Theoretical Computer Science},
+	author = {Marschall, Tobias},
+	year = {2011},
+	keywords = {Consensus string, Deterministic finite automaton, Generalized string, Minimization, Motif},
+	pages = {922--930},
+}
+
+@article{Knuutila01,
+	title = {Re-describing an algorithm by {Hopcroft}},
+	volume = {250},
+	issn = {0304-3975},
+	url = {https://www.sciencedirect.com/science/article/pii/S0304397599001504},
+	doi = {https://doi.org/10.1016/S0304-3975(99)00150-4},
+	abstract = {J. Hopcroft introduced already in 1970 an O(nlogn)-time algorithm for minimizing a finite deterministic automaton of n states. Although the existence of the algorithm is widely known, its theoretical justification, correctness and running time analysis are not. We give here a tutorial reconstruction of Hopcroft's algorithm focusing on a firm theoretical basis, clear correctness proofs and a well-founded computational analysis. Our analysis reveals that if the size of the input alphabet m is not fixed, then Hopcroft's original algorithm does not run in time O(mnlogn) as is commonly believed in the literature. The O(mnlogn) holds, however, for the variation presented later by D. Gries and for a new variant given in this article. We also propose a new efficient routine for refining the equivalence classes constructed in the algorithm and suggest a computationally sound heuristics as an enhancement.},
+	number = {1},
+	journal = {Theoretical Computer Science},
+	author = {Knuutila, Timo},
+	year = {2001},
+	keywords = {Algorithms, Finite automata, Minimization},
+	pages = {333--363},
+}
+
+@ARTICLE{Xu16,
+  author={Xu, Chengcheng and Chen, Shuhui and Su, Jinshu and Yiu, S. M. and Hui, Lucas C. K.},
+  journal={IEEE Communications Surveys & Tutorials},
+  title={A Survey on Regular Expression Matching for Deep Packet Inspection: Applications, Algorithms, and Hardware Platforms},
+  year={2016},
+  volume={18},
+  number={4},
+  pages={2991-3029},
+  doi={10.1109/COMST.2016.2566669}
+}
diff --git a/joss/paper.md b/joss/paper.md
@@ -24,117 +24,112 @@ bibliography: paper.bib
 
 Automata are abstract machines used to represent models of computation, and are a central object of study in theoretical computer science
 [@Hopcroft06]. Given an input string of characters over a fixed alphabet, these machines either accept or reject the string. A language corresponding to an automaton is
-the set of all strings it accepts. Three important families of automata in increasing order of generality are as follows:
+the set of all strings it accepts. Three important families of automata in increasing order of generality are the following:
 
 1. Finite-state automata
 2. Pushdown automata
 3. Turing machines
 
-These models are a core component of both computer science education and research, seeing applications in a wide variety of areas. In particular, the ability to manipulate finite-state automata within the context of a software package has seen attention from researchers in the past [@Sutner03]. Similar software has also included
-functionality for parsing regular expressions into their corresponding finite-state automata [@brics].
+The `automata` package facilitates working with these families by allowing simulation of reading input and higher-level manipulation
+of the corresponding languages using specialized algorithms.
 
 # Statement of need
 
-Although there are other packages in the Python software ecosystem that allow for working with
-various kinds of automata, they are often niche and lack things like a comprehensive test suite that
-allow for more rapid development. This leads to these packages being unable to adopt features that
-would be useful to researchers and students alike, such as sophisticated construction and manipulation
-algorithms. Moreover, Python is a popular tool for students and researchers, meaning the availability
-of a high-quality software package is likely to encourage the further exploration of these structures
-in the academic community.
+These models are a core component of both computer science education and research, seeing further theoretical work
+and applications in a wide variety of areas such as computational biology [@Marschall11] and networking [@Xu16].
+In particular, the ability to manipulate finite-state automata within the context of a software package has seen attention from
+researchers in the past [@Sutner03]. Similar software has also included
+functionality for parsing regular expressions into their corresponding finite-state automata [@brics].
+
+`automata` serves the demand for such a package in the Python software ecosystem. As a popular high-level language, Python enables
+significant flexibility and ease of use that directly benefits many users. The package includes a comprehensive test suite,
+support for modern language features (including type annotations), and has a large number of different automata,
+meeting the demands of users across a wide variety of use cases. In particular, the target audience
+is both researchers that wish to manipulate automata, and for those in educational contexts to reinforce understanding about how these
+models of computation function.
+
 
 # The `automata` package
 
-`automata` is a Python package for the manipulation and simulation of automata from the families listed above.
-The API is designed to mimic the formal mathematical description of each automata using built-in Python data structures. As a popular high-level language, Python enables greater flexibility and easy-of-use that is difficult
-to achieve with a low-level language (e.g., Rust). Algorithms in the package have been optimized for
-performance against benchmarks from tasks arising in research. In addition, Python allows for
-greater optimization by the integration of lower-level technologies (e.g., Cython), while still
-retaining the same high-level API, allowing for integration of more performant features as-needed by
-the user base. The package also has native display integration with Jupyter notebooks, enabling
-easy visualization.
+The API of the package is designed to mimic the formal mathematical description of each automaton using built-in Python data structures
+(such as sets and dicts). This is for ease of use by those that are unfamiliar with these models of computation, while also providing performance
+suitable for tasks arising in research. In particular, algorithms in the package have been written for tackling
+performance on large inputs, incorporating optimizations such as only exploring the reachable set of states
+in the construction of a new finite-state automaton. The package also has native display integration with Jupyter
+notebooks, enabling easy visualization that allows students to interact with `automata` in an exploratory manner.
 
-Of note are some sophisticated and useful algorithms implemented in the package for finite-state automata:
+Of note are some commonly used and technical algorithms implemented in the package for finite-state automata:
 
 - An optimized version of the Hopcroft-Karp algorithm to determine whether two deterministic finite automata (DFA) are equivalent [@AlmeidaMR10].
 
+- The product construction algorithm for binary set operations (union, intersection, etc.) on the languages corresponding to two input DFAs [@Sipser12].
+
 - Thompson's algorithm for converting regular expressions to equivalent nondeterministic finite automata (NFA) [@AhoSU86].
 
-- Hopcroft's algorithm for DFA minimization [@Yingjie09].
+- Hopcroft's algorithm for DFA minimization [@Hopcroft71; @Knuutila01].
 
-- A specialized algorithm for directly constructing a state-minimal DFA accepting a given
-finite language [@mihov_schulz_2019].
+- A specialized algorithm for directly constructing a state-minimal DFA accepting a given finite language [@mihov_schulz_2019].
 
 - A specialized algorithm for directly constructing a minimal DFA recognizing strings containing
 a given substring [@Knuth77].
 
-To the author's knowledge, this is the only Python package implementing a number of the algorithms stated above.
+To the authors' knowledge, this is the only Python package implementing all of the automata manipulation algorithms stated above.
 
-`automata` was designed around existing theoretical models of automata, for use by both
-mathematically-oriented researchers and in educational contexts. The
-included functionality for parsing regular expressions and manipulating finite-state
-machines enables fast and accessible exploration of these structures by researchers.
-On the educational side, the package includes visualization logic that allows students to
-interact with these structures in an exploratory manner, and has already seen usage in
-undergraduate courses. `automata` has already been cited in publications [@Erickson23], with more
-to come as the package becomes more popular.
-
-`automata` has seen a large number of contributions by external contributors and wide adoption,
-demonstrating the demand for a high-quality Python package providing these features. The code is
-well-maintained, including a comprehensive test suite and type annotations, meaning new features
-can be incorporated from requests by the community at a rapid pace.
+`automata` has already been cited in publications [@Erickson23], and has seen use in multiple large undergraduate courses in introductory
+theoretical computer science at the University of Illinois Urbana-Champaign (roughly 2000 students since Fall 2021). In this instance, the package is being used
+both as part of an autograder utility for finite-state automata created by students, and as an exploratory tool for use by students directly.
 
 # Example usage
 
 ![A visualization of `target_words_dfa`. Transitions on characters leading to immediate rejections are omitted.\label{fig:target_words_dfa}](finite_language_dfa.png){ width=100% }
 
 The following example is inspired by the use case described in [@Johnson_2010].
 We wish to determine which strings in a given set are within the target edit distance
-to a reference string. We will do this with utilities provided by `automata`,
-first by initializing a DFA corresponding to a set of target words.
+to a reference string. We will first initialize a DFA corresponding to a fixed set of target words
+over the alphabet of all lowercase ascii characters.
 
 ```python
 from automata.fa.dfa import DFA
 from automata.fa.nfa import NFA
 import string
 
-input_symbols = set(string.ascii_lowercase)
-
-target_words = {'these', 'are', 'target', 'words', 'them', 'those'}
-
 target_words_dfa = DFA.from_finite_language(
-  input_symbols,
-  target_words,
+  input_symbols=set(string.ascii_lowercase),
+  language={'these', 'are', 'target', 'words', 'them', 'those'},
 )
 ```
 A visualization of `target_words_dfa`, generated by the package in a Jupyter notebook,
 is depicted in \autoref{fig:target_words_dfa}.
 
-Next, we construct an NFA recognizing all strings within the given edit distance of a
-reference string. This construction can again be done with functions provided by the library.
-We need to perform an NFA to DFA conversion for later.
+Next, we construct an NFA recognizing all strings within a target edit distance of a fixed
+reference string, and then immediately convert this to an equivalent DFA. The package provides
+builtin functions to make this construction easy, and we use the same alphabet as the DFA that was just created.
 
 ```python
-reference_string = 'they'
-edit_distance = 2
-
 words_within_edit_distance_dfa = DFA.from_nfa(
   NFA.edit_distance(
-    input_symbols,
-    reference_string,
-    edit_distance,
+    input_symbols=set(string.ascii_lowercase),
+    reference_str='they',
+    max_edit_distance=2,
   )
 )
 ```
 
 Finally, we take the intersection of the two DFAs we have constructed and read all of
-the words in the result into a list. The library makes this straightforward and idiomatic.
+the words in the output DFA into a list. The library makes this straightforward and idiomatic.
 
 ```python
 found_words_dfa = target_words_dfa & words_within_edit_distance_dfa
 found_words = list(found_words_dfa)
 ```
 
+The DFA `found_words_dfa` accepts strings in the intersection of the languages of the
+DFAs given as input, and `found_words` is a list containing this language. Note the power of this
+technique is that the DFA `words_within_edit_distance_dfa`
+has an infinite language, meaning we could not do this same computation just using the builtin
+sets in Python directly (as they always represent a finite collection), although the
+syntax used by `automata` is very similar to promote intuition.
+
 # Acknowledgements
 
 Thanks (in no particular order) to GitHub users