Merge pull request #24 from carocad/antlr

Migrated parcera to Antlr4 for upmost performance
carocad · Nov 19, 2019 · 3d33c54 · 3d33c54
2 parents d6b28b1 + c798963
commit 3d33c54
Show file tree

Hide file tree

Showing 16 changed files with 843 additions and 535 deletions.
diff --git a/.gitignore b/.gitignore
@@ -11,3 +11,12 @@ pom.xml.asc
 /.idea/
 /nashorn_code_cache
 /.cljs_nashorn_repl
+/build/
+/yarn-error.log
+/node_modules/
+/out/
+/src/java/
+/src/javascript
+/figwheel_server.log
+package*.json
+/.eastwood
diff --git a/.travis.yml b/.travis.yml
@@ -14,16 +14,17 @@ jobs:
   include:
     - stage: Tests
       script:
+        - curl -O https://www.antlr.org/download/antlr-4.7.1-complete.jar
+        # generate java
+        - java -jar antlr-4.7.1-complete.jar -Xexact-output-dir -o src/java/parcera/antlr -package parcera.antlr -Dlanguage=Java -no-listener -no-visitor src/Clojure.g4
+        # now we can actually proceed with clojure code
         - lein do clean, compile, check, eastwood
         - lein trampoline test
-        - nvm install 10.10 && nvm use 10.10 && lein trampoline cljsbuild test
-
-    # only run the benchmark if we are trying to merge to master
-    # otherwise the build takes too long
-    - stage: Benchmark
-      if: branch = master
-      script:
         - lein trampoline test :benchmark
+        # todo - re-enable js
+        # generate javascript - todo
+        #- java -jar antlr-4.7.1-complete.jar -Xexact-output-dir -o src/javascript/parcera/antlr -package parcera.antlr -Dlanguage=JavaScript -no-listener -no-visitor src/Clojure.g4
+        #- nvm install 10.10 && nvm use 10.10 && lein trampoline cljsbuild test
 
     - stage: Release
       if: tag IS present

diff --git a/README.md b/README.md
@@ -3,7 +3,7 @@
 [![Build Status](https://travis-ci.com/carocad/parcera.svg?branch=master)](https://travis-ci.com/carocad/parcera)
 [![Clojars Project](https://img.shields.io/clojars/v/carocad/parcera.svg)](https://clojars.org/carocad/parcera)
 
-Grammar-based Clojure(script) parser.
+Grammar-based Clojure parser.
 
 Parcera can safely read any Clojure file without any code evaluation.
 
@@ -19,42 +19,33 @@ full explanation of the options available for a parser please visit Instaparse w
             [instaparse.core :as instaparse]))
 
 ;;parse clojure code from a string
-(parcera/clojure (str '(ns parcera.core
-                         (:require [instaparse.core :as instaparse]
-                                   [clojure.data :as data]
-                                   [clojure.string :as str]))))
+(parcera/ast (str '(ns parcera.core
+                     (:require [instaparse.core :as instaparse]
+                               [clojure.data :as data]
+                               [clojure.string :as str]))))
 
 ;; => returns a data structure with the result from the parser
-[:code
- [:list
-  [:symbol "ns"]
-  [:whitespace " "]
-  [:symbol "parcera.core"]
-  [:whitespace " "]
-  [:list
-   [:simple-keyword ":require"]
-   [:whitespace " "]
-   [:vector
-    [:symbol "instaparse.core"]
-    [:whitespace " "]
-    [:simple-keyword ":as"]
-    [:whitespace " "]
-    [:symbol "instaparse"]]
-   [:whitespace " "]
-   [:vector [:symbol "clojure.data"] [:whitespace " "] [:simple-keyword ":as"] [:whitespace " "] [:symbol "data"]]
-   [:whitespace " "]
-   [:vector [:symbol "clojure.string"] [:whitespace " "] [:simple-keyword ":as"] [:whitespace " "] [:symbol "str"]]]]]
+(:code
+ (:list
+  (:symbol "ns")
+  (:whitespace " ")
+  (:symbol "parcera.core")
+  (:whitespace " ")
+  (:list
+   (:simple_keyword "require")
+   (:whitespace " ")
+   (:vector
+    (:symbol "instaparse.core")
+    (:whitespace " ")
+    (:simple_keyword "as")
+    (:whitespace " ")
+    (:symbol "instaparse"))
+   (:whitespace " ")
+   (:vector (:symbol "clojure.data") (:whitespace " ") (:simple_keyword "as") (:whitespace " ") (:symbol "data"))
+   (:whitespace " ")
+   (:vector (:symbol "clojure.string") (:whitespace " ") (:simple_keyword "as") (:whitespace " ") (:symbol "str")))))
 
 ;; convert an AST back into a string
 (parcera/code [:symbol "ns"])
 ;; "ns"
 ```
-
-### notes
-There are some restrictions as to how much can a parser do. In my experience, these restrictions
-are related to some [semantic context-sensitivity](http://blog.reverberate.org/2013/09/ll-and-lr-in-context-why-parsing-tools.html).
-which the Clojure reader has embedded into itself. In general I have found the following ones:
-- `parcera` doesnt check that a map contains an even number of elements. This is specially difficult
-  to do since Clojure supports the discard macro `#_ form` which is a valid element but "doesnt count as one"
-- `parcera` doesnt check if a map has repeated keys
-- `parcera` doesnt check if a set has repeated elements
diff --git a/pom.xml b/pom.xml
@@ -3,9 +3,9 @@
   <groupId>carocad</groupId>
   <artifactId>parcera</artifactId>
   <packaging>jar</packaging>
-  <version>0.3.1</version>
+  <version>0.4.0</version>
   <name>parcera</name>
-  <description>Grammar-based Clojure(script) parser</description>
+  <description>Grammar-based Clojure parser</description>
   <url>https://github.com/carocad/parcera</url>
   <licenses>
     <license>
@@ -17,24 +17,48 @@
     <url>https://github.com/carocad/parcera</url>
     <connection>scm:git:git://github.com/carocad/parcera.git</connection>
     <developerConnection>scm:git:ssh://git@github.com/carocad/parcera.git</developerConnection>
-    <tag>b4ca5c659e55f00781e37bee1dc6bb400460e307</tag>
+    <tag>4ff04f242eddc4791cfdf2df572f91890c202e6c</tag>
   </scm>
   <build>
-    <sourceDirectory>src</sourceDirectory>
+    <sourceDirectory>src/clojure</sourceDirectory>
     <testSourceDirectory>test</testSourceDirectory>
     <resources>
       <resource>
         <directory>resources</directory>
       </resource>
     </resources>
     <testResources>
+      <testResource>
+        <directory>target</directory>
+      </testResource>
       <testResource>
         <directory>resources</directory>
       </testResource>
     </testResources>
     <directory>target</directory>
     <outputDirectory>target/classes</outputDirectory>
-    <plugins/>
+    <plugins>
+      <plugin>
+        <groupId>org.codehaus.mojo</groupId>
+        <artifactId>build-helper-maven-plugin</artifactId>
+        <version>1.7</version>
+        <executions>
+          <execution>
+            <id>add-source</id>
+            <phase>generate-sources</phase>
+            <goals>
+              <goal>add-source</goal>
+            </goals>
+            <configuration>
+              <sources>
+                <source>src/javascript</source>
+                <source>src/java</source>
+              </sources>
+            </configuration>
+          </execution>
+        </executions>
+      </plugin>
+    </plugins>
   </build>
   <repositories>
     <repository>
@@ -68,14 +92,9 @@
       <version>1.10.1</version>
     </dependency>
     <dependency>
-      <groupId>instaparse</groupId>
-      <artifactId>instaparse</artifactId>
-      <version>1.4.10</version>
-    </dependency>
-    <dependency>
-      <groupId>org.clojure</groupId>
-      <artifactId>clojurescript</artifactId>
-      <version>1.10.520</version>
+      <groupId>org.antlr</groupId>
+      <artifactId>antlr4-runtime</artifactId>
+      <version>4.7.1</version>
       <scope>provided</scope>
     </dependency>
     <dependency>

diff --git a/project.clj b/project.clj
@@ -1,26 +1,23 @@
-(defproject carocad/parcera "0.3.1"
-  :description "Grammar-based Clojure(script) parser"
+(defproject carocad/parcera "0.4.0"
+  :description "Grammar-based Clojure parser"
   :url "https://github.com/carocad/parcera"
   :license {:name "LGPLv3"
             :url  "https://github.com/carocad/parcera/blob/master/LICENSE.md"}
-  :dependencies [[org.clojure/clojure "1.10.1"]
-                 [instaparse/instaparse "1.4.10"]]
-  :profiles {:dev {:dependencies [[criterium/criterium "0.4.5"] ;; benchmark
-                                  [org.clojure/test.check "0.10.0"]]
-                   :plugins      [[jonase/eastwood "0.3.5"]
-                                  [lein-cljsbuild "1.1.7"]]
-                   :cljsbuild {:builds
-                               [{:id "dev"
-                                 :source-paths ["src" "test"]
-                                 :compiler {:main parcera.test-runner
-                                            :output-to "target/out/tests.js"
-                                            :target :nodejs
-                                            :optimizations :none}}]
-                               :test-commands
-                               {"test" ["node" "target/out/tests.js"]}}}
-             :provided {:dependencies [[org.clojure/clojurescript "1.10.520"]]}}
+  :source-paths ["src/clojure"]
+  :java-source-paths ["src/java"]
+  :dependencies [[org.clojure/clojure "1.10.1"]]
+  :profiles {:dev      {:dependencies   [[criterium/criterium "0.4.5"] ;; benchmark
+                                         [org.clojure/test.check "0.10.0"]] ;; generative testing
+                        :plugins        [[jonase/eastwood "0.3.5"]] ;; linter
+                        :resource-paths ["target"]
+                        :clean-targets  ^{:protect false} ["target"]}
+             ;; java reloader
+             ;[lein-virgil "0.1.9"]]
+             :provided {:dependencies [[org.antlr/antlr4-runtime "4.7.1"]]}}
+
   :test-selectors {:default     (fn [m] (not (some #{:benchmark} (keys m))))
                    :benchmark   :benchmark}
+
   :deploy-repositories [["clojars" {:url "https://clojars.org/repo"
                                     :username :env/clojars_username
                                     :password :env/clojars_password

diff --git a/scripts/figwheel.clj b/scripts/figwheel.clj
@@ -0,0 +1,2 @@
+(require '[figwheel.main.api :as fig])
+(fig/start "dev")
diff --git a/src/Clojure.g4 b/src/Clojure.g4
@@ -0,0 +1,152 @@
+
+grammar Clojure;
+
+/*
+ * NOTES to myself and to other developers:
+ *
+ * - You have to remember that the parser cannot check for semantics
+ * - You have to find the right balance of dividing enforcement between the
+ *   grammar and your own code.
+ *
+ * The parser should only check the syntax. So the rule of thumb is that when
+ * in doubt you let the parser pass the content up to your program. Then, in
+ * your program, you check the semantics and make sure that the rule actually
+ * have a proper meaning
+ *
+ * https://tomassetti.me/antlr-mega-tutorial/#lexers-and-parser
+*/
+
+code: form*;
+
+form: whitespace | literal | collection | reader_macro;
+
+// sets and namespaced map are not considerd collection from grammar perspective
+// since they start with # -> dispatch macro
+collection: list | vector | map;
+
+list: '(' form* ')';
+
+vector: '[' form* ']';
+
+map: '{' form* '}';
+
+literal: keyword | string | number | character | symbol;
+
+keyword: simple_keyword | macro_keyword;
+
+// making symbols, simple and macro keywords be based on NAME allows to
+// conform them all in the same way (see `conform` function)
+simple_keyword: ':' NAME;
+
+macro_keyword: '::' NAME;
+
+string: STRING;
+
+number: NUMBER;
+
+character: CHARACTER;
+
+symbol: NAME;
+
+reader_macro: ( unquote
+              | metadata
+              | backtick
+              | quote
+              | dispatch
+              | unquote_splicing
+              | deref
+              );
+
+unquote: '~' form;
+
+metadata: (metadata_entry whitespace?)+ ( symbol
+                                        | collection
+                                        | tag
+                                        | unquote
+                                        | unquote_splicing
+                                        );
+
+metadata_entry: '^' ( map | symbol | string | keyword );
+
+backtick: '`' form;
+
+quote: '\'' form;
+
+unquote_splicing: '~@' form;
+
+deref: '@' form;
+
+dispatch: function
+          | regex
+          | set
+          | conditional
+          | conditional_splicing
+          | namespaced_map
+          | var_quote
+          | discard
+          | tag
+          | symbolic;
+
+function: '#(' form* ')';
+
+regex: '#' STRING;
+
+set: '#{' form* '}';
+
+namespaced_map: '#' ( keyword |  auto_resolve) map;
+
+auto_resolve: '::';
+
+var_quote: '#\'' symbol;
+
+discard: '#_' form;
+
+tag: '#' symbol whitespace? (literal | collection);
+
+conditional: '#?(' form* ')';
+
+conditional_splicing: '#?@(' form* ')';
+
+symbolic: '##' ('Inf' | '-Inf' | 'NaN');
+
+// whitespace or comment
+whitespace: WHITESPACE;
+
+NUMBER: [+-]? DIGIT+ (DOUBLE_SUFFIX | LONG_SUFFIX | RATIO_SUFFIX);
+
+STRING: '"' ~["\\]* ('\\' . ~["\\]*)* '"';
+
+WHITESPACE: (SPACE | COMMENT)+;
+
+COMMENT: ';' ~[\r\n]*;
+
+SPACE: [\r\n\t\f, ]+;
+
+CHARACTER: '\\' (UNICODE_CHAR | NAMED_CHAR | UNICODE);
+
+NAME: NAME_HEAD NAME_BODY*;
+
+fragment UNICODE_CHAR: ~[\u0300-\u036F\u1DC0-\u1DFF\u20D0-\u20FF];
+
+fragment NAMED_CHAR: 'newline' | 'return' | 'space' | 'tab' | 'formfeed' | 'backspace';
+
+fragment UNICODE: 'u' [0-9d-fD-F] [0-9d-fD-F] [0-9d-fD-F] [0-9d-fD-F];
+
+// re-allow :#' as valid characters inside the name itself
+fragment NAME_BODY: NAME_HEAD | [:#'0-9];
+
+// these is the set of characters that are allowed by all symbols and keywords
+// however, this is more strict that necessary so that we can re-use it for both
+fragment NAME_HEAD: ~[\r\n\t\f ()[\]{}"@~^;`\\,:#'0-9];
+
+fragment DOUBLE_SUFFIX: ((('.' DIGIT*)? ([eE][-+]?DIGIT+)?) 'M'?);
+
+fragment LONG_SUFFIX: ('0'[xX]((DIGIT|[A-Fa-f])+) |
+                       '0'([0-7]+) |
+                       ([1-9]DIGIT?)[rR](DIGIT[a-zA-Z]+) |
+                       '0'DIGIT+
+                      )?'N'?;
+
+fragment RATIO_SUFFIX: '/' DIGIT+;
+
+fragment DIGIT: [0-9];