Permalink
Browse files

Introducing state machine with a stack to replace plain recursive des…

…cent. ALMOST working.

git-svn-id: https://tinyap.googlecode.com/svn/trunk@112 26adf5cf-fd31-0410-b5e7-cb6bf36da140
  • Loading branch information...
damien.leroux
damien.leroux committed Jul 19, 2010
1 parent cce49c0 commit f542d977c2bc6320175ec9e1ba3e9a843b541c27
Showing with 2,550 additions and 481 deletions.
  1. +4 −0 CMakeLists.txt
  2. +2 −1 config.h.cmake.in
  3. +6 −2 src/CMakeLists.txt
  4. +43 −0 src/ast.c
  5. +8 −4 src/ast.h
  6. +35 −46 src/bootstrap.c
  7. +2 −2 src/list.h
  8. +28 −11 src/main.c
  9. +4 −2 src/node_cache.c
  10. +1 −0 src/node_cache.h
  11. +248 −0 src/pda.c
  12. +50 −0 src/pda.h
  13. +771 −0 src/pda_impl.c
  14. +185 −0 src/pda_impl.h
  15. +1 −1 src/pilot_manager.c
  16. +28 −9 src/serialize.c
  17. +3 −0 src/serialize.h
  18. +11 −4 src/stack.h
  19. +25 −15 src/string_registry.c
  20. +3 −0 src/string_registry.h
  21. +65 −31 src/tinyap.c
  22. +6 −0 src/tinyap.h
  23. +13 −161 src/tinyap_alloc.c
  24. +28 −14 src/tinyap_alloc.h
  25. +276 −0 src/token_utils.c
  26. +48 −0 src/token_utils.h
  27. +447 −127 src/tokenizer.c
  28. +14 −5 src/tokenizer.h
  29. +94 −11 src/trie.c
  30. +37 −0 src/trie.h
  31. +63 −34 src/unparser.c
  32. +1 −1 src/walkableast.c
View
@@ -3,6 +3,9 @@ PROJECT(tinyap C)
SET(TINYAP_VERSION_MAJOR 2)
SET(TINYAP_VERSION_MINOR 0)
+SET(TINYAP_VERSION_PATCH beta)
+
+SET(TINYAP_VERSION "${TINYAP_VERSION_MAJOR}.${TINYAP_VERSION_MINOR}-${TINYAP_VERSION_PATCH}")
INCLUDE(CheckFunctionExists)
INCLUDE(CheckLibraryExists)
@@ -23,6 +26,7 @@ INCLUDE_DIRECTORIES(${CMAKE_BINARY_DIR}/src)
OPTION(TINYAP_SAFEMODE "Safe mode : check type before accessing any node in the AST" ON)
OPTION(NODE_CACHE_STATS "Compute and output statistics about node cache usage." OFF)
+OPTION(USE_FOREST "Compute and output all possible parse trees, not just first one. Slower algorithm but useful for ambiguous grammars." ON)
ADD_SUBDIRECTORY(src)
#INCLUDE(CMakeCPack.cmake)
View
@@ -43,7 +43,8 @@
/* Define to 1 if you have the ANSI C header files. */
#cmakedefine STDC_HEADERS
-#define PACKAGE_VERSION "${CMAKE_PACKAGE_VERSION}"
+#define PACKAGE_VERSION "${TINYAP_VERSION}"
#cmakedefine TINYAP_SAFEMODE
#cmakedefine NODE_CACHE_STATS
+#cmakedefine USE_FOREST
View
@@ -10,10 +10,14 @@ set(libtinyap_src
serialize.c
stack.c
tinyap.c
+ trie.c
tinyap_alloc.c
- tokenizer.c
+ #tokenizer.c
+ pda.c
+ pda_impl.c
+ token_utils.c
unparser.c
- unrepl.c
+ #unrepl.c
walkableast.c
walker.c
ape_prettyprint.c
View
@@ -33,6 +33,10 @@ volatile ast_node_t node_pool = NULL;
static tinyap_stack_t node_stack;
+
+ast_node_t PRODUCTION_OK_BUT_EMPTY = (union _ast_node_t[]){{ {0, 0, 0, 0, 0} }};
+
+
void node_pool_init() {
node_stack = new_stack();
}
@@ -190,3 +194,42 @@ void delete_node(ast_node_t n) {
+ast_node_t copy_node(ast_node_t);
+
+
+ast_node_t forest_append(ast_node_t prefix, ast_node_t suffix) {
+ ast_node_t ret=NULL, sufbak=suffix, p, s;
+ if(prefix->node_flags&IS_FOREST) {
+ if(suffix->node_flags&IS_FOREST) {
+ while(prefix) {
+ suffix=sufbak;
+ while(suffix) {
+ p = copy_node(Car(prefix));
+ s = copy_node(Car(suffix));
+ ret = newPair(Append(p, s), ret, suffix->pos.col, suffix->pos.col);
+ ret->node_flags|=IS_FOREST;
+ suffix = Cdr(suffix);
+ }
+ prefix = Cdr(prefix);
+ }
+ } else {
+ ret=prefix;
+ while(prefix) {
+ Car(prefix) = Append(Car(prefix), copy_node(suffix));
+ }
+ }
+ } else {
+ if(suffix->node_flags&IS_FOREST) {
+ ret = suffix;
+ while(suffix) {
+ Car(suffix) = Append(copy_node(prefix), Car(suffix));
+ suffix = Cdr(suffix);
+ }
+ } else {
+ ret = Append(copy_node(prefix), copy_node(suffix));
+ }
+ }
+ return ret;
+}
+
+
View
@@ -80,10 +80,11 @@ union _ast_node_t {
#define RULE_IS_LEFTREC 1
#define RULE_IS_LEFTREC_COMPUTED 2
#define ATOM_IS_NOT_STRING 4
+#define IS_FOREST 0x8000
-#define _atom(__s,_r,_c) (union _ast_node_t[]){{{ast_Atom,__s,NULL,_r,_c}}}
-#define _pair(__a,__d,_r,_c) (union _ast_node_t[]){{{ast_Pair,__a,__d,_r,_c}}}
+#define _atom(__s,_r,_c, _f) (union _ast_node_t[]){{{ast_Atom,__s,NULL,_r,_c, _f}}}
+#define _pair(__a,__d,_r,_c, _f) (union _ast_node_t[]){{{ast_Pair,__a,__d,_r,_c, _f}}}
/*! \par __n an AST node
@@ -181,16 +182,19 @@ ast_node_t newAtom(const char*data,int row,int col);
ast_node_t newPair(const ast_node_t a,const ast_node_t d,const int row,const int col);
//void delete_node(node_cache_t cache, ast_node_t n);
+ast_node_t forest_append(ast_node_t prefix_forest, ast_node_t suffix_forest);
+
void print_pair(ast_node_t n);
+extern ast_node_t PRODUCTION_OK_BUT_EMPTY;
static inline ast_node_t Append(const ast_node_t a,const ast_node_t b) {
ast_node_t ptr;
- if(!b) {
+ if(b==PRODUCTION_OK_BUT_EMPTY||!b) {
return (ast_node_t )a;
}
- if(!a) {
+ if(a==PRODUCTION_OK_BUT_EMPTY||!a) {
return (ast_node_t )b;
}
assert(isPair(b));
View
@@ -19,7 +19,6 @@
#include "config.h"
#include "ast.h"
-#include "tokenizer.h"
#include "bootstrap.h"
@@ -35,7 +34,6 @@ ast_node_t ast_unserialize(const char*input);
*/
const char*short_rules = "((Grammar\n"
-"(OperatorRule Toto (T pouet))\n"
"(Comment \\ TinyaP\\ :\\ this\\ is\\ not\\ yet\\ another\\ parser.)\n"
"(Comment \\ Copyright\\ \\(C\\)\\ 2007\\ Damien\\ Leroux)\n"
"(Comment \\ Grammar\\ for\\ 'short'\\ dialect.)\n"
@@ -53,72 +51,63 @@ const char*short_rules = "((Grammar\n"
"(Comment )\n"
"(Comment \\ Production\\ Atoms)\n"
"(Comment )\n"
-"(TransientRule elem (RE [_a-zA-Z][0-9a-zA-Z_]*))\n"
-"(OperatorRule STR (RawSeq (T ~) (RE [^~,]?) (T ,) (RE [^~,]?) (T ~)))\n"
-/*"(OperatorRule T (RawSeq (T \") (RE \\([^\"]|\\(?<=\\\\\\\\\\)\"\\)*) (T \")))\n"*/
-"(OperatorRule T (STR \" \"))\n"
-"(OperatorRule NT (NT elem))\n"
-/* ([^\\/]|\\.)+ */
-/*"(TransientRule re_re (RE \\([^\\\\/]|[\\\\][\\]\\[\\\\/\\ <>trnb\"]\\)*))\n"*/
-/*
- ~","~
- ~/,/~
- ~begin,end~
- ~:,~
- ~,~
-*/
-/*"(TransientRule re_re (RE \\([^/]|\\(?<=\\\\\\\\\\)/\\)+))\n"*/
-/*"(OperatorRule RE (RawSeq (T /) (NT re_re) (T /)))\n"*/
-"(OperatorRule RE (STR / /))\n"
-/*"(OperatorRule RPL (RawSeq (T //) (NT re_re) (T /) (NT re_re) (T /)))\n"*/
+"(TransientRule elem (RE [_a-zA-Z][0-9a-zA-Z_]*))\n"
+"(OperatorRule STR (RawSeq (T ~) (RE [^~,]?) (T ,) (RE [^~,]?) (T ~)))\n"
+"(OperatorRule BOW (RawSeq (T ~) (RE [_a-zA-Z][_a-zA-Z0-9]*) (NT optBK) (T ~)))\n"
+"(OperatorRule AddToBag (RawSeq (NT RE) (T :) (RE [_a-zA-Z][_a-zA-Z0-9]*) (NT optBK)))\n"
+"(TransientRule optBK (Rep01 (Alt (Seq (T !) (NT BKeep)))))\n"
+"(OperatorRule BKeep (Epsilon))\n"
+"(OperatorRule T (STR \" \"))\n"
+"(OperatorRule NT (NT elem))\n"
+"(OperatorRule RE (STR / /))\n"
"(Comment )\n"
"(Comment \\ Rules)\n"
"(Comment )\n"
-"(TransientRule rule (Alt (NT OperatorRule) (NT TransientRule)))\n"
+"(TransientRule rule (Alt (NT OperatorRule) (NT TransientRule)))\n"
"(OperatorRule OperatorRule (Seq (NT elem) (NT Space) (T ::=) (NT Space) (NT rule_expr) (T .) (NT NewLine)))\n"
"(OperatorRule TransientRule (Seq (NT elem) (NT Space) (T =) (NT Space) (NT rule_expr) (T .) (NT NewLine)))\n"
"(Comment )\n"
"(Comment \\ Expressions)\n"
"(Comment )\n"
-"(TransientRule rule_expr (Alt (NT Alt) (NT RawSeq) (NT Seq) (NT rule_elem)))\n"
-"(OperatorRule Prefix (Seq (T [) (NT rule_expr) (T ]) (NT rule_elem_atom)))\n"
-"(OperatorRule Postfix (Seq (T {) (NT rule_expr) (T }) (NT rule_elem_atom)))\n"
-"(OperatorRule RawSeq (Seq (T .raw) (NT Space) (NT rule_elem) (NT seq_expr)))\n"
-"(OperatorRule Seq (Seq (NT rule_elem) (NT seq_expr)))\n"
-"(OperatorRule Alt (Seq (T \\() (NT Space) (NT alt_expr) (NT Space) (T \\))))\n"
+"(TransientRule rule_expr (Alt (NT Alt) (NT RawSeq) (NT Seq) (NT rule_elem)))\n"
+"(OperatorRule Prefix (Seq (T [) (NT rule_expr) (T ]) (NT rule_elem_atom)))\n"
+"(OperatorRule Postfix (Seq (T {) (NT rule_expr) (T }) (NT rule_elem_atom)))\n"
+"(OperatorRule RawSeq (Seq (T .raw) (NT Space) (NT rule_elem) (Rep1N (NT Space) (NT rule_elem))))\n"
+"(OperatorRule Seq (Seq (NT rule_elem) (Rep1N (Seq (NT Space) (NT rule_elem)))))\n"
+"(OperatorRule Alt (Seq (T \\() (NT Space) (NT alt_expr) (NT Space) (T \\))))\n"
"(Comment )\n"
"(Comment \\ Helpers)\n"
"(Comment )\n"
-"(TransientRule seq_expr (Alt (Seq (NT seq_expr) (NT rule_elem)) (NT rule_elem)))\n"
-"(TransientRule alt_elem (Alt (NT RawSeq) (NT RawSeq) (NT Seq) (NT rule_elem)))\n"
+/*"(TransientRule seq_expr (Alt (Seq (NT seq_expr) (NT rule_elem)) (NT rule_elem)))\n"*/
+"(TransientRule alt_elem (Alt (NT RawSeq) (NT Seq) (NT rule_elem)))\n"
//"(TransientRule alt_expr (Alt (Seq (NT alt_expr) (T |) (NT Space) (NT alt_elem)) (NT alt_elem)))\n"
-"(TransientRule alt_expr (Seq (NT alt_elem) (Rep0N (Alt (Seq (T |) (NT Space) (NT alt_elem))))))\n"
-"(TransientRule rule_elem (Alt (Seq (Alt (NT EOF) (NT Comment) (NT Rep) (NT rule_elem_atom)) (NT Space))))\n"
-"(TransientRule rule_elem_atom (Alt (NT Epsilon) (NT T) (NT STR) (NT RE) (NT NT) (NT Alt) (NT Prefix) (NT Postfix)))\n"
+"(TransientRule alt_expr (Seq (NT alt_elem) (Rep0N (Alt (Seq (T |) (NT Space) (NT alt_elem))))))\n"
+"(TransientRule rule_elem (Alt (NT EOF) (NT Comment) (NT Rep) (NT rule_elem_atom)))\n"
+"(TransientRule rule_elem_atom (Alt (NT Epsilon) (NT T) (NT STR) (NT BOW) (NT AddToBag) (NT RE) (NT NT) (NT Alt) (NT Prefix) (NT Postfix)))\n"
"(Comment )\n"
"(Comment \\ Entry\\ point)\n"
"(Comment )\n"
-"(TransientRule _start (NT Grammar))\n"
-"(OperatorRule Grammar (NT _loop))\n"
-"(TransientRule _loop (Alt (EOF) (Seq (Alt (NT Comment) (NT rule)) (NT _loop))))\n"
+"(TransientRule _start (NT Grammar))\n"
+"(OperatorRule Grammar (NT _loop))\n"
+"(TransientRule _loop (Alt (EOF) (Seq (Alt (NT Comment) (NT rule)) (NT _loop))))\n"
"(Comment )\n"
"(Comment \\ Builtins)\n"
"(Comment )\n"
-"(OperatorRule EOF (T _EOF))\n"
-"(OperatorRule Epsilon (T _epsilon))\n"
-"(OperatorRule Space (T Space))\n"
-"(OperatorRule NewLine (T NewLine))\n"
-"(OperatorRule Indent (T Indent))\n"
-"(OperatorRule Dedent (T Dedent))\n"
+"(OperatorRule EOF (T _EOF))\n"
+"(OperatorRule Epsilon (T _epsilon))\n"
+"(OperatorRule Space (T Space))\n"
+"(OperatorRule NewLine (T NewLine))\n"
+"(OperatorRule Indent (T Indent))\n"
+"(OperatorRule Dedent (T Dedent))\n"
/*"(TransientRule _whitespace (RE \\([\\ \\r\\n\\t]|#[^\\r\\n]*[\\r\\n]+\\)+))\n"*/
-"(TransientRule _whitespace (RE \\([\\ \\r\\n\\t]\\)+))\n"
+"(TransientRule _whitespace (RE \\([\\ \\r\\n\\t]\\)+))\n"
"(Comment )\n"
"(Comment \\ Repetitions)\n"
"(Comment )\n"
-"(OperatorRule Rep1N (Seq (NT rule_elem_atom) (T +)))\n"
-"(OperatorRule Rep0N (Seq (NT rule_elem_atom) (T *)))\n"
-"(OperatorRule Rep01 (Seq (NT rule_elem_atom) (T ?)))\n"
-"(TransientRule Rep (Alt (NT Rep1N) (NT Rep0N) (NT Rep01)))\n"
+"(OperatorRule Rep1N (Seq (NT rule_elem_atom) (T +)))\n"
+"(OperatorRule Rep0N (Seq (NT rule_elem_atom) (T *)))\n"
+"(OperatorRule Rep01 (Seq (NT rule_elem_atom) (T ?)))\n"
+"(TransientRule Rep (Alt (NT Rep1N) (NT Rep0N) (NT Rep01)))\n"
"(Comment )\n"
"(Comment \\ Comments)\n"
"(Comment )\n"
View
@@ -50,11 +50,11 @@
#define TypeDefList(__id,__type)\
typedef LIST(__type) __id;\
- typedef NODE(__type) __id##Node;
+ typedef NODE(__type) __id##Node
#define TypeDefPList(__id,__type)\
typedef LIST(__type) __id;\
- typedef NODE_P(__type) __id##Node;
+ typedef NODE_P(__type) __id##Node
TypeDefPList(GenericList,void);
View
@@ -17,10 +17,7 @@
*/
#include "config.h"
#include "tinyap.h"
-#include "tokenizer.h"
#include "tinyape.h"
-//#include "bootstrap.h"
-//#include "tokenizer.h"
#include <stdio.h>
#include <string.h>
@@ -58,6 +55,7 @@ void tinyap_set_output(const tinyap_t t, ast_node_t o);
void node_pool_flush();
extern int max_rec_level;
+extern int tinyap_verbose;
@@ -87,23 +85,31 @@ int do_args(int argc,char*argv[]) {
}
/*} else if(cmp_param(0,"--relations","-R")) {*/
/*tinyap_set_output(parser, relations_from_tree(tinyap_get_output(parser)));*/
+ } else if(cmp_param(0,"--full-parse=on","-fp")) {
+ tinyap_set_full_parse(parser, 1);
+ } else if(cmp_param(0,"--simple-parse=on","-sp")) {
+ tinyap_set_full_parse(parser, 1);
} else if(cmp_param(0,"--parse","-p")) {
tinyap_parse(parser);
if(tinyap_parsed_ok(parser)&&tinyap_get_output(parser)) {
/*tinyap_serialize_to_file(tinyap_get_output(parser),argv[i]);*/
- fprintf(stderr, "parsed %u bytes in %.3f seconds (%.3f kBps)\n",
- tinyap_get_source_buffer_length(parser),
- tinyap_get_parse_time(parser),
- tinyap_get_source_buffer_length(parser)/tinyap_get_parse_time(parser)*(1./1024));
+ if(tinyap_verbose) {
+ fprintf(stderr, "parsed %u bytes in %.3f seconds (%.3f kBps)\n",
+ tinyap_get_source_buffer_length(parser),
+ tinyap_get_parse_time(parser),
+ tinyap_get_source_buffer_length(parser)/tinyap_get_parse_time(parser)*(1./1024));
+ }
} else {
fprintf(stderr,"parse error at line %i, column %i\n%s\n",tinyap_get_error_row(parser),tinyap_get_error_col(parser),tinyap_get_error(parser));
+ /*fprintf(stderr,"parse error at line %i, column %i\n%s\n", -1, -1, "TODO");*/
}
} else if(cmp_param(0,"--parse-as-grammar","-pag")) {
tinyap_parse_as_grammar(parser);
if(!(tinyap_parsed_ok(parser)&&tinyap_get_grammar_ast(parser))) {
/*tinyap_serialize_to_file(tinyap_get_output(parser),argv[i]);*/
/*} else {*/
fprintf(stderr,"parse error at line %i, column %i\n%s\n",tinyap_get_error_row(parser),tinyap_get_error_col(parser),tinyap_get_error(parser));
+ /*fprintf(stderr,"parse error at line %i, column %i\n%s\n", -1, -1, "TODO");*/
}
} else if(cmp_param(0,"--print-grammar","-pg")) {
/*print_rules(tinyap_get_grammar_ast(parser));*/
@@ -123,15 +129,22 @@ int do_args(int argc,char*argv[]) {
free((char*)up);
wa_del(grammar);
+ } else if(cmp_param(0,"--verbose","-V")) {
+ tinyap_set_verbose(1);
+ } else if(cmp_param(0,"--quiet","-q")) {
+ tinyap_set_verbose(0);
} else if(cmp_param(0,"--version","-v")) {
fprintf(stderr, TINYAP_ABOUT);
fprintf(stderr, "version " TINYAP_VERSION "\n" );
} else if(cmp_param(0,"--help","-h")) {
+
fprintf(stderr, TINYAP_ABOUT);
- fprintf(stderr, "Usage : %s [--input,-i [inputFile]] [--output,-o [outputFile]] [--grammar,-g [grammarFile]] [--parse,-p] [--parse-as-grammar,-pag] [--walk, -w [pilotName]] [--help,-h]\n",argv[0]);
+ fprintf(stderr, "Usage : %s [--input,-i [inputFile]] [--output,-o [outputFile]] [--grammar,-g [grammarFile]] [--parse,-p] [--parse-as-grammar,-pag] [--walk, -w [pilotName]] [--help,-h] [--version, -v] [--verbose,-V] [--quiet,-q]\n",argv[0]);
+ fprintf(stderr, "\n\t--version,-v\tdisplay version\n");
+ fprintf(stderr, "\n\t--verbose,-V\toutput messages during parse\n");
+ fprintf(stderr, "\n\t--quiet,-q\tdon't output messages during parse (default)\n");
fprintf(stderr, "\n\t--grammar,-g name\tuse this grammar to parse input\n");
- fprintf(stderr, "\t\t\"" GRAMMAR_EXPLICIT "\"\t(default) selects explicit variant\n");
- fprintf(stderr, "\t\t\"" GRAMMAR_CAMELCASING "\"\tselects CamelCasing variant\n");
+ fprintf(stderr, "\t\t\"" GRAMMAR_SHORT "\"\t(default) selects default meta-grammar\n");
fprintf(stderr, "\t\tany other string is a filename to read grammar from\n");
fprintf(stderr, "\n\t--print-grammar,-pg\toutput the current grammar in `explicit' dialect\n");
fprintf(stderr, "\t\targument is the same as above\n");
@@ -143,6 +156,8 @@ int do_args(int argc,char*argv[]) {
fprintf(stderr, "\t\tany other string is a filename to write to\n");
fprintf(stderr, "\n\t--parse,-p\t\tparse input text\n");
fprintf(stderr, "\n\t--parse-as-grammar,-pag\tparse input text and use output AST as new grammar\n");
+ fprintf(stderr, "\n\t--full-parse,-fp\t\tfind all possible parse trees\n");
+ fprintf(stderr, "\n\t--simple-parse,-p\t\tfind first parse tree\n");
fprintf(stderr, "\n\t--walk,-w name\t\twalk the current output tree using named ape\n\t\t\t\t(try prettyprint !)\n");
fprintf(stderr, "\n\t--help,-h\t\tdisplay this text\n\n");
exit(0);
@@ -156,7 +171,9 @@ int do_args(int argc,char*argv[]) {
tinyap_delete(parser);
- fprintf(stderr, "maximum recursion level : %i\n", max_rec_level);
+ /*if(tinyap_verbose) {*/
+ /*fprintf(stderr, "maximum recursion level : %i\n", max_rec_level);*/
+ /*}*/
return 0;
}
View
@@ -17,7 +17,9 @@
*/
#include "config.h"
-#include "tokenizer.h"
+#include "ast.h"
+#include "node_cache.h"
+
#include "tinyap_alloc.h"
#include "string_registry.h"
@@ -201,7 +203,7 @@ static inline size_t cache_hash(int l, int c, const char*n) {
/*return ret%NODE_CACHE_MOD;*/
/*return ret&NODE_CACHE_MASK;*/
/*size_t ret = hashlittle(n, strlen(n), (0xdeadbeef*l*c)^(0xdeadbeef+l+c));*/
- size_t ret = hash_bytes(n, strlen(n),0xdeadb33f);
+ size_t ret = hash_bytes((char*)n, strlen(n),0xdeadb33f);
/*size_t ret = hashlittle(n, strlen(n),0xdeadb33f);*/
/*size_t ret = FNV_HASH(n, strlen(n))^l^c;*/
/*ret *= (ret>>16);*/
Oops, something went wrong.

0 comments on commit f542d97

Please sign in to comment.