Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Browse files

Added Strings-Text-Processing

  • Loading branch information...
commit 33aedd1f96eebeeb6511e869dfcc224ef8021498 0 parents
@daniel-w daniel-w authored
BIN  LewiLL1.lhs_rhs.zip
Binary file not shown
BIN  calc_plain_tag_union_tok.zip
Binary file not shown
BIN  cfg_lookahead_extends.zip
Binary file not shown
BIN  expk_test.zip
Binary file not shown
BIN  fixed_string.zip
Binary file not shown
BIN  gram_lookahead.zip
Binary file not shown
BIN  imm_string_and_builder.zip
Binary file not shown
242 integral2str.hpp
@@ -0,0 +1,242 @@
+// Copyright Alexander Nasonov, 2006
+//
+#ifndef FILE_integral2str_hpp_INCLUDED_F406F3KP6
+#define FILE_integral2str_hpp_INCLUDED_F406F3KP6
+
+#include <boost/array.hpp>
+#include <boost/cstdint.hpp>
+#include <boost/mpl/deref.hpp>
+#include <boost/mpl/find_if.hpp>
+#include <boost/mpl/greater.hpp>
+#include <boost/mpl/vector_c.hpp>
+#include <boost/static_assert.hpp>
+#include <limits>
+
+template<class T> // T is unsigned type
+inline int hibit(T un)
+{
+ BOOST_STATIC_ASSERT(!std::numeric_limits<T>::is_signed);
+ return un >> (std::numeric_limits<T>::digits - 1);
+}
+
+template<class T> // T is unsigned type
+inline int digits_3(T un) // un in [-999, 999]
+{
+ BOOST_STATIC_ASSERT(!std::numeric_limits<T>::is_signed);
+
+ if(un < 100u)
+ return 1 + hibit(9u - un);
+ else
+ return 3;
+}
+
+template<class T> // T is unsigned type
+inline int digits_5(T un) // un in [-99999, 99999]
+{
+ BOOST_STATIC_ASSERT(!std::numeric_limits<T>::is_signed);
+
+ if(un < 100u)
+ return 1 + hibit(9u - un);
+ else
+ return un < 1000u ? 3 : 4 + hibit(9999u - un);
+}
+
+// TODO: measure performance of nlz-based
+// (number of leading zeros) implementations.
+#if 0
+template<class T> // T is unsigned type
+inline int digits_10(T un) // un in [9999999999, -9999999999]
+{
+ // TODO: read about bsrl. IIRC, its result is UB when un == 0
+ int msb_pos; // position of most significant bit
+ __asm__ ("bsrl %1, %0" : "=r" (msb_pos) : "r" (un));
+
+ // The alrothim is copied from http://www.hackersdelight.org
+
+ static unsigned int table[11] = { 0, 9, 99, 999, 9999, 99999,
+ 999999, 9999999, 99999999, 999999999, 0xffffffff
+ };
+
+ int x = (19 * msb_pos) >> 6;
+ return 1 + x + ((table[x + 1] - un) >> 31);
+}
+#endif
+
+template<class T> // T is unsigned type
+inline char* unsigned2str_5(T un, char* str, int digits)
+{
+ BOOST_STATIC_ASSERT(!std::numeric_limits<T>::is_signed);
+
+ unsigned int digit;
+
+ boost::uint_least32_t const multiplier4 = 8389;
+
+ // TODO: better parallelism
+ switch(digits)
+ {
+ case 5:
+ digit = un / 10000u; // 32bits is not enough for (un*0x1a36f)>>30
+ *str++ = '0' + digit;
+ un -= 10000u * digit;
+ case 4:
+ digit = (un * multiplier4) >> 23;
+ *str++ = '0' + digit;
+ un -= digit * 1000u;
+ case 3:
+ digit = (un * 41u) >> 12;
+ *str++ = '0' + digit;
+ un -= digit * 100u;
+ case 2:
+ digit = (un * 103u) >> 10;
+ *str++ = '0' + digit;
+ un -= digit * 10u;
+ case 1:
+ *str++ = '0' + un;
+ }
+
+ return str;
+}
+
+
+template<class T> // T is unsigned type
+inline char* unsigned2str_10(T un, char* str)
+{
+ // TODO: switch(digits) { case 10: ... case 9: ... }
+
+ BOOST_STATIC_ASSERT(!std::numeric_limits<T>::is_signed);
+
+ T hi = un / 100000u;
+
+ if(hi == 0)
+ return unsigned2str_5(un, str, digits_5(un));
+ else
+ {
+ str = unsigned2str_5(hi, str, digits_5(hi));
+ return unsigned2str_5( un % 100000u, str, 5);
+ }
+}
+
+#define INTEGRAL2STR_DEFINE(T) \
+inline bool is_negative(T n) { return n < 0; } \
+inline bool is_negative(unsigned T n) { return false; } \
+inline unsigned T correct_negative(unsigned T un) \
+{ return un; } \
+inline unsigned T correct_negative(T n) \
+{ unsigned T un = n; return n < 0 ? -un : un; }
+
+// No definitions for types shorter then int
+INTEGRAL2STR_DEFINE(int)
+INTEGRAL2STR_DEFINE(long)
+// TODO: non-standard types
+
+#undef INTEGRAL2STR_DEFINE
+
+
+template<int Digits10> struct integral2str_switch;
+
+template<>
+struct integral2str_switch<777> // generic impl
+{
+ template<class T>
+ inline static char* doit(T un, char* str, std::size_t size)
+ {
+ BOOST_STATIC_ASSERT(!std::numeric_limits<T>::is_signed);
+
+ char buf[std::numeric_limits<T>::digits10 + 2];
+
+ char* end = buf + sizeof(buf) / sizeof(buf[0]);
+ char* p = end;
+
+ do
+ {
+ --p;
+ *p = '0' + un % 10;
+ un /= 10;
+ }
+ while(un != 0);
+
+ while(p != end)
+ *str++ = *p++;
+
+ return str;
+ }
+};
+
+template<>
+struct integral2str_switch<10>
+{
+ template<class T>
+ inline static char* doit(T un, char* str, std::size_t)
+ {
+ return unsigned2str_10(un, str);
+ }
+};
+
+template<>
+struct integral2str_switch<5>
+{
+ template<class T>
+ inline static char* doit(T un, char* str, std::size_t)
+ {
+ return unsigned2str_5(un, str, digits_5(un));
+ }
+};
+
+
+// Not inline
+template<class T>
+void integral2str_impl(T n, char* str, std::size_t size)
+{
+ using namespace boost;
+ using mpl::_;
+
+ typedef mpl::int_<std::numeric_limits<T>::digits10> digits10;
+
+ typedef typename mpl::deref<
+ typename mpl::find_if< mpl::vector_c<int,5,10,777>
+ , mpl::greater<_,digits10>
+ >::type
+ >::type nearest;
+
+ typedef integral2str_switch<(nearest::value)> impl;
+
+ if(is_negative(n))
+ *str++ = '-';
+
+ str = impl::doit(correct_negative(n), str, size);
+ // Note that correct_negative also promotes n
+ *str = '\0';
+}
+
+template<class T>
+struct resultof_integral2str
+{
+ BOOST_STATIC_ASSERT(std::numeric_limits<T>::is_specialized);
+
+ typedef boost::array< char
+ , std::numeric_limits<T>::is_signed +
+ std::numeric_limits<T>::digits10 + 2
+ > type;
+};
+
+#define DEFINE_INTEGRAL2STR(T) \
+inline resultof_integral2str<T>::type integral2str(T n) \
+{ resultof_integral2str<T>::type result; \
+ char* str = result.c_array(); \
+ integral2str_impl(n, str, result.size()); return result; }
+
+// TODO: ??? DEFINE_INTEGRAL2STR(char)
+DEFINE_INTEGRAL2STR(signed char)
+DEFINE_INTEGRAL2STR(unsigned char)
+DEFINE_INTEGRAL2STR(signed short int)
+DEFINE_INTEGRAL2STR(unsigned short int)
+DEFINE_INTEGRAL2STR(signed int)
+DEFINE_INTEGRAL2STR(unsigned int)
+DEFINE_INTEGRAL2STR(signed long int)
+DEFINE_INTEGRAL2STR(unsigned long int)
+// TODO: non-standard types
+
+#undef DEFINE_INTEGRAL2STR
+
+#endif // #ifndef FILE_integral2str_hpp_INCLUDED_F406F3KP6
+
BIN  multi_pass~store_deref~functor.zip
Binary file not shown
BIN  particle.zip
Binary file not shown
BIN  price_parsing.7z
Binary file not shown
596 proto_static_disp.cpp
@@ -0,0 +1,596 @@
+//Purpose:
+// Prototype a way to eliminate all dynamic dispatching in grammar.
+// IOW, instead of having rule<...> have some way to create a type
+// the represents a specific grammar.
+#include <boost/mpl/integral_c.hpp>
+#include <boost/mpl/map.hpp>
+#include <iostream>
+
+ enum
+vocab_id
+{ terminal
+, nonterminal
+};
+
+ template
+ < vocab_id VocabId
+ >
+ struct
+vocabulary
+;
+
+ template
+ <
+ >
+ struct
+vocabulary
+ < terminal
+ >
+{
+
+ enum
+ word_ids
+ { ident //identifier
+ , number //integer number literal
+ , op_add //add operator
+ , op_mult //multiplication operator
+ , par_left //left parenthesis
+ , par_right //right parenthesis
+ , end_words //end-of-words sentinel
+ };
+ template
+ < word_ids Word
+ >
+ struct
+ variable
+ {
+ variable(void)
+ {}
+ variable(variable const&)
+ {}
+ template
+ < class Rhs //def of this variable
+ >
+ boost::mpl::pair<variable,Rhs>
+ operator=(Rhs const&)
+ {
+ typedef boost::mpl::pair<variable,Rhs> result_type;
+ result_type a_result;
+ return a_result;
+ }
+ };
+
+};
+
+ template
+ < typename VocabularyVariant
+ >
+ struct
+word_iterator
+/**@brief
+ * Specializations should be iterators over elements from VocabularyVariant.
+ */
+;
+
+#include <boost/mpl/vector.hpp>
+
+ typedef
+ boost::mpl::vector
+ < vocabulary<terminal>::variable<vocabulary<terminal>::ident>
+ , vocabulary<terminal>::variable<vocabulary<terminal>::number>
+ , vocabulary<terminal>::variable<vocabulary<terminal>::op_add>
+ , vocabulary<terminal>::variable<vocabulary<terminal>::op_mult>
+ , vocabulary<terminal>::variable<vocabulary<terminal>::par_left>
+ , vocabulary<terminal>::variable<vocabulary<terminal>::par_right>
+ , vocabulary<terminal>::variable<vocabulary<terminal>::end_words>
+ >::type
+vec_term_type
+;
+
+#include <boost/variant.hpp>
+
+ typedef
+ boost::variant
+ < vocabulary<terminal>::variable<vocabulary<terminal>::ident>
+ , vocabulary<terminal>::variable<vocabulary<terminal>::number>
+ , vocabulary<terminal>::variable<vocabulary<terminal>::op_add>
+ , vocabulary<terminal>::variable<vocabulary<terminal>::op_mult>
+ , vocabulary<terminal>::variable<vocabulary<terminal>::par_left>
+ , vocabulary<terminal>::variable<vocabulary<terminal>::par_right>
+ , vocabulary<terminal>::variable<vocabulary<terminal>::end_words>
+ >
+terminal_sum_type
+;
+
+#include <vector>
+#include "boost/range/iterator_range.hpp"
+
+ template
+ <
+ >
+ struct
+word_iterator
+ < terminal_sum_type
+ >
+ : public boost::iterator_range<std::vector<terminal_sum_type>::const_iterator>
+{
+ typedef
+ std::vector<terminal_sum_type>
+ container_type
+ ;
+ typedef
+ container_type::const_iterator
+ iter_type
+ ;
+ typedef
+ boost::iterator_range<iter_type>
+ super_type
+ ;
+ word_iterator(void)
+ {}
+ word_iterator(container_type const& a_container)
+ : super_type(a_container.begin(), a_container.end())
+ {}
+ void
+ operator++(void)
+ {
+ this->advance_begin(1);
+ }
+ terminal_sum_type
+ operator*(void)const
+ {
+ return this->front();
+ }
+};
+
+ template
+ <
+ >
+ struct
+vocabulary
+ < nonterminal
+ >
+{
+
+ enum
+ word_ids
+ { expression
+ , term
+ , factor
+ };
+ enum
+ { nout=expression+1
+ };
+ template
+ < word_ids Word
+ >
+ struct
+ variable
+ {
+ variable(void)
+ {}
+ variable(variable const&)
+ {}
+ template
+ < class Rhs //def of this variable
+ >
+ boost::mpl::pair<variable,Rhs>
+ operator=(Rhs const&)
+ {
+ typedef boost::mpl::pair<variable,Rhs> result_type;
+ result_type a_result;
+ return a_result;
+ }
+ };
+
+};
+
+ enum
+gram_ops_0
+/**@brief 'tag' for nullary grammar expressions
+ */
+{ gram_op_one //identity for sequence operator, i.e. expsilon
+};
+
+ template
+ < gram_ops_0 GramOp
+ >
+ struct
+gram_expr_0
+/**@brief Grammar Expression. IOW
+ * what can occur in any part of the rhs of a production.
+ */
+{};
+
+ enum
+gram_ops_1
+/**@brief 'tag' for unary grammar expressions
+ */
+{ gram_op_repeat //repeat operator, e.g. *x in spirit
+};
+
+ template
+ < gram_ops_1 GramOp
+ , class Body
+ >
+ struct
+gram_expr_1
+/**@brief Grammar Expression. IOW
+ * what can occur in any part of the rhs of a production
+ * *and* which is composed of 1 immediate subexpression.
+ */
+{};
+
+ enum
+gram_ops_2
+/**@brief 'tag' for binary grammar expressions
+ */
+{ gram_op_seq //sequence operator, e.g. x >> y in spirit
+, gram_op_alt //alternative operator, e.g. x | y in spirit
+};
+
+ template
+ < gram_ops_2 GramOp
+ , class Left
+ , class Right
+ >
+ struct
+gram_expr_2
+/**@brief Grammar Expression. IOW
+ * what can occur in any part of the rhs of a production
+ * *and* which is composed of 2 immediate subexpressions.
+ */
+{
+ gram_expr_2(void)
+ {}
+};
+
+//Define operators corresponding to gram_expr_* templates:
+
+ template
+ < class Body
+ >
+ gram_expr_1
+ < gram_op_repeat
+ , Body
+ >
+operator*
+ ( Body const&
+ )
+{
+ typedef gram_expr_1<gram_op_repeat,Body> result_type;
+ result_type a_result;
+ return a_result;
+}
+
+ template
+ < class Left
+ , class Right
+ >
+ gram_expr_2
+ < gram_op_seq
+ , Left
+ , Right
+ >
+operator&
+ ( Left const&
+ , Right const&
+ )
+{
+ typedef gram_expr_2<gram_op_seq,Left,Right> result_type;
+ result_type a_result;
+ return a_result;
+}
+
+ template
+ < class Left
+ , class Right
+ >
+ gram_expr_2
+ < gram_op_alt
+ , Left
+ , Right
+ >
+operator|
+ ( Left const&
+ , Right const&
+ )
+{
+ typedef gram_expr_2<gram_op_alt,Left,Right> result_type;
+ result_type a_result;
+ return a_result;
+}
+
+using namespace boost::mpl;
+
+ typedef
+ map
+ < typeof
+ ( vocabulary<nonterminal>::variable<vocabulary<nonterminal>::factor>()
+ = vocabulary<terminal>::variable<vocabulary<terminal>::ident>()
+ | vocabulary<terminal>::variable<vocabulary<terminal>::number>()
+ | vocabulary<terminal>::variable<vocabulary<terminal>::par_left>()
+ & vocabulary<nonterminal>::variable<vocabulary<nonterminal>::expression>()
+ & vocabulary<terminal>::variable<vocabulary<terminal>::par_right>()
+ )//end production factor = ident | number | '(' expression ')'
+ , typeof
+ ( vocabulary<nonterminal>::variable<vocabulary<nonterminal>::term>()
+ = vocabulary<nonterminal>::variable<vocabulary<nonterminal>::factor>()
+ & *
+ ( vocabulary<terminal>::variable<vocabulary<terminal>::op_mult>()
+ & vocabulary<nonterminal>::variable<vocabulary<nonterminal>::factor>()
+ )
+ )//end: production term = factor ( '*' factor )^*
+ , typeof
+ ( vocabulary<nonterminal>::variable<vocabulary<nonterminal>::expression>()
+ = vocabulary<nonterminal>::variable<vocabulary<nonterminal>::term>()
+ & *
+ ( vocabulary<terminal>::variable<vocabulary<terminal>::op_add>()
+ & vocabulary<nonterminal>::variable<vocabulary<nonterminal>::term>()
+ )
+ )//end production expression = term ( '+' term )^*
+ >
+arith_expr_gram_type
+/**@brief
+ * Grammar for arithmetic expressions
+ */
+;
+
+ struct
+parser_lhs
+/**@brief
+ * Top class in inheritance heirarchy of parser_production's.
+ */
+{
+ typedef
+ word_iterator<terminal_sum_type>
+ word_iterator_type
+ ;
+};
+
+ template
+ < typename Rhs
+ , typename Grammar
+ >
+ struct
+parser_rhs
+;
+
+ template
+ < vocabulary<terminal>::word_ids WordId
+ , typename Grammar
+ >
+ struct
+parser_rhs
+ < vocabulary<terminal>::variable<WordId>
+ , Grammar
+ >
+{
+ typedef
+ typename parser_lhs::word_iterator_type
+ iterator_type
+ ;
+ bool
+ parse_rhs(iterator_type& a_iter)
+ {
+ if(a_iter.empty()) return false;
+ terminal_sum_type next_term=*a_iter;
+ if(next_term.which() != WordId) return false;
+ ++a_iter;
+ return true;
+ }
+};
+
+ template
+ < class Body
+ , typename Grammar
+ >
+ struct
+parser_rhs
+ < gram_expr_1<gram_op_repeat,Body>
+ , Grammar
+ >
+{
+ typedef
+ typename parser_lhs::word_iterator_type
+ iterator_type
+ ;
+ bool
+ parse_rhs(iterator_type& a_iter)
+ {
+ bool match=true;
+ unsigned pre_tail=a_iter.size();
+ while(match && my_body.parse_rhs(a_iter))
+ {
+ unsigned post_tail=a_iter.size();
+ match=post_tail<pre_tail;
+ pre_tail=post_tail;
+ }
+ return match;
+ }
+ parser_rhs<Body,Grammar>
+ my_body
+ ;
+};
+
+ template
+ < class Left
+ , class Right
+ , typename Grammar
+ >
+ struct
+parser_rhs
+ < gram_expr_2<gram_op_seq,Left,Right>
+ , Grammar
+ >
+{
+ typedef
+ typename parser_lhs::word_iterator_type
+ iterator_type
+ ;
+ bool
+ parse_rhs(iterator_type& a_iter)
+ {
+ bool match=my_left.parse_rhs(a_iter);
+ if(match)
+ {
+ match=my_right.parse_rhs(a_iter);
+ }
+ return match;
+ }
+ parser_rhs<Left,Grammar>
+ my_left
+ ;
+ parser_rhs<Right,Grammar>
+ my_right
+ ;
+};
+
+ template
+ < class Left
+ , class Right
+ , typename Grammar
+ >
+ struct
+parser_rhs
+ < gram_expr_2<gram_op_alt,Left,Right>
+ , Grammar
+ >
+{
+ typedef
+ typename parser_lhs::word_iterator_type
+ iterator_type
+ ;
+ bool
+ parse_rhs(iterator_type& a_iter)
+ {
+ bool match=my_left.parse_rhs(a_iter);
+ if(!match)
+ {
+ match=my_right.parse_rhs(a_iter);
+ }
+ return match;
+ }
+
+ parser_rhs<Left,Grammar>
+ my_left
+ ;
+ parser_rhs<Right,Grammar>
+ my_right
+ ;
+};
+
+ template
+ < vocabulary<nonterminal>::word_ids WordId
+ , typename Grammar
+ >
+ struct
+parser_rhs
+ < vocabulary<nonterminal>::variable<WordId>
+ , Grammar
+ >
+{
+ typedef
+ typename parser_lhs::word_iterator_type
+ iterator_type
+ ;
+ bool
+ parse_rhs(iterator_type& a_iter)
+ {
+ //retrieve the rhs_parser_type for WordId from Grammar:
+ typedef vocabulary<nonterminal>::variable<WordId> var_type;
+ typedef typename Grammar::template production_rhs<var_type>::type rhs_parser_type;
+ //create instance of parser for that nonterminal:
+ rhs_parser_type rhs_parser;
+ //do parse & return result:
+ return rhs_parser.parse_rhs(a_iter);
+ }
+};
+
+#include <boost/mpl/at.hpp>
+
+ template
+ < class GramMap
+ >
+ struct
+grammar
+{
+ typedef
+ grammar<GramMap>
+ my_type
+ ;
+ typedef
+ parser_lhs::word_iterator_type
+ word_iterator_type
+ ;
+ template
+ < class NonTermVar
+ >
+ struct
+ production_rhs
+ /**@brief
+ * returns parser_rhs for rhs of NonTermVar production.
+ */
+ {
+ private:
+ typedef
+ typename at<GramMap,NonTermVar>::type
+ rhs_type
+ ;
+ public:
+ typedef
+ parser_rhs<rhs_type,my_type>
+ type
+ ;
+
+ };
+ grammar(void)
+ {
+ std::cout<<"grammar::CTOR\n";
+ }
+};
+
+ int
+main(void)
+{
+ enum
+ terms
+ { ident =vocabulary<terminal>::ident
+ , number =vocabulary<terminal>::number
+ , op_add =vocabulary<terminal>::op_add
+ , op_mult =vocabulary<terminal>::op_mult
+ , par_left =vocabulary<terminal>::par_left
+ , par_right=vocabulary<terminal>::par_right
+ };
+ typedef grammar<arith_expr_gram_type> gram_type;
+ typedef parser_lhs::word_iterator_type witer_type;
+ { std::cout<<"test factor with input=ident\n";
+ witer_type::container_type wordc;
+ at_c<vec_term_type,ident>::type a_ident;
+ terminal_sum_type a_sum(a_ident);
+ wordc.push_back(a_sum);
+ witer_type wordi(wordc);
+ std::cout<<"words="<<wordi.size()<<"\n";
+ typedef vocabulary<nonterminal>::variable<vocabulary<nonterminal>::factor> var;
+ gram_type::production_rhs<var>::type var_rhs;
+ bool result=var_rhs.parse_rhs(wordi);
+ std::cout<<"words="<<wordi.size()<<"\n";
+ std::cout<<"result="<<result<<"\n";
+ }
+ { std::cout<<"test term with input=ident * ident\n";
+ witer_type::container_type wordc;
+ at_c<vec_term_type,ident>::type a_ident;
+ terminal_sum_type a_sum(a_ident);
+ wordc.push_back(a_sum);
+ wordc.push_back(terminal_sum_type(at_c<vec_term_type,op_mult>::type()));
+ wordc.push_back(terminal_sum_type(at_c<vec_term_type,ident>::type()));
+ witer_type wordi(wordc);
+ std::cout<<"words="<<wordi.size()<<"\n";
+ typedef vocabulary<nonterminal>::variable<vocabulary<nonterminal>::term> var;
+ gram_type::production_rhs<var>::type var_rhs;
+ bool result=var_rhs.parse_rhs(wordi);
+ std::cout<<"words="<<wordi.size()<<"\n";
+ std::cout<<"result="<<result<<"\n";
+ }
+ return 0;
+}
BIN  string_convert.zip
Binary file not shown
BIN  subrule_simple.map_parse.1.50.zip
Binary file not shown
BIN  super_string_v1.zip
Binary file not shown
BIN  super_string_v2.zip
Binary file not shown
BIN  test_inputs.gz
Binary file not shown
BIN  text_encoding.zip
Binary file not shown
813 thompson-nfa-perl-regex.cpp
@@ -0,0 +1,813 @@
+// Regular expression implementation.
+// Supports traditional egrep syntax, plus non-greedy operators.
+// Tracks submatches a la traditional backtracking.
+//
+// Finds leftmost-biased (traditional backtracking) match;
+//
+// Executes repetitions likt Perl.
+//
+// Requires Boost C++ Libraries, see http://boost.org
+//
+// g++ -I $BOOST_ROOT nfa-perl.cpp
+// a.out '(a*)+' aaa # (0,3)(3,3)
+// a.out '(a|aa)(a|aa)' aaa # (0,2)(0,1)(1,2)
+//
+// Copyright (c) 2007 Russ Cox.
+// Copyright (c) 2007 Eric Niebler.
+// Can be distributed under the Boost Softwate License 1.0, see bottom of file.
+
+//#define BOOST_SPIRIT_DEBUG
+//#define _CRTDBG_MAP_ALLOC
+#include <iostream>
+#include <iomanip>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <vector>
+#include <deque>
+#include <list>
+#include <utility>
+#include <boost/array.hpp>
+#include <boost/next_prior.hpp>
+#include <boost/shared_ptr.hpp>
+#include <boost/spirit.hpp>
+#include <boost/spirit/phoenix.hpp>
+//#include <crtdbg.h>
+//using namespace boost;
+
+//enum
+//{
+// LeftmostBiased = 0,
+// LeftmostLongest = 1,
+//};
+//
+//enum
+//{
+// RepeatMinimal = 0,
+// RepeatLikePerl = 1,
+//};
+
+int debug;
+//int matchtype = LeftmostBiased;
+//int reptype = RepeatMinimal;
+
+enum SubState
+{
+ Unmatched = 0,
+ Incomplete = 1,
+ Matched = 2
+};
+
+enum
+{
+ NSUB = 10
+};
+
+template<typename Iter>
+struct Sub
+ : std::pair<Iter, Iter>
+{
+ Sub(Iter first_=Iter(), Iter second_=Iter())
+ : std::pair<Iter, Iter>(first_, second_)
+ , matched(Unmatched)
+ {}
+
+ SubState matched;
+};
+
+enum
+{
+ Char = 1,
+ Any = 2,
+ Split = 3,
+ LParen = 4,
+ RParen = 5,
+ Match = 6,
+};
+
+struct State
+{
+ int op;
+ int data;
+ State const *out;
+ State const *out1;
+ std::size_t id;
+
+private:
+ friend struct REImpl;
+ explicit State(int op_, int data_, std::size_t id_, State const *out_, State const *out1_)
+ : op(op_), data(data_), out(out_), out1(out1_), id(id_)
+ {}
+};
+
+template<typename Iter>
+struct Thread;
+
+template<typename Iter>
+struct StateEx
+{
+ StateEx()
+ : lastlist(0), visits(0), lastthread(0)
+ {}
+
+ int lastlist;
+ int visits;
+ Thread<Iter> *lastthread;
+};
+
+template<typename Iter>
+struct Extras
+{
+ Extras(std::size_t nstates)
+ : listid(0), stateex(nstates + 1, StateEx<Iter>())
+ {}
+
+ int listid;
+ std::vector<StateEx<Iter> > stateex;
+};
+
+template<typename Iter>
+struct Thread
+{
+ State const *state;
+ boost::array<Sub<Iter>, NSUB> match;
+};
+
+template<typename Iter>
+struct List
+{
+ explicit List(std::size_t nstates)
+ : t(nstates, Thread<Iter>()), n(0)
+ {}
+
+ std::vector<Thread<Iter> > t;
+ int n;
+};
+
+struct REImpl
+{
+ REImpl()
+ : start(0), nparen(0), states(new std::deque<State>)
+ {}
+
+ State *state(int op, int data, State const *out=0, State const *out1=0)
+ {
+ states->push_back(State(op, data, states->size()+1, out, out1));
+ return &states->back();
+ }
+
+ void dump()
+ {
+ std::vector<bool> seen(states->size() + 1);
+ dump(start, seen);
+ }
+
+ void dump(State const *s, std::vector<bool> &seen)
+ {
+ if(s == 0 || seen[s->id])
+ return;
+ seen[s->id] = true;
+ std::printf("%d| ", s->id);
+
+ switch(s->op)
+ {
+ case Char:
+ std::printf("'%c' -> %d\n", s->data, s->out->id);
+ break;
+
+ case Any:
+ std::printf(". -> %d\n", s->out->id);
+ break;
+
+ case Split:
+ std::printf("| -> %d, %d\n", s->out->id, s->out1->id);
+ break;
+
+ case LParen:
+ std::printf("( %d -> %d\n", s->data, s->out->id);
+ break;
+
+ case RParen:
+ std::printf(") %d -> %d\n", s->data, s->out->id);
+ break;
+
+ case Match:
+ std::printf("match\n");
+ break;
+
+ default:
+ std::printf("??? %d\n", s->op);
+ break;
+ }
+
+ dump(s->out, seen);
+ dump(s->out1, seen);
+ }
+
+ State const *start;
+ int nparen;
+ boost::shared_ptr<std::deque<State> > states;
+};
+
+// Since the out pointers in the list are always
+// uninitialized, we use the pointers themselves
+// as storage for the Ptrlists.
+union Ptrlist
+{
+ Ptrlist *next;
+ State const *s;
+};
+
+struct Frag
+{
+ explicit Frag(State const *start_=0, Ptrlist *out_=0)
+ : start(start_), out(out_)
+ {}
+
+ State const *start;
+ Ptrlist *out;
+};
+
+// Create singleton list containing just outp.
+Ptrlist *list1(State const **outp)
+{
+ Ptrlist *l = (Ptrlist*)outp;
+ l->next = 0;
+ return l;
+}
+
+// Patch the list of states at out to point to start.
+void patch(Ptrlist *l, State const *s)
+{
+ for(Ptrlist *next; l; l=next)
+ {
+ next = l->next;
+ l->s = s;
+ }
+}
+
+// Join the two lists l1 and l2, returning the combination.
+Ptrlist *append(Ptrlist *l1, Ptrlist *l2)
+{
+ Ptrlist *oldl1 = l1;
+ while(l1->next)
+ {
+ l1 = l1->next;
+ }
+ l1->next = l2;
+ return oldl1;
+}
+
+struct frag1_result
+{
+ template<typename>
+ struct result
+ {
+ typedef Frag type;
+ };
+};
+
+struct frag2_result
+{
+ template<typename, typename>
+ struct result
+ {
+ typedef Frag type;
+ };
+};
+
+struct frag3_result
+{
+ template<typename, typename, typename>
+ struct result
+ {
+ typedef Frag type;
+ };
+};
+
+struct any_char_impl : frag1_result
+{
+ Frag operator()(REImpl &impl) const
+ {
+ State *s = impl.state(Any, 0);
+ return Frag(s, list1(&s->out));
+ }
+};
+
+struct single_char_impl : frag2_result
+{
+ Frag operator()(REImpl &impl, char ch) const
+ {
+ State *s = impl.state(Char, ch);
+ return Frag(s, list1(&s->out));
+ }
+};
+
+struct paren_impl : frag3_result
+{
+ Frag operator()(REImpl &impl, Frag f, int n) const
+ {
+ if(n >= NSUB)
+ return f;
+ State *s1 = impl.state(LParen, n, f.start, 0);
+ State *s2 = impl.state(RParen, n, 0, 0);
+ patch(f.out, s2);
+ return Frag(s1, list1(&s2->out));
+ }
+};
+
+struct greedy_star_impl : frag2_result
+{
+ Frag operator()(REImpl &impl, Frag f) const
+ {
+ State *s = impl.state(Split, 0, f.start, 0);
+ patch(f.out, s);
+ return Frag(s, list1(&s->out1));
+ }
+};
+
+struct non_greedy_star_impl : frag2_result
+{
+ Frag operator()(REImpl &impl, Frag f) const
+ {
+ State *s = impl.state(Split, 0, 0, f.start);
+ patch(f.out, s);
+ return Frag(s, list1(&s->out));
+ }
+};
+
+struct greedy_plus_impl : frag2_result
+{
+ Frag operator()(REImpl &impl, Frag f) const
+ {
+ State *s = impl.state(Split, 0, f.start, 0);
+ patch(f.out, s);
+ return Frag(f.start, list1(&s->out1));
+ }
+};
+
+struct non_greedy_plus_impl : frag2_result
+{
+ Frag operator()(REImpl &impl, Frag f) const
+ {
+ State *s = impl.state(Split, 0, 0, f.start);
+ patch(f.out, s);
+ return Frag(f.start, list1(&s->out));
+ }
+};
+
+struct greedy_opt_impl : frag2_result
+{
+ Frag operator()(REImpl &impl, Frag f) const
+ {
+ State *s = impl.state(Split, 0, f.start, 0);
+ return Frag(s, append(f.out, list1(&s->out1)));
+ }
+};
+
+struct non_greedy_opt_impl : frag2_result
+{
+ Frag operator()(REImpl &impl, Frag f) const
+ {
+ State *s = impl.state(Split, 0, 0, f.start);
+ return Frag(s, append(f.out, list1(&s->out)));
+ }
+};
+
+struct do_concat_impl : frag2_result
+{
+ Frag operator()(Frag f1, Frag f2) const
+ {
+ patch(f1.out, f2.start);
+ return Frag(f1.start, f2.out);
+ }
+};
+
+struct do_alt_impl : frag3_result
+{
+ Frag operator()(REImpl &impl, Frag f1, Frag f2) const
+ {
+ State *s = impl.state(Split, 0, f1.start, f2.start);
+ return Frag(s, append(f1.out, f2.out));
+ }
+};
+
+struct next_paren_impl
+{
+ template<typename> struct result { typedef int type; };
+
+ int operator()(REImpl &impl) const
+ {
+ return ++impl.nparen;
+ }
+};
+
+struct do_regex_impl
+{
+ template<typename, typename>
+ struct result
+ {
+ typedef REImpl type;
+ };
+
+ REImpl &operator()(REImpl &impl, Frag f) const
+ {
+ f = paren_impl()(impl, f, 0);
+ State *s = impl.state(Match, 0, 0, 0);
+ patch(f.out, s);
+ impl.start = f.start;
+ return impl;
+ }
+};
+
+phoenix::function<any_char_impl> const any_char = any_char_impl();
+phoenix::function<single_char_impl> const single_char = single_char_impl();
+phoenix::function<paren_impl> const paren = paren_impl();
+phoenix::function<greedy_star_impl> const greedy_star = greedy_star_impl();
+phoenix::function<non_greedy_star_impl> const non_greedy_star = non_greedy_star_impl();
+phoenix::function<greedy_plus_impl> const greedy_plus = greedy_plus_impl();
+phoenix::function<non_greedy_plus_impl> const non_greedy_plus = non_greedy_plus_impl();
+phoenix::function<greedy_opt_impl> const greedy_opt = greedy_opt_impl();
+phoenix::function<non_greedy_opt_impl> const non_greedy_opt = non_greedy_opt_impl();
+phoenix::function<do_concat_impl> const do_concat = do_concat_impl();
+phoenix::function<do_alt_impl> const do_alt = do_alt_impl();
+phoenix::function<do_regex_impl> const do_regex = do_regex_impl();
+phoenix::function<next_paren_impl> const next_paren = next_paren_impl();
+
+struct frag_closure
+ : boost::spirit::closure<frag_closure, Frag>
+{
+ member1 frag;
+};
+
+struct count_closure
+ : boost::spirit::closure<count_closure, int>
+{
+ member1 count;
+};
+
+struct single_closure
+ : boost::spirit::closure<single_closure, Frag, int>
+{
+ member1 frag;
+ member2 count;
+};
+
+struct regex_closure
+ : boost::spirit::closure<regex_closure, REImpl>
+{
+ member1 impl;
+};
+
+struct regex_grammar
+ : boost::spirit::grammar< regex_grammar, regex_closure::context_t >
+{
+ template<typename Scan>
+ struct definition
+ {
+ definition(regex_grammar const &self)
+ {
+ using namespace boost::spirit;
+ using namespace phoenix;
+ BOOST_SPIRIT_DEBUG_NODE(regex);
+ BOOST_SPIRIT_DEBUG_NODE(alt);
+ BOOST_SPIRIT_DEBUG_NODE(concat);
+ BOOST_SPIRIT_DEBUG_NODE(repeat);
+ BOOST_SPIRIT_DEBUG_NODE(single);
+
+ regex = alt[ self.impl = do_regex(self.impl, arg1) ];
+
+ alt = concat[ alt.frag = arg1 ]
+ >> *('|' >> concat[ alt.frag = do_alt(self.impl, alt.frag, arg1) ]);
+
+ concat = repeat[ concat.frag = arg1 ]
+ >> *(repeat[ concat.frag = do_concat(concat.frag, arg1) ]);
+
+ repeat = single[ repeat.frag = arg1 ]
+ >> !( (ch_p('*') >> '?')[ repeat.frag = non_greedy_star(self.impl, repeat.frag) ]
+ | (ch_p('+') >> '?')[ repeat.frag = non_greedy_plus(self.impl, repeat.frag) ]
+ | (ch_p('?') >> '?')[ repeat.frag = non_greedy_opt(self.impl, repeat.frag) ]
+ | (ch_p('*'))[ repeat.frag = greedy_star(self.impl, repeat.frag) ]
+ | (ch_p('+'))[ repeat.frag = greedy_plus(self.impl, repeat.frag) ]
+ | (ch_p('?'))[ repeat.frag = greedy_opt(self.impl, repeat.frag) ])
+ ;
+
+ count = eps_p[ count.count = next_paren(self.impl) ];
+
+ single = ch_p('(') >> '?' >> ':' >> alt[ single.frag = arg1 ] >> ')'
+ | (ch_p('(') >> count[ single.count = arg1 ] >> alt[ single.frag = arg1 ] >> ')')
+ [
+ single.frag = paren(self.impl, single.frag, single.count)
+ ]
+ | ch_p('.') [ single.frag = any_char(self.impl) ]
+ | (~chset_p("|*+?():."))[ single.frag = single_char(self.impl, arg1)]
+ ;
+
+ }
+
+ boost::spirit::rule<Scan> const &start() const
+ {
+ return regex;
+ }
+
+ private:
+ boost::spirit::rule<Scan> regex;
+ boost::spirit::rule<Scan, frag_closure::context_t> alt, concat, repeat;
+ boost::spirit::rule<Scan, count_closure::context_t> count;
+ boost::spirit::rule<Scan, single_closure::context_t> single;
+ };
+};
+
+//// Is match a longer than match b?
+//// If so, return 1; if not, 0.
+//int longer(Subs const &a, Subs const &b)
+//{
+// if(a[0].first == 0)
+// return 0;
+// if(b[0].first == 0 || a[0].first < b[0].first)
+// return 1;
+// if(a[0].first == b[0].first && a[0].second > b[0].second)
+// return 1;
+// return 0;
+//}
+
+template<typename Iter>
+struct Matcher
+{
+ Matcher(REImpl const &impl_)
+ : impl(impl_), begin(), end(), subs(), dummy()
+ , l1(impl.states->size()), l2(impl.states->size())
+ , extras(impl.states->size())
+ {}
+
+ // Add s to l, following unlabeled arrows.
+ // Next character to read is p.
+ void addstate(List<Iter> *l, State const *s, boost::array<Sub<Iter>, NSUB> &m, Iter icur)
+ {
+ if(s == 0)
+ return;
+
+ StateEx<Iter> &ss = extras.stateex[s->id];
+ if(ss.lastlist == extras.listid)
+ {
+ if(++ss.visits > 2)
+ return;
+
+ //switch(matchtype)
+ //{
+ //case LeftmostBiased:
+ // if(reptype == RepeatMinimal || ++ss.visits > 2)
+ // return;
+ // break;
+ //case LeftmostLongest:
+ // if(!longer(m, ss.lastthread->match))
+ // return;
+ // break;
+ //}
+ }
+ else
+ {
+ ss.lastlist = extras.listid;
+ ss.lastthread = &l->t[l->n++];
+ ss.visits = 1;
+ }
+
+ if(ss.visits == 1)
+ {
+ ss.lastthread->state = s;
+ ss.lastthread->match = m;
+ }
+
+ switch(s->op)
+ {
+ case Split:
+ // follow unlabeled arrows
+ addstate(l, s->out, m, icur);
+ addstate(l, s->out1, m, icur);
+ break;
+
+ case LParen:
+ { // record left paren location and keep going
+ Sub<Iter> save = m[s->data];
+ m[s->data].first = icur;
+ m[s->data].matched = Incomplete;
+ addstate(l, s->out, m, icur);
+ // restore old information before returning.
+ m[s->data] = save;
+ } break;
+
+ case RParen:
+ { // record right paren location and keep going
+ Sub<Iter> save = m[s->data];
+ m[s->data].second = icur;
+ m[s->data].matched = Matched;
+ addstate(l, s->out, m, icur);
+ // restore old information before returning.
+ m[s->data] = save;
+ } break;
+
+ default:
+ break;
+ }
+ }
+
+ // Step the NFA from the states in clist
+ // past the character c,
+ // to create next NFA state set nlist.
+ // Record best match so far in *this.
+ void
+ step(List<Iter> *clist, int c, Iter icur, List<Iter> *nlist)
+ {
+ if(debug)
+ {
+ dumplist(clist, impl.nparen);
+ std::printf("%c (%d)\n", c, c);
+ }
+
+ ++extras.listid;
+ nlist->n = 0;
+
+ for(int i=0; i<clist->n; ++i)
+ {
+ Thread<Iter> *t = &clist->t[i];
+
+ //if(matchtype == LeftmostLongest)
+ //{
+ // // stop any threads that are worse than the
+ // // leftmost longest found so far. the threads
+ // // will end up ordered on the list by start point,
+ // // so if this one is too far right, all the rest are too.
+ // if(subs[0].first && subs[0].first < t->match[0].first)
+ // {
+ // break;
+ // }
+ //}
+
+ switch(t->state->op)
+ {
+ case Char:
+ if(c == t->state->data)
+ {
+ addstate(nlist, t->state->out, t->match, icur);
+ }
+ break;
+
+ case Any:
+ addstate(nlist, t->state->out, t->match, icur);
+ break;
+
+ case Match:
+ //switch(matchtype)
+ //{
+ //case LeftmostBiased:
+ // best so far ...
+ subs = t->match;
+ // ... because we cut off the worse ones right now!
+ return;
+ //case LeftmostLongest:
+ // if(longer(t->match, subs))
+ // {
+ // subs = t->match;
+ // }
+ // break;
+ //default:
+ // break;
+ //}
+ break;
+ default:
+ break;
+ }
+ }
+
+ // start a new thread if no match yet
+ if(subs[0].matched == Unmatched)
+ {
+ addstate(nlist, impl.start, dummy, icur);
+ }
+ }
+
+ // Compute initial thread list
+ List<Iter> *startlist(Iter icur, List<Iter> *l)
+ {
+ List<Iter> empty(0);
+ std::fill_n(&subs[0], (int)NSUB, Sub<Iter>());
+ step(&empty, 0, icur, l);
+ return l;
+ }
+
+ bool match(Iter icur, Iter iend)
+ {
+ begin = icur; end = iend;
+ List<Iter> *clist = startlist(icur, &l1);
+ List<Iter> *nlist = &l2;
+
+ for(; icur != end && clist->n > 0; ++icur)
+ {
+ int c = *icur & 0xFF;
+ step(clist, c, boost::next(icur), nlist);
+ std::swap(clist, nlist);
+ }
+
+ step(clist, 0, icur, nlist);
+ return subs[0].matched == Matched;
+ }
+
+ void printmatch(boost::array<Sub<Iter>, NSUB> const &m, int n)
+ {
+ for(int i=0; i<n; ++i)
+ {
+ if(m[i].matched == Matched)
+ std::printf("(%d,%d)", std::distance(begin, m[i].first), std::distance(begin, m[i].second));
+ else if(m[i].matched == Incomplete)
+ std::printf("(%d,?)", std::distance(begin, m[i].first));
+ else
+ std::printf("(?,?)");
+ }
+ }
+
+ void dumplist(List<Iter> const *l, int nparen)
+ {
+ for(int i=0; i<l->n; ++i)
+ {
+ Thread<Iter> const *t = &l->t[i];
+ if(t->state->op != Char && t->state->op != Any && t->state->op != Match)
+ {
+ continue;
+ }
+ std::printf(" ");
+ std::printf("%d ", t->state->id);
+ printmatch(t->match, nparen+1);
+ std::printf("\n");
+ }
+ }
+
+ REImpl const &impl;
+ Iter begin, end;
+ boost::array<Sub<Iter>, NSUB> subs, dummy;
+ List<Iter> l1, l2;
+ Extras<Iter> extras;
+};
+
+
+
+int main(int argc, char *argv[])
+{
+ //_CrtSetDbgFlag(_CRTDBG_ALLOC_MEM_DF | _CRTDBG_LEAK_CHECK_DF);
+
+ for(;;)
+ {
+ if(argc > 1 && strcmp(argv[1], "-d") == 0)
+ {
+ debug++;
+ argv[1] = argv[0]; --argc; ++argv;
+ }
+ //if(argc > 1 && strcmp(argv[1], "-l") == 0)
+ //{
+ // matchtype = LeftmostLongest;
+ // argv[1] = argv[0]; argc--; argv++;
+ //}
+ //else if(argc > 1 && strcmp(argv[1], "-p") == 0)
+ //{
+ // reptype = RepeatLikePerl;
+ // argv[1] = argv[0]; argc--; argv++;
+ //}
+ else
+ {
+ break;
+ }
+ }
+
+ if(argc < 3)
+ {
+ fprintf(stderr, "usage: %s regexp string...\n", argv[0]);
+ return 1;
+ }
+
+ REImpl impl;
+ boost::spirit::parse(argv[1], regex_grammar()(impl)[ phoenix::var(impl) = phoenix::arg1 ]);
+ if(debug)
+ {
+ impl.dump();
+ }
+
+ Matcher<std::string::const_iterator> m(impl);
+ for(int i=2; i<argc; ++i)
+ {
+ std::string const str = argv[i];
+ if(m.match(str.begin(), str.end()))
+ {
+ std::printf("%s: ", argv[i]);
+ m.printmatch(m.subs, impl.nparen + 1);
+ std::printf("\n");
+ }
+ }
+
+ return 0;
+}
+
+/*
+ * Distributed under the Boost Software License, Version 1.0. (See
+ * accompanying file LICENSE_1_0.txt or copy at
+ * http://www.boost.org/LICENSE_1_0.txt)
+ *
+ */
BIN  toy_attract.pair.zip
Binary file not shown
BIN  unicode.tar.bz2
Binary file not shown
BIN  utf8.zip
Binary file not shown
BIN  xpressive.zip
Binary file not shown
BIN  xpressive_multicapture_support.zip
Binary file not shown
Please sign in to comment.
Something went wrong with that request. Please try again.