From 495895b580d3053113e19b18358c506b50044af0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tade=C3=A1=C5=A1=20Ku=C4=8Dera?= Date: Mon, 17 Feb 2020 16:06:25 +0100 Subject: [PATCH 1/3] Fix handle of nested escaped regexp classes --- src/parser/parser_driver.cpp | 1 + tests/cpp/parser_tests.cpp | 31 +++++++++++++++++++++++++++++++ 2 files changed, 32 insertions(+) diff --git a/src/parser/parser_driver.cpp b/src/parser/parser_driver.cpp index fe6bcdbe..82a44c4c 100644 --- a/src/parser/parser_driver.cpp +++ b/src/parser/parser_driver.cpp @@ -391,6 +391,7 @@ void ParserDriver::defineTokens() _parser.token(R"(\\D)").states("$regexp_class").action([&](std::string_view) -> Value { _regexpClass += "\\D"; return {};}); _parser.token(R"(\\b)").states("$regexp_class").action([&](std::string_view) -> Value { _regexpClass += "\\b"; return {};}); _parser.token(R"(\\B)").states("$regexp_class").action([&](std::string_view) -> Value { _regexpClass += "\\B"; return {};}); + _parser.token(R"(\\])").states("$regexp_class").action([&](std::string_view) -> Value { _regexpClass += "\\]"; return {};}); _parser.token(R"([^]])").states("$regexp_class").action([&](std::string_view str) -> Value { _regexpClass += std::string{str}[0]; return {}; }); // $regexp end diff --git a/tests/cpp/parser_tests.cpp b/tests/cpp/parser_tests.cpp index 8f7c3d6a..e4a959b7 100644 --- a/tests/cpp/parser_tests.cpp +++ b/tests/cpp/parser_tests.cpp @@ -1110,6 +1110,37 @@ rule regexp_with_custom_negative_class EXPECT_EQ(input_text, driver.getParsedFile().getTextFormatted()); } +TEST_F(ParserTests, +RegexpWithEscapedSquareBracketsInsideClassWorks) { + prepareInput( +R"( +rule regexp_with_square_brackets_inside_class +{ + strings: + $1 = /[\[\]++]/ + condition: + $1 +} +)"); + + EXPECT_TRUE(driver.parse(input)); + ASSERT_EQ(1u, driver.getParsedFile().getRules().size()); + + const auto& rule = driver.getParsedFile().getRules()[0]; + EXPECT_EQ("regexp_with_square_brackets_inside_class", rule->getName()); + EXPECT_EQ(Rule::Modifier::None, rule->getModifier()); + + auto strings = rule->getStrings(); + ASSERT_EQ(1u, strings.size()); + + auto regexp = strings[0]; + EXPECT_TRUE(regexp->isRegexp()); + EXPECT_EQ("$1", regexp->getIdentifier()); + EXPECT_EQ(R"(/[\[\]++]/)", regexp->getText()); + + EXPECT_EQ(input_text, driver.getParsedFile().getTextFormatted()); +} + TEST_F(ParserTests, RegexpWithIterationWorks) { prepareInput( From e5f348092cc5840a7914b177560722fabcd05b2b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tade=C3=A1=C5=A1=20Ku=C4=8Dera?= Date: Mon, 17 Feb 2020 17:33:12 +0100 Subject: [PATCH 2/3] Fix handle of nested non-escaped classes --- include/yaramod/parser/parser_driver.h | 2 + include/yaramod/parser/value.h | 9 ++++- src/parser/parser_driver.cpp | 37 ++++++++++++++---- tests/cpp/parser_tests.cpp | 53 +++++++++++++++++++++++++- 4 files changed, 91 insertions(+), 10 deletions(-) diff --git a/include/yaramod/parser/parser_driver.h b/include/yaramod/parser/parser_driver.h index 5b57b34a..03dad9e8 100644 --- a/include/yaramod/parser/parser_driver.h +++ b/include/yaramod/parser/parser_driver.h @@ -180,7 +180,9 @@ class ParserDriver std::string _indent; ///< Variable storing current indentation std::string _comment; ///< For incremental construction of parsed comments std::string _regexpClass; ///< Currently processed regular expression class. + size_t _regexpClassDepth = 0; ///< The rectangular brackets depth of nesting pog::Parser _parser; ///< used pog parser + bool _sectionStrings = false; ///< flag used to determine if we parse section after 'strings:' bool _escapedContent = false; ///< flag used to determine if a currently parsed literal contains hexadecimal byte (such byte must be unescaped in getPureText()) diff --git a/include/yaramod/parser/value.h b/include/yaramod/parser/value.h index 66b08e3d..0113e7a5 100644 --- a/include/yaramod/parser/value.h +++ b/include/yaramod/parser/value.h @@ -21,6 +21,7 @@ namespace yaramod { using RegexpRangePair = std::pair, std::optional>; using StringModifiers = std::vector>; +using RegexpClassRecord = std::pair; /** * Value is the type of all tokens produced by POG parser. Both token and rule actions return Value. The rule action parameters are also Values. @@ -49,7 +50,8 @@ class Value std::shared_ptr, std::vector>, //18 TokenIt, - RegexpRangePair //20 + RegexpRangePair, //20 + RegexpClassRecord >; /// @name Constructors @@ -168,6 +170,11 @@ class Value { return std::move(moveValue()); } + + RegexpClassRecord&& getRegexpClassRecord() + { + return std::move(moveValue()); + } /// @} protected: diff --git a/src/parser/parser_driver.cpp b/src/parser/parser_driver.cpp index 82a44c4c..ed054e10 100644 --- a/src/parser/parser_driver.cpp +++ b/src/parser/parser_driver.cpp @@ -365,23 +365,37 @@ void ParserDriver::defineTokens() return std::string{str}; }); _parser.token(R"(\[\^\])").states("$regexp").enter_state("$regexp_class").action([&](std::string_view) -> Value { + _regexpClassDepth = 1; _regexpClass = "^]"; return {}; }); _parser.token(R"(\[\])").states("$regexp").enter_state("$regexp_class").action([&](std::string_view) -> Value { + _regexpClassDepth = 1; _regexpClass = "]"; return {}; }); _parser.token(R"(\[\^)").states("$regexp").enter_state("$regexp_class").action([&](std::string_view) -> Value { + _regexpClassDepth = 1; _regexpClass = "^"; return {}; } ); _parser.token(R"(\[)").states("$regexp").enter_state("$regexp_class").action([&](std::string_view) -> Value { + _regexpClassDepth = 1; _regexpClass.clear(); return {}; } ); - _parser.token(R"(\])").states("$regexp_class").symbol("REGEXP_CLASS").description("regexp class").enter_state("$regexp").action([&](std::string_view) -> Value { - return _regexpClass; + _parser.token(R"(\])").states("$regexp_class").symbol("REGEXP_CLASS").description("regexp class").action([&](std::string_view) -> Value { + --_regexpClassDepth; + if(_regexpClassDepth == 0) + { + enter_state("$regexp"); + return std::make_pair(true, _regexpClass); + } + else + { + _regexpClass += "]"; + return std::make_pair(false, ""); + } }); _parser.token(R"(\\w)").states("$regexp_class").action([&](std::string_view) -> Value { _regexpClass += "\\w"; return {};}); _parser.token(R"(\\W)").states("$regexp_class").action([&](std::string_view) -> Value { _regexpClass += "\\W"; return {};}); @@ -391,8 +405,14 @@ void ParserDriver::defineTokens() _parser.token(R"(\\D)").states("$regexp_class").action([&](std::string_view) -> Value { _regexpClass += "\\D"; return {};}); _parser.token(R"(\\b)").states("$regexp_class").action([&](std::string_view) -> Value { _regexpClass += "\\b"; return {};}); _parser.token(R"(\\B)").states("$regexp_class").action([&](std::string_view) -> Value { _regexpClass += "\\B"; return {};}); - _parser.token(R"(\\])").states("$regexp_class").action([&](std::string_view) -> Value { _regexpClass += "\\]"; return {};}); - _parser.token(R"([^]])").states("$regexp_class").action([&](std::string_view str) -> Value { _regexpClass += std::string{str}[0]; return {}; }); + _parser.token(R"(\\\])").states("$regexp_class").action([&](std::string_view) -> Value { _regexpClass += "\\]"; --_regexpClassDepth; return {};}); + _parser.token(R"(\\\[)").states("$regexp_class").action([&](std::string_view) -> Value { _regexpClass += "\\["; ++_regexpClassDepth; return {};}); + _parser.token(R"(\[)").states("$regexp_class").action([&](std::string_view) -> Value { + ++_regexpClassDepth; + _regexpClass += "["; + return {}; + }); + _parser.token(R"([^\]\[])").states("$regexp_class").action([&](std::string_view str) -> Value { _regexpClass += std::string{str}[0]; return {}; }); // $regexp end _parser.end_token().states("@default", "$str", "$include", "$hexstr", "hexstr_jump", "$regexp", "$regexp_class").action([&](std::string_view) -> Value { @@ -871,9 +891,12 @@ void ParserDriver::defineGrammar() .production("REGEXP_NON_SPACE", [](auto&&) -> Value { return Value(std::make_shared()); }) .production("REGEXP_DIGIT", [](auto&&) -> Value { return Value(std::make_shared()); }) .production("REGEXP_NON_DIGIT", [](auto&&) -> Value { return Value(std::make_shared()); }) - .production("REGEXP_CLASS", [](auto&& args) -> Value { - std::string c = std::move(args[0].getString()); - if (c[0] == '^') + .production("REGEXP_CLASS", [&](auto&& args) -> Value { + auto record = std::move(args[0].getRegexpClassRecord()); + if (!record.first) + return Value(std::make_shared("")); + auto c = record.second; + if (!c.empty() && c[0] == '^') return std::make_shared(c.substr(1, c.length() - 1), true); else return std::make_shared(std::move(c), false); diff --git a/tests/cpp/parser_tests.cpp b/tests/cpp/parser_tests.cpp index e4a959b7..1a8bd8b6 100644 --- a/tests/cpp/parser_tests.cpp +++ b/tests/cpp/parser_tests.cpp @@ -1114,7 +1114,7 @@ TEST_F(ParserTests, RegexpWithEscapedSquareBracketsInsideClassWorks) { prepareInput( R"( -rule regexp_with_square_brackets_inside_class +rule regexp_with_escaped_square_brackets_inside_class { strings: $1 = /[\[\]++]/ @@ -1127,7 +1127,7 @@ rule regexp_with_square_brackets_inside_class ASSERT_EQ(1u, driver.getParsedFile().getRules().size()); const auto& rule = driver.getParsedFile().getRules()[0]; - EXPECT_EQ("regexp_with_square_brackets_inside_class", rule->getName()); + EXPECT_EQ("regexp_with_escaped_square_brackets_inside_class", rule->getName()); EXPECT_EQ(Rule::Modifier::None, rule->getModifier()); auto strings = rule->getStrings(); @@ -1141,6 +1141,55 @@ rule regexp_with_square_brackets_inside_class EXPECT_EQ(input_text, driver.getParsedFile().getTextFormatted()); } +TEST_F(ParserTests, +RegexpWithUnescapedSquareBracketsInsideClassWorks) { + prepareInput( +R"( +rule regexp_with_unescaped_square_brackets_inside_class +{ + strings: + $1 = /[[d][]***[abc]**][[]**]/ + $2 = /[ !#()[\]{}*][ !#[\]+_]/ + $3 = /[[\]*+]/ + $4 = /[\[\]*+]/ + condition: + all of them +} +)"); + + EXPECT_TRUE(driver.parse(input)); + ASSERT_EQ(1u, driver.getParsedFile().getRules().size()); + + const auto& rule = driver.getParsedFile().getRules()[0]; + EXPECT_EQ("regexp_with_unescaped_square_brackets_inside_class", rule->getName()); + EXPECT_EQ(Rule::Modifier::None, rule->getModifier()); + + auto strings = rule->getStrings(); + ASSERT_EQ(4u, strings.size()); + + auto regexp1 = strings[0]; + EXPECT_TRUE(regexp1->isRegexp()); + EXPECT_EQ("$1", regexp1->getIdentifier()); + EXPECT_EQ(R"(/[[d][]***[abc]**][[]**]/)", regexp1->getText()); + + auto regexp2 = strings[1]; + EXPECT_TRUE(regexp2->isRegexp()); + EXPECT_EQ("$2", regexp2->getIdentifier()); + EXPECT_EQ(R"(/[ !#()[\]{}*][ !#[\]+_]/)", regexp2->getText()); + + auto regexp3 = strings[2]; + EXPECT_TRUE(regexp3->isRegexp()); + EXPECT_EQ("$3", regexp3->getIdentifier()); + EXPECT_EQ(R"(/[[\]*+]/)", regexp3->getText()); + + auto regexp = strings[3]; + EXPECT_TRUE(regexp->isRegexp()); + EXPECT_EQ("$4", regexp->getIdentifier()); + EXPECT_EQ(R"(/[\[\]*+]/)", regexp->getText()); + + EXPECT_EQ(input_text, driver.getParsedFile().getTextFormatted()); +} + TEST_F(ParserTests, RegexpWithIterationWorks) { prepareInput( From 0c380b880d305b5e9d04300a2fce4ab1bbba6659 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tade=C3=A1=C5=A1=20Ku=C4=8Dera?= Date: Mon, 24 Feb 2020 09:58:59 +0100 Subject: [PATCH 3/3] Fix small code issues --- include/yaramod/parser/parser_driver.h | 2 +- src/parser/parser_driver.cpp | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/include/yaramod/parser/parser_driver.h b/include/yaramod/parser/parser_driver.h index 03dad9e8..92d54c14 100644 --- a/include/yaramod/parser/parser_driver.h +++ b/include/yaramod/parser/parser_driver.h @@ -180,7 +180,7 @@ class ParserDriver std::string _indent; ///< Variable storing current indentation std::string _comment; ///< For incremental construction of parsed comments std::string _regexpClass; ///< Currently processed regular expression class. - size_t _regexpClassDepth = 0; ///< The rectangular brackets depth of nesting + std::size_t _regexpClassDepth = 0; ///< The rectangular brackets depth of nesting pog::Parser _parser; ///< used pog parser bool _sectionStrings = false; ///< flag used to determine if we parse section after 'strings:' diff --git a/src/parser/parser_driver.cpp b/src/parser/parser_driver.cpp index ed054e10..fbb38019 100644 --- a/src/parser/parser_driver.cpp +++ b/src/parser/parser_driver.cpp @@ -394,7 +394,7 @@ void ParserDriver::defineTokens() else { _regexpClass += "]"; - return std::make_pair(false, ""); + return std::make_pair(false, std::string{}); } }); _parser.token(R"(\\w)").states("$regexp_class").action([&](std::string_view) -> Value { _regexpClass += "\\w"; return {};}); @@ -894,7 +894,7 @@ void ParserDriver::defineGrammar() .production("REGEXP_CLASS", [&](auto&& args) -> Value { auto record = std::move(args[0].getRegexpClassRecord()); if (!record.first) - return Value(std::make_shared("")); + return Value(std::make_shared(std::string{})); auto c = record.second; if (!c.empty() && c[0] == '^') return std::make_shared(c.substr(1, c.length() - 1), true);