Skip to content

Commit

Permalink
Implement parsing of nested regexp classes (issue #67) (#69)
Browse files Browse the repository at this point in the history
* Fix handle of nested escaped regexp classes

* Fix handle of nested non-escaped classes

* Fix small code issues
  • Loading branch information
TadeasKucera committed Feb 25, 2020
1 parent 4951968 commit 6866be6
Show file tree
Hide file tree
Showing 4 changed files with 120 additions and 7 deletions.
2 changes: 2 additions & 0 deletions include/yaramod/parser/parser_driver.h
Original file line number Diff line number Diff line change
Expand Up @@ -180,7 +180,9 @@ class ParserDriver
std::string _indent; ///< Variable storing current indentation
std::string _comment; ///< For incremental construction of parsed comments
std::string _regexpClass; ///< Currently processed regular expression class.
std::size_t _regexpClassDepth = 0; ///< The rectangular brackets depth of nesting
pog::Parser<Value> _parser; ///< used pog parser

bool _sectionStrings = false; ///< flag used to determine if we parse section after 'strings:'
bool _escapedContent = false; ///< flag used to determine if a currently parsed literal contains hexadecimal byte (such byte must be unescaped in getPureText())

Expand Down
9 changes: 8 additions & 1 deletion include/yaramod/parser/value.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ namespace yaramod {

using RegexpRangePair = std::pair<std::optional<std::uint64_t>, std::optional<std::uint64_t>>;
using StringModifiers = std::vector<std::shared_ptr<StringModifier>>;
using RegexpClassRecord = std::pair<bool, std::string>;

/**
* Value is the type of all tokens produced by POG parser. Both token and rule actions return Value. The rule action parameters are also Values.
Expand Down Expand Up @@ -49,7 +50,8 @@ class Value
std::shared_ptr<RegexpUnit>,
std::vector<std::shared_ptr<RegexpUnit>>, //18
TokenIt,
RegexpRangePair //20
RegexpRangePair, //20
RegexpClassRecord
>;

/// @name Constructors
Expand Down Expand Up @@ -168,6 +170,11 @@ class Value
{
return std::move(moveValue<RegexpRangePair>());
}

RegexpClassRecord&& getRegexpClassRecord()
{
return std::move(moveValue<RegexpClassRecord>());
}
/// @}

protected:
Expand Down
36 changes: 30 additions & 6 deletions src/parser/parser_driver.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -365,23 +365,37 @@ void ParserDriver::defineTokens()
return std::string{str};
});
_parser.token(R"(\[\^\])").states("$regexp").enter_state("$regexp_class").action([&](std::string_view) -> Value {
_regexpClassDepth = 1;
_regexpClass = "^]";
return {};
});
_parser.token(R"(\[\])").states("$regexp").enter_state("$regexp_class").action([&](std::string_view) -> Value {
_regexpClassDepth = 1;
_regexpClass = "]";
return {};
});
_parser.token(R"(\[\^)").states("$regexp").enter_state("$regexp_class").action([&](std::string_view) -> Value {
_regexpClassDepth = 1;
_regexpClass = "^";
return {};
} );
_parser.token(R"(\[)").states("$regexp").enter_state("$regexp_class").action([&](std::string_view) -> Value {
_regexpClassDepth = 1;
_regexpClass.clear();
return {};
} );
_parser.token(R"(\])").states("$regexp_class").symbol("REGEXP_CLASS").description("regexp class").enter_state("$regexp").action([&](std::string_view) -> Value {
return _regexpClass;
_parser.token(R"(\])").states("$regexp_class").symbol("REGEXP_CLASS").description("regexp class").action([&](std::string_view) -> Value {
--_regexpClassDepth;
if(_regexpClassDepth == 0)
{
enter_state("$regexp");
return std::make_pair(true, _regexpClass);
}
else
{
_regexpClass += "]";
return std::make_pair(false, std::string{});
}
});
_parser.token(R"(\\w)").states("$regexp_class").action([&](std::string_view) -> Value { _regexpClass += "\\w"; return {};});
_parser.token(R"(\\W)").states("$regexp_class").action([&](std::string_view) -> Value { _regexpClass += "\\W"; return {};});
Expand All @@ -391,7 +405,14 @@ void ParserDriver::defineTokens()
_parser.token(R"(\\D)").states("$regexp_class").action([&](std::string_view) -> Value { _regexpClass += "\\D"; return {};});
_parser.token(R"(\\b)").states("$regexp_class").action([&](std::string_view) -> Value { _regexpClass += "\\b"; return {};});
_parser.token(R"(\\B)").states("$regexp_class").action([&](std::string_view) -> Value { _regexpClass += "\\B"; return {};});
_parser.token(R"([^]])").states("$regexp_class").action([&](std::string_view str) -> Value { _regexpClass += std::string{str}[0]; return {}; });
_parser.token(R"(\\\])").states("$regexp_class").action([&](std::string_view) -> Value { _regexpClass += "\\]"; --_regexpClassDepth; return {};});
_parser.token(R"(\\\[)").states("$regexp_class").action([&](std::string_view) -> Value { _regexpClass += "\\["; ++_regexpClassDepth; return {};});
_parser.token(R"(\[)").states("$regexp_class").action([&](std::string_view) -> Value {
++_regexpClassDepth;
_regexpClass += "[";
return {};
});
_parser.token(R"([^\]\[])").states("$regexp_class").action([&](std::string_view str) -> Value { _regexpClass += std::string{str}[0]; return {}; });
// $regexp end

_parser.end_token().states("@default", "$str", "$include", "$hexstr", "hexstr_jump", "$regexp", "$regexp_class").action([&](std::string_view) -> Value {
Expand Down Expand Up @@ -870,9 +891,12 @@ void ParserDriver::defineGrammar()
.production("REGEXP_NON_SPACE", [](auto&&) -> Value { return Value(std::make_shared<RegexpNonSpace>()); })
.production("REGEXP_DIGIT", [](auto&&) -> Value { return Value(std::make_shared<RegexpDigit>()); })
.production("REGEXP_NON_DIGIT", [](auto&&) -> Value { return Value(std::make_shared<RegexpNonDigit>()); })
.production("REGEXP_CLASS", [](auto&& args) -> Value {
std::string c = std::move(args[0].getString());
if (c[0] == '^')
.production("REGEXP_CLASS", [&](auto&& args) -> Value {
auto record = std::move(args[0].getRegexpClassRecord());
if (!record.first)
return Value(std::make_shared<RegexpText>(std::string{}));
auto c = record.second;
if (!c.empty() && c[0] == '^')
return std::make_shared<RegexpClass>(c.substr(1, c.length() - 1), true);
else
return std::make_shared<RegexpClass>(std::move(c), false);
Expand Down
80 changes: 80 additions & 0 deletions tests/cpp/parser_tests.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1110,6 +1110,86 @@ rule regexp_with_custom_negative_class
EXPECT_EQ(input_text, driver.getParsedFile().getTextFormatted());
}

TEST_F(ParserTests,
RegexpWithEscapedSquareBracketsInsideClassWorks) {
prepareInput(
R"(
rule regexp_with_escaped_square_brackets_inside_class
{
strings:
$1 = /[\[\]++]/
condition:
$1
}
)");

EXPECT_TRUE(driver.parse(input));
ASSERT_EQ(1u, driver.getParsedFile().getRules().size());

const auto& rule = driver.getParsedFile().getRules()[0];
EXPECT_EQ("regexp_with_escaped_square_brackets_inside_class", rule->getName());
EXPECT_EQ(Rule::Modifier::None, rule->getModifier());

auto strings = rule->getStrings();
ASSERT_EQ(1u, strings.size());

auto regexp = strings[0];
EXPECT_TRUE(regexp->isRegexp());
EXPECT_EQ("$1", regexp->getIdentifier());
EXPECT_EQ(R"(/[\[\]++]/)", regexp->getText());

EXPECT_EQ(input_text, driver.getParsedFile().getTextFormatted());
}

TEST_F(ParserTests,
RegexpWithUnescapedSquareBracketsInsideClassWorks) {
prepareInput(
R"(
rule regexp_with_unescaped_square_brackets_inside_class
{
strings:
$1 = /[[d][]***[abc]**][[]**]/
$2 = /[ !#()[\]{}*][ !#[\]+_]/
$3 = /[[\]*+]/
$4 = /[\[\]*+]/
condition:
all of them
}
)");

EXPECT_TRUE(driver.parse(input));
ASSERT_EQ(1u, driver.getParsedFile().getRules().size());

const auto& rule = driver.getParsedFile().getRules()[0];
EXPECT_EQ("regexp_with_unescaped_square_brackets_inside_class", rule->getName());
EXPECT_EQ(Rule::Modifier::None, rule->getModifier());

auto strings = rule->getStrings();
ASSERT_EQ(4u, strings.size());

auto regexp1 = strings[0];
EXPECT_TRUE(regexp1->isRegexp());
EXPECT_EQ("$1", regexp1->getIdentifier());
EXPECT_EQ(R"(/[[d][]***[abc]**][[]**]/)", regexp1->getText());

auto regexp2 = strings[1];
EXPECT_TRUE(regexp2->isRegexp());
EXPECT_EQ("$2", regexp2->getIdentifier());
EXPECT_EQ(R"(/[ !#()[\]{}*][ !#[\]+_]/)", regexp2->getText());

auto regexp3 = strings[2];
EXPECT_TRUE(regexp3->isRegexp());
EXPECT_EQ("$3", regexp3->getIdentifier());
EXPECT_EQ(R"(/[[\]*+]/)", regexp3->getText());

auto regexp = strings[3];
EXPECT_TRUE(regexp->isRegexp());
EXPECT_EQ("$4", regexp->getIdentifier());
EXPECT_EQ(R"(/[\[\]*+]/)", regexp->getText());

EXPECT_EQ(input_text, driver.getParsedFile().getTextFormatted());
}

TEST_F(ParserTests,
RegexpWithIterationWorks) {
prepareInput(
Expand Down

0 comments on commit 6866be6

Please sign in to comment.