Skip to content

Commit

Permalink
Correct parsing of literal numbers and booleans without quotes (#543)
Browse files Browse the repository at this point in the history
For example, 2.70  and  true  (without quotes) are now parsed as "2.70"^^xsd:decimal and "true"^^xsd:boolean, as they should.
  • Loading branch information
joka921 committed Jan 20, 2022
1 parent 6879a21 commit b16a83c
Show file tree
Hide file tree
Showing 6 changed files with 86 additions and 29 deletions.
2 changes: 2 additions & 0 deletions src/global/Constants.h
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,8 @@ static const char XSD_FLOAT_TYPE[] = "http://www.w3.org/2001/XMLSchema#float";
static const char XSD_DOUBLE_TYPE[] = "http://www.w3.org/2001/XMLSchema#double";
static const char XSD_DECIMAL_TYPE[] =
"http://www.w3.org/2001/XMLSchema#decimal";
static const char XSD_BOOLEAN_TYPE[] =
"http://www.w3.org/2001/XMLSchema#boolean";
static const char VALUE_DATE_TIME_SEPARATOR[] = "T";
static const int DEFAULT_NOF_VALUE_INTEGER_DIGITS = 50;
static const int DEFAULT_NOF_VALUE_EXPONENT_DIGITS = 20;
Expand Down
3 changes: 3 additions & 0 deletions src/index/Index.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1312,6 +1312,9 @@ LangtagAndTriple Index::tripleToInternalRepresentation(Triple&& tripleIn) {
if (ad_utility::isXsdValue(spo[2])) {
spo[2] = ad_utility::convertValueLiteralToIndexWord(spo[2]);
upperBound = 2;
} else if (ad_utility::isNumeric(spo[2])) {
spo[2] = ad_utility::convertNumericToIndexWord(spo[2]);
upperBound = 2;
} else if (isLiteral(spo[2])) {
res._langtag = decltype(_vocab)::getLanguage(spo[2]);
}
Expand Down
12 changes: 9 additions & 3 deletions src/parser/TurtleParser.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -235,7 +235,7 @@ bool TurtleParser<T>::collection() {
// ______________________________________________________________________
template <class T>
bool TurtleParser<T>::numericLiteral() {
return integer() || decimal() || doubleParse();
return doubleParse() || decimal() || integer();
}

// ______________________________________________________________________
Expand Down Expand Up @@ -263,8 +263,14 @@ bool TurtleParser<T>::rdfLiteral() {
// ______________________________________________________________________
template <class T>
bool TurtleParser<T>::booleanLiteral() {
return parseTerminal<TurtleTokenId::True>() ||
parseTerminal<TurtleTokenId::False>();
if (parseTerminal<TurtleTokenId::True>() ||
parseTerminal<TurtleTokenId::False>()) {
_lastParseResult =
'"' + _lastParseResult + "\"^^<" + XSD_BOOLEAN_TYPE + '>';
return true;
} else {
return false;
}
}

// ______________________________________________________________________
Expand Down
2 changes: 2 additions & 0 deletions src/parser/TurtleParser.h
Original file line number Diff line number Diff line change
Expand Up @@ -259,6 +259,8 @@ class TurtleParser {
FRIEND_TEST(TurtleParserTest, object);
FRIEND_TEST(TurtleParserTest, blankNode);
FRIEND_TEST(TurtleParserTest, blankNodePropertyList);
FRIEND_TEST(TurtleParserTest, numericLiteral);
FRIEND_TEST(TurtleParserTest, booleanLiteral);
};

/**
Expand Down
20 changes: 11 additions & 9 deletions src/util/Conversions.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
#include <vector>

#include "../global/Constants.h"
#include "../parser/TokenizerCtre.h"
#include "./Exception.h"
#include "./StringUtils.h"

Expand Down Expand Up @@ -641,18 +642,19 @@ bool isXsdValue(const string& val) {
}

// _____________________________________________________________________________
bool isNumeric(const string& val) {
if (val.empty()) {
return false;
bool isNumeric(const string& value) {
if (ctre::match<TurtleTokenCtre::Double>(value)) {
throw std::out_of_range{
"Decimal numbers with an explicit exponent are currently not supported "
"by QLever, but the following number was encountered: " +
value};
}
size_t start = (val[0] == '-' || val[0] == '+') ? 1 : 0;
size_t posNonDigit = val.find_first_not_of("0123456789", start);
if (posNonDigit == string::npos) {

if (ctre::match<TurtleTokenCtre::Integer>(value)) {
return true;
}
if (val[posNonDigit] == '.') {
return posNonDigit + 1 < val.size() &&
val.find_first_not_of("0123456789", posNonDigit + 1) == string::npos;
if (ctre::match<TurtleTokenCtre::Decimal>(value)) {
return true;
}
return false;
}
Expand Down
76 changes: 59 additions & 17 deletions test/TurtleParserTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
#include <string>

#include "../src/parser/TurtleParser.h"
#include "../src/util/Conversions.h"

using std::string;
TEST(TurtleParserTest, prefixedName) {
Expand Down Expand Up @@ -238,36 +239,77 @@ TEST(TurtleParserTest, object) {
}

TEST(TurtleParserTest, objectList) {
TurtleStringParser<Tokenizer> p;
p._activeSubject = "<s>";
p._activePredicate = "<p>";
TurtleStringParser<Tokenizer> parser;
parser._activeSubject = "<s>";
parser._activePredicate = "<p>";
string objectL = " <ob1>, <ob2>, <ob3>";
std::vector<std::array<string, 3>> exp;
exp.push_back({"<s>", "<p>", "<ob1>"});
exp.push_back({"<s>", "<p>", "<ob2>"});
exp.push_back({"<s>", "<p>", "<ob3>"});
p.setInputStream(objectL);
ASSERT_TRUE(p.objectList());
ASSERT_EQ(p._triples, exp);
ASSERT_EQ(p.getPosition(), objectL.size());
parser.setInputStream(objectL);
ASSERT_TRUE(parser.objectList());
ASSERT_EQ(parser._triples, exp);
ASSERT_EQ(parser.getPosition(), objectL.size());

p.setInputStream("@noObject");
ASSERT_FALSE(p.objectList());
parser.setInputStream("@noObject");
ASSERT_FALSE(parser.objectList());

p.setInputStream("<obj1>, @illFormed");
ASSERT_THROW(p.objectList(), TurtleParser<Tokenizer>::ParseException);
parser.setInputStream("<obj1>, @illFormed");
ASSERT_THROW(parser.objectList(), TurtleParser<Tokenizer>::ParseException);
}

TEST(TurtleParserTest, predicateObjectList) {
TurtleStringParser<Tokenizer> p;
p._activeSubject = "<s>";
TurtleStringParser<Tokenizer> parser;
parser._activeSubject = "<s>";
string predL = "\n <p1> <ob1>;<p2> \"ob2\",\n <ob3>";
std::vector<std::array<string, 3>> exp;
exp.push_back({"<s>", "<p1>", "<ob1>"});
exp.push_back({"<s>", "<p2>", "\"ob2\""});
exp.push_back({"<s>", "<p2>", "<ob3>"});
p.setInputStream(predL);
ASSERT_TRUE(p.predicateObjectList());
ASSERT_EQ(p._triples, exp);
ASSERT_EQ(p.getPosition(), predL.size());
parser.setInputStream(predL);
ASSERT_TRUE(parser.predicateObjectList());
ASSERT_EQ(parser._triples, exp);
ASSERT_EQ(parser.getPosition(), predL.size());
}

TEST(TurtleParserTest, numericLiteral) {
std::vector<std::string> literals{"2", "-2", "42.209", "-42.239", ".74"};

TurtleStringParser<Tokenizer> parser;
for (const auto& literal : literals) {
parser.setInputStream(literal);
ASSERT_TRUE(parser.numericLiteral());
ASSERT_EQ(parser._lastParseResult, literal);
LOG(INFO) << literal << std::endl;
ASSERT_TRUE(ad_utility::isNumeric(literal));
ASSERT_FLOAT_EQ(ad_utility::convertIndexWordToFloat(
ad_utility::convertNumericToIndexWord(literal)),
std::strtod(literal.c_str(), nullptr));
}

std::vector<std::string> nonWorkingLiterals{"2.3e12", "2.34e-14", "-0.3e2"};

for (const auto& literal : nonWorkingLiterals) {
parser.setInputStream(literal);
ASSERT_TRUE(parser.numericLiteral());
ASSERT_EQ(parser._lastParseResult, literal);
ASSERT_THROW(ad_utility::isNumeric(literal), std::out_of_range);
}
}

TEST(TurtleParserTest, booleanLiteral) {
TurtleStringParser<Tokenizer> parser;
parser.setInputStream("true");
ASSERT_TRUE(parser.booleanLiteral());
ASSERT_EQ("\"true\"^^<http://www.w3.org/2001/XMLSchema#boolean>",
parser._lastParseResult);

parser.setInputStream("false");
ASSERT_TRUE(parser.booleanLiteral());
ASSERT_EQ("\"false\"^^<http://www.w3.org/2001/XMLSchema#boolean>",
parser._lastParseResult);

parser.setInputStream("maybe");
ASSERT_FALSE(parser.booleanLiteral());
}

0 comments on commit b16a83c

Please sign in to comment.