Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added splitting at multiple whitespace characters #36

Merged
merged 4 commits into from
Feb 28, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
6 changes: 3 additions & 3 deletions src/parser/SparqlParser.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ void SparqlParser::parsePrologue(string str, ParsedQuery& query) {

// _____________________________________________________________________________
void SparqlParser::addPrefix(const string& str, ParsedQuery& query) {
auto parts = ad_utility::split(ad_utility::strip(str, ' '), ' ');
auto parts = ad_utility::splitWs(str);
if (parts.size() != 3) {
throw ParseException(string("Invalid PREFIX statement: ") + str);
}
Expand All @@ -73,12 +73,12 @@ void SparqlParser::addPrefix(const string& str, ParsedQuery& query) {
}
SparqlPrefix p{ad_utility::strip(parts[1], " :\t\n"), uri};
query._prefixes.emplace_back(p);
};
}

// _____________________________________________________________________________
void SparqlParser::parseSelect(const string& str, ParsedQuery& query) {
assert(ad_utility::startsWith(str, "SELECT"));
auto vars = ad_utility::split(str, ' ');
auto vars = ad_utility::splitWs(str);
size_t i = 1;
if (vars.size() > i && vars[i] == "DISTINCT") {
++i;
Expand Down
33 changes: 33 additions & 0 deletions src/util/StringUtils.h
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
#include <stdint.h>
#include <stdlib.h>
#include <assert.h>
#include <ctype.h>
#include <iostream>
#include <clocale>
#include <cstring>
Expand Down Expand Up @@ -108,6 +109,9 @@ inline string rstrip(const string& text, const char *s);
//! Splits a string at the separator, kinda like python.
inline vector<string> split(const string& orig, const char sep);

//! Splits a string at any maximum length sequence of whitespace
inline vector<string> splitWs(const string& orig);

//! Splits a string a any character inside the seps string.
inline vector<string> splitAny(const string& orig, const char *seps);

Expand Down Expand Up @@ -381,6 +385,35 @@ vector<string> split(const string& orig, const char sep) {
return result;
}

// _____________________________________________________________________________
vector<string> splitWs(const string& orig) {
vector<string> result;
if (orig.size() > 0) {
size_t start = 0;
size_t pos = 0;
while (pos < orig.size()) {
if (::isspace(static_cast<unsigned char>(orig[pos]))) {
if (start != pos) {
result.emplace_back(orig.substr(start, pos - start));
}
// skip any whitespace
while (pos < orig.size()
&& ::isspace(static_cast<unsigned char>(orig[pos]))) {
pos++;
}
start = pos;
}
pos++;
}
// avoid adding whitespace at the back of the string
// if (!::isspace(static_cast<unsigned char>(orig[orig.size() - 1]))) {
if (start != orig.size()) {
result.emplace_back(orig.substr(start));
}
}
return result;
}

// _____________________________________________________________________________
vector<string> splitAny(const string& orig, const char *seps) {
return splitAny(orig, string(seps));
Expand Down
47 changes: 47 additions & 0 deletions test/StringUtilsTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -211,6 +211,53 @@ TEST(StringUtilsTest, strip) {
ASSERT_EQ(u8"äö", strip(u8"xxaxaxaxaäöaaaxxx", "xa"));
ASSERT_EQ(u8"xxaxaxaxa♥", rstrip(u8"xxaxaxaxa♥aaaxxx", u8"xa"));
}


TEST(StringUtilsTest, splitWs) {
setlocale(LC_CTYPE, "en_US.utf8");
string s1 = " this\nis\t \nit ";
string s2 = "\n \t \n \t";
string s3 = "thisisit";
string s4 = "this is\nit";
string s5 = "a";
auto v1 = splitWs(s1);
ASSERT_EQ(size_t(3), v1.size());
ASSERT_EQ("this", v1[0]);
ASSERT_EQ("is", v1[1]);
ASSERT_EQ("it", v1[2]);

auto v2 = splitWs(s2);
ASSERT_EQ(size_t(0), v2.size());

auto v3 = splitWs(s3);
ASSERT_EQ(size_t(1), v3.size());
ASSERT_EQ("thisisit", v3[0]);

auto v4 = splitWs(s4);
ASSERT_EQ(size_t(3), v4.size());
ASSERT_EQ("this", v4[0]);
ASSERT_EQ("is", v4[1]);
ASSERT_EQ("it", v4[2]);

auto v5 = splitWs(s5);
ASSERT_EQ(size_t(1), v5.size());
ASSERT_EQ("a", v5[0]);

// and with unicode
string s6 = u8"Spaß \t ❤ \n漢字 ";
auto v6 = splitWs(s6);
ASSERT_EQ(u8"Spaß", v6[0]);
ASSERT_EQ(u8"❤", v6[1]);
ASSERT_EQ(u8"漢字", v6[2]);

// unicode code point 224 has a second byte (160), that equals the space
// character if the first bit is ignored
// (which may happen when casting char to int).
string s7 = u8"Test\u00e0test";
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Great find 🥇

auto v7 = splitWs(s7);
ASSERT_EQ(1u, v7.size());
ASSERT_EQ(s7, v7[0]);
}
} // namespace


Expand Down