-
Notifications
You must be signed in to change notification settings - Fork 37
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Added splitting at multiple whitespace characters #36
Changes from 3 commits
6f1fee9
3d52d0f
7ddcfe9
7b8f6a0
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -7,6 +7,7 @@ | |
#include <stdint.h> | ||
#include <stdlib.h> | ||
#include <assert.h> | ||
#include <ctype.h> | ||
#include <iostream> | ||
#include <clocale> | ||
#include <cstring> | ||
|
@@ -108,6 +109,9 @@ inline string rstrip(const string& text, const char *s); | |
//! Splits a string at the separator, kinda like python. | ||
inline vector<string> split(const string& orig, const char sep); | ||
|
||
//! Splits a string at any maximum length sequence of whitespace | ||
inline vector<string> splitWs(const string &orig); | ||
|
||
//! Splits a string a any character inside the seps string. | ||
inline vector<string> splitAny(const string& orig, const char *seps); | ||
|
||
|
@@ -381,6 +385,33 @@ vector<string> split(const string& orig, const char sep) { | |
return result; | ||
} | ||
|
||
// _____________________________________________________________________________ | ||
vector<string> splitWs(const string &orig) { | ||
vector<string> result; | ||
if (orig.size() > 0) { | ||
size_t start = 0; | ||
size_t pos = 0; | ||
while (pos < orig.size()) { | ||
if (orig[pos] >= 0 && ::isspace(orig[pos])) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I got a little obsessed with this and tried it out on my Raspberry Pi where |
||
if (start != pos) { | ||
result.emplace_back(orig.substr(start, pos - start)); | ||
} | ||
// skip any whitespace | ||
while (pos < orig.size() && orig[pos] >= 0 && ::isspace(orig[pos])) { | ||
pos++; | ||
} | ||
start = pos; | ||
} | ||
pos++; | ||
} | ||
// avoid adding whitespace at the back of the string | ||
if (!(orig[orig.size() - 1] >= 0 && ::isspace(orig[orig.size() - 1]))) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this can be replaced by just |
||
result.emplace_back(orig.substr(start)); | ||
} | ||
} | ||
return result; | ||
} | ||
|
||
// _____________________________________________________________________________ | ||
vector<string> splitAny(const string& orig, const char *seps) { | ||
return splitAny(orig, string(seps)); | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -211,6 +211,53 @@ TEST(StringUtilsTest, strip) { | |
ASSERT_EQ(u8"äö", strip(u8"xxaxaxaxaäöaaaxxx", "xa")); | ||
ASSERT_EQ(u8"xxaxaxaxa♥", rstrip(u8"xxaxaxaxa♥aaaxxx", u8"xa")); | ||
} | ||
|
||
|
||
TEST(StringUtilsTest, splitWs) { | ||
setlocale(LC_CTYPE, "en_US.utf8"); | ||
string s1 = " this\nis\t \nit "; | ||
string s2 = "\n \t \n \t"; | ||
string s3 = "thisisit"; | ||
string s4 = "this is\nit"; | ||
string s5 = "a"; | ||
auto v1 = splitWs(s1); | ||
ASSERT_EQ(size_t(3), v1.size()); | ||
ASSERT_EQ("this", v1[0]); | ||
ASSERT_EQ("is", v1[1]); | ||
ASSERT_EQ("it", v1[2]); | ||
|
||
auto v2 = splitWs(s2); | ||
ASSERT_EQ(size_t(0), v2.size()); | ||
|
||
auto v3 = splitWs(s3); | ||
ASSERT_EQ(size_t(1), v3.size()); | ||
ASSERT_EQ("thisisit", v3[0]); | ||
|
||
auto v4 = splitWs(s4); | ||
ASSERT_EQ(size_t(3), v4.size()); | ||
ASSERT_EQ("this", v4[0]); | ||
ASSERT_EQ("is", v4[1]); | ||
ASSERT_EQ("it", v4[2]); | ||
|
||
auto v5 = splitWs(s5); | ||
ASSERT_EQ(size_t(1), v5.size()); | ||
ASSERT_EQ("a", v5[0]); | ||
|
||
// and with unicode | ||
string s6 = u8"Spaß \t ❤ \n漢字 "; | ||
auto v6 = splitWs(s6); | ||
ASSERT_EQ(u8"Spaß", v6[0]); | ||
ASSERT_EQ(u8"❤", v6[1]); | ||
ASSERT_EQ(u8"漢字", v6[2]); | ||
|
||
// unicode code point 224 has a second byte (160), that equals the space | ||
// character if the first bit is ignored | ||
// (which may happen when casting char to int). | ||
string s7 = u8"Test\u00e0test"; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Great find 🥇 |
||
auto v7 = splitWs(s7); | ||
ASSERT_EQ(1u, v7.size()); | ||
ASSERT_EQ(s7, v7[0]); | ||
} | ||
} // namespace | ||
|
||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We have a weird mix of
type& name
andtype &name
in this file. I strongly prefertype& name
because to me being a reference is part of the type. Google allows both giving no preference while Stroustrup prefers the former. I propose we keep it this way and I'll do a clang format over the whole file after the merge.