Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

common: add for_each_substr() for cheap string split #18798

Merged
merged 4 commits into from Nov 14, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
90 changes: 34 additions & 56 deletions src/common/str_list.cc
Expand Up @@ -18,89 +18,67 @@ using std::string;
using std::vector;
using std::set;
using std::list;

static bool get_next_token(const string &s, size_t& pos, const char *delims, string& token)
{
int start = s.find_first_not_of(delims, pos);
int end;

if (start < 0){
pos = s.size();
return false;
}

end = s.find_first_of(delims, start);
if (end >= 0)
pos = end + 1;
else {
pos = end = s.size();
}

token = s.substr(start, end - start);
return true;
}
using ceph::for_each_substr;

void get_str_list(const string& str, const char *delims, list<string>& str_list)
{
size_t pos = 0;
string token;

str_list.clear();

while (pos < str.size()) {
if (get_next_token(str, pos, delims, token)) {
if (token.size() > 0) {
str_list.push_back(token);
}
}
}
for_each_substr(str, delims, [&str_list] (boost::string_view token) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can we pass token by reference? like:

for_each_substr(str, delims, [&str_list] (const boost::string_view& token) {
 ...
}

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

string_view is just a pointer/len pair, so it's trivial to copy and can be passed in registers. some advice about this from the cpp core guidelines:

F.16: For “in” parameters, pass cheaply-copied types by value and others by reference to const
...
What is “cheap to copy” depends on the machine architecture, but two or three words (doubles, pointers, references) are usually best passed by value. When copying is cheap, nothing beats the simplicity and safety of copying, and for small objects (up to two or three words) it is also faster than passing by reference because it does not require an extra indirection to access from the function.

str_list.emplace_back(token.begin(), token.end());
});
}

void get_str_list(const string& str, list<string>& str_list)
{
const char *delims = ";,= \t";
return get_str_list(str, delims, str_list);
get_str_list(str, delims, str_list);
}

list<string> get_str_list(const string& str, const char *delims)
{
list<string> result;
get_str_list(str, delims, result);
return result;
}

void get_str_vec(const string& str, const char *delims, vector<string>& str_vec)
{
size_t pos = 0;
string token;
str_vec.clear();

while (pos < str.size()) {
if (get_next_token(str, pos, delims, token)) {
if (token.size() > 0) {
str_vec.push_back(token);
}
}
}
for_each_substr(str, delims, [&str_vec] (boost::string_view token) {
str_vec.emplace_back(token.begin(), token.end());
});
}

void get_str_vec(const string& str, vector<string>& str_vec)
{
const char *delims = ";,= \t";
return get_str_vec(str, delims, str_vec);
get_str_vec(str, delims, str_vec);
}

void get_str_set(const string& str, const char *delims, set<string>& str_set)
vector<string> get_str_vec(const string& str, const char *delims)
{
size_t pos = 0;
string token;
vector<string> result;
get_str_vec(str, delims, result);
return result;
}

void get_str_set(const string& str, const char *delims, set<string>& str_set)
{
str_set.clear();

while (pos < str.size()) {
if (get_next_token(str, pos, delims, token)) {
if (token.size() > 0) {
str_set.insert(token);
}
}
}
for_each_substr(str, delims, [&str_set] (boost::string_view token) {
str_set.emplace(token.begin(), token.end());
});
}

void get_str_set(const string& str, set<string>& str_set)
{
const char *delims = ";,= \t";
return get_str_set(str, delims, str_set);
get_str_set(str, delims, str_set);
}

set<string> get_str_set(const string& str, const char *delims)
{
set<string> result;
get_str_set(str, delims, result);
return result;
}
36 changes: 28 additions & 8 deletions src/include/str_list.h
Expand Up @@ -5,6 +5,26 @@
#include <set>
#include <string>
#include <vector>
#include <boost/utility/string_view.hpp>


namespace ceph {

/// Split a string using the given delimiters, passing each piece as a
/// (non-null-terminated) boost::string_view to the callback.
template <typename Func> // where Func(boost::string_view) is a valid call
void for_each_substr(boost::string_view s, const char *delims, Func&& f)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I approve.

{
auto pos = s.find_first_not_of(delims);
while (pos != s.npos) {
s.remove_prefix(pos); // trim delims from the front
auto end = s.find_first_of(delims);
f(s.substr(0, end));
pos = s.find_first_not_of(delims, end);
}
}

} // namespace ceph

/**
* Split **str** into a list of strings, using the ";,= \t" delimiters and output the result in **str_list**.
Expand All @@ -26,6 +46,9 @@ extern void get_str_list(const std::string& str,
const char *delims,
std::list<std::string>& str_list);

std::list<std::string> get_str_list(const std::string& str,
const char *delims = ";,= \t");

/**
* Split **str** into a list of strings, using the ";,= \t" delimiters and output the result in **str_vec**.
*
Expand All @@ -46,6 +69,8 @@ extern void get_str_vec(const std::string& str,
const char *delims,
std::vector<std::string>& str_vec);

std::vector<std::string> get_str_vec(const std::string& str,
const char *delims = ";,= \t");
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How do you feel about delims being a flat set and/or just taking any sequence of characters?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i'm guessing the vast majority of cases will pass them a string literal, so i'd prefer not to copy them into something like a flat_set. but taking them as a string_view would be an easy option

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i decided to leave it as const char* here, because changing it to string_view would require changing the existing signatures too - and i don't think there's any real benefit to doing that

/**
* Split **str** into a list of strings, using the ";,= \t" delimiters and output the result in **str_list**.
*
Expand All @@ -66,6 +91,9 @@ extern void get_str_set(const std::string& str,
const char *delims,
std::set<std::string>& str_list);

std::set<std::string> get_str_set(const std::string& str,
const char *delims = ";,= \t");

/**
* Return a String containing the vector **v** joined with **sep**
*
Expand All @@ -90,12 +118,4 @@ inline std::string str_join(const std::vector<std::string>& v, const std::string
return r;
}

static inline std::vector<std::string> get_str_vec(const std::string& str)
{
std::vector<std::string> str_vec;
const char *delims = ";,= \t";
get_str_vec(str, delims, str_vec);
return str_vec;
}

#endif
82 changes: 39 additions & 43 deletions src/test/test_str_list.cc
@@ -1,54 +1,50 @@
// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
// vim: ts=8 sw=2 smarttab

#include "include/types.h"
#include "include/str_list.h"

#include <list>
#include <vector>
#include <string>

#include "gtest/gtest.h"

// SplitTest is parameterized for list/vector/set
using Types = ::testing::Types<std::list<std::string>,
std::vector<std::string>,
std::set<std::string>>;

const char *tests[][10] = {
{ "foo,bar", "foo", "bar", 0 },
{ "foo", "foo", 0 },
{ "foo;bar", "foo", "bar", 0 },
{ "foo bar", "foo", "bar", 0 },
{ " foo bar", "foo", "bar", 0 },
{ " foo bar ", "foo", "bar", 0 },
{ "a,b,c", "a", "b", "c", 0 },
{ " a\tb\tc\t", "a", "b", "c", 0 },
{ "a, b, c", "a", "b", "c", 0 },
{ "a b c", "a", "b", "c", 0 },
{ "a=b=c", "a", "b", "c", 0 },
{ 0 },
template <typename T>
struct SplitTest : ::testing::Test {
void test(const char* input, const char *delim,
const std::list<std::string>& expected) {
EXPECT_EQ(expected, get_str_list(input, delim));
}
void test(const char* input, const char *delim,
const std::vector<std::string>& expected) {
EXPECT_EQ(expected, get_str_vec(input, delim));
}
void test(const char* input, const char *delim,
const std::set<std::string>& expected) {
EXPECT_EQ(expected, get_str_set(input, delim));
}
};

TEST(StrList, get_str_list)
{
for (unsigned i=0; tests[i][0]; ++i) {
std::string src = tests[i][0];
std::list<std::string> expected;
for (unsigned j=1; tests[i][j]; ++j)
expected.push_back(tests[i][j]);
std::list<std::string> actual;
get_str_list(src, actual);
std::cout << "'" << src << "' -> " << actual << std::endl;
ASSERT_EQ(actual, expected);
}
}
TYPED_TEST_CASE(SplitTest, Types);

TEST(StrList, get_str_vec)
TYPED_TEST(SplitTest, Get)
{
for (unsigned i=0; tests[i][0]; ++i) {
std::string src = tests[i][0];
std::vector<std::string> expected;
for (unsigned j=1; tests[i][j]; ++j)
expected.push_back(tests[i][j]);
std::vector<std::string> actual;
get_str_vec (src, actual);
std::cout << "'" << src << "' -> " << actual << std::endl;
ASSERT_EQ(actual, expected);
}

this->test("", " ", TypeParam{});
this->test(" ", " ", TypeParam{});
this->test("foo", " ", TypeParam{"foo"});
this->test("foo bar", " ", TypeParam{"foo","bar"});
this->test(" foo bar", " ", TypeParam{"foo","bar"});
this->test("foo bar ", " ", TypeParam{"foo","bar"});
this->test("foo bar ", " ", TypeParam{"foo","bar"});

// default delimiter
const char *delims = ";,= \t";
this->test(" ; , = \t ", delims, TypeParam{});
this->test(" ; foo = \t ", delims, TypeParam{"foo"});
this->test("a,b,c", delims, TypeParam{"a","b","c"});
this->test("a\tb\tc\t", delims, TypeParam{"a","b","c"});
this->test("a, b, c", delims, TypeParam{"a","b","c"});
this->test("a b c", delims, TypeParam{"a","b","c"});
this->test("a=b=c", delims, TypeParam{"a","b","c"});
}