Skip to content

Commit

Permalink
erasure-code: implement alignment on chunk sizes
Browse files Browse the repository at this point in the history
jerasure expects chunk sizes that are aligned on the largest possible
vector size that could be used by SSE instructions, when available (
LARGEST_VECTOR_WORDSIZE == 16 bytes ).

For techniques derived from Cauchy, encoding and decoding is done by
subdividing the chunk into packets of packetsize bytes. The operations
are done w * packetsize bytes at a time. It follows that each chunk must
have a size that is a multiple of w * packetsize bytes.

For techniques derived from Vandermonde, it is enough for a chunk to be
a multiple of w * LARGEST_VECTOR_WORDSIZE.

ErasureCodeJerasure::get_alignment returns a size alignment constraint
that has to be enforced as a multiple of the object size. The resulting
object size then has to match the chunk constraints described above
although they have no relationship with K. For Cauchy, it leads to
excessive padding, making it impossible to set sensible parameters for
when the object size is small.

When the per_chunk_alignement data member is true, the semantic of
ErasureCodeJerasure::get_alignment is changed to return a size alignment
constraint to be enforced as a multiple of the chunk size. The
ErasureCodeJerasure::get_chunk_size method is modified to use the new
semantic when appropriate.

The jerasure-per-chunk-alignement parameter is parsed to set
per_chunk_alignement for the Vandermonde and Cauchy techniques.

The memory address of a chunk is implicitly aligned to a page boundary
because it is allocated with buffer::create_page_aligned.

http://tracker.ceph.com/issues/8475 Fixes: #8475

Signed-off-by: Loic Dachary <loic@dachary.org>
  • Loading branch information
Loic Dachary committed Aug 4, 2014
1 parent 3987ac2 commit c7daaaf
Show file tree
Hide file tree
Showing 3 changed files with 143 additions and 88 deletions.
81 changes: 63 additions & 18 deletions src/erasure-code/jerasure/ErasureCodeJerasure.cc
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,6 @@ extern "C" {
#include "liberation.h"
}

// FIXME(loic) this may be too conservative, check back with feedback from Andreas
#define LARGEST_VECTOR_WORDSIZE 16

#define dout_subsys ceph_subsys_osd
Expand Down Expand Up @@ -65,10 +64,26 @@ void ErasureCodeJerasure::init(const map<string,string> &parameters)
unsigned int ErasureCodeJerasure::get_chunk_size(unsigned int object_size) const
{
unsigned alignment = get_alignment();
unsigned tail = object_size % alignment;
unsigned padded_length = object_size + ( tail ? ( alignment - tail ) : 0 );
assert(padded_length % k == 0);
return padded_length / k;
if (per_chunk_alignment) {
unsigned chunk_size = object_size / k;
if (object_size % k)
chunk_size++;
dout(20) << "get_chunk_size: chunk_size " << chunk_size
<< " must be modulo " << alignment << dendl;
assert(alignment <= chunk_size);
unsigned modulo = chunk_size % alignment;
if (modulo) {
dout(10) << "get_chunk_size: " << chunk_size
<< " padded to " << chunk_size + alignment - modulo << dendl;
chunk_size += alignment - modulo;
}
return chunk_size;
} else {
unsigned tail = object_size % alignment;
unsigned padded_length = object_size + ( tail ? ( alignment - tail ) : 0 );
assert(padded_length % k == 0);
return padded_length / k;
}
}

int ErasureCodeJerasure::minimum_to_decode(const set<int> &want_to_read,
Expand Down Expand Up @@ -205,6 +220,19 @@ int ErasureCodeJerasure::to_int(const std::string &name,
return r;
}

bool ErasureCodeJerasure::to_bool(const std::string &name,
const map<std::string,std::string> &parameters,
bool default_value)
{
if (parameters.find(name) == parameters.end() ||
parameters.find(name)->second.size() == 0) {
dout(10) << name << " defaults to " << default_value << dendl;
return default_value;
}
const std::string value = parameters.find(name)->second;
return (value == "yes") || (value == "1") || (value == "true");
}

bool ErasureCodeJerasure::is_prime(int value)
{
int prime55[] = {
Expand Down Expand Up @@ -241,11 +269,14 @@ int ErasureCodeJerasureReedSolomonVandermonde::jerasure_decode(int *erasures,

unsigned ErasureCodeJerasureReedSolomonVandermonde::get_alignment() const
{
unsigned alignment = k*w*sizeof(int);
if ( ((w*sizeof(int))%LARGEST_VECTOR_WORDSIZE) )
alignment = k*w*LARGEST_VECTOR_WORDSIZE;
return alignment;

if (per_chunk_alignment) {
return w * LARGEST_VECTOR_WORDSIZE;
} else {
unsigned alignment = k*w*sizeof(int);
if ( ((w*sizeof(int))%LARGEST_VECTOR_WORDSIZE) )
alignment = k*w*LARGEST_VECTOR_WORDSIZE;
return alignment;
}
}

void ErasureCodeJerasureReedSolomonVandermonde::parse(const map<std::string,std::string> &parameters)
Expand All @@ -258,6 +289,7 @@ void ErasureCodeJerasureReedSolomonVandermonde::parse(const map<std::string,std:
<< " must be one of {8, 16, 32} : revert to " << DEFAULT_W << dendl;
w = DEFAULT_W;
}
per_chunk_alignment = to_bool("jerasure-per-chunk-alignment", parameters, false);
}

void ErasureCodeJerasureReedSolomonVandermonde::prepare()
Expand Down Expand Up @@ -285,10 +317,14 @@ int ErasureCodeJerasureReedSolomonRAID6::jerasure_decode(int *erasures,

unsigned ErasureCodeJerasureReedSolomonRAID6::get_alignment() const
{
unsigned alignment = k*w*sizeof(int);
if ( ((w*sizeof(int))%LARGEST_VECTOR_WORDSIZE) )
alignment = k*w*LARGEST_VECTOR_WORDSIZE;
return alignment;
if (per_chunk_alignment) {
return w * LARGEST_VECTOR_WORDSIZE;
} else {
unsigned alignment = k*w*sizeof(int);
if ( ((w*sizeof(int))%LARGEST_VECTOR_WORDSIZE) )
alignment = k*w*LARGEST_VECTOR_WORDSIZE;
return alignment;
}
}

void ErasureCodeJerasureReedSolomonRAID6::parse(const map<std::string,std::string> &parameters)
Expand Down Expand Up @@ -330,10 +366,18 @@ int ErasureCodeJerasureCauchy::jerasure_decode(int *erasures,

unsigned ErasureCodeJerasureCauchy::get_alignment() const
{
unsigned alignment = k*w*packetsize*sizeof(int);
if ( ((w*packetsize*sizeof(int))%LARGEST_VECTOR_WORDSIZE) )
alignment = k*w*packetsize*LARGEST_VECTOR_WORDSIZE;
return alignment;
if (per_chunk_alignment) {
unsigned alignment = w * packetsize;
unsigned modulo = alignment % LARGEST_VECTOR_WORDSIZE;
if (modulo)
alignment += LARGEST_VECTOR_WORDSIZE - modulo;
return alignment;
} else {
unsigned alignment = k*w*packetsize*sizeof(int);
if ( ((w*packetsize*sizeof(int))%LARGEST_VECTOR_WORDSIZE) )
alignment = k*w*packetsize*LARGEST_VECTOR_WORDSIZE;
return alignment;
}
}

void ErasureCodeJerasureCauchy::parse(const map<std::string,std::string> &parameters)
Expand All @@ -348,6 +392,7 @@ void ErasureCodeJerasureCauchy::parse(const map<std::string,std::string> &parame
w = DEFAULT_W;
}
packetsize = to_int("packetsize", parameters, DEFAULT_PACKETSIZE);
per_chunk_alignment = to_bool("jerasure-per-chunk-alignment", parameters, false);
}

void ErasureCodeJerasureCauchy::prepare_schedule(int *matrix)
Expand Down
7 changes: 6 additions & 1 deletion src/erasure-code/jerasure/ErasureCodeJerasure.h
Original file line number Diff line number Diff line change
Expand Up @@ -27,11 +27,13 @@ class ErasureCodeJerasure : public ErasureCodeInterface {
const char *technique;
string ruleset_root;
string ruleset_failure_domain;
bool per_chunk_alignment;

ErasureCodeJerasure(const char *_technique) :
technique(_technique),
ruleset_root("default"),
ruleset_failure_domain("host")
ruleset_failure_domain("host"),
per_chunk_alignment(false)
{}

virtual ~ErasureCodeJerasure() {}
Expand Down Expand Up @@ -80,6 +82,9 @@ class ErasureCodeJerasure : public ErasureCodeInterface {
static int to_int(const std::string &name,
const map<std::string,std::string> &parameters,
int default_value);
static bool to_bool(const std::string &name,
const map<std::string,std::string> &parameters,
bool default_value);
static bool is_prime(int value);
};

Expand Down
143 changes: 74 additions & 69 deletions src/test/erasure-code/TestErasureCodeJerasure.cc
Original file line number Diff line number Diff line change
Expand Up @@ -42,70 +42,77 @@ TYPED_TEST_CASE(ErasureCodeTest, JerasureTypes);

TYPED_TEST(ErasureCodeTest, encode_decode)
{
TypeParam jerasure;
map<std::string,std::string> parameters;
parameters["k"] = "2";
parameters["m"] = "2";
parameters["w"] = "7";
parameters["packetsize"] = "8";
jerasure.init(parameters);
const char *per_chunk_alignments[] = { "false", "true" };
for (int per_chunk_alignment = 0 ;
per_chunk_alignment < 2;
per_chunk_alignment++) {
TypeParam jerasure;
map<std::string,std::string> parameters;
parameters["k"] = "2";
parameters["m"] = "2";
parameters["w"] = "7";
parameters["packetsize"] = "8";
parameters["jerasure-per-chunk-alignment"] =
per_chunk_alignments[per_chunk_alignment];
jerasure.init(parameters);

#define LARGE_ENOUGH 2048
bufferptr in_ptr(buffer::create_page_aligned(LARGE_ENOUGH));
in_ptr.zero();
in_ptr.set_length(0);
const char *payload =
"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"
"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"
"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"
"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"
"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789";
in_ptr.append(payload, strlen(payload));
bufferlist in;
in.push_front(in_ptr);
int want_to_encode[] = { 0, 1, 2, 3 };
map<int, bufferlist> encoded;
EXPECT_EQ(0, jerasure.encode(set<int>(want_to_encode, want_to_encode+4),
in,
&encoded));
EXPECT_EQ(4u, encoded.size());
unsigned length = encoded[0].length();
EXPECT_EQ(0, strncmp(encoded[0].c_str(), in.c_str(), length));
EXPECT_EQ(0, strncmp(encoded[1].c_str(), in.c_str() + length,
in.length() - length));


// all chunks are available
{
int want_to_decode[] = { 0, 1 };
map<int, bufferlist> decoded;
EXPECT_EQ(0, jerasure.decode(set<int>(want_to_decode, want_to_decode+2),
encoded,
&decoded));
EXPECT_EQ(2u, decoded.size());
EXPECT_EQ(length, decoded[0].length());
EXPECT_EQ(0, strncmp(decoded[0].c_str(), in.c_str(), length));
EXPECT_EQ(0, strncmp(decoded[1].c_str(), in.c_str() + length,
bufferptr in_ptr(buffer::create_page_aligned(LARGE_ENOUGH));
in_ptr.zero();
in_ptr.set_length(0);
const char *payload =
"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"
"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"
"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"
"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"
"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789";
in_ptr.append(payload, strlen(payload));
bufferlist in;
in.push_front(in_ptr);
int want_to_encode[] = { 0, 1, 2, 3 };
map<int, bufferlist> encoded;
EXPECT_EQ(0, jerasure.encode(set<int>(want_to_encode, want_to_encode+4),
in,
&encoded));
EXPECT_EQ(4u, encoded.size());
unsigned length = encoded[0].length();
EXPECT_EQ(0, strncmp(encoded[0].c_str(), in.c_str(), length));
EXPECT_EQ(0, strncmp(encoded[1].c_str(), in.c_str() + length,
in.length() - length));
}

// two chunks are missing
{
map<int, bufferlist> degraded = encoded;
degraded.erase(0);
degraded.erase(1);
EXPECT_EQ(2u, degraded.size());
int want_to_decode[] = { 0, 1 };
map<int, bufferlist> decoded;
EXPECT_EQ(0, jerasure.decode(set<int>(want_to_decode, want_to_decode+2),
degraded,
&decoded));
// always decode all, regardless of want_to_decode
EXPECT_EQ(4u, decoded.size());
EXPECT_EQ(length, decoded[0].length());
EXPECT_EQ(0, strncmp(decoded[0].c_str(), in.c_str(), length));
EXPECT_EQ(0, strncmp(decoded[1].c_str(), in.c_str() + length,
in.length() - length));

// all chunks are available
{
int want_to_decode[] = { 0, 1 };
map<int, bufferlist> decoded;
EXPECT_EQ(0, jerasure.decode(set<int>(want_to_decode, want_to_decode+2),
encoded,
&decoded));
EXPECT_EQ(2u, decoded.size());
EXPECT_EQ(length, decoded[0].length());
EXPECT_EQ(0, strncmp(decoded[0].c_str(), in.c_str(), length));
EXPECT_EQ(0, strncmp(decoded[1].c_str(), in.c_str() + length,
in.length() - length));
}

// two chunks are missing
{
map<int, bufferlist> degraded = encoded;
degraded.erase(0);
degraded.erase(1);
EXPECT_EQ(2u, degraded.size());
int want_to_decode[] = { 0, 1 };
map<int, bufferlist> decoded;
EXPECT_EQ(0, jerasure.decode(set<int>(want_to_decode, want_to_decode+2),
degraded,
&decoded));
// always decode all, regardless of want_to_decode
EXPECT_EQ(4u, decoded.size());
EXPECT_EQ(length, decoded[0].length());
EXPECT_EQ(0, strncmp(decoded[0].c_str(), in.c_str(), length));
EXPECT_EQ(0, strncmp(decoded[1].c_str(), in.c_str() + length,
in.length() - length));
}
}
}

Expand Down Expand Up @@ -216,7 +223,7 @@ TEST(ErasureCodeTest, encode)
parameters["w"] = "8";
jerasure.init(parameters);

unsigned alignment = jerasure.get_alignment();
unsigned aligned_object_size = jerasure.get_alignment() * 2;
{
//
// When the input bufferlist needs to be padded because
Expand All @@ -225,17 +232,16 @@ TEST(ErasureCodeTest, encode)
bufferlist in;
map<int,bufferlist> encoded;
int want_to_encode[] = { 0, 1, 2, 3 };
int trail_length = 10;
in.append(string(alignment + trail_length, 'X'));
int trail_length = 1;
in.append(string(aligned_object_size + trail_length, 'X'));
EXPECT_EQ(0, jerasure.encode(set<int>(want_to_encode, want_to_encode+4),
in,
&encoded));
EXPECT_EQ(4u, encoded.size());
for(int i = 0; i < 4; i++)
EXPECT_EQ(alignment, encoded[i].length());
char *last_chunk = encoded[1].c_str();
int length =encoded[1].length();
EXPECT_EQ('X', last_chunk[0]);
EXPECT_EQ('\0', last_chunk[trail_length]);
EXPECT_EQ('\0', last_chunk[length - trail_length]);
}

{
Expand All @@ -251,11 +257,10 @@ TEST(ErasureCodeTest, encode)
map<int,bufferlist> encoded;
set<int> want_to_encode;
want_to_encode.insert(0);
int trail_length = 10;
in.append(string(alignment + trail_length, 'X'));
int trail_length = 1;
in.append(string(aligned_object_size + trail_length, 'X'));
EXPECT_EQ(0, jerasure.encode(want_to_encode, in, &encoded));
EXPECT_EQ(1u, encoded.size());
EXPECT_EQ(alignment, encoded[0].length());
}
}

Expand Down

0 comments on commit c7daaaf

Please sign in to comment.