Skip to content

Commit

Permalink
Merge pull request #17920 from dzafman/wip-21382
Browse files Browse the repository at this point in the history
Erasure code recovery should send additional reads if necessary

Fixes: http://tracker.ceph.com/issues/21382

Reviewed-by: Kefu Chai <kchai@redhat.com>
  • Loading branch information
dzafman committed Sep 29, 2017
2 parents fd704ab + 390d12f commit 2f466f8
Show file tree
Hide file tree
Showing 3 changed files with 95 additions and 57 deletions.
80 changes: 58 additions & 22 deletions qa/standalone/erasure-code/test-erasure-eio.sh
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,10 @@ function run() {
}

function setup_osds() {
for id in $(seq 0 3) ; do
local count=$1
shift

for id in $(seq 0 $(expr $count - 1)) ; do
run_osd $dir $id || return 1
done
wait_for_clean || return 1
Expand All @@ -55,10 +58,15 @@ function setup_osds() {

function create_erasure_coded_pool() {
local poolname=$1
shift
local k=$1
shift
local m=$1
shift

ceph osd erasure-code-profile set myprofile \
plugin=jerasure \
k=2 m=1 \
k=$k m=$m \
crush-failure-domain=osd || return 1
create_pool $poolname 1 1 erasure myprofile \
|| return 1
Expand Down Expand Up @@ -129,13 +137,13 @@ function rados_put_get() {
# recovery didn't crash the primary.
#
local -a initial_osds=($(get_osds $poolname $objname))
local last=$((${#initial_osds[@]} - 1))
local last_osd=${initial_osds[-1]}
# Kill OSD
kill_daemons $dir TERM osd.${initial_osds[$last]} >&2 < /dev/null || return 1
ceph osd out ${initial_osds[$last]} || return 1
! get_osds $poolname $objname | grep '\<'${initial_osds[$last]}'\>' || return 1
ceph osd in ${initial_osds[$last]} || return 1
run_osd $dir ${initial_osds[$last]} || return 1
kill_daemons $dir TERM osd.${last_osd} >&2 < /dev/null || return 1
ceph osd out ${last_osd} || return 1
! get_osds $poolname $objname | grep '\<'${last_osd}'\>' || return 1
ceph osd in ${last_osd} || return 1
run_osd $dir ${last_osd} || return 1
wait_for_clean || return 1
fi

Expand Down Expand Up @@ -269,10 +277,10 @@ function rados_get_data_bad_size() {
#
function TEST_rados_get_subread_eio_shard_0() {
local dir=$1
setup_osds || return 1
setup_osds 4 || return 1

local poolname=pool-jerasure
create_erasure_coded_pool $poolname || return 1
create_erasure_coded_pool $poolname 2 1 || return 1
# inject eio on primary OSD (0) and replica OSD (1)
local shard_id=0
rados_get_data eio $dir $shard_id || return 1
Expand All @@ -281,10 +289,10 @@ function TEST_rados_get_subread_eio_shard_0() {

function TEST_rados_get_subread_eio_shard_1() {
local dir=$1
setup_osds || return 1
setup_osds 4 || return 1

local poolname=pool-jerasure
create_erasure_coded_pool $poolname || return 1
create_erasure_coded_pool $poolname 2 1 || return 1
# inject eio into replicas OSD (1) and OSD (2)
local shard_id=1
rados_get_data eio $dir $shard_id || return 1
Expand All @@ -296,10 +304,10 @@ function TEST_rados_get_subread_eio_shard_1() {

function TEST_rados_get_subread_missing() {
local dir=$1
setup_osds || return 1
setup_osds 4 || return 1

local poolname=pool-jerasure
create_erasure_coded_pool $poolname || return 1
create_erasure_coded_pool $poolname 2 1 || return 1
# inject remove into replicas OSD (1) and OSD (2)
local shard_id=1
rados_get_data remove $dir $shard_id || return 1
Expand All @@ -316,10 +324,10 @@ function TEST_rados_get_subread_missing() {
#
function TEST_rados_get_bad_size_shard_0() {
local dir=$1
setup_osds || return 1
setup_osds 4 || return 1

local poolname=pool-jerasure
create_erasure_coded_pool $poolname || return 1
create_erasure_coded_pool $poolname 2 1 || return 1
# Set incorrect size into primary OSD (0) and replica OSD (1)
local shard_id=0
rados_get_data_bad_size $dir $shard_id 10 || return 1
Expand All @@ -330,10 +338,10 @@ function TEST_rados_get_bad_size_shard_0() {

function TEST_rados_get_bad_size_shard_1() {
local dir=$1
setup_osds || return 1
setup_osds 4 || return 1

local poolname=pool-jerasure
create_erasure_coded_pool $poolname || return 1
create_erasure_coded_pool $poolname 2 1 || return 1
# Set incorrect size into replicas OSD (1) and OSD (2)
local shard_id=1
rados_get_data_bad_size $dir $shard_id 10 || return 1
Expand All @@ -346,10 +354,10 @@ function TEST_rados_get_with_subreadall_eio_shard_0() {
local dir=$1
local shard_id=0

setup_osds || return 1
setup_osds 4 || return 1

local poolname=pool-jerasure
create_erasure_coded_pool $poolname || return 1
create_erasure_coded_pool $poolname 2 1 || return 1
# inject eio on primary OSD (0)
local shard_id=0
rados_get_data_recovery eio $dir $shard_id || return 1
Expand All @@ -361,17 +369,45 @@ function TEST_rados_get_with_subreadall_eio_shard_1() {
local dir=$1
local shard_id=0

setup_osds || return 1
setup_osds 4 || return 1

local poolname=pool-jerasure
create_erasure_coded_pool $poolname || return 1
create_erasure_coded_pool $poolname 2 1 || return 1
# inject eio on replica OSD (1)
local shard_id=1
rados_get_data_recovery eio $dir $shard_id || return 1

delete_pool $poolname
}

# Test recovery the first k copies aren't all available
function TEST_ec_recovery_errors() {
local dir=$1
local objname=myobject

setup_osds 7 || return 1

local poolname=pool-jerasure
create_erasure_coded_pool $poolname 3 2 || return 1

rados_put $dir $poolname $objname || return 1
inject_eio ec data $poolname $objname $dir 0 || return 1

local -a initial_osds=($(get_osds $poolname $objname))
local last_osd=${initial_osds[-1]}
# Kill OSD
kill_daemons $dir TERM osd.${last_osd} >&2 < /dev/null || return 1
ceph osd down ${last_osd} || return 1
ceph osd out ${last_osd} || return 1

# Cluster should recover this object
wait_for_clean || return 1

#rados_get_data_recovery eio $dir $shard_id || return 1

delete_pool $poolname
}

main test-erasure-eio "$@"

# Local Variables:
Expand Down
64 changes: 30 additions & 34 deletions src/osd/ECBackend.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1187,8 +1187,7 @@ void ECBackend::handle_sub_read_reply(
unsigned is_complete = 0;
// For redundant reads check for completion as each shard comes in,
// or in a non-recovery read check for completion once all the shards read.
// TODO: It would be nice if recovery could send more reads too
if (rop.do_redundant_reads || (!rop.for_recovery && rop.in_progress.empty())) {
if (rop.do_redundant_reads || rop.in_progress.empty()) {
for (map<hobject_t, read_result_t>::const_iterator iter =
rop.complete.begin();
iter != rop.complete.end();
Expand Down Expand Up @@ -1485,19 +1484,12 @@ void ECBackend::call_write_ordered(std::function<void(void)> &&cb) {
}
}

int ECBackend::get_min_avail_to_read_shards(
void ECBackend::get_all_avail_shards(
const hobject_t &hoid,
const set<int> &want,
bool for_recovery,
bool do_redundant_reads,
set<pg_shard_t> *to_read)
set<int> &have,
map<shard_id_t, pg_shard_t> &shards,
bool for_recovery)
{
// Make sure we don't do redundant reads for recovery
assert(!for_recovery || !do_redundant_reads);

set<int> have;
map<shard_id_t, pg_shard_t> shards;

for (set<pg_shard_t>::const_iterator i =
get_parent()->get_acting_shards().begin();
i != get_parent()->get_acting_shards().end();
Expand Down Expand Up @@ -1548,6 +1540,22 @@ int ECBackend::get_min_avail_to_read_shards(
}
}
}
}

int ECBackend::get_min_avail_to_read_shards(
const hobject_t &hoid,
const set<int> &want,
bool for_recovery,
bool do_redundant_reads,
set<pg_shard_t> *to_read)
{
// Make sure we don't do redundant reads for recovery
assert(!for_recovery || !do_redundant_reads);

set<int> have;
map<shard_id_t, pg_shard_t> shards;

get_all_avail_shards(hoid, have, shards, for_recovery);

set<int> need;
int r = ec_impl->minimum_to_decode(want, have, &need);
Expand All @@ -1573,30 +1581,18 @@ int ECBackend::get_min_avail_to_read_shards(
int ECBackend::get_remaining_shards(
const hobject_t &hoid,
const set<int> &avail,
set<pg_shard_t> *to_read)
set<pg_shard_t> *to_read,
bool for_recovery)
{
set<int> need;
map<shard_id_t, pg_shard_t> shards;
assert(to_read);

for (set<pg_shard_t>::const_iterator i =
get_parent()->get_acting_shards().begin();
i != get_parent()->get_acting_shards().end();
++i) {
dout(10) << __func__ << ": checking acting " << *i << dendl;
const pg_missing_t &missing = get_parent()->get_shard_missing(*i);
if (!missing.is_missing(hoid)) {
assert(!need.count(i->shard));
need.insert(i->shard);
assert(!shards.count(i->shard));
shards.insert(make_pair(i->shard, *i));
}
}
set<int> have;
map<shard_id_t, pg_shard_t> shards;

if (!to_read)
return 0;
get_all_avail_shards(hoid, have, shards, for_recovery);

for (set<int>::iterator i = need.begin();
i != need.end();
for (set<int>::iterator i = have.begin();
i != have.end();
++i) {
assert(shards.count(shard_id_t(*i)));
if (avail.find(*i) == avail.end())
Expand Down Expand Up @@ -2317,7 +2313,7 @@ int ECBackend::send_all_remaining_reads(
already_read.insert(i->shard);
dout(10) << __func__ << " have/error shards=" << already_read << dendl;
set<pg_shard_t> shards;
int r = get_remaining_shards(hoid, already_read, &shards);
int r = get_remaining_shards(hoid, already_read, &shards, rop.for_recovery);
if (r)
return r;
if (shards.empty())
Expand Down
8 changes: 7 additions & 1 deletion src/osd/ECBackend.h
Original file line number Diff line number Diff line change
Expand Up @@ -320,6 +320,11 @@ class ECBackend : public PGBackend {
const PushReplyOp &op,
pg_shard_t from,
RecoveryMessages *m);
void get_all_avail_shards(
const hobject_t &hoid,
set<int> &have,
map<shard_id_t, pg_shard_t> &shards,
bool for_recovery);

public:
/**
Expand Down Expand Up @@ -650,7 +655,8 @@ class ECBackend : public PGBackend {
int get_remaining_shards(
const hobject_t &hoid,
const set<int> &avail,
set<pg_shard_t> *to_read);
set<pg_shard_t> *to_read,
bool for_recovery);

int objects_get_attrs(
const hobject_t &hoid,
Expand Down

0 comments on commit 2f466f8

Please sign in to comment.