Skip to content

Commit

Permalink
Merge pull request #11090 from dillaman/wip-16974
Browse files Browse the repository at this point in the history
rbd-mirror: force-promoted image will remain R/O until rbd-mirror daemon restarted

Reviewed-by: Mykola Golub <mgolub@mirantis.com>
  • Loading branch information
Mykola Golub committed Sep 24, 2016
2 parents 1c81ea2 + f1cd613 commit 9aab326
Show file tree
Hide file tree
Showing 16 changed files with 549 additions and 244 deletions.
67 changes: 42 additions & 25 deletions qa/workunits/rbd/rbd_mirror.sh
Expand Up @@ -21,9 +21,9 @@ create_image ${CLUSTER2} ${POOL} ${image}
wait_for_image_replay_started ${CLUSTER1} ${POOL} ${image}
write_image ${CLUSTER2} ${POOL} ${image} 100
wait_for_replay_complete ${CLUSTER1} ${CLUSTER2} ${POOL} ${image}
test_status_in_pool_dir ${CLUSTER1} ${POOL} ${image} 'up+replaying' 'master_position'
wait_for_status_in_pool_dir ${CLUSTER1} ${POOL} ${image} 'up+replaying' 'master_position'
if [ -z "${RBD_MIRROR_USE_RBD_MIRROR}" ]; then
test_status_in_pool_dir ${CLUSTER2} ${POOL} ${image} 'down+unknown'
wait_for_status_in_pool_dir ${CLUSTER2} ${POOL} ${image} 'down+unknown'
fi
compare_images ${POOL} ${image}

Expand All @@ -35,16 +35,16 @@ write_image ${CLUSTER2} ${POOL} ${image1} 100
start_mirror ${CLUSTER1}
wait_for_image_replay_started ${CLUSTER1} ${POOL} ${image1}
wait_for_replay_complete ${CLUSTER1} ${CLUSTER2} ${POOL} ${image1}
test_status_in_pool_dir ${CLUSTER1} ${POOL} ${image1} 'up+replaying' 'master_position'
wait_for_status_in_pool_dir ${CLUSTER1} ${POOL} ${image1} 'up+replaying' 'master_position'
if [ -z "${RBD_MIRROR_USE_RBD_MIRROR}" ]; then
test_status_in_pool_dir ${CLUSTER2} ${POOL} ${image1} 'down+unknown'
wait_for_status_in_pool_dir ${CLUSTER2} ${POOL} ${image1} 'down+unknown'
fi
compare_images ${POOL} ${image1}

testlog "TEST: test the first image is replaying after restart"
write_image ${CLUSTER2} ${POOL} ${image} 100
wait_for_replay_complete ${CLUSTER1} ${CLUSTER2} ${POOL} ${image}
test_status_in_pool_dir ${CLUSTER1} ${POOL} ${image} 'up+replaying' 'master_position'
wait_for_status_in_pool_dir ${CLUSTER1} ${POOL} ${image} 'up+replaying' 'master_position'
compare_images ${POOL} ${image}

testlog "TEST: stop/start/restart mirror via admin socket"
Expand Down Expand Up @@ -102,41 +102,58 @@ start_mirror ${CLUSTER2}
# demote and promote same cluster
demote_image ${CLUSTER2} ${POOL} ${image}
wait_for_image_replay_stopped ${CLUSTER1} ${POOL} ${image}
test_status_in_pool_dir ${CLUSTER1} ${POOL} ${image} 'up+stopped'
test_status_in_pool_dir ${CLUSTER2} ${POOL} ${image} 'up+stopped'
wait_for_status_in_pool_dir ${CLUSTER1} ${POOL} ${image} 'up+stopped'
wait_for_status_in_pool_dir ${CLUSTER2} ${POOL} ${image} 'up+stopped'
promote_image ${CLUSTER2} ${POOL} ${image}
wait_for_image_replay_started ${CLUSTER1} ${POOL} ${image}
write_image ${CLUSTER2} ${POOL} ${image} 100
wait_for_replay_complete ${CLUSTER1} ${CLUSTER2} ${POOL} ${image}
test_status_in_pool_dir ${CLUSTER2} ${POOL} ${image} 'up+stopped'
test_status_in_pool_dir ${CLUSTER1} ${POOL} ${image} 'up+replaying' 'master_position'
wait_for_status_in_pool_dir ${CLUSTER2} ${POOL} ${image} 'up+stopped'
wait_for_status_in_pool_dir ${CLUSTER1} ${POOL} ${image} 'up+replaying' 'master_position'
compare_images ${POOL} ${image}

# failover
demote_image ${CLUSTER2} ${POOL} ${image}
wait_for_image_replay_stopped ${CLUSTER1} ${POOL} ${image}
test_status_in_pool_dir ${CLUSTER1} ${POOL} ${image} 'up+stopped'
test_status_in_pool_dir ${CLUSTER2} ${POOL} ${image} 'up+stopped'
wait_for_status_in_pool_dir ${CLUSTER1} ${POOL} ${image} 'up+stopped'
wait_for_status_in_pool_dir ${CLUSTER2} ${POOL} ${image} 'up+stopped'
promote_image ${CLUSTER1} ${POOL} ${image}
wait_for_image_replay_started ${CLUSTER2} ${POOL} ${image}
write_image ${CLUSTER1} ${POOL} ${image} 100
wait_for_replay_complete ${CLUSTER2} ${CLUSTER1} ${POOL} ${image}
test_status_in_pool_dir ${CLUSTER1} ${POOL} ${image} 'up+stopped'
test_status_in_pool_dir ${CLUSTER2} ${POOL} ${image} 'up+replaying' 'master_position'
wait_for_status_in_pool_dir ${CLUSTER1} ${POOL} ${image} 'up+stopped'
wait_for_status_in_pool_dir ${CLUSTER2} ${POOL} ${image} 'up+replaying' 'master_position'
compare_images ${POOL} ${image}

# failback
demote_image ${CLUSTER1} ${POOL} ${image}
wait_for_image_replay_stopped ${CLUSTER2} ${POOL} ${image}
test_status_in_pool_dir ${CLUSTER2} ${POOL} ${image} 'up+stopped'
wait_for_status_in_pool_dir ${CLUSTER2} ${POOL} ${image} 'up+stopped'
promote_image ${CLUSTER2} ${POOL} ${image}
wait_for_image_replay_started ${CLUSTER1} ${POOL} ${image}
write_image ${CLUSTER2} ${POOL} ${image} 100
wait_for_replay_complete ${CLUSTER1} ${CLUSTER2} ${POOL} ${image}
test_status_in_pool_dir ${CLUSTER1} ${POOL} ${image} 'up+replaying' 'master_position'
test_status_in_pool_dir ${CLUSTER2} ${POOL} ${image} 'up+stopped'
wait_for_status_in_pool_dir ${CLUSTER1} ${POOL} ${image} 'up+replaying' 'master_position'
wait_for_status_in_pool_dir ${CLUSTER2} ${POOL} ${image} 'up+stopped'
compare_images ${POOL} ${image}

# force promote
force_promote_image=test_force_promote
create_image ${CLUSTER2} ${POOL} ${force_promote_image}
write_image ${CLUSTER2} ${POOL} ${force_promote_image} 100
wait_for_image_replay_stopped ${CLUSTER2} ${POOL} ${force_promote_image}
wait_for_image_replay_started ${CLUSTER1} ${POOL} ${force_promote_image}
wait_for_replay_complete ${CLUSTER1} ${CLUSTER2} ${POOL} ${force_promote_image}
wait_for_status_in_pool_dir ${CLUSTER1} ${POOL} ${force_promote_image} 'up+replaying' 'master_position'
wait_for_status_in_pool_dir ${CLUSTER2} ${POOL} ${force_promote_image} 'up+stopped'
promote_image ${CLUSTER1} ${POOL} ${force_promote_image} '--force'
wait_for_image_replay_stopped ${CLUSTER1} ${POOL} ${force_promote_image}
wait_for_image_replay_stopped ${CLUSTER2} ${POOL} ${force_promote_image}
wait_for_status_in_pool_dir ${CLUSTER1} ${POOL} ${force_promote_image} 'up+stopped'
wait_for_status_in_pool_dir ${CLUSTER2} ${POOL} ${force_promote_image} 'up+stopped'
write_image ${CLUSTER1} ${POOL} ${force_promote_image} 100
write_image ${CLUSTER2} ${POOL} ${force_promote_image} 100

testlog "TEST: cloned images"
parent_image=test_parent
parent_snap=snap
Expand All @@ -152,12 +169,12 @@ write_image ${CLUSTER2} ${POOL} ${clone_image} 100
enable_mirror ${CLUSTER2} ${PARENT_POOL} ${parent_image}
wait_for_image_replay_started ${CLUSTER1} ${PARENT_POOL} ${parent_image}
wait_for_replay_complete ${CLUSTER1} ${CLUSTER2} ${PARENT_POOL} ${parent_image}
test_status_in_pool_dir ${CLUSTER1} ${PARENT_POOL} ${parent_image} 'up+replaying' 'master_position'
wait_for_status_in_pool_dir ${CLUSTER1} ${PARENT_POOL} ${parent_image} 'up+replaying' 'master_position'
compare_images ${PARENT_POOL} ${parent_image}

wait_for_image_replay_started ${CLUSTER1} ${POOL} ${clone_image}
wait_for_replay_complete ${CLUSTER1} ${CLUSTER2} ${POOL} ${clone_image}
test_status_in_pool_dir ${CLUSTER1} ${POOL} ${clone_image} 'up+replaying' 'master_position'
wait_for_status_in_pool_dir ${CLUSTER1} ${POOL} ${clone_image} 'up+replaying' 'master_position'
compare_images ${POOL} ${clone_image}

expect_failure "is non-primary" clone_image ${CLUSTER1} ${PARENT_POOL} \
Expand Down Expand Up @@ -194,7 +211,7 @@ for i in ${image3} ${image5}; do
remove_snapshot ${CLUSTER2} ${POOL} ${i} 'snap2'
# workaround #16555: before removing make sure it is not still bootstrapped
wait_for_image_replay_started ${CLUSTER1} ${POOL} ${i}
remove_image ${CLUSTER2} ${POOL} ${i}
remove_image_retry ${CLUSTER2} ${POOL} ${i}
done

for i in ${image2} ${image3} ${image4} ${image5}; do
Expand Down Expand Up @@ -234,7 +251,7 @@ testlog "TEST: simple image resync"
request_resync_image ${CLUSTER1} ${POOL} ${image}
wait_for_image_present ${CLUSTER1} ${POOL} ${image} 'deleted'
wait_for_image_replay_started ${CLUSTER1} ${POOL} ${image}
test_status_in_pool_dir ${CLUSTER1} ${POOL} ${image} 'up+replaying' 'master_position'
wait_for_status_in_pool_dir ${CLUSTER1} ${POOL} ${image} 'up+replaying' 'master_position'
compare_images ${POOL} ${image}

testlog "TEST: image resync while replayer is stopped"
Expand All @@ -245,15 +262,15 @@ admin_daemon ${CLUSTER1} rbd mirror start ${POOL}/${image}
wait_for_image_present ${CLUSTER1} ${POOL} ${image} 'deleted'
admin_daemon ${CLUSTER1} rbd mirror start ${POOL}/${image}
wait_for_image_replay_started ${CLUSTER1} ${POOL} ${image}
test_status_in_pool_dir ${CLUSTER1} ${POOL} ${image} 'up+replaying' 'master_position'
wait_for_status_in_pool_dir ${CLUSTER1} ${POOL} ${image} 'up+replaying' 'master_position'
compare_images ${POOL} ${image}

testlog "TEST: request image resync while daemon is offline"
stop_mirror ${CLUSTER1}
request_resync_image ${CLUSTER1} ${POOL} ${image}
start_mirror ${CLUSTER1}
wait_for_image_replay_started ${CLUSTER1} ${POOL} ${image}
test_status_in_pool_dir ${CLUSTER1} ${POOL} ${image} 'up+replaying' 'master_position'
wait_for_status_in_pool_dir ${CLUSTER1} ${POOL} ${image} 'up+replaying' 'master_position'
compare_images ${POOL} ${image}

testlog "TEST: client disconnect"
Expand All @@ -268,7 +285,7 @@ test -n "$(get_mirror_position ${CLUSTER2} ${POOL} ${image})"
disconnect_image ${CLUSTER2} ${POOL} ${image}
test -z "$(get_mirror_position ${CLUSTER2} ${POOL} ${image})"
wait_for_image_replay_stopped ${CLUSTER1} ${POOL} ${image}
test_status_in_pool_dir ${CLUSTER1} ${POOL} ${image} 'up+error' 'disconnected'
wait_for_status_in_pool_dir ${CLUSTER1} ${POOL} ${image} 'up+error' 'disconnected'

testlog " - replay started after resync requested"
request_resync_image ${CLUSTER1} ${POOL} ${image}
Expand All @@ -293,7 +310,7 @@ set_image_meta ${CLUSTER2} ${POOL} ${image} \
testlog " - replay is still stopped (disconnected) after restart"
admin_daemon ${CLUSTER1} rbd mirror start ${POOL}/${image}
wait_for_image_replay_stopped ${CLUSTER1} ${POOL} ${image}
test_status_in_pool_dir ${CLUSTER1} ${POOL} ${image} 'up+error' 'disconnected'
wait_for_status_in_pool_dir ${CLUSTER1} ${POOL} ${image} 'up+error' 'disconnected'

testlog " - replay started after resync requested"
request_resync_image ${CLUSTER1} ${POOL} ${image}
Expand All @@ -317,6 +334,6 @@ set_image_meta ${CLUSTER1} ${POOL} ${image} \
disconnect_image ${CLUSTER2} ${POOL} ${image}
test -z "$(get_mirror_position ${CLUSTER2} ${POOL} ${image})"
wait_for_image_replay_stopped ${CLUSTER1} ${POOL} ${image}
test_status_in_pool_dir ${CLUSTER1} ${POOL} ${image} 'up+error' 'disconnected'
wait_for_status_in_pool_dir ${CLUSTER1} ${POOL} ${image} 'up+error' 'disconnected'

echo OK
19 changes: 18 additions & 1 deletion qa/workunits/rbd/rbd_mirror_helpers.sh
Expand Up @@ -487,6 +487,21 @@ test_status_in_pool_dir()
grep "description: .*${description_pattern}" ${status_log}
}

wait_for_status_in_pool_dir()
{
local cluster=$1
local pool=$2
local image=$3
local state_pattern=$4
local description_pattern=$5

for s in 1 2 4 8 8 8 8 8 8 8 8 16 16; do
sleep ${s}
test_status_in_pool_dir ${cluster} ${pool} ${image} ${state_pattern} ${description_pattern} && return 0
done
return 1
}

create_image()
{
local cluster=$1 ; shift
Expand Down Expand Up @@ -520,6 +535,7 @@ remove_image()
local pool=$2
local image=$3

rbd --cluster=${cluster} -p ${pool} snap purge ${image}
rbd --cluster=${cluster} -p ${pool} rm ${image}
}

Expand Down Expand Up @@ -692,8 +708,9 @@ promote_image()
local cluster=$1
local pool=$2
local image=$3
local force=$4

rbd --cluster=${cluster} mirror image promote ${pool}/${image}
rbd --cluster=${cluster} mirror image promote ${pool}/${image} ${force}
}

set_pool_mirror_mode()
Expand Down
6 changes: 3 additions & 3 deletions qa/workunits/rbd/rbd_mirror_stress.sh
Expand Up @@ -100,7 +100,7 @@ for i in `seq 1 10`
do
stress_write_image ${CLUSTER2} ${POOL} ${image}

test_status_in_pool_dir ${CLUSTER1} ${POOL} ${image} 'up+replaying' 'master_position'
wait_for_status_in_pool_dir ${CLUSTER1} ${POOL} ${image} 'up+replaying' 'master_position'

snap_name="snap${i}"
create_snap ${CLUSTER2} ${POOL} ${image} ${snap_name}
Expand All @@ -114,7 +114,7 @@ do
remove_snapshot ${CLUSTER2} ${POOL} ${image} ${snap_name}
done

remove_image ${CLUSTER2} ${POOL} ${image}
remove_image_retry ${CLUSTER2} ${POOL} ${image}
wait_for_image_present ${CLUSTER1} ${POOL} ${image} 'deleted'

testlog "TEST: create many images"
Expand Down Expand Up @@ -152,7 +152,7 @@ for i in `seq 1 ${IMAGE_COUNT}`
do
image="image_${i}"
remove_snapshot ${CLUSTER2} ${POOL} ${image} ${snap_name}
remove_image ${CLUSTER2} ${POOL} ${image}
remove_image_retry ${CLUSTER2} ${POOL} ${image}
done

testlog "TEST: image deletions should propagate"
Expand Down
16 changes: 10 additions & 6 deletions src/journal/JournalMetadata.cc
Expand Up @@ -256,21 +256,23 @@ struct C_GetTags : public Context {
const std::string &oid;
const std::string &client_id;
AsyncOpTracker &async_op_tracker;
uint64_t start_after_tag_tid;
boost::optional<uint64_t> tag_class;
JournalMetadata::Tags *tags;
Context *on_finish;

const uint64_t MAX_RETURN = 64;
uint64_t start_after_tag_tid = 0;
bufferlist out_bl;

C_GetTags(CephContext *cct, librados::IoCtx &ioctx, const std::string &oid,
const std::string &client_id, AsyncOpTracker &async_op_tracker,
uint64_t start_after_tag_tid,
const boost::optional<uint64_t> &tag_class,
JournalMetadata::Tags *tags, Context *on_finish)
: cct(cct), ioctx(ioctx), oid(oid), client_id(client_id),
async_op_tracker(async_op_tracker), tag_class(tag_class), tags(tags),
on_finish(on_finish) {
async_op_tracker(async_op_tracker),
start_after_tag_tid(start_after_tag_tid), tag_class(tag_class),
tags(tags), on_finish(on_finish) {
async_op_tracker.start_op();
}
virtual ~C_GetTags() {
Expand Down Expand Up @@ -559,6 +561,7 @@ void JournalMetadata::unregister_client(Context *on_finish) {

void JournalMetadata::allocate_tag(uint64_t tag_class, const bufferlist &data,
Tag *tag, Context *on_finish) {
on_finish = new C_NotifyUpdate(this, on_finish);
C_AllocateTag *ctx = new C_AllocateTag(m_cct, m_ioctx, m_oid,
m_async_op_tracker, tag_class,
data, tag, on_finish);
Expand All @@ -579,11 +582,12 @@ void JournalMetadata::get_tag(uint64_t tag_tid, Tag *tag, Context *on_finish) {
ctx->send();
}

void JournalMetadata::get_tags(const boost::optional<uint64_t> &tag_class,
void JournalMetadata::get_tags(uint64_t start_after_tag_tid,
const boost::optional<uint64_t> &tag_class,
Tags *tags, Context *on_finish) {
C_GetTags *ctx = new C_GetTags(m_cct, m_ioctx, m_oid, m_client_id,
m_async_op_tracker, tag_class,
tags, on_finish);
m_async_op_tracker, start_after_tag_tid,
tag_class, tags, on_finish);
ctx->send();
}

Expand Down
3 changes: 2 additions & 1 deletion src/journal/JournalMetadata.h
Expand Up @@ -71,7 +71,8 @@ class JournalMetadata : public RefCountedObject, boost::noncopyable {
void allocate_tag(uint64_t tag_class, const bufferlist &data,
Tag *tag, Context *on_finish);
void get_tag(uint64_t tag_tid, Tag *tag, Context *on_finish);
void get_tags(const boost::optional<uint64_t> &tag_class, Tags *tags,
void get_tags(uint64_t start_after_tag_tid,
const boost::optional<uint64_t> &tag_class, Tags *tags,
Context *on_finish);

inline const Settings &get_settings() const {
Expand Down
7 changes: 6 additions & 1 deletion src/journal/Journaler.cc
Expand Up @@ -316,7 +316,12 @@ void Journaler::get_tag(uint64_t tag_tid, Tag *tag, Context *on_finish) {
}

void Journaler::get_tags(uint64_t tag_class, Tags *tags, Context *on_finish) {
m_metadata->get_tags(tag_class, tags, on_finish);
m_metadata->get_tags(0, tag_class, tags, on_finish);
}

void Journaler::get_tags(uint64_t start_after_tag_tid, uint64_t tag_class,
Tags *tags, Context *on_finish) {
m_metadata->get_tags(start_after_tag_tid, tag_class, tags, on_finish);
}

void Journaler::start_replay(ReplayHandler *replay_handler) {
Expand Down
2 changes: 2 additions & 0 deletions src/journal/Journaler.h
Expand Up @@ -96,6 +96,8 @@ class Journaler {
cls::journal::Tag *tag, Context *on_finish);
void get_tag(uint64_t tag_tid, Tag *tag, Context *on_finish);
void get_tags(uint64_t tag_class, Tags *tags, Context *on_finish);
void get_tags(uint64_t start_after_tag_tid, uint64_t tag_class, Tags *tags,
Context *on_finish);

void start_replay(ReplayHandler *replay_handler);
void start_live_replay(ReplayHandler *replay_handler, double interval);
Expand Down

0 comments on commit 9aab326

Please sign in to comment.