From 6511e7a9e35a14216c03cd6921ca4d232274f953 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Tue, 23 May 2017 21:46:54 +0800 Subject: [PATCH 1/2] osdc/Journaler: avoid executing on_safe contexts prematurely Journaler::_do_flush() can skip flushing some data when prezered journal space isn't enough. Before updating Journaler::next_safe_pos, we need to check if Journaler::_do_flush() has flushed enough data. Fixes: http://tracker.ceph.com/issues/20055 Signed-off-by: "Yan, Zheng" --- src/osdc/Journaler.cc | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/osdc/Journaler.cc b/src/osdc/Journaler.cc index 5949ff3b4016c..bfecac212c3b3 100644 --- a/src/osdc/Journaler.cc +++ b/src/osdc/Journaler.cc @@ -568,8 +568,11 @@ uint64_t Journaler::append_entry(bufferlist& bl) ldout(cct, 10) << " flushing completed object(s) (su " << su << " wro " << write_obj << " flo " << flush_obj << ")" << dendl; _do_flush(write_buf.length() - write_off); - if (write_off) { - // current entry isn't being flushed, set next_safe_pos to the end of previous entry + + // if _do_flush() skips flushing some data, it does not update next_safe_pos. + if (write_buf.length() > 0 && + write_buf.length() <= wrote) { // the unflushed data are within this entry + // set next_safe_pos to end of previous entry next_safe_pos = write_pos - wrote; } } From 41618d8a2a0d381917b53e2a2cb988405f7ac6c9 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Wed, 24 May 2017 11:31:22 +0800 Subject: [PATCH 2/2] osdc/Journaler: use waitfor_safe map to find journal entry boundaries Signed-off-by: "Yan, Zheng" --- src/osdc/Journaler.cc | 26 ++++++++++++++++++++++++-- 1 file changed, 24 insertions(+), 2 deletions(-) diff --git a/src/osdc/Journaler.cc b/src/osdc/Journaler.cc index bfecac212c3b3..ba18ff5534f7e 100644 --- a/src/osdc/Journaler.cc +++ b/src/osdc/Journaler.cc @@ -569,7 +569,8 @@ uint64_t Journaler::append_entry(bufferlist& bl) << write_obj << " flo " << flush_obj << ")" << dendl; _do_flush(write_buf.length() - write_off); - // if _do_flush() skips flushing some data, it does not update next_safe_pos. + // if _do_flush() skips flushing some data, it does do a best effort to + // update next_safe_pos. if (write_buf.length() > 0 && write_buf.length() <= wrote) { // the unflushed data are within this entry // set next_safe_pos to end of previous entry @@ -637,6 +638,17 @@ void Journaler::_do_flush(unsigned amount) next_safe_pos = write_pos; } else { write_buf.splice(0, len, &write_bl); + // Keys of waitfor_safe map are journal entry boundaries. + // Try finding a journal entry that we are actually flushing + // and set next_safe_pos to end of it. This is best effort. + // The one we found may not be the lastest flushing entry. + auto p = waitfor_safe.lower_bound(flush_pos + len); + if (p != waitfor_safe.end()) { + if (p->first > flush_pos + len && p != waitfor_safe.begin()) + --p; + if (p->first <= flush_pos + len && p->first > next_safe_pos) + next_safe_pos = p->first; + } } filer.write(ino, &layout, snapc, @@ -955,7 +967,16 @@ void Journaler::_issue_read(uint64_t len) if (pending_safe.empty()) { _flush(NULL); } - waitfor_safe[flush_pos].push_back(new C_RetryRead(this)); + + // Make sure keys of waitfor_safe map are journal entry boundaries. + // The key we used here is either next_safe_pos or old value of + // next_safe_pos. next_safe_pos is always set to journal entry + // boundary. + auto p = pending_safe.rbegin(); + if (p != pending_safe.rend()) + waitfor_safe[p->second].push_back(new C_RetryRead(this)); + else + waitfor_safe[next_safe_pos].push_back(new C_RetryRead(this)); return; } @@ -1065,6 +1086,7 @@ bool Journaler::_is_readable() // adjust write_pos prezeroing_pos = prezero_pos = write_pos = flush_pos = safe_pos = next_safe_pos = read_pos; assert(write_buf.length() == 0); + assert(waitfor_safe.empty()); // reset read state requested_pos = received_pos = read_pos;