Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

osd: never send rados ack (only commit) #12451

Merged
merged 12 commits into from Dec 29, 2016
2 changes: 0 additions & 2 deletions doc/rados/operations/control.rst
Expand Up @@ -255,8 +255,6 @@ Changes a pool setting. ::
Valid fields are:

* ``size``: Sets the number of copies of data in the pool.
* ``crash_replay_interval``: The number of seconds to allow
clients to replay acknowledged but uncommited requests.
* ``pg_num``: The placement group number.
* ``pgp_num``: Effective number when calculating pg placement.
* ``crush_ruleset``: rule number for mapping placement.
Expand Down
15 changes: 0 additions & 15 deletions doc/rados/operations/pools.rst
Expand Up @@ -275,15 +275,6 @@ You may set values for the following keys:
:Type: Integer
:Version: ``0.54`` and above

.. _crash_replay_interval:

``crash_replay_interval``

:Description: The number of seconds to allow clients to replay acknowledged,
but uncommitted requests.

:Type: Integer

.. _pg_num:

``pg_num``
Expand Down Expand Up @@ -573,12 +564,6 @@ You may get values for the following keys:
:Type: Integer
:Version: ``0.54`` and above

``crash_replay_interval``

:Description: see crash_replay_interval_

:Type: Integer

``pg_num``

:Description: see pg_num_
Expand Down
7 changes: 1 addition & 6 deletions qa/workunits/cephtool/test.sh
Expand Up @@ -801,11 +801,6 @@ function test_mon_mds()
# the "current_epoch + 1" checking below if they're generating updates
fail_all_mds $FS_NAME

# Check for default crash_replay_interval set automatically in 'fs new'
#This may vary based on ceph.conf (e.g., it's 5 in teuthology runs)
#ceph osd dump | grep fs_data > $TMPFILE
#check_response "crash_replay_interval 45 "

ceph mds compat show
expect_false ceph mds deactivate 2
ceph mds dump
Expand Down Expand Up @@ -1441,7 +1436,7 @@ function test_mon_osd_pool_set()
wait_for_clean
ceph osd pool get $TEST_POOL_GETSET all

for s in pg_num pgp_num size min_size crash_replay_interval crush_rule crush_ruleset; do
for s in pg_num pgp_num size min_size crush_rule crush_ruleset; do
ceph osd pool get $TEST_POOL_GETSET $s
done

Expand Down
2 changes: 1 addition & 1 deletion qa/workunits/rest/test.py
Expand Up @@ -398,7 +398,7 @@ def expect_nofail(url, method, respcode, contenttype, extra_hdrs=None,
expect('osd/reweight?id=0&weight=-1', 'PUT', 400, '')
expect('osd/reweight?id=0&weight=1', 'PUT', 200, '')

for v in ['pg_num', 'pgp_num', 'size', 'min_size', 'crash_replay_interval',
for v in ['pg_num', 'pgp_num', 'size', 'min_size',
'crush_ruleset']:
r = expect('osd/pool/get.json?pool=rbd&var=' + v, 'GET', 200, 'json')
assert(v in r.myjson['output'])
Expand Down
16 changes: 0 additions & 16 deletions src/mon/MDSMonitor.cc
Expand Up @@ -1643,22 +1643,6 @@ int MDSMonitor::management_command(
return r;
}

// Automatically set crash_replay_interval on data pool if it
// isn't already set.
if (data_pool->get_crash_replay_interval() == 0) {
// We will be changing osdmon's state and requesting the osdmon to
// propose. We thus need to make sure the osdmon is writeable before
// we do this, waiting if it's not.
if (!mon->osdmon()->is_writeable()) {
mon->osdmon()->wait_for_writeable(op, new C_RetryMessage(this, op));
return -EAGAIN;
}

r = mon->osdmon()->set_crash_replay_interval(data, g_conf->osd_default_data_pool_replay_window);
assert(r == 0); // We just did get_pg_pool so it must exist and be settable
request_proposal(mon->osdmon());
}

// All checks passed, go ahead and create.
create_new_fs(pending_fsmap, fs_name, metadata, data);
ss << "new fs with metadata pool " << metadata << " and data pool " << data;
Expand Down
28 changes: 0 additions & 28 deletions src/mon/OSDMonitor.cc
Expand Up @@ -5125,34 +5125,6 @@ int OSDMonitor::parse_osd_id(const char *s, stringstream *pss)
}


/**
* Special setter for crash_replay_interval on a pool. Equivalent to
* using prepare_command_pool_set, but in a form convenient for use
* from MDSMonitor rather than from an administrative command.
*/
int OSDMonitor::set_crash_replay_interval(const int64_t pool_id, const uint32_t cri)
{
pg_pool_t p;
if (pending_inc.new_pools.count(pool_id)) {
p = pending_inc.new_pools[pool_id];
} else {
const pg_pool_t *p_ptr = osdmap.get_pg_pool(pool_id);
if (p_ptr == NULL) {
return -ENOENT;
} else {
p = *p_ptr;
}
}

dout(10) << "Set pool " << pool_id << " crash_replay_interval=" << cri << dendl;
p.crash_replay_interval = cri;
p.last_change = pending_inc.epoch;
pending_inc.new_pools[pool_id] = p;

return 0;
}


int OSDMonitor::prepare_command_pool_set(map<string,cmd_vartype> &cmdmap,
stringstream& ss)
{
Expand Down
1 change: 0 additions & 1 deletion src/mon/OSDMonitor.h
Expand Up @@ -411,7 +411,6 @@ class OSDMonitor : public PaxosService {
bool prepare_command(MonOpRequestRef op);
bool prepare_command_impl(MonOpRequestRef op, map<string,cmd_vartype>& cmdmap);

int set_crash_replay_interval(const int64_t pool_id, const uint32_t cri);
int prepare_command_pool_set(map<string,cmd_vartype> &cmdmap,
stringstream& ss);

Expand Down
55 changes: 3 additions & 52 deletions src/osd/OSD.cc
Expand Up @@ -1716,7 +1716,6 @@ OSD::OSD(CephContext *cct_, ObjectStore *store_,
cct->_conf->osd_command_thread_timeout,
cct->_conf->osd_command_thread_suicide_timeout,
&command_tp),
replay_queue_lock("OSD::replay_queue_lock"),
remove_wq(
cct,
store,
Expand Down Expand Up @@ -2521,8 +2520,6 @@ void OSD::create_logger()
"Client write operations"); // client writes
osd_plb.add_u64_counter(l_osd_op_w_inb, "op_w_in_bytes",
"Client data written"); // client write in bytes
osd_plb.add_time_avg(l_osd_op_w_rlat, "op_w_rlat",
"Client write operation readable/applied latency"); // client write readable/applied latency
osd_plb.add_time_avg(l_osd_op_w_lat, "op_w_latency",
"Latency of write operation (including queue time)"); // client write latency
osd_plb.add_time_avg(l_osd_op_w_process_lat, "op_w_process_latency",
Expand All @@ -2535,8 +2532,6 @@ void OSD::create_logger()
"Client read-modify-write operations write in"); // client rmw in bytes
osd_plb.add_u64_counter(l_osd_op_rw_outb,"op_rw_out_bytes",
"Client read-modify-write operations read out "); // client rmw out bytes
osd_plb.add_time_avg(l_osd_op_rw_rlat,"op_rw_rlat",
"Client read-modify-write operation readable/applied latency"); // client rmw readable/applied latency
osd_plb.add_time_avg(l_osd_op_rw_lat, "op_rw_latency",
"Latency of read-modify-write operation (including queue time)"); // client rmw latency
osd_plb.add_time_avg(l_osd_op_rw_process_lat, "op_rw_process_latency",
Expand Down Expand Up @@ -4389,10 +4384,6 @@ void OSD::tick()
start_boot();
}

if (is_active()) {
check_replay_queue();
}

do_waiters();

tick_timer.add_event_after(OSD_TICK_INTERVAL, new C_Tick(this));
Expand Down Expand Up @@ -8452,44 +8443,6 @@ void OSD::_remove_pg(PG *pg)
// =========================================================
// RECOVERY

/*
* caller holds osd_lock
*/
void OSD::check_replay_queue()
{
assert(osd_lock.is_locked());

utime_t now = ceph_clock_now();
list< pair<spg_t,utime_t> > pgids;
replay_queue_lock.Lock();
while (!replay_queue.empty() &&
replay_queue.front().second <= now) {
pgids.push_back(replay_queue.front());
replay_queue.pop_front();
}
replay_queue_lock.Unlock();

for (list< pair<spg_t,utime_t> >::iterator p = pgids.begin(); p != pgids.end(); ++p) {
spg_t pgid = p->first;
pg_map_lock.get_read();
if (pg_map.count(pgid)) {
PG *pg = _lookup_lock_pg_with_map_lock_held(pgid);
pg_map_lock.unlock();
dout(10) << "check_replay_queue " << *pg << dendl;
if ((pg->is_active() || pg->is_activating()) &&
pg->is_replay() &&
pg->is_primary() &&
pg->replay_until == p->second) {
pg->replay_queued_ops();
}
pg->unlock();
} else {
pg_map_lock.unlock();
dout(10) << "check_replay_queue pgid " << pgid << " (not found)" << dendl;
}
}
}

void OSDService::_maybe_queue_recovery() {
assert(recovery_lock.is_locked_by_me());
uint64_t available_pushes;
Expand Down Expand Up @@ -8864,10 +8817,8 @@ void OSD::handle_replica_op(OpRequestRef& op, OSDMapRef& osdmap)
bool OSD::op_is_discardable(MOSDOp *op)
{
// drop client request if they are not connected and can't get the
// reply anyway. unless this is a replayed op, in which case we
// want to do what we can to apply it.
if (!op->get_connection()->is_connected() &&
op->get_version().version == 0) {
// reply anyway.
if (!op->get_connection()->is_connected()) {
return true;
}
return false;
Expand Down Expand Up @@ -9337,7 +9288,7 @@ int OSD::init_op_flags(OpRequestRef& op)
iter->op.watch.op == CEPH_OSD_WATCH_OP_PING)) {
/* This a bit odd. PING isn't actually a write. It can't
* result in an update to the object_info. PINGs also aren'ty
* replayed, so there's no reason to write out a log entry
* resent, so there's no reason to write out a log entry
*
* However, we pipeline them behind writes, so let's force
* the write_ordered flag.
Expand Down
8 changes: 1 addition & 7 deletions src/osd/OSD.h
Expand Up @@ -73,14 +73,12 @@ enum {
l_osd_op_r_prepare_lat,
l_osd_op_w,
l_osd_op_w_inb,
l_osd_op_w_rlat,
l_osd_op_w_lat,
l_osd_op_w_process_lat,
l_osd_op_w_prepare_lat,
l_osd_op_rw,
l_osd_op_rw_inb,
l_osd_op_rw_outb,
l_osd_op_rw_rlat,
l_osd_op_rw_lat,
l_osd_op_rw_process_lat,
l_osd_op_rw_prepare_lat,
Expand Down Expand Up @@ -975,7 +973,7 @@ class OSDService {
}
}
}
// replay / delayed pg activation
// delayed pg activation
void queue_for_recovery(PG *pg, bool front = false) {
Mutex::Locker l(recovery_lock);
if (front) {
Expand Down Expand Up @@ -2313,10 +2311,6 @@ class OSD : public Dispatcher,
void do_recovery(PG *pg, epoch_t epoch_queued, uint64_t pushes_reserved,
ThreadPool::TPHandle &handle);

Mutex replay_queue_lock;
list< pair<spg_t, utime_t > > replay_queue;

void check_replay_queue();

// -- scrubbing --
void sched_scrub();
Expand Down