ceph · ifed01 · Mar 20, 2024 · Mar 20, 2024 · Nov 23, 2021 · Mar 20, 2024
@@ -16,7 +16,7 @@ std::ostream& operator<<(std::ostream& os, const aio_t& aio)
 }
 
 int aio_queue_t::submit_batch(aio_iter begin, aio_iter end, 
-			      uint16_t aios_size, void *priv, 
+			      void *priv,
 			      int *retries)
 {
   // 2^16 * 125us = ~8 seconds, so max sleep is ~16 seconds
@@ -25,33 +25,43 @@ int aio_queue_t::submit_batch(aio_iter begin, aio_iter end,
   int r;
 
   aio_iter cur = begin;
-  struct aio_t *piocb[aios_size];
-  int left = 0;
-  while (cur != end) {
-    cur->priv = priv;
-    *(piocb+left) = &(*cur);
-    ++left;
-    ++cur;
-  }
-  ceph_assert(aios_size >= left);
+#if defined(HAVE_LIBAIO)
+  struct aio_t *piocb[max_iodepth];
+#endif
   int done = 0;
-  while (left > 0) {
+  int pushed = 0; //used for LIBAIO only
+  int pulled = 0;
+  while (cur != end || pushed < pulled) {
 #if defined(HAVE_LIBAIO)
-    r = io_submit(ctx, std::min(left, max_iodepth), (struct iocb**)(piocb + done));
+    while (cur != end && pulled < max_iodepth) {
+      cur->priv = priv;
+      piocb[pulled] = &(*cur);
+      ++pulled;
+      ++cur;
+    }
+    int toSubmit = pulled - pushed;
+    r = io_submit(ctx, toSubmit, (struct iocb**)(piocb + pushed));
+    if (r >= 0 && r < toSubmit) {
+      pushed += r;
+      done += r;
+      r = -EAGAIN;
+    }
 #elif defined(HAVE_POSIXAIO)
-    if (piocb[done]->n_aiocb == 1) {
+    cur->priv = priv;
+    if ((cur->n_aiocb == 1) {
       // TODO: consider batching multiple reads together with lio_listio
-      piocb[done]->aio.aiocb.aio_sigevent.sigev_notify = SIGEV_KEVENT;
-      piocb[done]->aio.aiocb.aio_sigevent.sigev_notify_kqueue = ctx;
-      piocb[done]->aio.aiocb.aio_sigevent.sigev_value.sival_ptr = piocb[done];
-      r = aio_read(&piocb[done]->aio.aiocb);
+      cur->aio.aiocb.aio_sigevent.sigev_notify = SIGEV_KEVENT;
+      cur->aio.aiocb.aio_sigevent.sigev_notify_kqueue = ctx;
+      cur->aio.aiocb.aio_sigevent.sigev_value.sival_ptr = &(*cur);
+      r = aio_write(&cur->aio.aiocb);
     } else {
       struct sigevent sev;
       sev.sigev_notify = SIGEV_KEVENT;
       sev.sigev_notify_kqueue = ctx;
-      sev.sigev_value.sival_ptr = piocb[done];
-      r = lio_listio(LIO_NOWAIT, &piocb[done]->aio.aiocbp, piocb[done]->n_aiocb, &sev);
+      sev.sigev_value.sival_ptr = &(*cur);
+      r = lio_listio(LIO_NOWAIT, &cur->aio.aiocbp, cur->n_aiocb, &sev);
     }
+    ++cur;
 #endif
     if (r < 0) {
       if (r == -EAGAIN && attempts-- > 0) {
@@ -64,9 +74,9 @@ int aio_queue_t::submit_batch(aio_iter begin, aio_iter end,
     }
     ceph_assert(r > 0);
     done += r;
-    left -= r;
     attempts = 16;
     delay = 125;
+    pushed = pulled = 0;
   }
   return done;
 }

@@ -100,7 +100,7 @@ struct io_queue_t {
 
   virtual int init(std::vector<int> &fds) = 0;
   virtual void shutdown() = 0;
-  virtual int submit_batch(aio_iter begin, aio_iter end, uint16_t aios_size,
+  virtual int submit_batch(aio_iter begin, aio_iter end,
 			   void *priv, int *retries) = 0;
   virtual int get_next_completed(int timeout_ms, aio_t **paio, int max) = 0;
 };
@@ -153,7 +153,7 @@ struct aio_queue_t final : public io_queue_t {
     }
   }
 
-  int submit_batch(aio_iter begin, aio_iter end, uint16_t aios_size,
+  int submit_batch(aio_iter begin, aio_iter end,
 		   void *priv, int *retries) final;
   int get_next_completed(int timeout_ms, aio_t **paio, int max) final;
 };
@@ -344,11 +344,11 @@ void KernelDevice::close()
   extblkdev::release_device(ebd_impl);
 
   for (int i = 0; i < WRITE_LIFE_MAX; i++) {
-    assert(fd_directs[i] >= 0);
+    ceph_assert(fd_directs[i] >= 0);
     VOID_TEMP_FAILURE_RETRY(::close(fd_directs[i]));
     fd_directs[i] = -1;
 
-    assert(fd_buffereds[i] >= 0);
+    ceph_assert(fd_buffereds[i] >= 0);
     VOID_TEMP_FAILURE_RETRY(::close(fd_buffereds[i]));
     fd_buffereds[i] = -1;
   }
@@ -910,10 +910,8 @@ void KernelDevice::aio_submit(IOContext *ioc)
 
   void *priv = static_cast<void*>(ioc);
   int r, retries = 0;
-  // num of pending aios should not overflow when passed to submit_batch()
-  assert(pending <= std::numeric_limits<uint16_t>::max());
   r = io_queue->submit_batch(ioc->running_aios.begin(), e,
-			     pending, priv, &retries);
+			     priv, &retries);
 
   if (retries)
     derr << __func__ << " retries " << retries << dendl;

@@ -176,10 +176,9 @@ void ioring_queue_t::shutdown()
 }
 
 int ioring_queue_t::submit_batch(aio_iter beg, aio_iter end,
-                                 uint16_t aios_size, void *priv,
+                                 void *priv,
                                  int *retries)
 {
-  (void)aios_size;
   (void)retries;
 
   pthread_mutex_lock(&d->sq_mutex);

@@ -27,7 +27,7 @@ struct ioring_queue_t final : public io_queue_t {
   int init(std::vector<int> &fds) final;
   void shutdown() final;
 
-  int submit_batch(aio_iter begin, aio_iter end, uint16_t aios_size,
+  int submit_batch(aio_iter begin, aio_iter end,
                    void *priv, int *retries) final;
   int get_next_completed(int timeout_ms, aio_t **paio, int max) final;
 };
diff --git a/src/test/objectstore/store_test.cc b/src/test/objectstore/store_test.cc
@@ -7734,6 +7734,155 @@ TEST_P(StoreTestSpecificAUSize, BlobReuseOnOverwrite) {
   }
 }
 
+TEST_P(StoreTestSpecificAUSize, ManyManyExtents) {
+
+  if (string(GetParam()) != "bluestore")
+    return;
+
+  size_t block_size = 4096;
+  StartDeferred(block_size);
+
+  int r;
+  coll_t cid;
+  ghobject_t hoid(hobject_t("test", "", CEPH_NOSNAP, 0, -1, ""));
+
+  const PerfCounters* logger = store->get_perf_counters();
+
+  auto ch = store->create_new_collection(cid);
+  {
+    ObjectStore::Transaction t;
+    t.create_collection(cid, 0);
+    r = queue_transaction(store, ch, std::move(t));
+    ASSERT_EQ(r, 0);
+  }
+  const size_t max_iterations = 129;
+  const size_t max_txn_ops = 512;
+  bufferlist bl;
+  {
+    for (size_t i = 0; i < max_iterations; i++) {
+      ObjectStore::Transaction t;
+      for (size_t j = 0; j < max_txn_ops; j++) {
+        bl.clear();
+        bl.append(std::string(1, 'a' + j % 26));
+        t.write(cid, hoid, (i * max_txn_ops + j) * 4096, bl.length(), bl, CEPH_OSD_OP_FLAG_FADVISE_DONTNEED);
+      }
+      r = queue_transaction(store, ch, std::move(t));
+      ASSERT_EQ(r, 0);
+      cerr << "iter " << i << "/" << max_iterations - 1 << std::endl;
+    }
+  }
+  ch.reset();
+  store->umount();
+  store->mount();
+  ch = store->open_collection(cid);
+  {
+    bl.clear();
+    size_t len = (max_iterations * max_txn_ops) * 4096 - 4095;
+    cerr << "reading in a single chunk, size =" << len << std::endl;
+    r = store->read(ch, hoid,
+      0, len,
+      bl, CEPH_OSD_OP_FLAG_FADVISE_DONTNEED);
+    ASSERT_EQ(r, len);
+    ASSERT_EQ(r, bl.length());
+    size_t idx = 0;
+    for (size_t i = 0; i < max_iterations; i++) {
+      for (size_t j = 0; j < max_txn_ops; j++) {
+        ASSERT_EQ(bl[idx], 'a' + j % 26);
+        idx += 4096;
+      }
+    }
+  }
+  ch.reset();
+  store->umount();
+  store->mount();
+  ch = store->open_collection(cid);
+  {
+    cerr << "reading in multiple chunks..." << std::endl;
+    bl.clear();
+    store->fiemap(ch, hoid, 0, 1ull << 31, bl);
+    map<uint64_t,uint64_t> m;
+    auto p = bl.cbegin();
+    decode(m, p);
+
+    bl.clear();
+    interval_set<uint64_t> im(std::move(m));
+    r = store->readv(ch, hoid, im, bl, 0);
+    ASSERT_EQ(r, max_txn_ops * max_iterations);
+    ASSERT_EQ(r, bl.length());
+    size_t idx = 0;
+    for (size_t i = 0; i < max_iterations; i++) {
+      for (size_t j = 0; j < max_txn_ops; j++) {
+        ASSERT_EQ(bl[idx++], 'a' + j % 26);
+      }
+    }
+  }
+  store->refresh_perf_counters();
+  cerr << "blobs = " << logger->get(l_bluestore_blobs)
+       << " extents = " << logger->get(l_bluestore_extents)
+       << std::endl;
+  {
+    ObjectStore::Transaction t;
+    t.remove(cid, hoid);
+    t.remove_collection(cid);
+    cerr << "Cleaning" << std::endl;
+    r = queue_transaction(store, ch, std::move(t));
+    ASSERT_EQ(r, 0);
+  }
+}
+
+TEST_P(StoreTestSpecificAUSize, ManyManyExtents2) {
+
+  if (string(GetParam()) != "bluestore")
+    return;
+
+  size_t block_size = 4096;
+  StartDeferred(block_size);
+
+  int r;
+  coll_t cid;
+  ghobject_t hoid(hobject_t("test", "", CEPH_NOSNAP, 0, -1, ""));
+
+  auto ch = store->create_new_collection(cid);
+  {
+    ObjectStore::Transaction t;
+    t.create_collection(cid, 0);
+    r = queue_transaction(store, ch, std::move(t));
+    ASSERT_EQ(r, 0);
+  }
+  {
+    ObjectStore::Transaction t;
+    bufferlist bl;
+    bl.append(std::string(1024 * 1024, 'a'));
+    t.write(cid, hoid, 0, bl.length(), bl, 0);
+    r = queue_transaction(store, ch, std::move(t));
+    ASSERT_EQ(r, 0);
+  }
+  ch.reset();
+  store->umount();
+  store->mount();
+  ch = store->open_collection(cid);
+  {
+    cerr << "reading in multiple chunks..." << std::endl;
+    bufferlist bl;
+    interval_set<uint64_t> im;
+    for (int i=0; i < 100000;i++) {
+      im.insert(i * 2, 1);
+    }
+    r = store->readv(ch, hoid, im, bl, 0);
+    ASSERT_EQ(r, 100000);
+    ASSERT_EQ(r, bl.length());
+  }
+  store->refresh_perf_counters();
+  {
+    ObjectStore::Transaction t;
+    t.remove(cid, hoid);
+    t.remove_collection(cid);
+    cerr << "Cleaning" << std::endl;
+    r = queue_transaction(store, ch, std::move(t));
+    ASSERT_EQ(r, 0);
+  }
+}
+
 TEST_P(StoreTestSpecificAUSize, ZeroBlockDetectionSmallAppend) {
   CephContext *cct = (new CephContext(CEPH_ENTITY_TYPE_CLIENT))->get();
   if (string(GetParam()) != "bluestore" || !cct->_conf->bluestore_zero_block_detection) {