-
Notifications
You must be signed in to change notification settings - Fork 223
OB_ERR_PRIMARY_KEY_DUPLICATE when doing compaction #1205
Description
Describe this problem
In our deployments with OBKV as wal implementation, we found following errors:
2023-09-11 10:24:03.322 ERRO [proxy/src/grpc/write.rs:33] Failed to handle write, err:Internal error, msg:Failed to execute interpreter, err:Failed to execute insert, err:Failed to write table, err:Failed to wri
te tables, table:adrtbcoreV2__DEFAULT__1__DEFAULT_goc_ctr_latency_us_bucket, err:Failed to flush table, table:adrtbcoreV2__DEFAULT__1__DEFAULT_goc_ctr_latency_us_bucket, err:Background flush failed, cannot write
more data, retry_count:20, err:Failed to run flush job, msg:Some("table:adrtbcoreV2__DEFAULT__1__DEFAULT_goc_ctr_latency_us_bucket, table_id:4449, request_id:607920880"), err:Failed to alloc file id, err:Failed
to alloc file id, err:Failed to write update to wal, err:Failed to write log entries, err:Failed to write table unit, namespace:manifest, wal location:WalLocation { region_id: 99, table_id: 4449 }, err:Failed t
o write log to table, region_id:99, err:Failed to write to table, table:wal_manifest_permanent_000099, err:Common error, code:ObException(OB_ERR_PRIMARY_KEY_DUPLICATE), err:OBKV server return exception in batch
response: ObTableOperationResult { base: BasePayLoad { channel_id: 1262076206, version: 1, timeout: 10000, flag: 7 }, header: ObTableResult { base: BasePayLoad { channel_id: 1262076207, version: 1, timeout: 1000
0, flag: 7 }, errorno: -5024, sql_state: [0], msg: [0] }, operation_type: Insert, entity: ObTableEntity { base: BasePayLoad { channel_id: 1262076208, version: 1, timeout: 10000, flag: 7 }, row_key: ObRowKey { ke
ys: [] }, properties: {} }, affected_rows: 0 }..
Backtrace:
0 <snafu::backtrace_shim::Backtrace as snafu::GenerateBacktrace>::generate::h02217cd51b8458df
/home/db/.cargo/registry/src/github.com-1ecc6299db9ec823/snafu-0.6.10/src/backtrace_shim.rs:15
<table_kv::obkv::WriteTable<__T0> as snafu::IntoError<table_kv::obkv::Error>>::into_error::hb2712bb104fe5bf5
/home/db/ceresdb/components/table_kv/src/obkv.rs:35
<core::result::Result<T,E> as snafu::ResultExt<T,E>>::context::{{closure}}::ha945fa61a9d0fdc4
/home/db/.cargo/registry/src/github.com-1ecc6299db9ec823/snafu-0.6.10/src/lib.rs:318
core::result::Result<T,E>::map_err::h2c67e337ccf5ca8b
/rustc/11d96b59307b1702fffe871bfc2d0145d070881e/library/core/src/result.rs:860
<core::result::Result<T,E> as snafu::ResultExt<T,E>>::context::h79ba9da8f5cc9087
/home/db/.cargo/registry/src/github.com-1ecc6299db9ec823/snafu-0.6.10/src/lib.rs:318
1 <table_kv::obkv::ObkvImpl as table_kv::TableKv>::write::h4c7f29ac2567028a
/home/db/ceresdb/components/table_kv/src/obkv.rs:503
2 wal::table_kv_impl::table_unit::TableUnitWriter::write_log::{{closure}}::{{closure}}::h374597f853fb1ebd
/home/db/ceresdb/wal/src/table_kv_impl/table_unit.rs:972
<tokio::runtime::blocking::task::BlockingTask<T> as core::future::future::Future>::poll::hd732123b7ce277e0 /home/db/.cargo/registry/src/github.com-1ecc6299db9ec823/tokio-1.29.1/src/runtime/blocking/task.rs:42
tokio::runtime::task::core::Core<T,S>::poll::{{closure}}::h7bc8dab5bff1edd8 /home/db/.cargo/registry/src/github.com-1ecc6299db9ec823/tokio-1.29.1/src/runtime/task/core.rs:311 tokio::loom::std::unsafe_cell::UnsafeCell<T>::with_mut::h874e5b1272fa4233
/home/db/.cargo/registry/src/github.com-1ecc6299db9ec823/tokio-1.29.1/src/loom/std/unsafe_cell.rs:14 tokio::runtime::task::core::Core<T,S>::poll::h004175d454d51de2 /home/db/.cargo/registry/src/github.com-1ecc6299db9ec823/tokio-1.29.1/src/runtime/task/core.rs:300 tokio::runtime::task::harness::poll_future::{{closure}}::h214bee1116474500
/home/db/.cargo/registry/src/github.com-1ecc6299db9ec823/tokio-1.29.1/src/runtime/task/harness.rs:476
<core::panic::unwind_safe::AssertUnwindSafe<F> as core::ops::function::FnOnce<()>>::call_once::h8283513884f3bd25
/rustc/11d96b59307b1702fffe871bfc2d0145d070881e/library/core/src/panic/unwind_safe.rs:271
std::panicking::try::do_call::h35e5c17abc1297f6
/rustc/11d96b59307b1702fffe871bfc2d0145d070881e/library/std/src/panicking.rs:483
std::panicking::try::h3476acde4fde0c11
/rustc/11d96b59307b1702fffe871bfc2d0145d070881e/library/std/src/panicking.rs:447
std::panic::catch_unwind::hc0a7aa6a07741dbc
/rustc/11d96b59307b1702fffe871bfc2d0145d070881e/library/std/src/panic.rs:140
tokio::runtime::task::harness::poll_future::h4d9ff312aa07f229
/home/db/.cargo/registry/src/github.com-1ecc6299db9ec823/tokio-1.29.1/src/runtime/task/harness.rs:464
tokio::runtime::task::harness::Harness<T,S>::poll_inner::h4a3e33931b374525
/home/db/.cargo/registry/src/github.com-1ecc6299db9ec823/tokio-1.29.1/src/runtime/task/harness.rs:198
tokio::runtime::task::harness::Harness<T,S>::poll::h33185797ff335d5d
/home/db/.cargo/registry/src/github.com-1ecc6299db9ec823/tokio-1.29.1/src/runtime/task/harness.rs:152
tokio::runtime::task::raw::poll::hee93371fc98f2eb6
/home/db/.cargo/registry/src/github.com-1ecc6299db9ec823/tokio-1.29.1/src/runtime/task/raw.rs:276
3 tokio::runtime::task::raw::RawTask::poll::heabf9c70e6fd77a4
/home/db/.cargo/registry/src/github.com-1ecc6299db9ec823/tokio-1.29.1/src/runtime/task/raw.rs:200
tokio::runtime::task::UnownedTask<S>::run::h71abe9846161356c
/home/db/.cargo/registry/src/github.com-1ecc6299db9ec823/tokio-1.29.1/src/runtime/task/mod.rs:437
tokio::runtime::blocking::pool::Task::run::h447d1827b63320cd
/home/db/.cargo/registry/src/github.com-1ecc6299db9ec823/tokio-1.29.1/src/runtime/blocking/pool.rs:159
tokio::runtime::blocking::pool::Inner::run::h512f4d4d040e4562
/home/db/.cargo/registry/src/github.com-1ecc6299db9ec823/tokio-1.29.1/src/runtime/blocking/pool.rs:513
tokio::runtime::blocking::pool::Spawner::spawn_thread::{{closure}}::h136aab7e70ce0684
/home/db/.cargo/registry/src/github.com-1ecc6299db9ec823/tokio-1.29.1/src/runtime/blocking/pool.rs:471
std::sys_common::backtrace::__rust_begin_short_backtrace::h0fdc9691673882ce
/rustc/11d96b59307b1702fffe871bfc2d0145d070881e/library/std/src/sys_common/backtrace.rs:121
4 std::thread::Builder::spawn_unchecked_::{{closure}}::{{closure}}::hafa9124fedea5fb8
/rustc/11d96b59307b1702fffe871bfc2d0145d070881e/library/std/src/thread/mod.rs:558
<core::panic::unwind_safe::AssertUnwindSafe<F> as core::ops::function::FnOnce<()>>::call_once::hd98e2f10cf2e1f38
/rustc/11d96b59307b1702fffe871bfc2d0145d070881e/library/core/src/panic/unwind_safe.rs:271
std::panicking::try::do_call::h46b8f1ab94cefb1f
/rustc/11d96b59307b1702fffe871bfc2d0145d070881e/library/std/src/panicking.rs:483
std::panicking::try::h347f95a1f7845419
/rustc/11d96b59307b1702fffe871bfc2d0145d070881e/library/std/src/panicking.rs:447
std::panic::catch_unwind::h5dd7ee5b5feeb457
/rustc/11d96b59307b1702fffe871bfc2d0145d070881e/library/std/src/panic.rs:140
std::thread::Builder::spawn_unchecked_::{{closure}}::h2561cf64a40e49e6
/rustc/11d96b59307b1702fffe871bfc2d0145d070881e/library/std/src/thread/mod.rs:557
core::ops::function::FnOnce::call_once{{vtable.shim}}::h46fa823051c99d1e
/rustc/11d96b59307b1702fffe871bfc2d0145d070881e/library/core/src/ops/function.rs:250
5 <alloc::boxed::Box<F,A> as core::ops::function::FnOnce<Args>>::call_once::hc8beb91c5e39b692
/rustc/11d96b59307b1702fffe871bfc2d0145d070881e/library/alloc/src/boxed.rs:1988
<alloc::boxed::Box<F,A> as core::ops::function::FnOnce<Args>>::call_once::h20e58dce1054acc4
/rustc/11d96b59307b1702fffe871bfc2d0145d070881e/library/alloc/src/boxed.rs:1988
std::sys::unix::thread::Thread::new::thread_start::h848946a57aa81736
/rustc/11d96b59307b1702fffe871bfc2d0145d070881e/library/std/src/sys/unix/thread.rs:108
6 start_thread
7 __clone
Backtrace:
0 <snafu::backtrace_shim::Backtrace as snafu::GenerateBacktrace>::generate::h02217cd51b8458df
/home/db/.cargo/registry/src/github.com-1ecc6299db9ec823/snafu-0.6.10/src/backtrace_shim.rs:15
<wal::manager::error::Write as snafu::IntoError<wal::manager::error::Error>>::into_error::hf5c49394fa68e61d
/home/db/ceresdb/wal/src/manager.rs:41
1 <core::result::Result<T,E> as snafu::ResultExt<T,E>>::context::{{closure}}::h557595a434999597
/home/db/.cargo/registry/src/github.com-1ecc6299db9ec823/snafu-0.6.10/src/lib.rs:318
core::result::Result<T,E>::map_err::h10c8702abcd0713f
/rustc/11d96b59307b1702fffe871bfc2d0145d070881e/library/core/src/result.rs:860
<core::result::Result<T,E> as snafu::ResultExt<T,E>>::context::h2728c3a6b3d1a399
/home/db/.cargo/registry/src/github.com-1ecc6299db9ec823/snafu-0.6.10/src/lib.rs:318
<wal::table_kv_impl::wal::WalNamespaceImpl<T> as wal::manager::WalManager>::write::{{closure}}::hace34fa1b277d26d
/home/db/ceresdb/wal/src/table_kv_impl/wal.rs:175
2 <core::pin::Pin<P> as core::future::future::Future>::poll::haffa190cb0c71e7d
/rustc/11d96b59307b1702fffe871bfc2d0145d070881e/library/core/src/future/future.rs:125
<analytic_engine::manifest::details::WalBasedLogStore as analytic_engine::manifest::details::MetaUpdateLogStore>::append::{{closure}}::h4696a77b8bdfbd84
/home/db/ceresdb/analytic_engine/src/manifest/details.rs:723
3 <core::pin::Pin<P> as core::future::future::Future>::poll::h3b85d708da0a520b
/rustc/11d96b59307b1702fffe871bfc2d0145d070881e/library/core/src/future/future.rs:125
analytic_engine::manifest::details::ManifestImpl::store_update_to_wal::{{closure}}::h75d609256d00d067
/home/db/ceresdb/analytic_engine/src/manifest/details.rs:442
<analytic_engine::manifest::details::ManifestImpl as analytic_engine::manifest::Manifest>::apply_edit::{{closure}}::h0f6eb57b366fa8dc
/home/db/ceresdb/analytic_engine/src/manifest/details.rs:502
4 <core::pin::Pin<P> as core::future::future::Future>::poll::h6b95f43fd3e5a084
/rustc/11d96b59307b1702fffe871bfc2d0145d070881e/library/core/src/future/future.rs:125
analytic_engine::table::data::TableData::persist_max_file_id::{{closure}}::h3408c2f626b3cd19
/home/db/ceresdb/analytic_engine/src/table/data.rs:551
analytic_engine::table::data::TableData::alloc_file_id::{{closure}}::{{closure}}::{{closure}}::h02b6a214ddc98868
/home/db/ceresdb/analytic_engine/src/table/data.rs:520
id_allocator::Inner::alloc_id::{{closure}}::h1964701c5b592986
/home/db/ceresdb/components/id_allocator/src/lib.rs:52
id_allocator::IdAllocator::alloc_id::{{closure}}::h22881461833e70d2
/home/db/ceresdb/components/id_allocator/src/lib.rs:80
analytic_engine::table::data::TableData::alloc_file_id::{{closure}}::h1c839670dd38ad8d
/home/db/ceresdb/analytic_engine/src/table/data.rs:525
5 analytic_engine::instance::flush_compaction::FlushTask::dump_normal_memtable::{{closure}}::h31660e44e945a85b
/home/db/ceresdb/analytic_engine/src/instance/flush_compaction.rs:620
analytic_engine::instance::flush_compaction::FlushTask::dump_memtables::{{closure}}::h897337f24183c2ab
/home/db/ceresdb/analytic_engine/src/instance/flush_compaction.rs:384
analytic_engine::instance::flush_compaction::FlushTask::run::{{closure}}::h6d1391310def4ea6
/home/db/ceresdb/analytic_engine/src/instance/flush_compaction.rs:280
analytic_engine::instance::flush_compaction::Flusher::schedule_table_flush::{{closure}}::{{closure}}::h0ce1ae1039065eee
/home/db/ceresdb/analytic_engine/src/instance/flush_compaction.rs:252
analytic_engine::instance::serial_executor::TableFlushScheduler::flush_sequentially::{{closure}}::{{closure}}::h1aae6e2506874b14
/home/db/ceresdb/analytic_engine/src/instance/serial_executor.rs:219
Server version
$ ceresdb-server --version
CeresDB Server
Version: 1.2.6-alpha
Git commit: 7f8faff
Git branch: main
Opt level: 3
Rustc version: 1.69.0-nightly
Target: x86_64-unknown-linux-gnu
Build date: 2023-08-17T09:04:30.077953531Z
Steps to reproduce
After some discussion with @ShiKaiWi @Rachelint, this problem will arise when close shard, and table's compaction belonging to this shard still exists, this means there will be two nodes writing the same manifest file.
Expected behavior
No error
Additional Information
In theory, when a close shard request is received, it should release all resources(such as: WAL/manifest/object_store) before finish closing shard, only when those resources all are released, then we can open this shard in a new node.