From 2cd5139768858b579ee1091b03ef71f434d0f959 Mon Sep 17 00:00:00 2001 From: Filipe David Borba Manana Date: Tue, 31 May 2011 10:04:20 +0000 Subject: [PATCH] More efficient term size calculation Unlike byte_size(term_to_binary(Term)), the BIF erlang:external_size/1 doesn't do the serialization step, it only calculates the maximum external size for any term, which is more efficient (faster and avoids the garbage generation). With the test couch_http_bulk_writes.sh at [1], using 20 writers and batches of 100 1Kb documents, it's possible to write about 1 400 000 documents with this patch instead of about 1 300 000. [1] https://github.com/fdmanana/basho_bench_couch git-svn-id: https://svn.apache.org/repos/asf/couchdb/trunk@1129597 13f79535-47bb-0310-9956-ffa450edef68 --- src/couchdb/couch_btree.erl | 20 ++++++++++---------- src/couchdb/couch_db.hrl | 6 ++++++ src/couchdb/couch_db_updater.erl | 2 +- src/couchdb/couch_view_compactor.erl | 4 ++-- src/couchdb/couch_work_queue.erl | 2 +- 5 files changed, 20 insertions(+), 14 deletions(-) diff --git a/src/couchdb/couch_btree.erl b/src/couchdb/couch_btree.erl index e467c3fd..dbc20e73 100644 --- a/src/couchdb/couch_btree.erl +++ b/src/couchdb/couch_btree.erl @@ -276,26 +276,26 @@ complete_root(Bt, KPs) -> % written. Plus with the "case byte_size(term_to_binary(InList)) of" code % it's probably really inefficient. -chunkify(#btree{compression = Comp} = Bt, InList) -> - case byte_size(couch_compress:compress(InList, Comp)) of +chunkify(InList) -> + case ?term_size(InList) of Size when Size > ?CHUNK_THRESHOLD -> NumberOfChunksLikely = ((Size div ?CHUNK_THRESHOLD) + 1), ChunkThreshold = Size div NumberOfChunksLikely, - chunkify(Bt, InList, ChunkThreshold, [], 0, []); + chunkify(InList, ChunkThreshold, [], 0, []); _Else -> [InList] end. -chunkify(_Bt, [], _ChunkThreshold, [], 0, OutputChunks) -> +chunkify([], _ChunkThreshold, [], 0, OutputChunks) -> lists:reverse(OutputChunks); -chunkify(_Bt, [], _ChunkThreshold, OutList, _OutListSize, OutputChunks) -> +chunkify([], _ChunkThreshold, OutList, _OutListSize, OutputChunks) -> lists:reverse([lists:reverse(OutList) | OutputChunks]); -chunkify(Bt, [InElement | RestInList], ChunkThreshold, OutList, OutListSize, OutputChunks) -> - case byte_size(couch_compress:compress(InElement, Bt#btree.compression)) of +chunkify([InElement | RestInList], ChunkThreshold, OutList, OutListSize, OutputChunks) -> + case ?term_size(InElement) of Size when (Size + OutListSize) > ChunkThreshold andalso OutList /= [] -> - chunkify(Bt, RestInList, ChunkThreshold, [], 0, [lists:reverse([InElement | OutList]) | OutputChunks]); + chunkify(RestInList, ChunkThreshold, [], 0, [lists:reverse([InElement | OutList]) | OutputChunks]); Size -> - chunkify(Bt, RestInList, ChunkThreshold, [InElement | OutList], OutListSize + Size, OutputChunks) + chunkify(RestInList, ChunkThreshold, [InElement | OutList], OutListSize + Size, OutputChunks) end. modify_node(Bt, RootPointerInfo, Actions, QueryOutput) -> @@ -350,7 +350,7 @@ get_node(#btree{fd = Fd}, NodePos) -> write_node(#btree{fd = Fd, compression = Comp} = Bt, NodeType, NodeList) -> % split up nodes into smaller sizes - NodeListList = chunkify(Bt, NodeList), + NodeListList = chunkify(NodeList), % now write out each chunk and return the KeyPointer pairs for those nodes ResultList = [ begin diff --git a/src/couchdb/couch_db.hrl b/src/couchdb/couch_db.hrl index 7ff1f6ac..0696459d 100644 --- a/src/couchdb/couch_db.hrl +++ b/src/couchdb/couch_db.hrl @@ -27,6 +27,12 @@ -define(b2l(V), binary_to_list(V)). -define(l2b(V), list_to_binary(V)). -define(term_to_bin(T), term_to_binary(T, [{minor_version, 1}])). +-define(term_size(T), + try + erlang:external_size(T) + catch _:_ -> + byte_size(?term_to_bin(T)) + end). -define(DEFAULT_ATTACHMENT_CONTENT_TYPE, <<"application/octet-stream">>). diff --git a/src/couchdb/couch_db_updater.erl b/src/couchdb/couch_db_updater.erl index 3be4344c..4ac3a168 100644 --- a/src/couchdb/couch_db_updater.erl +++ b/src/couchdb/couch_db_updater.erl @@ -888,7 +888,7 @@ copy_compact(Db, NewDb0, Retry) -> fun(#doc_info{high_seq=Seq}=DocInfo, _Offset, {AccNewDb, AccUncopied, AccUncopiedSize, AccCopiedSize, TotalCopied}) -> - AccUncopiedSize2 = AccUncopiedSize + byte_size(?term_to_bin(DocInfo)), + AccUncopiedSize2 = AccUncopiedSize + ?term_size(DocInfo), if AccUncopiedSize2 >= BufferSize -> NewDb2 = copy_docs( Db, AccNewDb, lists:reverse([DocInfo | AccUncopied]), Retry), diff --git a/src/couchdb/couch_view_compactor.erl b/src/couchdb/couch_view_compactor.erl index 5edfa2b3..b34692a3 100644 --- a/src/couchdb/couch_view_compactor.erl +++ b/src/couchdb/couch_view_compactor.erl @@ -57,7 +57,7 @@ compact_group(Group, EmptyGroup) -> Msg = "Duplicates of ~s detected in ~s ~s - rebuild required", exit(io_lib:format(Msg, [DocId, DbName, GroupId])); true -> ok end, - AccSize2 = AccSize + byte_size(?term_to_bin(KV)), + AccSize2 = AccSize + ?term_size(KV), if AccSize2 >= BufferSize -> {ok, Bt2} = couch_btree:add(Bt, lists:reverse([KV|Acc])), couch_task_status:update("Copied ~p of ~p Ids (~p%)", @@ -90,7 +90,7 @@ compact_view(View, EmptyView, BufferSize) -> %% Key is {Key,DocId} Fun = fun(KV, {Bt, Acc, AccSize, TotalCopied}) -> - AccSize2 = AccSize + byte_size(?term_to_bin(KV)), + AccSize2 = AccSize + ?term_size(KV), if AccSize2 >= BufferSize -> {ok, Bt2} = couch_btree:add(Bt, lists:reverse([KV|Acc])), couch_task_status:update("View #~p: copied ~p of ~p KVs (~p%)", diff --git a/src/couchdb/couch_work_queue.erl b/src/couchdb/couch_work_queue.erl index c21b8b5d..6a675861 100644 --- a/src/couchdb/couch_work_queue.erl +++ b/src/couchdb/couch_work_queue.erl @@ -42,7 +42,7 @@ new(Options) -> queue(Wq, Item) when is_binary(Item) -> gen_server:call(Wq, {queue, Item, byte_size(Item)}, infinity); queue(Wq, Item) -> - gen_server:call(Wq, {queue, Item, byte_size(?term_to_bin(Item))}, infinity). + gen_server:call(Wq, {queue, Item, ?term_size(Item)}, infinity). dequeue(Wq) ->