Permalink
Browse files

copy in hashtree docs & eqc

these were originally in riak_kv. since hashtree.erl is now
in riak_core these should be here too
  • Loading branch information...
1 parent d957587 commit 97c6b77400c412de55e65bd897289b807f4b5703 @jrwest jrwest committed Sep 6, 2013
Showing with 301 additions and 0 deletions.
  1. +64 −0 docs/hashtree.md
  2. BIN docs/hashtree.png
  3. +237 −0 test/hashtree_eqc.erl
View
64 docs/hashtree.md
@@ -0,0 +1,64 @@
+`hashtree.erl` implements a fixed-sized hash tree, avoiding any need
+for rebalancing. The tree consists of a fixed number of on-disk
+`segments` and a hash tree constructed over these `segments`. Each
+level of the tree is grouped into buckets based on a fixed `tree
+width`. Each hash at level `i` corresponds to the hash of a bucket of
+hashes at level `i+1`. The following figure depicts a tree with 16
+segments and a tree-width of 4:
+
+![image](https://github.com/basho/riak_kv/raw/jdb-hashtree/docs/hashtree.png)
+
+To insert a new `(key, hash)` pair, the key is hashed and mapped to
+one of the segments. The `(key, hash)` pair is then stored in the
+appropriate segment, which is an ordered `(key, hash)` dictionary. The
+given segment is then marked as dirty. Whenever `update_tree` is
+called, the hash for each dirty segment is re-computed, the
+appropriate leaf node in the hash tree updated, and the hash tree is
+updated bottom-up as necessary. Only paths along which hashes have
+been changed are re-computed.
+
+The current implementation uses LevelDB for the heavy lifting. Rather
+than reading/writing the on-disk segments as a unit, `(key, hash)`
+pairs are written to LevelDB as simple key-value pairs. The LevelDB
+key written is the binary `<<$s, SegmentId:64/integer,
+Key/binary>>`. Thus, inserting a new key-value hash is nothing more
+than a single LevelDB write. Likewise, key-hash pairs for a segment
+are laided on sequentially on-disk based on key sorting. An in-memory
+bitvector is used to track dirty segments, although a `gb_sets` was
+formerly used.
+
+When updating the segment hashes, a LevelDB iterator is used to access
+the segment keys in-order. The iterator seeks to the beginning of the
+segment and then iterators through all of the key-hash pairs. As an
+optimization, the iteration process is designed to read in multiple
+segments when possible. For example, if the list of dirty segments was
+`[1, 2, 3, 5, 6, 10]`, the code will seek an iterator to the beginning
+of segment 1, iterator through all of its keys, compute the
+appropriate segment 1 hash, then continue to traverse through segment
+2 and segment 3's keys, updating those hashes as well. After segment
+3, a new iterator will be created to seek to the beginning of segment
+5, and handle both 5, and 6; and then a final iterator used to access
+segment 10. This design works very well when constructing a new tree
+from scratch. There's a phase of inserting a bunch of key-hash pairs
+(all writes), followed by an in-order traversal of the LevelDB
+database (all reads).
+
+Trees are compared using standard hash tree approach, comparing the
+hash at each level, and recursing to the next level down when
+different. After reaching the leaf nodes, any differing hashes results
+in a key exchange of the keys in the associated differing segments.
+
+By default, the hash tree itself is entirely in-memory. However, the
+code provides a `MEM_LEVEL` paramemter that specifics that levels
+greater than the parameter should be stored on-disk instead. These
+buckets are simply stored on disk in the same LevelDB structure as
+`{$b, Level, Bucket} -> orddict(Key, Hash)}` objects.
+
+The default settings use `1024*1024` segments with a tree width of
+`1024`. Thus, the resulting tree is only 3 levels deep. And there
+are only `1+1024+1024*1024` hashs stored in memory -- so, a few
+MB per hash tree. Given `1024*1024` on-disk segments, and assuming
+the code uniformly hashes keys to each segment, you end up with ~1000
+keys per segment with a 1 billion key hash tree. Thus, a single key
+difference would require 3 hash exchanges and a key exchange of
+1000 keys to determine the differing key.
View
BIN docs/hashtree.png
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
View
237 test/hashtree_eqc.erl
@@ -0,0 +1,237 @@
+-module(hashtree_eqc).
+-compile([export_all]).
+
+-ifdef(TEST).
+-ifdef(EQC).
+-include_lib("eqc/include/eqc.hrl").
+-include_lib("eqc/include/eqc_statem.hrl").
+-define(QC_OUT(P),
+ eqc:on_output(fun(Str, Args) -> io:format(user, Str, Args) end, P)).
+
+-include_lib("eunit/include/eunit.hrl").
+
+hashtree_test_() ->
+ {timeout, 30,
+ fun() ->
+ ?assert(eqc:quickcheck(?QC_OUT(eqc:testing_time(29,
+ hashtree_eqc:prop_correct()))))
+ end
+ }.
+
+-record(state,
+ {
+ tree1,
+ tree2,
+ only1 = [],
+ only2 = [],
+ both = [],
+ segments,
+ width,
+ mem_levels
+ }).
+
+
+initial_state() ->
+ #state{
+ only1 = [],
+ only2 = [],
+ both = []
+ }.
+
+integer_to_binary(Int) ->
+ list_to_binary(integer_to_list(Int)).
+
+-ifndef(old_hash).
+sha(Bin) ->
+ crypto:hash(sha, Bin).
+-else.
+sha(Bin) ->
+ crypto:sha(Bin).
+-endif.
+
+object(_S) ->
+ {?LET(Key, int(), ?MODULE:integer_to_binary(Key)), sha(term_to_binary(make_ref()))}.
+
+command(S) ->
+ oneof(
+ [{call, ?MODULE, start_1, [S]} || S#state.tree1 == undefined] ++
+ [{call, ?MODULE, start_2, [S]} || S#state.tree2 == undefined] ++
+ [{call, ?MODULE, write_1, [S#state.tree1, object(S)]} ||
+ S#state.tree1 /= undefined] ++
+ [{call, ?MODULE, write_2, [S#state.tree2, object(S)]} ||
+ S#state.tree2 /= undefined] ++
+ [{call, ?MODULE, write_both, [S#state.tree1, S#state.tree2, object(S)]} ||
+ S#state.tree1 /= undefined, S#state.tree2 /= undefined] ++
+ [{call, ?MODULE, update_tree_1, [S#state.tree1]} || S#state.tree1 /= undefined] ++
+ [{call, ?MODULE, update_tree_2, [S#state.tree2]} || S#state.tree2 /= undefined] ++
+ [{call, ?MODULE, reconcile, [S]} ||
+ S#state.tree1 /= undefined, S#state.tree2 /= undefined] ++
+ []
+ ).
+
+make_treevars() ->
+ Powers = [8, 16, 32, 64, 128, 256, 512, 1024],
+ Segments=oneof(Powers),
+ Width=oneof(Powers),
+ %NumLevels = erlang:trunc(math:log(Segments) / math:log(Width)) + 1,
+ %MemLevels = random:uniform(NumLevels+1)-1,
+ %MemLevels = oneof(lists:seq(0, NumLevels),
+ MemLevels=4,
+ {{call, erlang, '*', [Segments, Segments]}, Width, MemLevels}.
+ %{1024*1024, 1024, 4}.
+
+start_1(S) ->
+ hashtree:new({0,0}, [{segments, S#state.segments}, {width,
+ S#state.width}, {mem_levels, S#state.mem_levels}]).
+start_2(S) ->
+ hashtree:new({0,0}, [{segments, S#state.segments}, {width,
+ S#state.width}, {mem_levels, S#state.mem_levels}]).
+
+write_1(Tree, {Key, Hash}) ->
+ hashtree:insert(Key, Hash, Tree).
+
+write_2(Tree, {Key, Hash}) ->
+ hashtree:insert(Key, Hash, Tree).
+
+write_both(Tree1, Tree2, {Key, Hash}) ->
+ {hashtree:insert(Key, Hash, Tree1), hashtree:insert(Key, Hash, Tree2)}.
+
+update_tree_1(T1) ->
+ hashtree:update_tree(T1).
+
+update_tree_2(T2) ->
+ hashtree:update_tree(T2).
+
+reconcile(S) ->
+ A2 = hashtree:update_tree(S#state.tree1),
+ B2 = hashtree:update_tree(S#state.tree2),
+ KeyDiff = hashtree:local_compare(A2, B2),
+ Missing = [M || {missing, M} <- KeyDiff],
+ RemoteMissing = [M || {remote_missing, M} <- KeyDiff],
+ Different = [D || {different, D} <- KeyDiff],
+
+ Insert = fun(Tree, Vals) ->
+ lists:foldl(fun({Key, Hash}, Acc) ->
+ hashtree:insert(Key, Hash, Acc)
+ end, Tree, Vals)
+ end,
+
+ A3 = Insert(A2, [lists:keyfind(K, 1, S#state.only2) || K <- Missing, lists:keyfind(K, 1,
+ S#state.only2) /= false]),
+ B3 = Insert(B2, [lists:keyfind(K, 1, S#state.only1) || K <- RemoteMissing, lists:keyfind(K, 1,
+ S#state.only1) /= false]),
+ B4 = Insert(B3, [lists:keyfind(K, 1, S#state.only1) || K <- Different, lists:keyfind(K, 1,
+ S#state.only1) /= false]),
+ Res = {hashtree:update_tree(A3), hashtree:update_tree(B4)},
+ Res.
+
+
+write_differing(Tree1, Tree2, {Key, Hash1}, Hash2) ->
+ {{Key, Hash1}, {Key, Hash2}, hashtree:insert(Key, Hash1, Tree1),
+ hashtree:insert(Key, Hash2, Tree2)}.
+
+precondition(S,{call,_,start_1,_}) ->
+ S#state.tree1 == undefined;
+precondition(S,{call,_,start_2,_}) ->
+ S#state.tree2 == undefined;
+precondition(S,{call,_,write_1,_}) ->
+ S#state.tree1 /= undefined;
+precondition(S,{call,_,write_2,_}) ->
+ S#state.tree2 /= undefined;
+precondition(S,{call,_,write_both,_}) ->
+ S#state.tree1 /= undefined andalso S#state.tree2 /= undefined;
+precondition(S,{call,_,reconcile,_}) ->
+ S#state.tree1 /= undefined andalso S#state.tree2 /= undefined;
+precondition(S,{call,_,update_tree_1,_}) ->
+ S#state.tree1 /= undefined;
+precondition(S,{call,_,update_tree_2,_}) ->
+ S#state.tree2 /= undefined.
+
+postcondition(_S,{call,_,_,_},_R) ->
+ true.
+
+next_state(S,V,{call, _, start_1, [_]}) ->
+ S#state{tree1=V, only1=[], both=[]};
+next_state(S,V,{call, _, start_2, [_]}) ->
+ S#state{tree2=V, only2=[], both=[]};
+next_state(S,V,{call, _, write_1, [_, {Key, Val}]}) ->
+ S#state{tree1=V, only1=[{Key, Val}|lists:keydelete(Key, 1,
+ S#state.only1)]};
+next_state(S,V,{call, _, write_2, [_, {Key, Val}]}) ->
+ S#state{tree2=V, only2=[{Key, Val}|lists:keydelete(Key, 1,
+ S#state.only2)]};
+next_state(S,V,{call, _, update_tree_1, [_]}) ->
+ S#state{tree1=V};
+next_state(S,V,{call, _, update_tree_2, [_]}) ->
+ S#state{tree2=V};
+next_state(S,R,{call, _, write_both, [_, _, {Key, Val}]}) ->
+ S#state{tree1={call, erlang, element, [1, R]},
+ tree2={call, erlang, element, [2, R]},
+ only1=[{Key, Val}|lists:keydelete(Key, 1, S#state.only1)],
+ only2=[{Key, Val}|lists:keydelete(Key, 1, S#state.only2)]
+ };
+next_state(S,R,{call, _, reconcile, [_]}) ->
+ Keys = lists:ukeymerge(1, lists:ukeysort(1, S#state.only1),
+ lists:ukeysort(1, S#state.only2)),
+ S#state{tree1={call, erlang, element, [1, R]},
+ tree2={call, erlang, element, [2, R]},
+ only1 = Keys,
+ only2 = Keys
+ }.
+
+
+prop_correct() ->
+ ?FORALL({Segments, Width, MemLevels}, make_treevars(),
+ ?FORALL(Cmds,commands(?MODULE, #state{segments=Segments, width=Width,
+ mem_levels=MemLevels}),
+ ?TRAPEXIT(
+ aggregate(command_names(Cmds),
+ begin
+ {H,S,Res} = run_commands(?MODULE,Cmds),
+ ?WHENFAIL(
+ begin
+ io:format("History: ~p\nState: ~p\nRes: ~p\n~p\n",
+ [H,S,Res, zip(tl(Cmds), [Y || {_, Y} <- H])]),
+ catch hashtree:destroy(hashtree:close(S#state.tree1)),
+ catch hashtree:destroy(hashtree:close(S#state.tree2))
+ end,
+ begin
+ ?assertEqual(ok, Res),
+ Unique1 = S#state.only1 -- S#state.only2,
+ Unique2 = S#state.only2 -- S#state.only1,
+ Expected = [{missing, Key} || {Key, _} <-
+ Unique2, not
+ lists:keymember(Key, 1, S#state.only1)] ++
+ [{remote_missing, Key} || {Key, _} <-
+ Unique1, not
+ lists:keymember(Key, 1, S#state.only2)] ++
+ [{different, Key} || Key <-
+ sets:to_list(sets:intersection(sets:from_list([Key
+ || {Key,_} <- Unique1]),
+ sets:from_list([Key || {Key,_}
+ <- Unique2])))],
+
+ case S#state.tree1 == undefined orelse
+ S#state.tree2 == undefined of
+ true ->
+ true;
+ _ ->
+
+ T1 = hashtree:update_tree(S#state.tree1),
+ T2 = hashtree:update_tree(S#state.tree2),
+
+ KeyDiff = hashtree:local_compare(T1, T2),
+
+ ?assertEqual(lists:usort(Expected),
+ lists:usort(KeyDiff)),
+
+ catch hashtree:destroy(hashtree:close(T1)),
+ catch hashtree:destroy(hashtree:close(T2)),
+ true
+ end
+ end
+ )
+ end)))).
+
+-endif.
+-endif.

0 comments on commit 97c6b77

Please sign in to comment.