Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Newer
Older
100644 295 lines (260 sloc) 10.703 kb
90f6ce8 Andy Gross initial import
argv0 authored
1 %% -------------------------------------------------------------------
2 %%
3 %% riak_core: Core Riak Application
4 %%
5 %% Copyright (c) 2007-2010 Basho Technologies, Inc. All Rights Reserved.
6 %%
7 %% This file is provided to you under the Apache License,
8 %% Version 2.0 (the "License"); you may not use this file
9 %% except in compliance with the License. You may obtain
10 %% a copy of the License at
11 %%
12 %% http://www.apache.org/licenses/LICENSE-2.0
13 %%
14 %% Unless required by applicable law or agreed to in writing,
15 %% software distributed under the License is distributed on an
16 %% "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
17 %% KIND, either express or implied. See the License for the
18 %% specific language governing permissions and limitations
19 %% under the License.
20 %%
21 %% -------------------------------------------------------------------
22
23 %% @doc A simple Erlang implementation of vector clocks as inspired by Lamport logical clocks.
24 %%
25 %% @reference Leslie Lamport (1978). "Time, clocks, and the ordering of events
26 %% in a distributed system". Communications of the ACM 21 (7): 558-565.
27 %%
28 %% @reference Friedemann Mattern (1988). "Virtual Time and Global States of
29 %% Distributed Systems". Workshop on Parallel and Distributed Algorithms:
30 %% pp. 215-226
31
32 -module(vclock).
33
34 -author('Justin Sheehy <justin@basho.com>').
35 -author('Andy Gross <andy@basho.com>').
36
37 -export([fresh/0,descends/2,merge/1,get_counter/2,get_timestamp/2,
97231e0 Add vclock:increment variant that accepts a timestamp.
Jon Meredith authored
38 increment/2,increment/3,all_nodes/1,equal/2,prune/3,timestamp/0]).
90f6ce8 Andy Gross initial import
argv0 authored
39
ec82f3a Andy Gross dialyzer: riak_core fixes
argv0 authored
40 -ifdef(TEST).
90f6ce8 Andy Gross initial import
argv0 authored
41 -include_lib("eunit/include/eunit.hrl").
ec82f3a Andy Gross dialyzer: riak_core fixes
argv0 authored
42 -endif.
90f6ce8 Andy Gross initial import
argv0 authored
43
ec82f3a Andy Gross dialyzer: riak_core fixes
argv0 authored
44 -export_type([vclock/0, timestamp/0, vclock_node/0]).
45
46 -opaque vclock() :: [vc_entry()].
90f6ce8 Andy Gross initial import
argv0 authored
47 % The timestamp is present but not used, in case a client wishes to inspect it.
48 -type vc_entry() :: {vclock_node(), {counter(), timestamp()}}.
49
50 % Nodes can have any term() as a name, but they must differ from each other.
51 -type vclock_node() :: term().
52 -type counter() :: integer().
53 -type timestamp() :: integer().
54
55 % @doc Create a brand new vclock.
56 -spec fresh() -> vclock().
57 fresh() ->
58 [].
59
60 % @doc Return true if Va is a direct descendant of Vb, else false -- remember, a vclock is its own descendant!
ec82f3a Andy Gross dialyzer: riak_core fixes
argv0 authored
61 -spec descends(Va :: vclock()|[], Vb :: vclock()|[]) -> boolean().
90f6ce8 Andy Gross initial import
argv0 authored
62 descends(_, []) ->
63 % all vclocks descend from the empty vclock
64 true;
65 descends(Va, Vb) ->
66 [{NodeB, {CtrB, _T}}|RestB] = Vb,
e58c769 Erik Søe Sørensen vclock: simplification of descends/2
eriksoe authored
67 case lists:keyfind(NodeB, 1, Va) of
68 false ->
69 false;
70 {_, {CtrA, _TSA}} ->
71 (CtrA >= CtrB) andalso descends(Va,RestB)
72 end.
90f6ce8 Andy Gross initial import
argv0 authored
73
74 % @doc Combine all VClocks in the input list into their least possible
75 % common descendant.
ec82f3a Andy Gross dialyzer: riak_core fixes
argv0 authored
76 -spec merge(VClocks :: [vclock()]) -> vclock() | [].
90f6ce8 Andy Gross initial import
argv0 authored
77 merge([]) -> [];
78 merge([SingleVclock]) -> SingleVclock;
79 merge([First|Rest]) -> merge(Rest, lists:keysort(1, First)).
80
81 merge([], NClock) -> NClock;
82 merge([AClock|VClocks],NClock) ->
83 merge(VClocks, merge(lists:keysort(1, AClock), NClock, [])).
84
d3dd32a Erik Søe Sørensen vclock: simplifying merge/3 using lists:reverse/2.
eriksoe authored
85 merge([], Left, AccClock) -> lists:reverse(AccClock, Left);
86 merge(Left, [], AccClock) -> lists:reverse(AccClock, Left);
90f6ce8 Andy Gross initial import
argv0 authored
87 merge(V=[{Node1,{Ctr1,TS1}}|VClock],
88 N=[{Node2,{Ctr2,TS2}}|NClock], AccClock) ->
89 if Node1 < Node2 ->
90 merge(VClock, N, [{Node1,{Ctr1,TS1}}|AccClock]);
91 Node1 > Node2 ->
92 merge(V, NClock, [{Node2,{Ctr2,TS2}}|AccClock]);
93 true ->
94 ({_Ctr,_TS} = C1) = if Ctr1 > Ctr2 -> {Ctr1,TS1};
95 true -> {Ctr2,TS2}
96 end,
97 merge(VClock, NClock, [{Node1,C1}|AccClock])
98 end.
99
100 % @doc Get the counter value in VClock set from Node.
101 -spec get_counter(Node :: vclock_node(), VClock :: vclock()) -> counter() | undefined.
102 get_counter(Node, VClock) ->
c4fe87f Erik Søe Sørensen vclock: Replacing proplists:get_value() calls with far faster lists:keyf...
eriksoe authored
103 case lists:keyfind(Node, 1, VClock) of
104 {_, {Ctr, _TS}} -> Ctr;
105 false -> undefined
90f6ce8 Andy Gross initial import
argv0 authored
106 end.
107
108 % @doc Get the timestamp value in a VClock set from Node.
109 -spec get_timestamp(Node :: vclock_node(), VClock :: vclock()) -> timestamp() | undefined.
110 get_timestamp(Node, VClock) ->
c4fe87f Erik Søe Sørensen vclock: Replacing proplists:get_value() calls with far faster lists:keyf...
eriksoe authored
111 case lists:keyfind(Node, 1, VClock) of
112 {_, {_Ctr, TS}} -> TS;
113 false -> undefined
90f6ce8 Andy Gross initial import
argv0 authored
114 end.
115
116 % @doc Increment VClock at Node.
117 -spec increment(Node :: vclock_node(), VClock :: vclock()) -> vclock().
118 increment(Node, VClock) ->
97231e0 Add vclock:increment variant that accepts a timestamp.
Jon Meredith authored
119 increment(Node, timestamp(), VClock).
120
121 % @doc Increment VClock at Node.
122 -spec increment(Node :: vclock_node(), IncTs :: timestamp(),
123 VClock :: vclock()) -> vclock().
124 increment(Node, IncTs, VClock) ->
90f6ce8 Andy Gross initial import
argv0 authored
125 {{_Ctr, _TS}=C1,NewV} = case lists:keytake(Node, 1, VClock) of
126 false ->
97231e0 Add vclock:increment variant that accepts a timestamp.
Jon Meredith authored
127 {{1, IncTs}, VClock};
90f6ce8 Andy Gross initial import
argv0 authored
128 {value, {_N, {C, _T}}, ModV} ->
97231e0 Add vclock:increment variant that accepts a timestamp.
Jon Meredith authored
129 {{C + 1, IncTs}, ModV}
90f6ce8 Andy Gross initial import
argv0 authored
130 end,
131 [{Node,C1}|NewV].
132
97231e0 Add vclock:increment variant that accepts a timestamp.
Jon Meredith authored
133
90f6ce8 Andy Gross initial import
argv0 authored
134 % @doc Return the list of all nodes that have ever incremented VClock.
135 -spec all_nodes(VClock :: vclock()) -> [vclock_node()].
136 all_nodes(VClock) ->
137 [X || {X,{_,_}} <- VClock].
138
213eec2 Erik Søe Sørensen vclock: faster timestamp().
eriksoe authored
139 -define(DAYS_FROM_GREGORIAN_BASE_TO_EPOCH, (1970*365+478)).
140 -define(SECONDS_FROM_GREGORIAN_BASE_TO_EPOCH,
141 (?DAYS_FROM_GREGORIAN_BASE_TO_EPOCH * 24*60*60)
142 %% == calendar:datetime_to_gregorian_seconds({{1970,1,1},{0,0,0}})
143 ).
144
97231e0 Add vclock:increment variant that accepts a timestamp.
Jon Meredith authored
145 % @doc Return a timestamp for a vector clock
146 -spec timestamp() -> timestamp().
90f6ce8 Andy Gross initial import
argv0 authored
147 timestamp() ->
213eec2 Erik Søe Sørensen vclock: faster timestamp().
eriksoe authored
148 %% Same as calendar:datetime_to_gregorian_seconds(erlang:universaltime()),
149 %% but significantly faster.
b6a33cf Andrew Thompson Minor cleanup of trifork vclock patch
Vagabond authored
150 {MegaSeconds, Seconds, _} = os:timestamp(),
213eec2 Erik Søe Sørensen vclock: faster timestamp().
eriksoe authored
151 ?SECONDS_FROM_GREGORIAN_BASE_TO_EPOCH + MegaSeconds*1000000 + Seconds.
90f6ce8 Andy Gross initial import
argv0 authored
152
153 % @doc Compares two VClocks for equality.
154 -spec equal(VClockA :: vclock(), VClockB :: vclock()) -> boolean().
155 equal(VA,VB) ->
265a185 Erik Søe Sørensen vclock:equal/2: Much simpler, much faster (25x).
eriksoe authored
156 lists:sort(VA) =:= lists:sort(VB).
90f6ce8 Andy Gross initial import
argv0 authored
157
158 % @doc Possibly shrink the size of a vclock, depending on current age and size.
159 -spec prune(V::vclock(), Now::integer(), BucketProps::term()) -> vclock().
160 prune(V,Now,BucketProps) ->
724b951 Erik Søe Sørensen vclock:prune(): In case of oldish vclock entries with identical timestam...
eriksoe authored
161 %% This sort need to be deterministic, to avoid spurious merge conflicts later.
162 %% We achieve this by using the node ID as secondary key.
163 SortV = lists:sort(fun({N1,{_,T1}},{N2,{_,T2}}) -> {T1,N1} < {T2,N2} end, V),
90f6ce8 Andy Gross initial import
argv0 authored
164 prune_vclock1(SortV,Now,BucketProps).
165 % @private
166 prune_vclock1(V,Now,BProps) ->
c4fe87f Erik Søe Sørensen vclock: Replacing proplists:get_value() calls with far faster lists:keyf...
eriksoe authored
167 case length(V) =< get_property(small_vclock, BProps) of
90f6ce8 Andy Gross initial import
argv0 authored
168 true -> V;
169 false ->
170 {_,{_,HeadTime}} = hd(V),
c4fe87f Erik Søe Sørensen vclock: Replacing proplists:get_value() calls with far faster lists:keyf...
eriksoe authored
171 case (Now - HeadTime) < get_property(young_vclock,BProps) of
90f6ce8 Andy Gross initial import
argv0 authored
172 true -> V;
173 false -> prune_vclock1(V,Now,BProps,HeadTime)
174 end
175 end.
176 % @private
177 prune_vclock1(V,Now,BProps,HeadTime) ->
178 % has a precondition that V is longer than small and older than young
b6a33cf Andrew Thompson Minor cleanup of trifork vclock patch
Vagabond authored
179 case (length(V) > get_property(big_vclock,BProps)) orelse
180 ((Now - HeadTime) > get_property(old_vclock,BProps)) of
90f6ce8 Andy Gross initial import
argv0 authored
181 true -> prune_vclock1(tl(V),Now,BProps);
7ef6ade Erik Søe Sørensen Slight simplification of prune_vclock1().
eriksoe authored
182 false -> V
90f6ce8 Andy Gross initial import
argv0 authored
183 end.
184
c4fe87f Erik Søe Sørensen vclock: Replacing proplists:get_value() calls with far faster lists:keyf...
eriksoe authored
185 get_property(Key, PairList) ->
10b8387 Andrew Thompson Make get_property act more like proplists:get_value
Vagabond authored
186 case lists:keyfind(Key, 1, PairList) of
187 {_Key, Value} ->
188 Value;
189 false ->
190 undefined
191 end.
c4fe87f Erik Søe Sørensen vclock: Replacing proplists:get_value() calls with far faster lists:keyf...
eriksoe authored
192
ec82f3a Andy Gross dialyzer: riak_core fixes
argv0 authored
193 %% ===================================================================
194 %% EUnit tests
195 %% ===================================================================
196 -ifdef(TEST).
197
198 % @doc Serves as both a trivial test and some example code.
199 example_test() ->
200 A = vclock:fresh(),
201 B = vclock:fresh(),
202 A1 = vclock:increment(a, A),
203 B1 = vclock:increment(b, B),
204 true = vclock:descends(A1,A),
205 true = vclock:descends(B1,B),
206 false = vclock:descends(A1,B1),
207 A2 = vclock:increment(a, A1),
208 C = vclock:merge([A2, B1]),
209 C1 = vclock:increment(c, C),
210 true = vclock:descends(C1, A2),
211 true = vclock:descends(C1, B1),
212 false = vclock:descends(B1, C1),
213 false = vclock:descends(B1, A1),
214 ok.
215
90f6ce8 Andy Gross initial import
argv0 authored
216 prune_small_test() ->
217 % vclock with less entries than small_vclock will be untouched
218 Now = riak_core_util:moment(),
219 OldTime = Now - 32000000,
220 SmallVC = [{<<"1">>, {1, OldTime}},
221 {<<"2">>, {2, OldTime}},
222 {<<"3">>, {3, OldTime}}],
223 Props = [{small_vclock,4}],
224 ?assertEqual(lists:sort(SmallVC), lists:sort(prune(SmallVC, Now, Props))).
225
226 prune_young_test() ->
227 % vclock with all entries younger than young_vclock will be untouched
228 Now = riak_core_util:moment(),
229 NewTime = Now - 1,
230 VC = [{<<"1">>, {1, NewTime}},
231 {<<"2">>, {2, NewTime}},
232 {<<"3">>, {3, NewTime}}],
233 Props = [{small_vclock,1},{young_vclock,1000}],
234 ?assertEqual(lists:sort(VC), lists:sort(prune(VC, Now, Props))).
235
236 prune_big_test() ->
237 % vclock not preserved by small or young will be pruned down to
238 % no larger than big_vclock entries
239 Now = riak_core_util:moment(),
240 NewTime = Now - 1000,
241 VC = [{<<"1">>, {1, NewTime}},
242 {<<"2">>, {2, NewTime}},
243 {<<"3">>, {3, NewTime}}],
244 Props = [{small_vclock,1},{young_vclock,1},
245 {big_vclock,2},{old_vclock,100000}],
246 ?assert(length(prune(VC, Now, Props)) =:= 2).
247
248 prune_old_test() ->
249 % vclock not preserved by small or young will be pruned down to
250 % no larger than big_vclock and no entries more than old_vclock ago
251 Now = riak_core_util:moment(),
252 NewTime = Now - 1000,
253 OldTime = Now - 100000,
254 VC = [{<<"1">>, {1, NewTime}},
255 {<<"2">>, {2, OldTime}},
256 {<<"3">>, {3, OldTime}}],
257 Props = [{small_vclock,1},{young_vclock,1},
258 {big_vclock,2},{old_vclock,10000}],
259 ?assert(length(prune(VC, Now, Props)) =:= 1).
260
7ce0ed9 Jon Meredith Added test for determinstic vclock pruning for nodes with same timestamp...
jonmeredith authored
261 prune_order_test() ->
262 % vclock with two nodes of the same timestamp will be pruned down
263 % to the same node
264 Now = riak_core_util:moment(),
265 OldTime = Now - 100000,
266 VC1 = [{<<"1">>, {1, OldTime}},
267 {<<"2">>, {2, OldTime}}],
268 VC2 = lists:reverse(VC1),
269 Props = [{small_vclock,1},{young_vclock,1},
270 {big_vclock,2},{old_vclock,10000}],
271 ?assertEqual(prune(VC1, Now, Props), prune(VC2, Now, Props)).
272
90f6ce8 Andy Gross initial import
argv0 authored
273 accessor_test() ->
274 VC = [{<<"1">>, {1, 1}},
275 {<<"2">>, {2, 2}}],
276 ?assertEqual(1, get_counter(<<"1">>, VC)),
277 ?assertEqual(1, get_timestamp(<<"1">>, VC)),
278 ?assertEqual(2, get_counter(<<"2">>, VC)),
279 ?assertEqual(2, get_timestamp(<<"2">>, VC)),
280 ?assertEqual(undefined, get_counter(<<"3">>, VC)),
281 ?assertEqual(undefined, get_timestamp(<<"3">>, VC)),
282 ?assertEqual([<<"1">>, <<"2">>], all_nodes(VC)).
283
284 merge_test() ->
285 VC1 = [{<<"1">>, {1, 1}},
286 {<<"2">>, {2, 2}},
287 {<<"4">>, {4, 4}}],
288 VC2 = [{<<"3">>, {3, 3}},
289 {<<"4">>, {3, 3}}],
290 ?assertEqual([], merge(vclock:fresh())),
291 ?assertEqual([{<<"1">>,{1,1}},{<<"2">>,{2,2}},{<<"3">>,{3,3}},{<<"4">>,{4,4}}],
292 merge([VC1, VC2])).
ec82f3a Andy Gross dialyzer: riak_core fixes
argv0 authored
293
294 -endif.
Something went wrong with that request. Please try again.