Skip to content
Newer
Older
100644 201 lines (187 sloc) 7.85 KB
7467b94 @beerriot docs for luwak_mr_words
authored Jan 16, 2011
1 %% -------------------------------------------------------------------
2 %% luwak_mr: utilities for map/reducing on Luwak data
3 %%
4 %% This file is provided to you under the Apache License,
5 %% Version 2.0 (the "License"); you may not use this file
6 %% except in compliance with the License. You may obtain
7 %% a copy of the License at
8 %%
9 %% http://www.apache.org/licenses/LICENSE-2.0
10 %%
11 %% Unless required by applicable law or agreed to in writing,
12 %% software distributed under the License is distributed on an
13 %% "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 %% KIND, either express or implied. See the License for the
15 %% specific language governing permissions and limitations
16 %% under the License.
17 %%
18 %% -------------------------------------------------------------------
19
20 %% @doc Word-splitting, as an example of using {@link luwak_mr:file/3}.
21 %%
22 %% This module is an example of how one might use the
23 %% `luwak_mr:file' dynamic map/reduce input generator to run a
24 %% computation over a Luwak file. The example demonstrated is
25 %% that of splitting a file into its constituent words.
26 %% Inspiration for the method was derived from Guy Steele's talk
27 %%<a href="http://www.infoq.com/presentations/Thinking-Parallel-Programming">
28 %% "How to Think about Parallel Programming--Not!"</a>
29 %%
30 %% The basic idea is to use Luwak's division of blocks as the
31 %% division of parallelizable labor. (The green lines on slide
32 %% 55/21 of Guy's presentation.)
33 %%
34 %% Once you have put this module and `luwak_mr' in Riak's code
35 %% path, you can use it by first filling a Luwak file with
36 %% Latin-1 text. Then run:
37 %%```
38 %% luwak_mr_words:in_file(<<"my_file_name">>).
39 %%'''
40 %% The function should return a list of binaries, each binary
41 %% being one word from the file. The words will remain in the
42 %% order they were in the file. "Words" are defined as
43 %% consecutive non-space (code 32) characters.
44 %%
45 %% Do not run this on a very large file in production. It is
46 %% demo code, and therefore does some inefficient things with
47 %% binaries. The `in_file/1' and `reduce/2' functions also expect
48 %% to, at some point, hold the entire result set in memory.
49
e7542dc @beerriot working, undocumented, word-splitter
authored Jan 16, 2011
50 -module(luwak_mr_words).
51
52 -export([
53 in_file/1,
54 map/3,
55 reduce/2
56 ]).
57
7467b94 @beerriot docs for luwak_mr_words
authored Jan 16, 2011
58 %% A chunk is a block of text whose boundaries we haven't found yet
e7542dc @beerriot working, undocumented, word-splitter
authored Jan 16, 2011
59 -record(chunk, {data}).
7467b94 @beerriot docs for luwak_mr_words
authored Jan 16, 2011
60 %% A segment is a block of text containing some bounded words, with a
61 %% possible chunk on its left and/or right side.
e7542dc @beerriot working, undocumented, word-splitter
authored Jan 16, 2011
62 -record(segment, {left, words=[], right}).
63
7467b94 @beerriot docs for luwak_mr_words
authored Jan 16, 2011
64 %% @spec in_file(binary()) -> [binary()]
65 %% @doc Split a Luwak file into "words" (strings of characters between
66 %% spaces).
e7542dc @beerriot working, undocumented, word-splitter
authored Jan 16, 2011
67 in_file(Filename) ->
68 {ok, C} = riak:local_client(),
2220271 riak_pipe MR-emulation compatibility
Bryan Fink authored Jun 29, 2011
69 Inputs = {modfun, luwak_mr, file, Filename},
70 Spec = [{map, {modfun, ?MODULE, map}, none, false},
71 {reduce, {modfun, ?MODULE, reduce}, none, true}],
e7542dc @beerriot working, undocumented, word-splitter
authored Jan 16, 2011
72 {ok, [{_Start, _End, R}]} =
2220271 riak_pipe MR-emulation compatibility
Bryan Fink authored Jun 29, 2011
73 case riak_kv_util:mapred_system() of
74 pipe ->
75 riak_kv_mrc_pipe:mapred(Inputs, Spec);
76 legacy ->
77 C:mapred(Inputs, Spec)
78 end,
e7542dc @beerriot working, undocumented, word-splitter
authored Jan 16, 2011
79 strip_record(R).
80
7467b94 @beerriot docs for luwak_mr_words
authored Jan 16, 2011
81 %% @spec strip_record(chunk()|segment()) -> [binary()]
82 %% @doc Converts a chunk or segment record into a list of words.
e7542dc @beerriot working, undocumented, word-splitter
authored Jan 16, 2011
83 strip_record(#chunk{data=Data}) ->
84 [Data];
85 strip_record(#segment{left=Left, words=Words, right=Right}) ->
86 RMR = case Right of
87 undefined -> Words;
88 _ -> Words++[Right]
89 end,
90 RLMR = case Left of
91 undefined -> RMR;
92 _ -> [Left|RMR]
93 end,
94 RLMR.
95
7467b94 @beerriot docs for luwak_mr_words
authored Jan 16, 2011
96 %% @spec map(luwak_block(), integer(), term()) -> [block_result()]
97 %% @type block_result() = {integer(), integer(), chunk()|segment()}
98 %% @doc Splits a luwak_block into a chunk or segment. Used in the map
99 %% phase of in_file/1.
100 %% The return value is a 3-tuple, with elements:
101 %% 1: the byte-offset of the start of the block
102 %% 2: the byte-offset of the next byte after the block
103 %% 3: the chunk or segment that the block parsed to
e7542dc @beerriot working, undocumented, word-splitter
authored Jan 16, 2011
104 map(Block, Offset, _) ->
105 Data = luwak_block:data(Block),
106 [{Offset, Offset+size(Data), words(Data)}].
107
7467b94 @beerriot docs for luwak_mr_words
authored Jan 16, 2011
108 %% @spec reduce([block_result()], term()) -> [block_result()]
109 %% @doc Combines chunks and segments into larger chunks and segments.
e7542dc @beerriot working, undocumented, word-splitter
authored Jan 16, 2011
110 reduce([], _) -> [];
111 reduce(Maps, _) ->
112 [First|Sorted] = lists:keysort(1, Maps),
113 lists:foldl(fun reduce_fun/2, [First], Sorted).
114
7467b94 @beerriot docs for luwak_mr_words
authored Jan 16, 2011
115 %% @spec reduce_fun(block_result(), [block_result()]) -> [block_result()]
116 %% @doc Fold implementation of reduce/2.
117 %% The name of this function saddens me.
e7542dc @beerriot working, undocumented, word-splitter
authored Jan 16, 2011
118 reduce_fun({Seam, End, Map}, [{Start, Seam, LastMap}|Acc]) ->
119 [{Start, End, merge(LastMap, Map)}|Acc];
120 reduce_fun(Next, Acc) ->
121 [Next|Acc].
122
7467b94 @beerriot docs for luwak_mr_words
authored Jan 16, 2011
123 %% @spec words(binary()) -> chunk()|segment()
124 %% @doc Split a binary into a chunk (if there are no spaces at all)
125 %% or a segment (if there is one or more spaces).
e7542dc @beerriot working, undocumented, word-splitter
authored Jan 16, 2011
126 words(Binary) ->
127 case words_firstchunk(Binary) of
128 whole ->
129 #chunk{data=Binary};
130 {FirstChunk, Rest} ->
131 {Words, LastChunk} = words_rest(Rest),
132 #segment{left=FirstChunk, words=Words, right=LastChunk}
133 end.
134
7467b94 @beerriot docs for luwak_mr_words
authored Jan 16, 2011
135 %% @spec words_firstchunk(binary()) -> whole|{binary(), binary()}
136 %% @doc Recursive finder of the first chunk of a binary. Returns
137 %% 'whole' if there are no spaces (so the binary is one big
138 %% chunk) or a 2-tuple of:
139 %% 1: the characters before the first space
140 %% 2: the characters after the first space
e7542dc @beerriot working, undocumented, word-splitter
authored Jan 16, 2011
141 words_firstchunk(Binary) -> words_firstchunk(Binary, []).
142 words_firstchunk(<<>>, _Acc) ->
143 whole; %% don't recreate that whole binary
144 words_firstchunk(<<$\s,Rest/binary>>, Acc) ->
145 FirstChunk = case Acc of
146 [] -> undefined;
147 _ -> list_to_binary(lists:reverse(Acc))
148 end,
149 {FirstChunk, Rest};
150 words_firstchunk(<<C,Rest/binary>>, Acc) ->
151 words_firstchunk(Rest, [C|Acc]).
152
7467b94 @beerriot docs for luwak_mr_words
authored Jan 16, 2011
153 %% @spec words_rest(binary()) -> {[binary()], binary()}
154 %% @doc Recursive splitter of the rest of a binary. Returns
155 %% a 2-tuple of:
156 %% 1: a list of space-delimited words
157 %% 2: the list of characters after the final space
e7542dc @beerriot working, undocumented, word-splitter
authored Jan 16, 2011
158 words_rest(Binary) -> words_rest(Binary, [], []).
159 words_rest(<<>>, CharAcc, WordAcc) ->
160 LastChunk = case CharAcc of
161 [] -> undefined;
162 _ -> list_to_binary(lists:reverse(CharAcc))
163 end,
164 {lists:reverse(WordAcc), LastChunk};
165 words_rest(<<$\s,Rest/binary>>, CharAcc, WordAcc) ->
166 words_rest(Rest, [],
167 case CharAcc of
168 [] -> WordAcc;
169 _ -> [list_to_binary(lists:reverse(CharAcc))|WordAcc]
170 end);
171 words_rest(<<C,Rest/binary>>, CharAcc, WordAcc) ->
172 words_rest(Rest, [C|CharAcc], WordAcc).
173
7467b94 @beerriot docs for luwak_mr_words
authored Jan 16, 2011
174 %% @spec merge(chunk()|segment, chunk()|segment()) -> chunk()|segment()
175 %% @doc Merge a chunk or segment with another chunk or segment.
176 %% The first argument should be the chunk or segement imediately
177 %% preceding the chunk or segment in the second argument.
e7542dc @beerriot working, undocumented, word-splitter
authored Jan 16, 2011
178 merge(#chunk{data=TC}, #chunk{data=OC}) ->
179 #chunk{data=iolist_to_binary([TC,OC])};
180 merge(#chunk{data=TC}, #segment{left=OL}=O) ->
181 NL = case OL of
182 undefined -> TC;
183 _ -> iolist_to_binary([TC,OL])
184 end,
185 O#segment{left=NL};
186 merge(#segment{right=TR}=T, #chunk{data=OC}) ->
187 NR = case TR of
188 undefined -> OC;
189 _ -> iolist_to_binary([TR,OC])
190 end,
191 T#segment{right=NR};
192 merge(#segment{left=TL, words=TW, right=TR},
193 #segment{left=OL, words=OW, right=OR}) ->
194 NW = case {TR, OL} of
195 {undefined, undefined} -> TW++OW;
196 {_ , undefined} -> TW++[TR|OW];
197 {undefined, _ } -> TW++[OL|OW];
198 {_ , _ } -> TW++[iolist_to_binary([TR,OL])|OW]
199 end,
200 #segment{left=TL, words=NW, right=OR}.
Something went wrong with that request. Please try again.