Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with
or
.
Download ZIP
Newer
Older
100644 108 lines (96 sloc) 3.411 kB
e7542dc @beerriot working, undocumented, word-splitter
authored
1 -module(luwak_mr_words).
2
3 -export([
4 in_file/1,
5 map/3,
6 reduce/2
7 ]).
8
9 -record(chunk, {data}).
10 -record(segment, {left, words=[], right}).
11
12 in_file(Filename) ->
13 {ok, C} = riak:local_client(),
14 {ok, [{_Start, _End, R}]} =
15 C:mapred({modfun, luwak_mr, file, Filename},
16 [{map, {modfun, ?MODULE, map}, none, false},
17 {reduce, {modfun, ?MODULE, reduce}, none, true}]),
18 strip_record(R).
19
20 strip_record(#chunk{data=Data}) ->
21 [Data];
22 strip_record(#segment{left=Left, words=Words, right=Right}) ->
23 RMR = case Right of
24 undefined -> Words;
25 _ -> Words++[Right]
26 end,
27 RLMR = case Left of
28 undefined -> RMR;
29 _ -> [Left|RMR]
30 end,
31 RLMR.
32
33 map(Block, Offset, _) ->
34 Data = luwak_block:data(Block),
35 [{Offset, Offset+size(Data), words(Data)}].
36
37 reduce([], _) -> [];
38 reduce(Maps, _) ->
39 [First|Sorted] = lists:keysort(1, Maps),
40 lists:foldl(fun reduce_fun/2, [First], Sorted).
41
42 %% so sad
43 reduce_fun({Seam, End, Map}, [{Start, Seam, LastMap}|Acc]) ->
44 [{Start, End, merge(LastMap, Map)}|Acc];
45 reduce_fun(Next, Acc) ->
46 [Next|Acc].
47
48 words(Binary) ->
49 case words_firstchunk(Binary) of
50 whole ->
51 #chunk{data=Binary};
52 {FirstChunk, Rest} ->
53 {Words, LastChunk} = words_rest(Rest),
54 #segment{left=FirstChunk, words=Words, right=LastChunk}
55 end.
56
57 words_firstchunk(Binary) -> words_firstchunk(Binary, []).
58 words_firstchunk(<<>>, _Acc) ->
59 whole; %% don't recreate that whole binary
60 words_firstchunk(<<$\s,Rest/binary>>, Acc) ->
61 FirstChunk = case Acc of
62 [] -> undefined;
63 _ -> list_to_binary(lists:reverse(Acc))
64 end,
65 {FirstChunk, Rest};
66 words_firstchunk(<<C,Rest/binary>>, Acc) ->
67 words_firstchunk(Rest, [C|Acc]).
68
69 words_rest(Binary) -> words_rest(Binary, [], []).
70 words_rest(<<>>, CharAcc, WordAcc) ->
71 LastChunk = case CharAcc of
72 [] -> undefined;
73 _ -> list_to_binary(lists:reverse(CharAcc))
74 end,
75 {lists:reverse(WordAcc), LastChunk};
76 words_rest(<<$\s,Rest/binary>>, CharAcc, WordAcc) ->
77 words_rest(Rest, [],
78 case CharAcc of
79 [] -> WordAcc;
80 _ -> [list_to_binary(lists:reverse(CharAcc))|WordAcc]
81 end);
82 words_rest(<<C,Rest/binary>>, CharAcc, WordAcc) ->
83 words_rest(Rest, [C|CharAcc], WordAcc).
84
85 merge(#chunk{data=TC}, #chunk{data=OC}) ->
86 #chunk{data=iolist_to_binary([TC,OC])};
87 merge(#chunk{data=TC}, #segment{left=OL}=O) ->
88 NL = case OL of
89 undefined -> TC;
90 _ -> iolist_to_binary([TC,OL])
91 end,
92 O#segment{left=NL};
93 merge(#segment{right=TR}=T, #chunk{data=OC}) ->
94 NR = case TR of
95 undefined -> OC;
96 _ -> iolist_to_binary([TR,OC])
97 end,
98 T#segment{right=NR};
99 merge(#segment{left=TL, words=TW, right=TR},
100 #segment{left=OL, words=OW, right=OR}) ->
101 NW = case {TR, OL} of
102 {undefined, undefined} -> TW++OW;
103 {_ , undefined} -> TW++[TR|OW];
104 {undefined, _ } -> TW++[OL|OW];
105 {_ , _ } -> TW++[iolist_to_binary([TR,OL])|OW]
106 end,
107 #segment{left=TL, words=NW, right=OR}.
Something went wrong with that request. Please try again.