forked from tsuna/gohbase
/
compressor.go
155 lines (132 loc) · 3.62 KB
/
compressor.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
// Copyright (C) 2020 The GoHBase Authors. All rights reserved.
// This file is part of GoHBase.
// Use of this source code is governed by the Apache License 2.0
// that can be found in the COPYING file.
package region
import (
"encoding/binary"
"fmt"
"io"
"net"
"github.com/baiweiguo/gohbase/compression"
)
type compressor struct {
compression.Codec
}
func min(x, y uint32) int {
if x < y {
return int(x)
}
return int(y)
}
func growBuffer(b []byte, sz int) []byte {
l := len(b) + sz
if l <= cap(b) {
return b[:l]
}
return append(b, make([]byte, sz)...)
}
func resizeBufferCap(b []byte, capacity int) []byte {
if capacity <= cap(b) {
return b
}
l := len(b)
b = append(b, make([]byte, capacity-l)...)
return b[:l]
}
func (c *compressor) compressCellblocks(cbs net.Buffers, uncompressedLen uint32) []byte {
b := newBuffer(4)
// put uncompressed length
binary.BigEndian.PutUint32(b, uncompressedLen)
uncompressedBuffer := newBuffer(min(uncompressedLen, c.ChunkLen()))
defer freeBuffer(uncompressedBuffer)
var chunkLen uint32
var lenOffset int
for {
n, err := cbs.Read(uncompressedBuffer)
if n == 0 {
break
}
// grow for chunk length
lenOffset = len(b)
b = growBuffer(b, 4)
b, chunkLen = c.Encode(uncompressedBuffer[:n], b)
// write the chunk length
binary.BigEndian.PutUint32(b[lenOffset:], chunkLen)
if err == io.EOF {
break
} else if err != nil {
panic(err) // unexpected error
}
}
return b
}
func readN(b []byte, n int) ([]byte, []byte, error) {
if len(b) < n {
return nil, nil, fmt.Errorf(
"short read: want %d bytes, got %d", n, len(b))
}
return b[:n], b[n:], nil
}
func readUint32(b []byte) (uint32, []byte, error) {
head, tail, err := readN(b, 4)
if err != nil {
return 0, nil, err
}
return binary.BigEndian.Uint32(head), tail, nil
}
// decompressCellblocks decodes block stream format of hadoop.
// The wire format is as follows:
//
// <length of uncompressed block>
// <length of compressed chunk><compressed chunk>
// <length of compressed chunk><compressed chunk>
// ...
// <length of compressed chunk><compressed chunk>
// <length of uncompressed block>
// <length of compressed chunk><compressed chunk>
// ...
// ...
func (c *compressor) decompressCellblocks(b []byte) ([]byte, error) {
var (
err error
out []byte
compressedChunk []byte
compressedChunkLen uint32
uncompressedBlockLen uint32
uncompressedChunkLen uint32
)
for len(b) > 0 {
// read uncompressed block length
uncompressedBlockLen, b, err = readUint32(b)
if err != nil {
return nil, fmt.Errorf("failed to read uncompressed block length: %w", err)
}
out = resizeBufferCap(out, len(out)+int(uncompressedBlockLen))
// read and decompress encoded chunks until whole block is read
var uncompressedSoFar uint32
for uncompressedSoFar < uncompressedBlockLen {
compressedChunkLen, b, err = readUint32(b)
if err != nil {
return nil, fmt.Errorf(
"failed to read compressed chunk block length: %w", err)
}
compressedChunk, b, err = readN(b, int(compressedChunkLen))
if err != nil {
return nil, fmt.Errorf("failed to read compressed chunk: %w", err)
}
out, uncompressedChunkLen, err = c.Decode(compressedChunk, out)
if err != nil {
return nil, fmt.Errorf("failed to decode compressed chunk: %w", err)
}
uncompressedSoFar += uncompressedChunkLen
}
// check that uncompressed lengths add up
if uncompressedSoFar > uncompressedBlockLen {
return nil, fmt.Errorf(
"uncompressed more than expected: expected %d, got %d so far",
uncompressedBlockLen, uncompressedSoFar)
}
}
return out, nil
}