/
smaz.go
163 lines (149 loc) · 4.77 KB
/
smaz.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
// Package smaz is an implementation of the smaz library
// (https://github.com/antirez/smaz) for compressing small strings.
package smaz
import "errors"
var (
codeStrings = []string{" ",
"the", "e", "t", "a", "of", "o", "and", "i", "n", "s", "e ", "r", " th",
" t", "in", "he", "th", "h", "he ", "to", "\r\n", "l", "s ", "d", " a", "an",
"er", "c", " o", "d ", "on", " of", "re", "of ", "t ", ", ", "is", "u", "at",
" ", "n ", "or", "which", "f", "m", "as", "it", "that", "\n", "was", "en",
" ", " w", "es", " an", " i", "\r", "f ", "g", "p", "nd", " s", "nd ", "ed ",
"w", "ed", "http://", "for", "te", "ing", "y ", "The", " c", "ti", "r ", "his",
"st", " in", "ar", "nt", ",", " to", "y", "ng", " h", "with", "le", "al", "to ",
"b", "ou", "be", "were", " b", "se", "o ", "ent", "ha", "ng ", "their", "\"",
"hi", "from", " f", "in ", "de", "ion", "me", "v", ".", "ve", "all", "re ",
"ri", "ro", "is ", "co", "f t", "are", "ea", ". ", "her", " m", "er ", " p",
"es ", "by", "they", "di", "ra", "ic", "not", "s, ", "d t", "at ", "ce", "la",
"h ", "ne", "as ", "tio", "on ", "n t", "io", "we", " a ", "om", ", a", "s o",
"ur", "li", "ll", "ch", "had", "this", "e t", "g ", "e\r\n", " wh", "ere",
" co", "e o", "a ", "us", " d", "ss", "\n\r\n", "\r\n\r", "=\"", " be", " e",
"s a", "ma", "one", "t t", "or ", "but", "el", "so", "l ", "e s", "s,", "no",
"ter", " wa", "iv", "ho", "e a", " r", "hat", "s t", "ns", "ch ", "wh", "tr",
"ut", "/", "have", "ly ", "ta", " ha", " on", "tha", "-", " l", "ati", "en ",
"pe", " re", "there", "ass", "si", " fo", "wa", "ec", "our", "who", "its", "z",
"fo", "rs", ">", "ot", "un", "<", "im", "th ", "nc", "ate", "><", "ver", "ad",
" we", "ly", "ee", " n", "id", " cl", "ac", "il", "</", "rt", " wi", "div",
"e, ", " it", "whi", " ma", "ge", "x", "e c", "men", ".com",
}
codes = make([][]byte, len(codeStrings))
codeTrie trieNode
)
func init() {
for i, code := range codeStrings {
codes[i] = []byte(code)
codeTrie.put([]byte(code), byte(i))
}
}
// A trieNode represents a logical vertex in the trie structure.
// The trie maps []byte -> byte.
type trieNode struct {
branches [256]*trieNode
val byte
terminal bool
}
// put inserts the mapping k -> v into the trie, overwriting any previous value.
// It returns true if the element was not previously in t.
func (n *trieNode) put(k []byte, v byte) bool {
for _, c := range k {
next := n.branches[int(c)]
if next == nil {
next = &trieNode{}
n.branches[c] = next
}
n = next
}
n.val = v
if n.terminal {
return false
}
n.terminal = true
return true
}
func flushVerb(out, verb []byte) []byte {
// We can write a max of 255 continuous verbatim characters,
// because the length of the continuous verbatim section is represented
// by a single byte.
var chunk []byte
for len(verb) > 0 {
if len(verb) < 255 {
chunk, verb = verb, nil
} else {
chunk, verb = verb[:255], verb[255:]
}
if len(chunk) == 1 {
// 254 is code for a single verbatim byte.
out = append(out, 254)
} else {
// 255 is code for a verbatim string.
// It is followed by a byte containing the length of the string.
out = append(out, 255, byte(len(chunk)))
}
out = append(out, chunk...)
}
return out
}
// Compress compresses a byte slice and returns the compressed data.
func Compress(input []byte) []byte {
out := make([]byte, 0, len(input)/2) // estimate output size
var verb []byte
for len(input) > 0 {
prefixLen := 0
var code byte
n := &codeTrie
for i, c := range input {
next := n.branches[int(c)]
if next == nil {
break
}
n = next
if n.terminal {
prefixLen = i + 1
code = n.val
}
}
if prefixLen > 0 {
input = input[prefixLen:]
out = flushVerb(out, verb)
verb = verb[:0]
out = append(out, code)
} else {
verb = append(verb, input[0])
input = input[1:]
}
}
return flushVerb(out, verb)
}
// ErrDecompression is returned when decompressing invalid smaz-encoded data.
var ErrDecompression = errors.New("invalid or corrupted compressed data")
// Decompress decompresses a smaz-compressed byte slice and return a new slice
// with the decompressed data.
// err is nil if and only if decompression fails for any reason
// (e.g., corrupted data).
func Decompress(b []byte) ([]byte, error) {
dec := make([]byte, 0, len(b)) // estimate initial size
for len(b) > 0 {
switch b[0] {
case 254: // verbatim byte
if len(b) < 2 {
return nil, ErrDecompression
}
dec = append(dec, b[1])
b = b[2:]
case 255: // verbatim string
if len(b) < 2 {
return nil, ErrDecompression
}
n := int(b[1])
if len(b) < n+2 {
return nil, ErrDecompression
}
dec = append(dec, b[2:n+2]...)
b = b[n+2:]
default: // look up encoded value
dec = append(dec, codes[int(b[0])]...)
b = b[1:]
}
}
return dec, nil
}