forked from dolthub/vitess
/
unicodeloosemd5.go
131 lines (113 loc) · 3.84 KB
/
unicodeloosemd5.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
package vindexes
import (
"bytes"
"fmt"
"sync"
"unicode/utf8"
"golang.org/x/text/collate"
"golang.org/x/text/language"
)
// UnicodeLooseMD5 is a vindex that normalizes and hashes unicode strings
// to a keyspace id. It conservatively converts the string to its base
// characters before hashing. This is also known as UCA level 1.
// Ref: http://www.unicode.org/reports/tr10/#Multi_Level_Comparison.
// This is compatible with MySQL's utf8_unicode_ci collation.
type UnicodeLooseMD5 struct {
name string
}
// NewUnicodeLooseMD5 creates a new UnicodeLooseMD5.
func NewUnicodeLooseMD5(name string, _ map[string]string) (Vindex, error) {
return &UnicodeLooseMD5{name: name}, nil
}
// String returns the name of the vindex.
func (vind *UnicodeLooseMD5) String() string {
return vind.name
}
// Cost returns the cost as 1.
func (vind *UnicodeLooseMD5) Cost() int {
return 1
}
// Verify returns true if ids maps to ksids.
func (vind *UnicodeLooseMD5) Verify(_ VCursor, ids []interface{}, ksids [][]byte) (bool, error) {
if len(ids) != len(ksids) {
return false, fmt.Errorf("UnicodeLooseMD5.Verify: length of ids %v doesn't match length of ksids %v", len(ids), len(ksids))
}
for rowNum := range ids {
data, err := unicodeHash(ids[rowNum])
if err != nil {
return false, fmt.Errorf("UnicodeLooseMD5.Verify: %v", err)
}
if bytes.Compare(data, ksids[rowNum]) != 0 {
return false, nil
}
}
return true, nil
}
// Map returns the corresponding keyspace id values for the given ids.
func (vind *UnicodeLooseMD5) Map(_ VCursor, ids []interface{}) ([][]byte, error) {
out := make([][]byte, 0, len(ids))
for _, id := range ids {
data, err := unicodeHash(id)
if err != nil {
return nil, fmt.Errorf("UnicodeLooseMD5.Map: %v", err)
}
out = append(out, data)
}
return out, nil
}
func unicodeHash(key interface{}) ([]byte, error) {
source, err := getBytes(key)
if err != nil {
return nil, err
}
collator := collatorPool.Get().(pooledCollator)
defer collatorPool.Put(collator)
norm, err := normalize(collator.col, collator.buf, source)
if err != nil {
return nil, err
}
return binHash(norm), nil
}
func normalize(col *collate.Collator, buf *collate.Buffer, in []byte) ([]byte, error) {
// We cannot pass invalid UTF-8 to the collator.
if !utf8.Valid(in) {
return nil, fmt.Errorf("cannot normalize string containing invalid UTF-8: %q", string(in))
}
// Ref: http://dev.mysql.com/doc/refman/5.6/en/char.html.
// Trailing spaces are ignored by MySQL.
in = bytes.TrimRight(in, " ")
// We use the collation key which can be used to
// perform lexical comparisons.
return col.Key(buf, in), nil
}
// pooledCollator pairs a Collator and a Buffer.
// These pairs are pooled to avoid reallocating for every request,
// which would otherwise be required because they can't be used concurrently.
//
// Note that you must ensure no active references into the buffer remain
// before you return this pair back to the pool.
// That is, either do your processing on the result first, or make a copy.
type pooledCollator struct {
col *collate.Collator
buf *collate.Buffer
}
var collatorPool = sync.Pool{New: newPooledCollator}
func newPooledCollator() interface{} {
// Ref: http://www.unicode.org/reports/tr10/#Introduction.
// Unicode seems to define a universal (or default) order.
// But various locales have conflicting order,
// which they have the right to override.
// Unfortunately, the Go library requires you to specify a locale.
// So, I chose English assuming that it won't override
// the Unicode universal order. But I couldn't find an easy
// way to verify this.
// Also, the locale differences are not an issue for level 1,
// because the conservative comparison makes them all equal.
return pooledCollator{
col: collate.New(language.English, collate.Loose),
buf: new(collate.Buffer),
}
}
func init() {
Register("unicode_loose_md5", NewUnicodeLooseMD5)
}