/
cclass.go
138 lines (120 loc) · 3.23 KB
/
cclass.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
// Copyright Suneido Software Corp. All rights reserved.
// Governed by the MIT license found in the LICENSE file.
package regex
import (
"math/bits"
"github.com/apmckinlay/gsuneido/util/ascii"
)
// Character classes are compiled to either listSet or bitSet instructions
// listSet is used for small numbers of characters
// bitSet is either 128 bits in 16 bytes or 256 bits in 32 bytes
// Note: this is for ASCII only
// predefined character class instructions
var (
blank = cc().addChars(" \t")
digit = cc().addRange('0', '9')
notDigit = cc().add(digit).negate()
lower = cc().addRange('a', 'z')
upper = cc().addRange('A', 'Z')
alpha = cc().add(lower).add(upper)
alnum = cc().add(digit).add(alpha)
word = cc().addChars("_").add(alnum)
notWord = cc().add(word).negate()
punct = cc().addChars("!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~")
graph = cc().add(alnum).add(punct)
print = cc().addChars(" ").add(graph)
xdigit = cc().addChars("0123456789abcdefABCDEF")
space = cc().addChars(" \t\r\n")
notSpace = cc().add(space).negate()
cntrl = cc().addRange('\u0000', '\u001f').addRange('\u007f', '\u009f')
)
type cclass [32]byte // 256 / 8
func cc() *cclass {
return &cclass{}
}
// addRange adds a range of characters to a character class instruction
func (cc *cclass) addRange(from, to byte) *cclass {
if from > to {
return cc
}
for c := from; ; c++ {
cc[c>>3] |= (1 << (c & 7))
if c >= to {
break
}
}
return cc
}
// addChars adds characters to a character class instruction
func (cc *cclass) addChars(s string) *cclass {
for i := 0; i < len(s); i++ {
c := s[i]
cc[c>>3] |= (1 << (c & 7))
}
return cc
}
// addChar add a single character to a character class instruction
func (cc *cclass) addChar(c byte) *cclass {
cc[c>>3] |= (1 << (c & 7))
return cc
}
// add or's the argument cclass into the receiver
func (cc *cclass) add(b2 *cclass) *cclass {
for i := range cc {
cc[i] |= b2[i]
}
return cc
}
// negate inverts a builder
func (cc *cclass) negate() *cclass {
for i := range cc {
cc[i] = ^cc[i]
}
return cc
}
// ignore makes the character class ignore case
func (cc *cclass) ignore() {
for lo := byte('a'); lo <= 'z'; lo++ {
up := ascii.ToUpper(lo)
if cc[lo>>3]&(1<<(lo&7)) != 0 {
cc[up>>3] |= (1 << (up & 7))
} else if cc[up>>3]&(1<<(up&7)) != 0 {
cc[lo>>3] |= (1 << (lo & 7))
}
}
}
// setLen returns the length of the cclass as a bit set (16 or 32)
func (cc *cclass) setLen() int {
for _, b := range cc[16:] {
if b != 0 {
return 32
}
}
return 16
}
// listLen returns the length of the cclass as a list of characters
func (cc *cclass) listLen() int {
n := 0
for _, x := range cc {
n += bits.OnesCount8(x)
}
return n
}
// list returns the cclass as a list of characters
func (cc *cclass) list() []byte {
list := make([]byte, 0, 16)
for i := 0; i < 256; i++ {
if cc[i>>3]&(1<<(i&7)) != 0 {
list = append(list, byte(i))
}
}
return list
}
// matchHalfSet returns whether a character is in a half bit set (16 bytes)
func matchHalfSet(set Pattern, c byte) bool {
return c < 128 && set[c>>3]&(1<<(c&7)) != 0
}
// matchFullSet returns whether a character is in a full bit set (32 bytes)
func matchFullSet(set Pattern, c byte) bool {
return set[c>>3]&(1<<(c&7)) != 0
}