-
Notifications
You must be signed in to change notification settings - Fork 0
/
util.go
277 lines (247 loc) · 7.89 KB
/
util.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
package util
import (
"io"
"os"
"path/filepath"
"regexp"
"sort"
"sync"
"github.com/adedayo/checkmate-core/pkg/code"
"github.com/adedayo/checkmate-core/pkg/diagnostics"
)
var (
dataChunkSize = 4096 //read in source data 4k-bytes chunks
reNL = regexp.MustCompile(`\n`)
)
//FindFiles recursively searches the directories and files contained in paths and returns a unique list of files
func FindFiles(paths []string) (result []string) {
directoryOrFile := make(map[string]bool)
worklist := make(map[string]struct{})
for _, p := range paths {
path := filepath.Clean(p)
if fileInfo, err := os.Stat(path); !os.IsNotExist(err) {
directoryOrFile[path] = fileInfo.IsDir()
}
}
var nothing struct{}
//collect unique files to analyse
for file, isDir := range directoryOrFile {
if isDir {
for _, f := range getFiles(file) {
worklist[f] = nothing
}
} else {
worklist[file] = nothing
}
}
for path := range worklist {
result = append(result, path)
}
return
}
func getFiles(dir string) (paths []string) {
filepath.WalkDir(dir, func(path string, info os.DirEntry, err error) error {
if err != nil {
return filepath.SkipDir
}
if !info.IsDir() {
paths = append(paths, path)
}
return nil
})
return
}
//ResourceMultiplexer interface defines a path or source reader that can be multiplexed to multiple consumers. It provides
//additional utility such as mapping a source index to the line and character, i.e. the `code.Position` in the source
type ResourceMultiplexer interface {
//SetSource is the source reader to multiplex to multiple consumers, which will be provided with a copy of the source data as it is being streamed in from the source
SetResourceAndConsumers(filePath string, source *io.Reader, provideSourceInDiagnostics bool, consumers ...ResourceConsumer)
}
//PathMultiplexer interface defines an aggregator of analysers that can consume filesystem paths and URIs and process them
type PathMultiplexer interface {
SetPathConsumers(consumers ...PathConsumer)
ConsumePath(path string)
}
type defaultPathMultiplexer struct {
consumers []PathConsumer
}
func (dpm *defaultPathMultiplexer) SetPathConsumers(consumers ...PathConsumer) {
dpm.consumers = consumers
}
func (dpm *defaultPathMultiplexer) ConsumePath(path string) {
for _, c := range dpm.consumers {
c.ConsumePath(path)
}
}
//PositionProvider provides a "global" view of code location, given an arbitrary character index.
type PositionProvider interface {
GetPosition(index int64) code.Position
}
//PathConsumer is a sink for paths and URIs
type PathConsumer interface {
ConsumePath(path string)
diagnostics.ExclusionProvider
}
//NewPathMultiplexer creates a choreographer that orchestrates the consumption of paths by consumers
func NewPathMultiplexer(consumers ...PathConsumer) PathMultiplexer {
dpm := defaultPathMultiplexer{}
dpm.SetPathConsumers(consumers...)
return &dpm
}
//ResourceConsumer is a sink for streaming source
type ResourceConsumer interface {
//Consume allows a source processor receive `source` data streamed in "chunks", with `startIndex` indicating the
//character location of the first character in the stream
Consume(startIndex int64, source string)
//ConsumePath allows resource consumers that process filepaths directly to analyse files on disk
ConsumePath(filePath string)
SetLineKeeper(*LineKeeper)
//ShouldProvideSourceInDiagnostics toggles whether source evidence should be provided with diagnostics, defaults to false
ShouldProvideSourceInDiagnostics(bool)
//used to signal to the consumer that the source stream has ended
End()
}
//NewResourceMultiplexer creates a source multiplexer over an input reader
func NewResourceMultiplexer(filePath string, source *io.Reader, provideSource bool, consumers ...ResourceConsumer) ResourceMultiplexer {
sm := defaultResourceMultiplexer{}
sm.SetResourceAndConsumers(filePath, source, provideSource, consumers...)
return &sm
}
type defaultResourceMultiplexer struct {
filePath string
source *io.Reader
consumers []ResourceConsumer
lineKeeper LineKeeper
}
func (sm *defaultResourceMultiplexer) SetResourceAndConsumers(filePath string, src *io.Reader, provideSource bool, consumers ...ResourceConsumer) {
sm.filePath = filePath
sm.source = src
sm.consumers = consumers
for _, consumer := range consumers {
consumer.SetLineKeeper(&sm.lineKeeper)
consumer.ShouldProvideSourceInDiagnostics(provideSource)
}
sm.start()
}
//begins to stream data from source to the consumers
func (sm *defaultResourceMultiplexer) start() {
startIndex := int64(0)
for data := range readChunks(*sm.source, dataChunkSize) {
locations := reNL.FindAllStringIndex(data, -1)
locs := []int{}
for _, l := range locations {
locs = append(locs, l[0])
}
sm.lineKeeper.appendEOLs(locs)
var wg sync.WaitGroup
consumers := sm.consumers
wg.Add(len(consumers))
for _, c := range consumers {
go func(consumer ResourceConsumer, w *sync.WaitGroup) {
defer w.Done()
consumer.Consume(startIndex, data)
}(c, &wg)
}
wg.Wait()
startIndex += int64(len(data))
}
for _, c := range sm.consumers {
c.ConsumePath(sm.filePath)
}
for _, consumer := range sm.consumers {
consumer.End()
}
}
func (sm *defaultResourceMultiplexer) GetPosition(index int64) code.Position {
return sm.lineKeeper.GetPositionFromCharacterIndex(index)
}
//readChunk reads the `source` in `dataChunkSize` (4Mb) chunks and tries to align
//to the newline \n boundaries - so will sometimes "walk backwards to the last \n"
//and place the remaining data `remnant` in the next chunk
//TODO: write a test for this with various random data sources and compare the Sha256 of
//original data with the combined chunks.
func readChunks(source io.Reader, chunkSize int) chan string {
out := make(chan string)
var largeChunk, remnant string
buf := make([]byte, chunkSize)
go func() {
defer close(out)
for {
len, err := source.Read(buf)
if err == nil {
//find the last newline position in the buffer
nlFound := false
nlLocation := -1
for i := len - 1; i >= 0; i-- {
if buf[i] == '\n' {
nlFound = true
nlLocation = i
break
}
}
if nlFound {
out <- largeChunk + remnant + string(buf[:nlLocation+1])
//the remaining data after newline
remnant = string(buf[nlLocation+1 : len])
largeChunk = ""
} else {
largeChunk += remnant + string(buf[:len])
remnant = ""
}
} else {
out <- largeChunk + remnant + string(buf[:len])
break
}
}
}()
return out
}
//LineKeeper keeps track of line numberson a textual source file and can map character location to the relevant `code.Position`
type LineKeeper struct {
EOLLocations []int // end-of-line locations
lock sync.Mutex
}
func (lk *LineKeeper) appendEOLs(eols []int) {
sorted := sort.IntSlice(eols)
lk.lock.Lock()
//if this is not the first set of EOLs, "continue" from where we stopped last time, by adding the location to
//these set of eol's position. Note this works because we chunk data on EOL boundaries
if len(lk.EOLLocations) > 0 {
last := lk.EOLLocations[len(lk.EOLLocations)-1]
for i := range sorted {
sorted[i] += last
}
}
lk.EOLLocations = append(lk.EOLLocations, sorted...)
lk.lock.Unlock()
}
//GetPositionFromCharacterIndex returns the `code.Position` given the index of the character in the file
func (lk *LineKeeper) GetPositionFromCharacterIndex(pos int64) code.Position {
//lk.EOLLocations are sorted
lk.lock.Lock()
defer lk.lock.Unlock()
if len(lk.EOLLocations) > 0 {
end := int64(len(lk.EOLLocations) - 1)
if pos > int64(lk.EOLLocations[end]) {
return code.Position{
Line: end + 1,
Character: pos - int64(lk.EOLLocations[end]),
}
}
for i, eol := range lk.EOLLocations {
if int64(eol) > pos {
if i > 0 {
return code.Position{
Line: int64(i),
Character: pos - int64(lk.EOLLocations[i-1]),
}
}
break
}
}
}
return code.Position{
Line: 0,
Character: pos,
}
}