/
processor.go
345 lines (275 loc) · 10 KB
/
processor.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
package processor
import (
"encoding/base64"
"encoding/json"
"fmt"
"io/ioutil"
"os"
"path/filepath"
"runtime"
"runtime/debug"
"sort"
"strings"
"sync"
)
// Flags set via the CLI which control how the output is displayed
// Files indicates if there should be file output or not when formatting
var Files = false
// Languages indicates if the command line should print out the supported languages
var Languages = false
// Verbose enables verbose logging output
var Verbose = false
// Debug enables debug logging output
var Debug = false
// Trace enables trace logging output which is extremely verbose
var Trace = false
// Duplicates enables duplicate file detection
var Duplicates = false
// Complexity toggles complexity calculation
var Complexity = false
// More enables wider output with more information in formatter
var More = false
// Cocomo toggles the COCOMO calculation
var Cocomo = false
// DisableCheckBinary toggles checking for binary files using NUL bytes
var DisableCheckBinary = false
// SortBy sets which column output in formatter should be sorted by
var SortBy = ""
// Exclude is a regular expression which is used to exclude files from being processed
var Exclude = ""
// Format sets the output format of the formatter
var Format = ""
// FileOutput sets the file that output should be written to
var FileOutput = ""
// PathBlacklist sets the paths that should be skipped
var PathBlacklist = []string{}
// FileListQueueSize is the queue of files found and ready to be read into memory
var FileListQueueSize = runtime.NumCPU()
// FileReadJobWorkers is the number of processes that read files off disk into memory
var FileReadJobWorkers = runtime.NumCPU() * 4
// FileReadContentJobQueueSize is a queue of files ready to be processed
var FileReadContentJobQueueSize = runtime.NumCPU()
// FileProcessJobWorkers is the number of workers that process the file collecting stats
var FileProcessJobWorkers = runtime.NumCPU() * 4
// FileSummaryJobQueueSize is the queue used to hold processed file statistics before formatting
var FileSummaryJobQueueSize = runtime.NumCPU()
// WhiteListExtensions is a list of extensions which are whitelisted to be processed
var WhiteListExtensions = []string{}
// AverageWage is the average wage in dollars used for the COCOMO cost estimate
var AverageWage int64 = 56286
// GcFileCount is the number of files to process before turning the GC back on
var GcFileCount = 10000
var gcPercent = -1
var isLazy = false
// DirFilePaths is not set via flags but by arguments following the flags for file or directory to process
var DirFilePaths = []string{}
// Raw languageDatabase loaded
var languageDatabase = map[string]Language{}
// ExtensionToLanguage is loaded from the JSON that is in constants.go
var ExtensionToLanguage = map[string][]string{}
// LanguageFeatures contains the processed languages from processLanguageFeature
var LanguageFeatures = map[string]LanguageFeature{}
// LanguageFeaturesMutex is the shared mutex used to control getting and setting of language features
// used rather than sync.Map because it turned out to be marginally faster
var LanguageFeaturesMutex = sync.Mutex{}
// ConfigureGc needs to be set outside of ProcessConstants because it should only be enabled in command line
// mode https://github.com/boyter/scc/issues/32
func ConfigureGc() {
gcPercent = debug.SetGCPercent(gcPercent)
}
// ConfigureLazy is a simple setter used to turn on lazy loading used only by command line
func ConfigureLazy(lazy bool) {
isLazy = lazy
}
// ProcessConstants is responsible for setting up the language features based on the JSON file that is stored in constants
// Needs to be called at least once in order for anything to actually happen
func ProcessConstants() {
languageDatabase = loadDatabase()
startTime := makeTimestampNano()
for name, value := range languageDatabase {
for _, ext := range value.Extensions {
ExtensionToLanguage[ext] = append(ExtensionToLanguage[ext], name)
}
}
if Trace {
printTrace(fmt.Sprintf("nanoseconds build extension to language: %d", makeTimestampNano()-startTime))
}
// If lazy is set then we want to load in the features as we find them not in one go
// however otherwise being used as a library so just load them all in
if !isLazy {
startTime = makeTimestampMilli()
for name, value := range languageDatabase {
processLanguageFeature(name, value)
}
if Trace {
printTrace(fmt.Sprintf("milliseconds build language features: %d", makeTimestampMilli()-startTime))
}
} else {
printTrace("configured to lazy load language features")
}
}
// LoadLanguageFeature will load a single feature as requested given the name
func LoadLanguageFeature(loadName string) {
if !isLazy {
return
}
// Check if already loaded and if so return because we don't need to do it again
LanguageFeaturesMutex.Lock()
_, ok := LanguageFeatures[loadName]
LanguageFeaturesMutex.Unlock()
if ok {
return
}
var name string
var value Language
for name, value = range languageDatabase {
if name == loadName {
break
}
}
startTime := makeTimestampNano()
processLanguageFeature(loadName, value)
if Trace {
printTrace(fmt.Sprintf("nanoseconds to build language %s features: %d", loadName, makeTimestampNano()-startTime))
}
}
func processLanguageFeature(name string, value Language) {
complexityTrie := &Trie{}
slCommentTrie := &Trie{}
mlCommentTrie := &Trie{}
stringTrie := &Trie{}
tokenTrie := &Trie{}
complexityMask := byte(0)
singleLineCommentMask := byte(0)
multiLineCommentMask := byte(0)
stringMask := byte(0)
processMask := byte(0)
for _, v := range value.ComplexityChecks {
complexityMask |= v[0]
complexityTrie.Insert(TComplexity, []byte(v))
if !Complexity {
tokenTrie.Insert(TComplexity, []byte(v))
}
}
if !Complexity {
processMask |= complexityMask
}
for _, v := range value.LineComment {
singleLineCommentMask |= v[0]
slCommentTrie.Insert(TSlcomment, []byte(v))
tokenTrie.Insert(TSlcomment, []byte(v))
}
processMask |= singleLineCommentMask
for _, v := range value.MultiLine {
multiLineCommentMask |= v[0][0]
mlCommentTrie.InsertClose(TMlcomment, []byte(v[0]), []byte(v[1]))
tokenTrie.InsertClose(TMlcomment, []byte(v[0]), []byte(v[1]))
}
processMask |= multiLineCommentMask
for _, v := range value.Quotes {
stringMask |= v[0][0]
stringTrie.InsertClose(TString, []byte(v[0]), []byte(v[1]))
tokenTrie.InsertClose(TString, []byte(v[0]), []byte(v[1]))
}
processMask |= stringMask
LanguageFeaturesMutex.Lock()
LanguageFeatures[name] = LanguageFeature{
Complexity: complexityTrie,
MultiLineComments: mlCommentTrie,
SingleLineComments: slCommentTrie,
Strings: stringTrie,
Tokens: tokenTrie,
Nested: value.NestedMultiLine,
ComplexityCheckMask: complexityMask,
MultiLineCommentMask: multiLineCommentMask,
SingleLineCommentMask: singleLineCommentMask,
StringCheckMask: stringMask,
ProcessMask: processMask,
Keywords: value.Keywords,
}
LanguageFeaturesMutex.Unlock()
}
func processFlags() {
// If wide/more mode is enabled we want the complexity calculation
// to happen regardless as that is the only purpose of the flag
if More && Complexity {
Complexity = false
}
if Debug {
printDebug(fmt.Sprintf("Path Black List: %v", PathBlacklist))
printDebug(fmt.Sprintf("Sort By: %s", SortBy))
printDebug(fmt.Sprintf("White List: %v", WhiteListExtensions))
printDebug(fmt.Sprintf("Files Output: %t", Files))
printDebug(fmt.Sprintf("Verbose: %t", Verbose))
printDebug(fmt.Sprintf("Duplicates Detection: %t", Duplicates))
printDebug(fmt.Sprintf("Complexity Calculation: %t", !Complexity))
printDebug(fmt.Sprintf("Wide: %t", More))
printDebug(fmt.Sprintf("Average Wage: %d", AverageWage))
printDebug(fmt.Sprintf("Cocomo: %t", !Cocomo))
}
}
func loadDatabase() map[string]Language {
var database map[string]Language
startTime := makeTimestampMilli()
data, err := base64.StdEncoding.DecodeString(languages)
if err != nil {
panic(fmt.Sprintf("failed to base64 decode languages: %v", err))
}
if err := json.Unmarshal(data, &database); err != nil {
panic(fmt.Sprintf("languages json invalid: %v", err))
}
if Trace {
printTrace(fmt.Sprintf("milliseconds unmarshal: %d", makeTimestampMilli()-startTime))
}
return database
}
func printLanguages() {
database := loadDatabase()
var names []string
for key := range database {
names = append(names, key)
}
sort.Slice(names, func(i, j int) bool {
return strings.Compare(strings.ToLower(names[i]), strings.ToLower(names[j])) < 0
})
for _, name := range names {
fmt.Println(fmt.Sprintf("%s (%s)", name, strings.Join(database[name].Extensions, ",")))
}
}
// Process is the main entry point of the command line it sets everything up and starts running
func Process() {
if Languages {
printLanguages()
return
}
ProcessConstants()
processFlags()
// Clean up any invalid arguments before setting everything up
if len(DirFilePaths) == 0 {
DirFilePaths = append(DirFilePaths, ".")
}
fpath := filepath.Clean(DirFilePaths[0])
if _, err := os.Stat(fpath); os.IsNotExist(err) {
fmt.Println("file or directory does not exists: " + fpath)
return
}
SortBy = strings.ToLower(SortBy)
if Debug {
printDebug(fmt.Sprintf("NumCPU: %d", runtime.NumCPU()))
printDebug(fmt.Sprintf("SortBy: %s", SortBy))
printDebug(fmt.Sprintf("PathBlacklist: %v", PathBlacklist))
}
fileListQueue := make(chan *FileJob, FileListQueueSize) // Files ready to be read from disk
fileReadContentJobQueue := make(chan *FileJob, FileReadContentJobQueueSize) // Files ready to be processed
fileSummaryJobQueue := make(chan *FileJob, FileSummaryJobQueueSize) // Files ready to be summarised
go walkDirectoryParallel(fpath, fileListQueue)
go fileReaderWorker(fileListQueue, fileReadContentJobQueue)
go fileProcessorWorker(fileReadContentJobQueue, fileSummaryJobQueue)
result := fileSummarize(fileSummaryJobQueue)
if FileOutput == "" {
fmt.Println(result)
} else {
_ = ioutil.WriteFile(FileOutput, []byte(result), 0600)
fmt.Println("results written to " + FileOutput)
}
}