/
detector.go
238 lines (193 loc) · 6.09 KB
/
detector.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
// SPDX-License-Identifier: MIT OR Unlicense
package processor
import (
"errors"
"fmt"
"sort"
"strings"
)
// DetectLanguage detects a language based on the filename returns the language extension and error
func DetectLanguage(name string) ([]string, string) {
extension := ""
t := strings.Count(name, ".")
// If there is no . in the filename or it starts with one then check if #! or other
if (t == 0 || (name[0] == '.' && t == 1)) && len(AllowListExtensions) == 0 {
return checkFullName(name)
}
// Lookup in case the full name matches
language, ok := ExtensionToLanguage[strings.ToLower(name)]
// If no match check if we have a matching extension
if !ok {
extension = getExtension(name)
language, ok = ExtensionToLanguage[extension]
}
// Convert from d.ts to ts and check that in case of multiple extensions
if !ok {
extension = getExtension(extension)
language, _ = ExtensionToLanguage[extension]
}
return language, extension
}
func checkFullName(name string) ([]string, string) {
// Need to check if special type
language, ok := FilenameToLanguage[strings.ToLower(name)]
if ok {
return []string{language}, name
}
if Verbose {
printWarn(fmt.Sprintf("possible #! file: %s", name))
}
// No extension indicates possible #! so mark as such for processing
return []string{SheBang}, name
}
// DetectSheBang given some content attempt to determine if it has a #! that maps to a known language and return the language
func DetectSheBang(content string) (string, error) {
if !strings.HasPrefix(content, "#!") {
return "", errors.New("Missing #!")
}
index := strings.Index(content, "\n")
if index != -1 {
content = content[:index]
}
cmd, err := scanForSheBang([]byte(content))
if err != nil {
return "", err
}
for k, v := range ShebangLookup {
for _, x := range v {
// detects both full path and env usage
if x == cmd {
return k, nil
}
}
}
return "", errors.New("Unknown #!")
}
func scanForSheBang(content []byte) (string, error) {
state := 0
lastSlash := 0
candidate1 := ""
candidate2 := ""
for i := range content {
switch state {
case 0: // Deals with whitespace after #! and before first /
if content[i] == '/' {
lastSlash = i
state = 1
}
case 1: // Once we found the first / keep going till we hit whitespace
if content[i] == '/' {
lastSlash = i
}
// when at the end pull out the candidate
if i == len(content)-1 {
candidate1 = string(content[lastSlash+1 : i+1])
}
// between last slash and here is the first candidate which is either env or perl/php/python etc..
if isWhitespace(content[i]) {
// mark from lastSlash to here as first argument
candidate1 = string(content[lastSlash+1 : i])
state = 2
}
case 2: // We have the first candidate, see if there is another
// go till end of whitespace, mark that spot as new start
if !isWhitespace(content[i]) {
lastSlash = i
state = 3
}
case 3:
if i == len(content)-1 {
candidate2 = string(content[lastSlash : i+1])
}
if isWhitespace(content[i]) {
candidate2 = string(content[lastSlash:i])
state = 4
}
case 4:
break
}
}
switch {
case candidate1 == "env":
return candidate2, nil
case candidate1 != "":
return candidate1, nil
}
return "", errors.New("Unable to determine #! command")
}
type languageGuess struct {
Name string
Count int
}
// DetermineLanguage given a filename, fallback language, possible languages and content make a guess to the type.
// If multiple possible it will guess based on keywords similar to how https://github.com/vmchale/polyglot does
func DetermineLanguage(filename string, fallbackLanguage string, possibleLanguages []string, content []byte) string {
// If being called through an API its possible nothing is set here and as
// such should just return as the Language value should have already been set
if len(possibleLanguages) == 0 {
return fallbackLanguage
}
// There should only be two possibilities now, either we have a single fallbackLanguage
// in which case we set it and return
// or we have multiple in which case we try to determine it heuristically
if len(possibleLanguages) == 1 {
return possibleLanguages[0]
}
startTime := makeTimestampNano()
var toCheck string
if len(content) > 20_000 {
toCheck = string(content)[:20_000]
} else {
toCheck = string(content)
}
primary := ""
toSort := []languageGuess{}
for _, lan := range possibleLanguages {
LanguageFeaturesMutex.Lock()
langFeatures := LanguageFeatures[lan]
LanguageFeaturesMutex.Unlock()
count := 0
for _, key := range langFeatures.Keywords {
if strings.Contains(toCheck, key) {
count++
}
}
// if no features are found that means that this one is considered the primary
// and as such the default fallback if we don't find a suitable number of matching
// keywords
// consider YAML files for example, where cloudformation files can also be YAML
// YAML can have any form so its not possible to say "this is a yaml file"
// so we can only say "this is likely to be a cloudformation file", and as such
// we need to handle a fallback case, which in this case is nothing
if len(langFeatures.Keywords) == 0 {
primary = lan
}
toSort = append(toSort, languageGuess{Name: lan, Count: count})
}
sort.Slice(toSort, func(i, j int) bool {
if toSort[i].Count == toSort[j].Count {
return strings.Compare(toSort[i].Name, toSort[j].Name) < 0
}
return toSort[i].Count > toSort[j].Count
})
//fmt.Println(toSort)
//fmt.Println(possibleLanguages)
//fmt.Println(primary, toSort[0].Name, toSort[0].Count)
if primary != "" && len(toSort) != 0 {
// OK at this point we have a primary, which means we want 3 or more matches to count as something else
if toSort[0].Count < 3 {
// we didn't find enough results, so lets return the primary in this case
return primary
}
}
if Verbose {
printWarn(fmt.Sprintf("guessing language %s for file %s", toSort[0].Name, filename))
}
if Trace {
printTrace(fmt.Sprintf("nanoseconds to guess language: %s: %d", filename, makeTimestampNano()-startTime))
}
if len(toSort) != 0 {
return toSort[0].Name
}
return fallbackLanguage
}