Skip to content

Commit

Permalink
Merge pull request #1 from boyter/master
Browse files Browse the repository at this point in the history
merge updated code
  • Loading branch information
foxdd committed Apr 18, 2021
2 parents c916c20 + e8a6442 commit a663333
Show file tree
Hide file tree
Showing 15 changed files with 486 additions and 363 deletions.
18 changes: 18 additions & 0 deletions examples/issue246.py
@@ -0,0 +1,18 @@
#!/usr/bin/env python3

"""
Docstrings containing an apostrophe (') are handled incorrectly
The line above is counted as code despite being in the middle of a docstring.
The end of docstring flag seems to be changed to an apostrophe,
which means the next line will not exit the docstring.
"""
# Code containing single quotes will exit the docstring,
# but presuming the quotes are balanced the second
# quote will put us in string scanning mode.
if __name__ == '__main__':
print('Hello, World!')
# Not counted as a comment

# ^ Not counted as a blank line
# Break out of string scanner with unbalanced single quote: '
exit(0)
14 changes: 12 additions & 2 deletions languages.json
Expand Up @@ -6645,10 +6645,20 @@
"== "
],
"extensions": [
"vim"
"vim",
"vimrc",
"gvimrc"
],
"filenames": [
"_vimrc",
".vimrc",
"_gvimrc",
".gvimrc",
"vimrc",
"gvimrc"
],
"line_comment": [
"\\\""
"\"", "#"
],
"multi_line": [],
"quotes": [
Expand Down
37 changes: 37 additions & 0 deletions processor/bloom.go
@@ -0,0 +1,37 @@
package processor

import "math/rand"

var BloomTable [256]uint64

func init() {
for i := range BloomTable {
BloomTable[i] = BloomHash(byte(i))
}
}

func BloomHash(b byte) uint64 {
// Since our input is based on ASCII characters (and majority lower case
// characters) the values are not well distributed through the 0-255 byte
// range. math/rand gives us a way to generate a value with more well
// distributed randomness.
k := rand.New(rand.NewSource(int64(b))).Uint64()

// Mask to slice out a 0-63 value
var mask64 uint64 = 0b00111111

// For a bloom filter we only want a few bits set, but distributed
// through the 64 bit space.
// The logic here is to slice a value between 0 and 63 from k, and set a
// single bit in the output hash based on that.
// Setting three bits this way seems to give the best results. Fewer bits
// makes the hash not unique enough, more leads to overcrowding the bloom
// filter.
var hash uint64
for i := uint64(0); i < 3; i++ {
n := k >> (i*8) & mask64
hash |= 1 << n
}

return hash
}
2 changes: 1 addition & 1 deletion processor/constants.go

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions processor/file.go
Expand Up @@ -259,6 +259,7 @@ func newFileJob(path, name string, fileInfo os.FileInfo) *FileJob {
Extension: extension,
PossibleLanguages: language,
Bytes: fileInfo.Size(),
EndPoint: int(fileInfo.Size() - 1),
}
} else if Verbose {
printWarn(fmt.Sprintf("skipping file unknown extension: %s", name))
Expand Down
18 changes: 9 additions & 9 deletions processor/processor.go
Expand Up @@ -350,14 +350,14 @@ func processLanguageFeature(name string, value Language) {
stringTrie := &Trie{}
tokenTrie := &Trie{}

complexityMask := byte(0)
singleLineCommentMask := byte(0)
multiLineCommentMask := byte(0)
stringMask := byte(0)
processMask := byte(0)
var complexityMask uint64
var singleLineCommentMask uint64
var multiLineCommentMask uint64
var stringMask uint64
var processMask uint64

for _, v := range value.ComplexityChecks {
complexityMask |= v[0]
complexityMask |= BloomTable[v[0]]
complexityTrie.Insert(TComplexity, []byte(v))
if !Complexity {
tokenTrie.Insert(TComplexity, []byte(v))
Expand All @@ -368,21 +368,21 @@ func processLanguageFeature(name string, value Language) {
}

for _, v := range value.LineComment {
singleLineCommentMask |= v[0]
singleLineCommentMask |= BloomTable[v[0]]
slCommentTrie.Insert(TSlcomment, []byte(v))
tokenTrie.Insert(TSlcomment, []byte(v))
}
processMask |= singleLineCommentMask

for _, v := range value.MultiLine {
multiLineCommentMask |= v[0][0]
multiLineCommentMask |= BloomTable[v[0][0]]
mlCommentTrie.InsertClose(TMlcomment, []byte(v[0]), []byte(v[1]))
tokenTrie.InsertClose(TMlcomment, []byte(v[0]), []byte(v[1]))
}
processMask |= multiLineCommentMask

for _, v := range value.Quotes {
stringMask |= v.Start[0]
stringMask |= BloomTable[v.Start[0]]
stringTrie.InsertClose(TString, []byte(v.Start), []byte(v.End))
tokenTrie.InsertClose(TString, []byte(v.Start), []byte(v.End))
}
Expand Down
60 changes: 60 additions & 0 deletions processor/state_blank.go
@@ -0,0 +1,60 @@
package processor

type StateBlank struct {}

func (state *StateBlank) String() string {
return "blank"
}

func (state *StateBlank) Process(job *FileJob, lang *LanguageFeature, index int, lineType LineType) (int, LineType, State) {
switch tokenType, offsetJump, endString := lang.Tokens.Match(job.Content[index:]); tokenType {
case TMlcomment:
commentType := lineType
if commentType == LINE_BLANK {
commentType = LINE_COMMENT
}

index += offsetJump - 1
return index, commentType, NewStateCommentMulti(endString)

case TSlcomment:
commentType := lineType
if commentType == LINE_BLANK {
commentType = LINE_COMMENT
}
return index, commentType, &StateCommentSingle{}

case TString:
index, docString, skipEsc := verifyIgnoreEscape(lang, job, index)

if docString {
commentType := lineType
if commentType == LINE_BLANK {
commentType = LINE_COMMENT
}

return index, commentType, &StateDocString{
End: endString,
SkipEsc: skipEsc,
}
}

return index, LINE_CODE, &StateString{
End: endString,
SkipEsc: skipEsc,
}

case TComplexity:
if index == 0 || isWhitespace(job.Content[index-1]) {
job.Complexity++
}
return index, LINE_BLANK, state

default:
return index, LINE_CODE, &StateCode{}
}
}

func (state *StateBlank) Reset() (LineType, State) {
return LINE_BLANK, state
}
92 changes: 92 additions & 0 deletions processor/state_code.go
@@ -0,0 +1,92 @@
package processor

type StateCode struct {}

func (state *StateCode) String() string {
return "code"
}

func (state *StateCode) Process(job *FileJob, lang *LanguageFeature, index int, lineType LineType) (int, LineType, State) {
// Hacky fix to https://github.com/boyter/scc/issues/181
endPoint := job.EndPoint
if endPoint > len(job.Content) {
endPoint--
}

var i int
for i = index; i < endPoint; i++ {
curByte := job.Content[i]

if curByte == '\n' {
return i, LINE_CODE, state
}

if isBinary(i, curByte) {
job.Binary = true
return i, LINE_CODE, state
}

if shouldProcess(curByte, lang.ProcessMask) {
if Duplicates {
// Technically this is wrong because we skip bytes so this is not a true
// hash of the file contents, but for duplicate files it shouldn't matter
// as both will skip the same way
digestible := []byte{job.Content[index]}
job.Hash.Write(digestible)
}

switch tokenType, offsetJump, endString := lang.Tokens.Match(job.Content[i:]); tokenType {
case TString:
// If we are in string state then check what sort of string so we know if docstring OR ignoreescape string

// It is safe to -1 here as to enter the code state we need to have
// transitioned from blank to here hence i should always be >= 1
// This check is to ensure we aren't in a character declaration
// TODO this should use language features
if job.Content[i-1] == '\\' {
break // from switch, not from the loop
}

i, docString, skipEsc := verifyIgnoreEscape(lang, job, i)

if docString {
commentType := lineType
if commentType == LINE_BLANK {
commentType = LINE_COMMENT
}

return i, commentType, &StateDocString{
End: endString,
SkipEsc: skipEsc,
}
}

// i += offsetJump - 1
return i, LINE_CODE, &StateString{
End: endString,
SkipEsc: skipEsc,
}

case TSlcomment:
i += offsetJump - 1
return i, LINE_CODE, &StateCommentSingle{}

case TMlcomment:
i += offsetJump - 1

return i, LINE_CODE, NewStateCommentMulti(endString)

case TComplexity:
if i == 0 || isWhitespace(job.Content[i-1]) {
job.Complexity++
}
}
}
}

return i, LINE_CODE, state
}

func (state *StateCode) Reset() (LineType, State) {
return LINE_BLANK, &StateBlank{}
}
70 changes: 70 additions & 0 deletions processor/state_comment_multi.go
@@ -0,0 +1,70 @@
package processor

type StateCommentMulti struct {
Stack [][]byte
}

func (state *StateCommentMulti) String() string {
return "multiline-comment"
}

func NewStateCommentMulti(token []byte) *StateCommentMulti {
return &StateCommentMulti{
Stack: [][]byte{token},
}
}

func (state *StateCommentMulti) Process(job *FileJob, lang *LanguageFeature, index int, lineType LineType) (int, LineType, State) {
var i int
for i = index; i < job.EndPoint; i++ {
curByte := job.Content[i]

if curByte == '\n' {
break
}

endToken := state.peek()
if checkForMatchSingle(curByte, i, job.EndPoint, endToken, job) {
// set offset jump here
i += len(endToken) - 1

if len(state.Stack) == 1 {
return i, lineType, &StateBlank{}
} else {
state.pop()
return i, lineType, state
}
}

// Check if we are entering another multiline comment
// This should come below check for match single as it speeds up processing
if lang.Nested {
if ok, offsetJump, endString := lang.MultiLineComments.Match(job.Content[i:]); ok != 0 {
i += offsetJump - 1
state.push(endString)
return i, lineType, state
}
}
}

return i, lineType, state
}

func (state *StateCommentMulti) Reset() (LineType, State) {
return LINE_COMMENT, state
}

func (state *StateCommentMulti) peek() []byte {
i := len(state.Stack) - 1
return state.Stack[i]
}

func (state *StateCommentMulti) push(token []byte) {
state.Stack = append(state.Stack, token)
}

func (state *StateCommentMulti) pop() {
i := len(state.Stack) - 1

state.Stack = state.Stack[:i]
}
24 changes: 24 additions & 0 deletions processor/state_comment_single.go
@@ -0,0 +1,24 @@
package processor

type StateCommentSingle struct {}

func (state *StateCommentSingle) String() string {
return "comment"
}

func (state *StateCommentSingle) Process(job *FileJob, lang *LanguageFeature, index int, lineType LineType) (int, LineType, State) {
var i int
for i = index; i < job.EndPoint; i++ {
curByte := job.Content[i]

if curByte == '\n' {
break
}
}

return i, lineType, state
}

func (state *StateCommentSingle) Reset() (LineType, State) {
return LINE_BLANK, &StateBlank{}
}

0 comments on commit a663333

Please sign in to comment.