Merge pull request #1 from boyter/master

merge updated code
boyter · Apr 18, 2021 · a663333 · a663333
2 parents c916c20 + e8a6442
commit a663333
Show file tree

Hide file tree

Showing 15 changed files with 486 additions and 363 deletions.
diff --git a/examples/issue246.py b/examples/issue246.py
@@ -0,0 +1,18 @@
+#!/usr/bin/env python3
+
+"""
+Docstrings containing an apostrophe (') are handled incorrectly
+The line above is counted as code despite being in the middle of a docstring.
+The end of docstring flag seems to be changed to an apostrophe,
+which means the next line will not exit the docstring.
+"""
+# Code containing single quotes will exit the docstring,
+# but presuming the quotes are balanced the second
+# quote will put us in string scanning mode.
+if __name__ == '__main__':
+    print('Hello, World!')
+# Not counted as a comment
+
+# ^ Not counted as a blank line
+# Break out of string scanner with unbalanced single quote: '
+    exit(0)
diff --git a/languages.json b/languages.json
@@ -6645,10 +6645,20 @@
       "== "
     ],
     "extensions": [
-      "vim"
+      "vim",
+      "vimrc",
+      "gvimrc"
+    ],
+    "filenames": [
+        "_vimrc",
+        ".vimrc",
+        "_gvimrc",
+        ".gvimrc",
+        "vimrc",
+        "gvimrc"
     ],
     "line_comment": [
-      "\\\""
+      "\"", "#"
     ],
     "multi_line": [],
     "quotes": [

diff --git a/processor/bloom.go b/processor/bloom.go
@@ -0,0 +1,37 @@
+package processor
+
+import "math/rand"
+
+var BloomTable [256]uint64
+
+func init() {
+	for i := range BloomTable {
+		BloomTable[i] = BloomHash(byte(i))
+	}
+}
+
+func BloomHash(b byte) uint64 {
+	// Since our input is based on ASCII characters (and majority lower case
+	// characters) the values are not well distributed through the 0-255 byte
+	// range. math/rand gives us a way to generate a value with more well
+	// distributed randomness.
+	k := rand.New(rand.NewSource(int64(b))).Uint64()
+
+	// Mask to slice out a 0-63 value
+	var mask64 uint64 = 0b00111111
+
+	// For a bloom filter we only want a few bits set, but distributed
+	// through the 64 bit space.
+	// The logic here is to slice a value between 0 and 63 from k, and set a
+	// single bit in the output hash based on that.
+	// Setting three bits this way seems to give the best results. Fewer bits
+	// makes the hash not unique enough, more leads to overcrowding the bloom
+	// filter.
+	var hash uint64
+	for i := uint64(0); i < 3; i++ {
+		n := k >> (i*8) & mask64
+		hash |= 1 << n
+	}
+
+	return hash
+}
diff --git a/processor/constants.go b/processor/constants.go
diff --git a/processor/file.go b/processor/file.go
@@ -259,6 +259,7 @@ func newFileJob(path, name string, fileInfo os.FileInfo) *FileJob {
 			Extension:         extension,
 			PossibleLanguages: language,
 			Bytes:             fileInfo.Size(),
+			EndPoint:          int(fileInfo.Size() - 1),
 		}
 	} else if Verbose {
 		printWarn(fmt.Sprintf("skipping file unknown extension: %s", name))

diff --git a/processor/processor.go b/processor/processor.go
@@ -350,14 +350,14 @@ func processLanguageFeature(name string, value Language) {
 	stringTrie := &Trie{}
 	tokenTrie := &Trie{}
 
-	complexityMask := byte(0)
-	singleLineCommentMask := byte(0)
-	multiLineCommentMask := byte(0)
-	stringMask := byte(0)
-	processMask := byte(0)
+	var complexityMask uint64
+	var singleLineCommentMask uint64
+	var multiLineCommentMask uint64
+	var stringMask uint64
+	var processMask uint64
 
 	for _, v := range value.ComplexityChecks {
-		complexityMask |= v[0]
+		complexityMask |= BloomTable[v[0]]
 		complexityTrie.Insert(TComplexity, []byte(v))
 		if !Complexity {
 			tokenTrie.Insert(TComplexity, []byte(v))
@@ -368,21 +368,21 @@ func processLanguageFeature(name string, value Language) {
 	}
 
 	for _, v := range value.LineComment {
-		singleLineCommentMask |= v[0]
+		singleLineCommentMask |= BloomTable[v[0]]
 		slCommentTrie.Insert(TSlcomment, []byte(v))
 		tokenTrie.Insert(TSlcomment, []byte(v))
 	}
 	processMask |= singleLineCommentMask
 
 	for _, v := range value.MultiLine {
-		multiLineCommentMask |= v[0][0]
+		multiLineCommentMask |= BloomTable[v[0][0]]
 		mlCommentTrie.InsertClose(TMlcomment, []byte(v[0]), []byte(v[1]))
 		tokenTrie.InsertClose(TMlcomment, []byte(v[0]), []byte(v[1]))
 	}
 	processMask |= multiLineCommentMask
 
 	for _, v := range value.Quotes {
-		stringMask |= v.Start[0]
+		stringMask |= BloomTable[v.Start[0]]
 		stringTrie.InsertClose(TString, []byte(v.Start), []byte(v.End))
 		tokenTrie.InsertClose(TString, []byte(v.Start), []byte(v.End))
 	}

diff --git a/processor/state_blank.go b/processor/state_blank.go
@@ -0,0 +1,60 @@
+package processor
+
+type StateBlank struct {}
+
+func (state *StateBlank) String() string {
+	return "blank"
+}
+
+func (state *StateBlank) Process(job *FileJob, lang *LanguageFeature, index int, lineType LineType) (int, LineType, State) {
+	switch tokenType, offsetJump, endString := lang.Tokens.Match(job.Content[index:]); tokenType {
+	case TMlcomment:
+		commentType := lineType
+		if commentType == LINE_BLANK {
+			commentType = LINE_COMMENT
+		}
+
+		index += offsetJump - 1
+		return index, commentType, NewStateCommentMulti(endString)
+
+	case TSlcomment:
+		commentType := lineType
+		if commentType == LINE_BLANK {
+			commentType = LINE_COMMENT
+		}
+		return index, commentType, &StateCommentSingle{}
+
+	case TString:
+		index, docString, skipEsc := verifyIgnoreEscape(lang, job, index)
+
+		if docString {
+			commentType := lineType
+			if commentType == LINE_BLANK {
+				commentType = LINE_COMMENT
+			}
+
+			return index, commentType, &StateDocString{
+				End:     endString,
+				SkipEsc: skipEsc,
+			}
+		}
+
+		return index, LINE_CODE, &StateString{
+			End:     endString,
+			SkipEsc: skipEsc,
+		}
+
+	case TComplexity:
+		if index == 0 || isWhitespace(job.Content[index-1]) {
+			job.Complexity++
+		}
+		return index, LINE_BLANK, state
+
+	default:
+		return index, LINE_CODE, &StateCode{}
+	}
+}
+
+func (state *StateBlank) Reset() (LineType, State) {
+	return LINE_BLANK, state
+}
diff --git a/processor/state_code.go b/processor/state_code.go
@@ -0,0 +1,92 @@
+package processor
+
+type StateCode struct {}
+
+func (state *StateCode) String() string {
+	return "code"
+}
+
+func (state *StateCode) Process(job *FileJob, lang *LanguageFeature, index int, lineType LineType) (int, LineType, State) {
+	// Hacky fix to https://github.com/boyter/scc/issues/181
+	endPoint := job.EndPoint
+	if endPoint > len(job.Content) {
+		endPoint--
+	}
+
+	var i int
+	for i = index; i < endPoint; i++ {
+		curByte := job.Content[i]
+
+		if curByte == '\n' {
+			return i, LINE_CODE, state
+		}
+
+		if isBinary(i, curByte) {
+			job.Binary = true
+			return i, LINE_CODE, state
+		}
+
+		if shouldProcess(curByte, lang.ProcessMask) {
+			if Duplicates {
+				// Technically this is wrong because we skip bytes so this is not a true
+				// hash of the file contents, but for duplicate files it shouldn't matter
+				// as both will skip the same way
+				digestible := []byte{job.Content[index]}
+				job.Hash.Write(digestible)
+			}
+
+			switch tokenType, offsetJump, endString := lang.Tokens.Match(job.Content[i:]); tokenType {
+			case TString:
+				// If we are in string state then check what sort of string so we know if docstring OR ignoreescape string
+
+				// It is safe to -1 here as to enter the code state we need to have
+				// transitioned from blank to here hence i should always be >= 1
+				// This check is to ensure we aren't in a character declaration
+				// TODO this should use language features
+				if job.Content[i-1] == '\\' {
+					break // from switch, not from the loop
+				}
+
+				i, docString, skipEsc := verifyIgnoreEscape(lang, job, i)
+
+				if docString {
+					commentType := lineType
+					if commentType == LINE_BLANK {
+						commentType = LINE_COMMENT
+					}
+
+					return i, commentType, &StateDocString{
+						End:     endString,
+						SkipEsc: skipEsc,
+					}
+				}
+
+				// i += offsetJump - 1
+				return i, LINE_CODE, &StateString{
+					End:     endString,
+					SkipEsc: skipEsc,
+				}
+
+			case TSlcomment:
+				i += offsetJump - 1
+				return i, LINE_CODE, &StateCommentSingle{}
+
+			case TMlcomment:
+				i += offsetJump - 1
+
+				return i, LINE_CODE, NewStateCommentMulti(endString)
+
+			case TComplexity:
+				if i == 0 || isWhitespace(job.Content[i-1]) {
+					job.Complexity++
+				}
+			}
+		}
+	}
+
+	return i, LINE_CODE, state
+}
+
+func (state *StateCode) Reset() (LineType, State) {
+	return LINE_BLANK, &StateBlank{}
+}
diff --git a/processor/state_comment_multi.go b/processor/state_comment_multi.go
@@ -0,0 +1,70 @@
+package processor
+
+type StateCommentMulti struct {
+	Stack     [][]byte
+}
+
+func (state *StateCommentMulti) String() string {
+	return "multiline-comment"
+}
+
+func NewStateCommentMulti(token []byte) *StateCommentMulti {
+	return &StateCommentMulti{
+		Stack: [][]byte{token},
+	}
+}
+
+func (state *StateCommentMulti) Process(job *FileJob, lang *LanguageFeature, index int, lineType LineType) (int, LineType, State) {
+	var i int
+	for i = index; i < job.EndPoint; i++ {
+		curByte := job.Content[i]
+
+		if curByte == '\n' {
+			break
+		}
+
+		endToken := state.peek()
+		if checkForMatchSingle(curByte, i, job.EndPoint, endToken, job) {
+			// set offset jump here
+			i += len(endToken) - 1
+
+			if len(state.Stack) == 1 {
+				return i, lineType, &StateBlank{}
+			} else {
+				state.pop()
+				return i, lineType, state
+			}
+		}
+
+		// Check if we are entering another multiline comment
+		// This should come below check for match single as it speeds up processing
+		if lang.Nested {
+			if ok, offsetJump, endString := lang.MultiLineComments.Match(job.Content[i:]); ok != 0 {
+				i += offsetJump - 1
+				state.push(endString)
+				return i, lineType, state
+			}
+		}
+	}
+
+	return i, lineType, state
+}
+
+func (state *StateCommentMulti) Reset() (LineType, State) {
+	return LINE_COMMENT, state
+}
+
+func (state *StateCommentMulti) peek() []byte {
+	i := len(state.Stack) - 1
+	return state.Stack[i]
+}
+
+func (state *StateCommentMulti) push(token []byte) {
+	state.Stack = append(state.Stack, token)
+}
+
+func (state *StateCommentMulti) pop() {
+	i := len(state.Stack) - 1
+
+	state.Stack = state.Stack[:i]
+}
diff --git a/processor/state_comment_single.go b/processor/state_comment_single.go
@@ -0,0 +1,24 @@
+package processor
+
+type StateCommentSingle struct {}
+
+func (state *StateCommentSingle) String() string {
+	return "comment"
+}
+
+func (state *StateCommentSingle) Process(job *FileJob, lang *LanguageFeature, index int, lineType LineType) (int, LineType, State) {
+	var i int
+	for i = index; i < job.EndPoint; i++ {
+		curByte := job.Content[i]
+
+		if curByte == '\n' {
+			break
+		}
+	}
+
+	return i, lineType, state
+}
+
+func (state *StateCommentSingle) Reset() (LineType, State) {
+	return LINE_BLANK, &StateBlank{}
+}