Skip to content

Commit

Permalink
Refactor CountStats state machine
Browse files Browse the repository at this point in the history
  • Loading branch information
dbaggerman committed Mar 28, 2021
1 parent d98709e commit 9770aba
Show file tree
Hide file tree
Showing 11 changed files with 419 additions and 344 deletions.
18 changes: 18 additions & 0 deletions examples/issue246.py
@@ -0,0 +1,18 @@
#!/usr/bin/env python3

"""
Docstrings containing an apostrophe (') are handled incorrectly
The line above is counted as code despite being in the middle of a docstring.
The end of docstring flag seems to be changed to an apostrophe,
which means the next line will not exit the docstring.
"""
# Code containing single quotes will exit the docstring,
# but presuming the quotes are balanced the second
# quote will put us in string scanning mode.
if __name__ == '__main__':
print('Hello, World!')
# Not counted as a comment

# ^ Not counted as a blank line
# Break out of string scanner with unbalanced single quote: '
exit(0)
1 change: 1 addition & 0 deletions processor/file.go
Expand Up @@ -259,6 +259,7 @@ func newFileJob(path, name string, fileInfo os.FileInfo) *FileJob {
Extension: extension,
PossibleLanguages: language,
Bytes: fileInfo.Size(),
EndPoint: int(fileInfo.Size() - 1),
}
} else if Verbose {
printWarn(fmt.Sprintf("skipping file unknown extension: %s", name))
Expand Down
60 changes: 60 additions & 0 deletions processor/state_blank.go
@@ -0,0 +1,60 @@
package processor

type StateBlank struct {}

func (state *StateBlank) String() string {
return "blank"
}

func (state *StateBlank) Process(job *FileJob, lang *LanguageFeature, index int, lineType LineType) (int, LineType, State) {
switch tokenType, offsetJump, endString := lang.Tokens.Match(job.Content[index:]); tokenType {
case TMlcomment:
commentType := lineType
if commentType == LINE_BLANK {
commentType = LINE_COMMENT
}

index += offsetJump - 1
return index, commentType, NewStateCommentMulti(endString)

case TSlcomment:
commentType := lineType
if commentType == LINE_BLANK {
commentType = LINE_COMMENT
}
return index, commentType, &StateCommentSingle{}

case TString:
index, docString, skipEsc := verifyIgnoreEscape(lang, job, index)

if docString {
commentType := lineType
if commentType == LINE_BLANK {
commentType = LINE_COMMENT
}

return index, commentType, &StateDocString{
End: endString,
SkipEsc: skipEsc,
}
}

return index, LINE_CODE, &StateString{
End: endString,
SkipEsc: skipEsc,
}

case TComplexity:
if index == 0 || isWhitespace(job.Content[index-1]) {
job.Complexity++
}
return index, LINE_BLANK, state

default:
return index, LINE_CODE, &StateCode{}
}
}

func (state *StateBlank) Reset() (LineType, State) {
return LINE_BLANK, state
}
92 changes: 92 additions & 0 deletions processor/state_code.go
@@ -0,0 +1,92 @@
package processor

type StateCode struct {}

func (state *StateCode) String() string {
return "code"
}

func (state *StateCode) Process(job *FileJob, lang *LanguageFeature, index int, lineType LineType) (int, LineType, State) {
// Hacky fix to https://github.com/boyter/scc/issues/181
endPoint := job.EndPoint
if endPoint > len(job.Content) {
endPoint--
}

var i int
for i = index; i < endPoint; i++ {
curByte := job.Content[i]

if curByte == '\n' {
return i, LINE_CODE, state
}

if isBinary(i, curByte) {
job.Binary = true
return i, LINE_CODE, state
}

if shouldProcess(curByte, lang.ProcessMask) {
if Duplicates {
// Technically this is wrong because we skip bytes so this is not a true
// hash of the file contents, but for duplicate files it shouldn't matter
// as both will skip the same way
digestible := []byte{job.Content[index]}
job.Hash.Write(digestible)
}

switch tokenType, offsetJump, endString := lang.Tokens.Match(job.Content[i:]); tokenType {
case TString:
// If we are in string state then check what sort of string so we know if docstring OR ignoreescape string

// It is safe to -1 here as to enter the code state we need to have
// transitioned from blank to here hence i should always be >= 1
// This check is to ensure we aren't in a character declaration
// TODO this should use language features
if job.Content[i-1] == '\\' {
break // from switch, not from the loop
}

i, docString, skipEsc := verifyIgnoreEscape(lang, job, i)

if docString {
commentType := lineType
if commentType == LINE_BLANK {
commentType = LINE_COMMENT
}

return i, commentType, &StateDocString{
End: endString,
SkipEsc: skipEsc,
}
}

// i += offsetJump - 1
return i, LINE_CODE, &StateString{
End: endString,
SkipEsc: skipEsc,
}

case TSlcomment:
i += offsetJump - 1
return i, LINE_CODE, &StateCommentSingle{}

case TMlcomment:
i += offsetJump - 1

return i, LINE_CODE, NewStateCommentMulti(endString)

case TComplexity:
if i == 0 || isWhitespace(job.Content[i-1]) {
job.Complexity++
}
}
}
}

return i, LINE_CODE, state
}

func (state *StateCode) Reset() (LineType, State) {
return LINE_BLANK, &StateBlank{}
}
70 changes: 70 additions & 0 deletions processor/state_comment_multi.go
@@ -0,0 +1,70 @@
package processor

type StateCommentMulti struct {
Stack [][]byte
}

func (state *StateCommentMulti) String() string {
return "multiline-comment"
}

func NewStateCommentMulti(token []byte) *StateCommentMulti {
return &StateCommentMulti{
Stack: [][]byte{token},
}
}

func (state *StateCommentMulti) Process(job *FileJob, lang *LanguageFeature, index int, lineType LineType) (int, LineType, State) {
var i int
for i = index; i < job.EndPoint; i++ {
curByte := job.Content[i]

if curByte == '\n' {
break
}

endToken := state.peek()
if checkForMatchSingle(curByte, i, job.EndPoint, endToken, job) {
// set offset jump here
i += len(endToken) - 1

if len(state.Stack) == 1 {
return i, lineType, &StateBlank{}
} else {
state.pop()
return i, lineType, state
}
}

// Check if we are entering another multiline comment
// This should come below check for match single as it speeds up processing
if lang.Nested {
if ok, offsetJump, endString := lang.MultiLineComments.Match(job.Content[i:]); ok != 0 {
i += offsetJump - 1
state.push(endString)
return i, lineType, state
}
}
}

return i, lineType, state
}

func (state *StateCommentMulti) Reset() (LineType, State) {
return LINE_COMMENT, state
}

func (state *StateCommentMulti) peek() []byte {
i := len(state.Stack) - 1
return state.Stack[i]
}

func (state *StateCommentMulti) push(token []byte) {
state.Stack = append(state.Stack, token)
}

func (state *StateCommentMulti) pop() {
i := len(state.Stack) - 1

state.Stack = state.Stack[:i]
}
24 changes: 24 additions & 0 deletions processor/state_comment_single.go
@@ -0,0 +1,24 @@
package processor

type StateCommentSingle struct {}

func (state *StateCommentSingle) String() string {
return "comment"
}

func (state *StateCommentSingle) Process(job *FileJob, lang *LanguageFeature, index int, lineType LineType) (int, LineType, State) {
var i int
for i = index; i < job.EndPoint; i++ {
curByte := job.Content[i]

if curByte == '\n' {
break
}
}

return i, lineType, state
}

func (state *StateCommentSingle) Reset() (LineType, State) {
return LINE_BLANK, &StateBlank{}
}
53 changes: 53 additions & 0 deletions processor/state_docstring.go
@@ -0,0 +1,53 @@
package processor

import (
"fmt"
)

type StateDocString struct {
End []byte
SkipEsc bool
}

func (state *StateDocString) String() string {
return "docstring"
}

func (state *StateDocString) Process(job *FileJob, lang *LanguageFeature, index int, lineType LineType) (int, LineType, State) {
var i int
for i = index; i < job.EndPoint; i++ {
if job.Content[i] == '\n' {
return i, lineType, state
}

if job.Content[i-1] != '\\' {
if checkForMatchSingle(job.Content[i], i, job.EndPoint, state.End, job) {
// So we have hit end of docstring at this point in which case check if only whitespace characters till the next
// newline and if so we change to a comment otherwise to code
// need to start the loop after ending definition of docstring, therefore adding the length of the string to
// the index
for j := i + len(state.End); j <= job.EndPoint; j++ {
if job.Content[j] == '\n' {
if Debug {
printDebug("Found newline so docstring is comment")
}
return j, LINE_COMMENT, &StateBlank{}
}

if !isWhitespace(job.Content[j]) {
if Debug {
printDebug(fmt.Sprintf("Found something not whitespace so is code: %s", string(job.Content[j])))
}
return j, LINE_CODE, &StateBlank{}
}
}
}
}
}

return i, lineType, state
}

func (state *StateDocString) Reset() (LineType, State) {
return LINE_COMMENT, state
}
37 changes: 37 additions & 0 deletions processor/state_string.go
@@ -0,0 +1,37 @@
package processor

import "fmt"

type StateString struct {
End []byte
SkipEsc bool
}

func (state *StateString) String() string {
return fmt.Sprintf("string[end=%s,skipesc=%v]", state.End, state.SkipEsc)
}

func (state *StateString) Process(job *FileJob, lang *LanguageFeature, index int, lineType LineType) (int, LineType, State) {
var i int
for i = index; i < job.EndPoint; i++ {
// If we hit a newline, return because we want to count the stats but keep
// the current state so we end up back in this loop when the outer
// one calls again
if job.Content[i] == '\n' {
return i, LINE_CODE, state
}

// If we are in a literal string we want to ignore the \ check OR we aren't checking for special ones
if state.SkipEsc || job.Content[i-1] != '\\' {
if checkForMatchSingle(job.Content[i], i, job.EndPoint, state.End, job) {
return i, LINE_CODE, &StateCode{}
}
}
}

return i, LINE_CODE, state
}

func (state *StateString) Reset() (LineType, State) {
return LINE_CODE, state
}
6 changes: 6 additions & 0 deletions processor/states.go
@@ -0,0 +1,6 @@
package processor

type State interface {
Process(*FileJob, *LanguageFeature, int, LineType) (int, LineType, State)
Reset() (LineType, State)
}
4 changes: 3 additions & 1 deletion processor/structs.go
Expand Up @@ -5,6 +5,7 @@ package processor
import (
"bytes"
"sync"
"hash"
)

// Used by trie structure to store the types
Expand Down Expand Up @@ -76,11 +77,12 @@ type FileJob struct {
Blank int64
Complexity int64
WeightedComplexity float64
Hash []byte
Hash hash.Hash
Callback FileJobCallback
Binary bool
Minified bool
Generated bool
EndPoint int
}

// LanguageSummary is used to hold summarised results for a single language
Expand Down

0 comments on commit 9770aba

Please sign in to comment.