Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refactor CountStats state machine #247

Merged
merged 1 commit into from
Mar 29, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
18 changes: 18 additions & 0 deletions examples/issue246.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
#!/usr/bin/env python3

"""
Docstrings containing an apostrophe (') are handled incorrectly
The line above is counted as code despite being in the middle of a docstring.
The end of docstring flag seems to be changed to an apostrophe,
which means the next line will not exit the docstring.
"""
# Code containing single quotes will exit the docstring,
# but presuming the quotes are balanced the second
# quote will put us in string scanning mode.
if __name__ == '__main__':
print('Hello, World!')
# Not counted as a comment

# ^ Not counted as a blank line
# Break out of string scanner with unbalanced single quote: '
exit(0)
1 change: 1 addition & 0 deletions processor/file.go
Original file line number Diff line number Diff line change
Expand Up @@ -259,6 +259,7 @@ func newFileJob(path, name string, fileInfo os.FileInfo) *FileJob {
Extension: extension,
PossibleLanguages: language,
Bytes: fileInfo.Size(),
EndPoint: int(fileInfo.Size() - 1),
}
} else if Verbose {
printWarn(fmt.Sprintf("skipping file unknown extension: %s", name))
Expand Down
60 changes: 60 additions & 0 deletions processor/state_blank.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
package processor

type StateBlank struct {}

func (state *StateBlank) String() string {
return "blank"
}

func (state *StateBlank) Process(job *FileJob, lang *LanguageFeature, index int, lineType LineType) (int, LineType, State) {
switch tokenType, offsetJump, endString := lang.Tokens.Match(job.Content[index:]); tokenType {
case TMlcomment:
commentType := lineType
if commentType == LINE_BLANK {
commentType = LINE_COMMENT
}

index += offsetJump - 1
return index, commentType, NewStateCommentMulti(endString)

case TSlcomment:
commentType := lineType
if commentType == LINE_BLANK {
commentType = LINE_COMMENT
}
return index, commentType, &StateCommentSingle{}

case TString:
index, docString, skipEsc := verifyIgnoreEscape(lang, job, index)

if docString {
commentType := lineType
if commentType == LINE_BLANK {
commentType = LINE_COMMENT
}

return index, commentType, &StateDocString{
End: endString,
SkipEsc: skipEsc,
}
}

return index, LINE_CODE, &StateString{
End: endString,
SkipEsc: skipEsc,
}

case TComplexity:
if index == 0 || isWhitespace(job.Content[index-1]) {
job.Complexity++
}
return index, LINE_BLANK, state

default:
return index, LINE_CODE, &StateCode{}
}
}

func (state *StateBlank) Reset() (LineType, State) {
return LINE_BLANK, state
}
92 changes: 92 additions & 0 deletions processor/state_code.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
package processor

type StateCode struct {}

func (state *StateCode) String() string {
return "code"
}

func (state *StateCode) Process(job *FileJob, lang *LanguageFeature, index int, lineType LineType) (int, LineType, State) {
// Hacky fix to https://github.com/boyter/scc/issues/181
endPoint := job.EndPoint
if endPoint > len(job.Content) {
endPoint--
}

var i int
for i = index; i < endPoint; i++ {
curByte := job.Content[i]

if curByte == '\n' {
return i, LINE_CODE, state
}

if isBinary(i, curByte) {
job.Binary = true
return i, LINE_CODE, state
}

if shouldProcess(curByte, lang.ProcessMask) {
if Duplicates {
// Technically this is wrong because we skip bytes so this is not a true
// hash of the file contents, but for duplicate files it shouldn't matter
// as both will skip the same way
digestible := []byte{job.Content[index]}
job.Hash.Write(digestible)
}

switch tokenType, offsetJump, endString := lang.Tokens.Match(job.Content[i:]); tokenType {
case TString:
// If we are in string state then check what sort of string so we know if docstring OR ignoreescape string

// It is safe to -1 here as to enter the code state we need to have
// transitioned from blank to here hence i should always be >= 1
// This check is to ensure we aren't in a character declaration
// TODO this should use language features
if job.Content[i-1] == '\\' {
break // from switch, not from the loop
}

i, docString, skipEsc := verifyIgnoreEscape(lang, job, i)

if docString {
commentType := lineType
if commentType == LINE_BLANK {
commentType = LINE_COMMENT
}

return i, commentType, &StateDocString{
End: endString,
SkipEsc: skipEsc,
}
}

// i += offsetJump - 1
return i, LINE_CODE, &StateString{
End: endString,
SkipEsc: skipEsc,
}

case TSlcomment:
i += offsetJump - 1
return i, LINE_CODE, &StateCommentSingle{}

case TMlcomment:
i += offsetJump - 1

return i, LINE_CODE, NewStateCommentMulti(endString)

case TComplexity:
if i == 0 || isWhitespace(job.Content[i-1]) {
job.Complexity++
}
}
}
}

return i, LINE_CODE, state
}

func (state *StateCode) Reset() (LineType, State) {
return LINE_BLANK, &StateBlank{}
}
70 changes: 70 additions & 0 deletions processor/state_comment_multi.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
package processor

type StateCommentMulti struct {
Stack [][]byte
}

func (state *StateCommentMulti) String() string {
return "multiline-comment"
}

func NewStateCommentMulti(token []byte) *StateCommentMulti {
return &StateCommentMulti{
Stack: [][]byte{token},
}
}

func (state *StateCommentMulti) Process(job *FileJob, lang *LanguageFeature, index int, lineType LineType) (int, LineType, State) {
var i int
for i = index; i < job.EndPoint; i++ {
curByte := job.Content[i]

if curByte == '\n' {
break
}

endToken := state.peek()
if checkForMatchSingle(curByte, i, job.EndPoint, endToken, job) {
// set offset jump here
i += len(endToken) - 1

if len(state.Stack) == 1 {
return i, lineType, &StateBlank{}
} else {
state.pop()
return i, lineType, state
}
}

// Check if we are entering another multiline comment
// This should come below check for match single as it speeds up processing
if lang.Nested {
if ok, offsetJump, endString := lang.MultiLineComments.Match(job.Content[i:]); ok != 0 {
i += offsetJump - 1
state.push(endString)
return i, lineType, state
}
}
}

return i, lineType, state
}

func (state *StateCommentMulti) Reset() (LineType, State) {
return LINE_COMMENT, state
}

func (state *StateCommentMulti) peek() []byte {
i := len(state.Stack) - 1
return state.Stack[i]
}

func (state *StateCommentMulti) push(token []byte) {
state.Stack = append(state.Stack, token)
}

func (state *StateCommentMulti) pop() {
i := len(state.Stack) - 1

state.Stack = state.Stack[:i]
}
24 changes: 24 additions & 0 deletions processor/state_comment_single.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
package processor

type StateCommentSingle struct {}

func (state *StateCommentSingle) String() string {
return "comment"
}

func (state *StateCommentSingle) Process(job *FileJob, lang *LanguageFeature, index int, lineType LineType) (int, LineType, State) {
var i int
for i = index; i < job.EndPoint; i++ {
curByte := job.Content[i]

if curByte == '\n' {
break
}
}

return i, lineType, state
}

func (state *StateCommentSingle) Reset() (LineType, State) {
return LINE_BLANK, &StateBlank{}
}
53 changes: 53 additions & 0 deletions processor/state_docstring.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
package processor

import (
"fmt"
)

type StateDocString struct {
End []byte
SkipEsc bool
}

func (state *StateDocString) String() string {
return "docstring"
}

func (state *StateDocString) Process(job *FileJob, lang *LanguageFeature, index int, lineType LineType) (int, LineType, State) {
var i int
for i = index; i < job.EndPoint; i++ {
if job.Content[i] == '\n' {
return i, lineType, state
}

if job.Content[i-1] != '\\' {
if checkForMatchSingle(job.Content[i], i, job.EndPoint, state.End, job) {
// So we have hit end of docstring at this point in which case check if only whitespace characters till the next
// newline and if so we change to a comment otherwise to code
// need to start the loop after ending definition of docstring, therefore adding the length of the string to
// the index
for j := i + len(state.End); j <= job.EndPoint; j++ {
if job.Content[j] == '\n' {
if Debug {
printDebug("Found newline so docstring is comment")
}
return j, LINE_COMMENT, &StateBlank{}
}

if !isWhitespace(job.Content[j]) {
if Debug {
printDebug(fmt.Sprintf("Found something not whitespace so is code: %s", string(job.Content[j])))
}
return j, LINE_CODE, &StateBlank{}
}
}
}
}
}

return i, lineType, state
}

func (state *StateDocString) Reset() (LineType, State) {
return LINE_COMMENT, state
}
37 changes: 37 additions & 0 deletions processor/state_string.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
package processor

import "fmt"

type StateString struct {
End []byte
SkipEsc bool
}

func (state *StateString) String() string {
return fmt.Sprintf("string[end=%s,skipesc=%v]", state.End, state.SkipEsc)
}

func (state *StateString) Process(job *FileJob, lang *LanguageFeature, index int, lineType LineType) (int, LineType, State) {
var i int
for i = index; i < job.EndPoint; i++ {
// If we hit a newline, return because we want to count the stats but keep
// the current state so we end up back in this loop when the outer
// one calls again
if job.Content[i] == '\n' {
return i, LINE_CODE, state
}

// If we are in a literal string we want to ignore the \ check OR we aren't checking for special ones
if state.SkipEsc || job.Content[i-1] != '\\' {
if checkForMatchSingle(job.Content[i], i, job.EndPoint, state.End, job) {
return i, LINE_CODE, &StateCode{}
}
}
}

return i, LINE_CODE, state
}

func (state *StateString) Reset() (LineType, State) {
return LINE_CODE, state
}
6 changes: 6 additions & 0 deletions processor/states.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
package processor

type State interface {
Process(*FileJob, *LanguageFeature, int, LineType) (int, LineType, State)
Reset() (LineType, State)
}
4 changes: 3 additions & 1 deletion processor/structs.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ package processor
import (
"bytes"
"sync"
"hash"
)

// Used by trie structure to store the types
Expand Down Expand Up @@ -76,11 +77,12 @@ type FileJob struct {
Blank int64
Complexity int64
WeightedComplexity float64
Hash []byte
Hash hash.Hash
Callback FileJobCallback
Binary bool
Minified bool
Generated bool
EndPoint int
}

// LanguageSummary is used to hold summarised results for a single language
Expand Down