From 9770aba7cd9d4d46c8a4615151eab9776e7141eb Mon Sep 17 00:00:00 2001 From: David Baggerman Date: Tue, 23 Mar 2021 20:00:33 +1100 Subject: [PATCH] Refactor CountStats state machine --- examples/issue246.py | 18 ++ processor/file.go | 1 + processor/state_blank.go | 60 +++++ processor/state_code.go | 92 +++++++ processor/state_comment_multi.go | 70 ++++++ processor/state_comment_single.go | 24 ++ processor/state_docstring.go | 53 ++++ processor/state_string.go | 37 +++ processor/states.go | 6 + processor/structs.go | 4 +- processor/workers.go | 398 +++++------------------------- 11 files changed, 419 insertions(+), 344 deletions(-) create mode 100644 examples/issue246.py create mode 100644 processor/state_blank.go create mode 100644 processor/state_code.go create mode 100644 processor/state_comment_multi.go create mode 100644 processor/state_comment_single.go create mode 100644 processor/state_docstring.go create mode 100644 processor/state_string.go create mode 100644 processor/states.go diff --git a/examples/issue246.py b/examples/issue246.py new file mode 100644 index 00000000..35bec5dd --- /dev/null +++ b/examples/issue246.py @@ -0,0 +1,18 @@ +#!/usr/bin/env python3 + +""" +Docstrings containing an apostrophe (') are handled incorrectly +The line above is counted as code despite being in the middle of a docstring. +The end of docstring flag seems to be changed to an apostrophe, +which means the next line will not exit the docstring. +""" +# Code containing single quotes will exit the docstring, +# but presuming the quotes are balanced the second +# quote will put us in string scanning mode. +if __name__ == '__main__': + print('Hello, World!') +# Not counted as a comment + +# ^ Not counted as a blank line +# Break out of string scanner with unbalanced single quote: ' + exit(0) diff --git a/processor/file.go b/processor/file.go index fd96fd00..15e67c0b 100644 --- a/processor/file.go +++ b/processor/file.go @@ -259,6 +259,7 @@ func newFileJob(path, name string, fileInfo os.FileInfo) *FileJob { Extension: extension, PossibleLanguages: language, Bytes: fileInfo.Size(), + EndPoint: int(fileInfo.Size() - 1), } } else if Verbose { printWarn(fmt.Sprintf("skipping file unknown extension: %s", name)) diff --git a/processor/state_blank.go b/processor/state_blank.go new file mode 100644 index 00000000..be743b78 --- /dev/null +++ b/processor/state_blank.go @@ -0,0 +1,60 @@ +package processor + +type StateBlank struct {} + +func (state *StateBlank) String() string { + return "blank" +} + +func (state *StateBlank) Process(job *FileJob, lang *LanguageFeature, index int, lineType LineType) (int, LineType, State) { + switch tokenType, offsetJump, endString := lang.Tokens.Match(job.Content[index:]); tokenType { + case TMlcomment: + commentType := lineType + if commentType == LINE_BLANK { + commentType = LINE_COMMENT + } + + index += offsetJump - 1 + return index, commentType, NewStateCommentMulti(endString) + + case TSlcomment: + commentType := lineType + if commentType == LINE_BLANK { + commentType = LINE_COMMENT + } + return index, commentType, &StateCommentSingle{} + + case TString: + index, docString, skipEsc := verifyIgnoreEscape(lang, job, index) + + if docString { + commentType := lineType + if commentType == LINE_BLANK { + commentType = LINE_COMMENT + } + + return index, commentType, &StateDocString{ + End: endString, + SkipEsc: skipEsc, + } + } + + return index, LINE_CODE, &StateString{ + End: endString, + SkipEsc: skipEsc, + } + + case TComplexity: + if index == 0 || isWhitespace(job.Content[index-1]) { + job.Complexity++ + } + return index, LINE_BLANK, state + + default: + return index, LINE_CODE, &StateCode{} + } +} + +func (state *StateBlank) Reset() (LineType, State) { + return LINE_BLANK, state +} diff --git a/processor/state_code.go b/processor/state_code.go new file mode 100644 index 00000000..c2c496a8 --- /dev/null +++ b/processor/state_code.go @@ -0,0 +1,92 @@ +package processor + +type StateCode struct {} + +func (state *StateCode) String() string { + return "code" +} + +func (state *StateCode) Process(job *FileJob, lang *LanguageFeature, index int, lineType LineType) (int, LineType, State) { + // Hacky fix to https://github.com/boyter/scc/issues/181 + endPoint := job.EndPoint + if endPoint > len(job.Content) { + endPoint-- + } + + var i int + for i = index; i < endPoint; i++ { + curByte := job.Content[i] + + if curByte == '\n' { + return i, LINE_CODE, state + } + + if isBinary(i, curByte) { + job.Binary = true + return i, LINE_CODE, state + } + + if shouldProcess(curByte, lang.ProcessMask) { + if Duplicates { + // Technically this is wrong because we skip bytes so this is not a true + // hash of the file contents, but for duplicate files it shouldn't matter + // as both will skip the same way + digestible := []byte{job.Content[index]} + job.Hash.Write(digestible) + } + + switch tokenType, offsetJump, endString := lang.Tokens.Match(job.Content[i:]); tokenType { + case TString: + // If we are in string state then check what sort of string so we know if docstring OR ignoreescape string + + // It is safe to -1 here as to enter the code state we need to have + // transitioned from blank to here hence i should always be >= 1 + // This check is to ensure we aren't in a character declaration + // TODO this should use language features + if job.Content[i-1] == '\\' { + break // from switch, not from the loop + } + + i, docString, skipEsc := verifyIgnoreEscape(lang, job, i) + + if docString { + commentType := lineType + if commentType == LINE_BLANK { + commentType = LINE_COMMENT + } + + return i, commentType, &StateDocString{ + End: endString, + SkipEsc: skipEsc, + } + } + + // i += offsetJump - 1 + return i, LINE_CODE, &StateString{ + End: endString, + SkipEsc: skipEsc, + } + + case TSlcomment: + i += offsetJump - 1 + return i, LINE_CODE, &StateCommentSingle{} + + case TMlcomment: + i += offsetJump - 1 + + return i, LINE_CODE, NewStateCommentMulti(endString) + + case TComplexity: + if i == 0 || isWhitespace(job.Content[i-1]) { + job.Complexity++ + } + } + } + } + + return i, LINE_CODE, state +} + +func (state *StateCode) Reset() (LineType, State) { + return LINE_BLANK, &StateBlank{} +} diff --git a/processor/state_comment_multi.go b/processor/state_comment_multi.go new file mode 100644 index 00000000..4385db73 --- /dev/null +++ b/processor/state_comment_multi.go @@ -0,0 +1,70 @@ +package processor + +type StateCommentMulti struct { + Stack [][]byte +} + +func (state *StateCommentMulti) String() string { + return "multiline-comment" +} + +func NewStateCommentMulti(token []byte) *StateCommentMulti { + return &StateCommentMulti{ + Stack: [][]byte{token}, + } +} + +func (state *StateCommentMulti) Process(job *FileJob, lang *LanguageFeature, index int, lineType LineType) (int, LineType, State) { + var i int + for i = index; i < job.EndPoint; i++ { + curByte := job.Content[i] + + if curByte == '\n' { + break + } + + endToken := state.peek() + if checkForMatchSingle(curByte, i, job.EndPoint, endToken, job) { + // set offset jump here + i += len(endToken) - 1 + + if len(state.Stack) == 1 { + return i, lineType, &StateBlank{} + } else { + state.pop() + return i, lineType, state + } + } + + // Check if we are entering another multiline comment + // This should come below check for match single as it speeds up processing + if lang.Nested { + if ok, offsetJump, endString := lang.MultiLineComments.Match(job.Content[i:]); ok != 0 { + i += offsetJump - 1 + state.push(endString) + return i, lineType, state + } + } + } + + return i, lineType, state +} + +func (state *StateCommentMulti) Reset() (LineType, State) { + return LINE_COMMENT, state +} + +func (state *StateCommentMulti) peek() []byte { + i := len(state.Stack) - 1 + return state.Stack[i] +} + +func (state *StateCommentMulti) push(token []byte) { + state.Stack = append(state.Stack, token) +} + +func (state *StateCommentMulti) pop() { + i := len(state.Stack) - 1 + + state.Stack = state.Stack[:i] +} diff --git a/processor/state_comment_single.go b/processor/state_comment_single.go new file mode 100644 index 00000000..dc978942 --- /dev/null +++ b/processor/state_comment_single.go @@ -0,0 +1,24 @@ +package processor + +type StateCommentSingle struct {} + +func (state *StateCommentSingle) String() string { + return "comment" +} + +func (state *StateCommentSingle) Process(job *FileJob, lang *LanguageFeature, index int, lineType LineType) (int, LineType, State) { + var i int + for i = index; i < job.EndPoint; i++ { + curByte := job.Content[i] + + if curByte == '\n' { + break + } + } + + return i, lineType, state +} + +func (state *StateCommentSingle) Reset() (LineType, State) { + return LINE_BLANK, &StateBlank{} +} diff --git a/processor/state_docstring.go b/processor/state_docstring.go new file mode 100644 index 00000000..3af62a5a --- /dev/null +++ b/processor/state_docstring.go @@ -0,0 +1,53 @@ +package processor + +import ( + "fmt" +) + +type StateDocString struct { + End []byte + SkipEsc bool +} + +func (state *StateDocString) String() string { + return "docstring" +} + +func (state *StateDocString) Process(job *FileJob, lang *LanguageFeature, index int, lineType LineType) (int, LineType, State) { + var i int + for i = index; i < job.EndPoint; i++ { + if job.Content[i] == '\n' { + return i, lineType, state + } + + if job.Content[i-1] != '\\' { + if checkForMatchSingle(job.Content[i], i, job.EndPoint, state.End, job) { + // So we have hit end of docstring at this point in which case check if only whitespace characters till the next + // newline and if so we change to a comment otherwise to code + // need to start the loop after ending definition of docstring, therefore adding the length of the string to + // the index + for j := i + len(state.End); j <= job.EndPoint; j++ { + if job.Content[j] == '\n' { + if Debug { + printDebug("Found newline so docstring is comment") + } + return j, LINE_COMMENT, &StateBlank{} + } + + if !isWhitespace(job.Content[j]) { + if Debug { + printDebug(fmt.Sprintf("Found something not whitespace so is code: %s", string(job.Content[j]))) + } + return j, LINE_CODE, &StateBlank{} + } + } + } + } + } + + return i, lineType, state +} + +func (state *StateDocString) Reset() (LineType, State) { + return LINE_COMMENT, state +} diff --git a/processor/state_string.go b/processor/state_string.go new file mode 100644 index 00000000..91194e08 --- /dev/null +++ b/processor/state_string.go @@ -0,0 +1,37 @@ +package processor + +import "fmt" + +type StateString struct { + End []byte + SkipEsc bool +} + +func (state *StateString) String() string { + return fmt.Sprintf("string[end=%s,skipesc=%v]", state.End, state.SkipEsc) +} + +func (state *StateString) Process(job *FileJob, lang *LanguageFeature, index int, lineType LineType) (int, LineType, State) { + var i int + for i = index; i < job.EndPoint; i++ { + // If we hit a newline, return because we want to count the stats but keep + // the current state so we end up back in this loop when the outer + // one calls again + if job.Content[i] == '\n' { + return i, LINE_CODE, state + } + + // If we are in a literal string we want to ignore the \ check OR we aren't checking for special ones + if state.SkipEsc || job.Content[i-1] != '\\' { + if checkForMatchSingle(job.Content[i], i, job.EndPoint, state.End, job) { + return i, LINE_CODE, &StateCode{} + } + } + } + + return i, LINE_CODE, state +} + +func (state *StateString) Reset() (LineType, State) { + return LINE_CODE, state +} diff --git a/processor/states.go b/processor/states.go new file mode 100644 index 00000000..64a40d68 --- /dev/null +++ b/processor/states.go @@ -0,0 +1,6 @@ +package processor + +type State interface { + Process(*FileJob, *LanguageFeature, int, LineType) (int, LineType, State) + Reset() (LineType, State) +} diff --git a/processor/structs.go b/processor/structs.go index 36e48822..5c2353e2 100644 --- a/processor/structs.go +++ b/processor/structs.go @@ -5,6 +5,7 @@ package processor import ( "bytes" "sync" + "hash" ) // Used by trie structure to store the types @@ -76,11 +77,12 @@ type FileJob struct { Blank int64 Complexity int64 WeightedComplexity float64 - Hash []byte + Hash hash.Hash Callback FileJobCallback Binary bool Minified bool Generated bool + EndPoint int } // LanguageSummary is used to hold summarised results for a single language diff --git a/processor/workers.go b/processor/workers.go index 899bdce7..164277d1 100644 --- a/processor/workers.go +++ b/processor/workers.go @@ -5,7 +5,6 @@ package processor import ( "bytes" "fmt" - "hash" "runtime/debug" "strings" "sync" @@ -33,6 +32,19 @@ const SheBang string = "#!" // LineType what type of line are are processing type LineType int32 +func (lt LineType) String() string { + switch lt { + case LINE_BLANK: + return "blank" + case LINE_CODE: + return "code" + case LINE_COMMENT: + return "comment" + default: + return fmt.Sprintf("%d", lt) + } +} + // These are not meant to be CAMEL_CASE but as it us used by an external project we cannot change it const ( LINE_BLANK LineType = iota @@ -107,253 +119,10 @@ func shouldProcess(currentByte byte, processBytesMask uint64) bool { return true } -func resetState(currentState int64) int64 { - if currentState == SMulticomment || currentState == SMulticommentCode { - currentState = SMulticomment - } else if currentState == SString { - currentState = SString - } else { - currentState = SBlank - } - - return currentState -} - -func stringState(fileJob *FileJob, index int, endPoint int, stringTrie *Trie, endString []byte, currentState int64, ignoreEscape bool) (int, int64) { - // Its not possible to enter this state without checking at least 1 byte so it is safe to check -1 here - // without checking if it is out of bounds first - for i := index; i < endPoint; i++ { - index = i - - // If we hit a newline, return because we want to count the stats but keep - // the current state so we end up back in this loop when the outer - // one calls again - if fileJob.Content[i] == '\n' { - return i, currentState - } - - // If we are in a literal string we want to ignore the \ check OR we aren't checking for special ones - if ignoreEscape || fileJob.Content[i-1] != '\\' { - if checkForMatchSingle(fileJob.Content[i], index, endPoint, endString, fileJob) { - return i, SCode - } - } - } - - return index, currentState -} - -// This is a special state check pretty much only ever used by Python codebases -// but potentially it could be expanded to deal with other types -func docStringState(fileJob *FileJob, index int, endPoint int, stringTrie *Trie, endString []byte, currentState int64) (int, int64) { - // Its not possible to enter this state without checking at least 1 byte so it is safe to check -1 here - // without checking if it is out of bounds first - for i := index; i < endPoint; i++ { - index = i - - if fileJob.Content[i] == '\n' { - return i, currentState - } - - if fileJob.Content[i-1] != '\\' { - if ok, _, _ := stringTrie.Match(fileJob.Content[i:]); ok != 0 { - // So we have hit end of docstring at this point in which case check if only whitespace characters till the next - // newline and if so we change to a comment otherwise to code - // need to start the loop after ending definition of docstring, therefore adding the length of the string to - // the index - for j := index + len(endString); j <= endPoint; j++ { - if fileJob.Content[j] == '\n' { - if Debug { - printDebug("Found newline so docstring is comment") - } - return i, SComment - } - - if !isWhitespace(fileJob.Content[j]) { - if Debug { - printDebug(fmt.Sprintf("Found something not whitespace so is code: %s", string(fileJob.Content[j]))) - } - return i, SCode - } - } - - return i, SCode - } - } - } - - return index, currentState -} - -func codeState( - fileJob *FileJob, - index int, - endPoint int, - currentState int64, - endString []byte, - endComments [][]byte, - langFeatures LanguageFeature, - digest *hash.Hash, -) (int, int64, []byte, [][]byte, bool) { - // Hacky fix to https://github.com/boyter/scc/issues/181 - if endPoint > len(fileJob.Content) { - endPoint-- - } - - for i := index; i < endPoint; i++ { - curByte := fileJob.Content[i] - index = i - - if curByte == '\n' { - return i, currentState, endString, endComments, false - } - - if isBinary(i, curByte) { - fileJob.Binary = true - return i, currentState, endString, endComments, false - } - - if shouldProcess(curByte, langFeatures.ProcessMask) { - if Duplicates { - // Technically this is wrong because we skip bytes so this is not a true - // hash of the file contents, but for duplicate files it shouldn't matter - // as both will skip the same way - digestible := []byte{fileJob.Content[index]} - (*digest).Write(digestible) - } - - switch tokenType, offsetJump, endString := langFeatures.Tokens.Match(fileJob.Content[i:]); tokenType { - case TString: - // If we are in string state then check what sort of string so we know if docstring OR ignoreescape string - i, ignoreEscape := verifyIgnoreEscape(langFeatures, fileJob, index) - - // It is safe to -1 here as to enter the code state we need to have - // transitioned from blank to here hence i should always be >= 1 - // This check is to ensure we aren't in a character declaration - // TODO this should use language features - if fileJob.Content[i-1] != '\\' { - currentState = SString - } - - return i, currentState, endString, endComments, ignoreEscape - - case TSlcomment: - currentState = SCommentCode - return i, currentState, endString, endComments, false - - case TMlcomment: - if langFeatures.Nested || len(endComments) == 0 { - endComments = append(endComments, endString) - currentState = SMulticommentCode - i += offsetJump - 1 - - return i, currentState, endString, endComments, false - } - - case TComplexity: - if index == 0 || isWhitespace(fileJob.Content[index-1]) { - fileJob.Complexity++ - } - } - } - } - - return index, currentState, endString, endComments, false -} - -func commentState(fileJob *FileJob, index int, endPoint int, currentState int64, endComments [][]byte, endString []byte, langFeatures LanguageFeature) (int, int64, []byte, [][]byte) { - for i := index; i < endPoint; i++ { - curByte := fileJob.Content[i] - index = i - - if curByte == '\n' { - return i, currentState, endString, endComments - } - - if checkForMatchSingle(curByte, index, endPoint, endComments[len(endComments)-1], fileJob) { - // set offset jump here - offsetJump := len(endComments[len(endComments)-1]) - endComments = endComments[:len(endComments)-1] - - if len(endComments) == 0 { - // If we started as multiline code switch back to code so we count correctly - // IE i := 1 /* for the lols */ - // TODO is that required? Might still be required to count correctly - if currentState == SMulticommentCode { - currentState = SCode // TODO pointless to change here, just set S_MULTICOMMENT_BLANK - } else { - currentState = SMulticommentBlank - } - } - - i += offsetJump - 1 - return i, currentState, endString, endComments - } - // Check if we are entering another multiline comment - // This should come below check for match single as it speeds up processing - if langFeatures.Nested || len(endComments) == 0 { - if ok, offsetJump, endString := langFeatures.MultiLineComments.Match(fileJob.Content[i:]); ok != 0 { - endComments = append(endComments, endString) - i += offsetJump - 1 - - return i, currentState, endString, endComments - } - } - } - - return index, currentState, endString, endComments -} - -func blankState( - fileJob *FileJob, - index int, - endPoint int, - currentState int64, - endComments [][]byte, - endString []byte, - langFeatures LanguageFeature, -) (int, int64, []byte, [][]byte, bool) { - switch tokenType, offsetJump, endString := langFeatures.Tokens.Match(fileJob.Content[index:]); tokenType { - case TMlcomment: - if langFeatures.Nested || len(endComments) == 0 { - endComments = append(endComments, endString) - currentState = SMulticomment - index += offsetJump - 1 - return index, currentState, endString, endComments, false - } - - case TSlcomment: - currentState = SComment - return index, currentState, endString, endComments, false - - case TString: - index, ignoreEscape := verifyIgnoreEscape(langFeatures, fileJob, index) - - for _, v := range langFeatures.Quotes { - if v.End == string(endString) && v.DocString { - currentState = SDocString - return index, currentState, endString, endComments, ignoreEscape - } - } - currentState = SString - return index, currentState, endString, endComments, ignoreEscape - - case TComplexity: - currentState = SCode - if index == 0 || isWhitespace(fileJob.Content[index-1]) { - fileJob.Complexity++ - } - - default: - currentState = SCode - } - - return index, currentState, endString, endComments, false -} - // Some languages such as C# have quoted strings like @"\" where no escape character is required // this checks if there is one so we can cater for these cases -func verifyIgnoreEscape(langFeatures LanguageFeature, fileJob *FileJob, index int) (int, bool) { +func verifyIgnoreEscape(langFeatures *LanguageFeature, fileJob *FileJob, index int) (int, bool, bool) { + docString := false ignoreEscape := false // loop over the string states and if we have the special flag match, and if so we need to ensure we can handle them @@ -370,13 +139,14 @@ func verifyIgnoreEscape(langFeatures LanguageFeature, fileJob *FileJob, index in // If we have a match then jump ahead enough so we don't pick it up again for cases like @" if isMatch { + docString = langFeatures.Quotes[i].DocString ignoreEscape = true index = index + len(langFeatures.Quotes[i].Start) } } } - return index, ignoreEscape + return index, docString, ignoreEscape } // CountStats will process the fileJob @@ -411,22 +181,17 @@ func CountStats(fileJob *FileJob) { langFeatures.Tokens = &Trie{} } - endPoint := int(fileJob.Bytes - 1) - currentState := SBlank - endComments := [][]byte{} - endString := []byte{} - - // TODO needs to be set via langFeatures.Quotes[0].IgnoreEscape for the matching feature - ignoreEscape := false + var lineStart int + var lineType LineType + var currentState State = &StateBlank{} // For determining duplicates we need the below. The reason for creating // the byte array here is to avoid GC pressure. MD5 is in the standard library // and is fast enough to not warrant murmur3 hashing. No need to be // crypto secure here either so no need to eat the performance cost of a better // hash method - var digest hash.Hash if Duplicates { - digest = blake2b.New256() + fileJob.Hash = blake2b.New256() } for index := checkBomSkip(fileJob); index < int(fileJob.Bytes); index++ { @@ -435,47 +200,9 @@ func CountStats(fileJob *FileJob) { // changing anything in here and profile/measure afterwards! // NB that the order of the if statements matters and has been set to what in benchmarks is most efficient if !isWhitespace(fileJob.Content[index]) { - - switch currentState { - case SCode: - index, currentState, endString, endComments, ignoreEscape = codeState( - fileJob, - index, - endPoint, - currentState, - endString, - endComments, - langFeatures, - &digest, - ) - case SString: - index, currentState = stringState(fileJob, index, endPoint, langFeatures.Strings, endString, currentState, ignoreEscape) - case SDocString: - // For a docstring we can either move into blank in which case we count it as a docstring - // or back into code in which case it should be counted as code - index, currentState = docStringState(fileJob, index, endPoint, langFeatures.Strings, endString, currentState) - case SMulticomment, SMulticommentCode: - index, currentState, endString, endComments = commentState( - fileJob, - index, - endPoint, - currentState, - endComments, - endString, - langFeatures, - ) - case SBlank, SMulticommentBlank: - // From blank we can move into comment, move into a multiline comment - // or move into code but we can only do one. - index, currentState, endString, endComments, ignoreEscape = blankState( - fileJob, - index, - endPoint, - currentState, - endComments, - endString, - langFeatures, - ) + index, lineType, currentState = currentState.Process(fileJob, &langFeatures, index, lineType) + if Trace { + printTrace(fmt.Sprintf("state transition @ %d:%d: line=%s state=%s", fileJob.Lines+1, index-lineStart, lineType, currentState)) } } @@ -494,7 +221,7 @@ func CountStats(fileJob *FileJob) { // This means the end of processing the line so calculate the stats according to what state // we are currently in - if fileJob.Content[index] == '\n' || index >= endPoint { + if fileJob.Content[index] == '\n' || index >= fileJob.EndPoint { fileJob.Lines++ if NoLarge && fileJob.Lines >= LargeLineCount { @@ -503,55 +230,39 @@ func CountStats(fileJob *FileJob) { return } - switch currentState { - case SCode, SString, SCommentCode, SMulticommentCode: + switch lineType { + case LINE_CODE: fileJob.Code++ - currentState = resetState(currentState) - if fileJob.Callback != nil { - if !fileJob.Callback.ProcessLine(fileJob, fileJob.Lines, LINE_CODE) { - return - } - } - if Trace { - printTrace(fmt.Sprintf("%s line %d ended with state: %d: counted as code", fileJob.Location, fileJob.Lines, currentState)) - } - case SComment, SMulticomment, SMulticommentBlank: + case LINE_COMMENT: fileJob.Comment++ - currentState = resetState(currentState) - if fileJob.Callback != nil { - if !fileJob.Callback.ProcessLine(fileJob, fileJob.Lines, LINE_COMMENT) { - return - } - } - if Trace { - printTrace(fmt.Sprintf("%s line %d ended with state: %d: counted as comment", fileJob.Location, fileJob.Lines, currentState)) - } - case SBlank: + case LINE_BLANK: fileJob.Blank++ - if fileJob.Callback != nil { - if !fileJob.Callback.ProcessLine(fileJob, fileJob.Lines, LINE_BLANK) { - return - } - } - if Trace { - printTrace(fmt.Sprintf("%s line %d ended with state: %d: counted as blank", fileJob.Location, fileJob.Lines, currentState)) - } - case SDocString: - fileJob.Comment++ - if fileJob.Callback != nil { - if !fileJob.Callback.ProcessLine(fileJob, fileJob.Lines, LINE_COMMENT) { - return - } - } - if Trace { - printTrace(fmt.Sprintf("%s line %d ended with state: %d: counted as comment", fileJob.Location, fileJob.Lines, currentState)) + } + + if Trace { + printTrace(fmt.Sprintf( + "%s line %d [%s] ended with state: %v: counted as %v", + fileJob.Location, + fileJob.Lines, + string(fileJob.Content[lineStart:index]), + currentState, + lineType, + )) + //printTrace(fmt.Sprintf(`line %d: "%s"`, fileJob.Lines, string(fileJob.Content[lineStart:index]))) + + // lineStart is only used to produce the line trace, so it's + // safe to update it inside the condition + lineStart = index+1 + } + + if fileJob.Callback != nil { + if !fileJob.Callback.ProcessLine(fileJob, fileJob.Lines, lineType) { + return } } - } - } - if Duplicates { - fileJob.Hash = digest.Sum(nil) + lineType, currentState = currentState.Reset() + } } isGenerated := false @@ -736,7 +447,8 @@ func processFile(job *FileJob) bool { if Duplicates { duplicates.mux.Lock() - if duplicates.Check(job.Bytes, job.Hash) { + jobHash := job.Hash.Sum(nil) + if duplicates.Check(job.Bytes, jobHash) { if Verbose { printWarn(fmt.Sprintf("skipping duplicate file: %s", job.Location)) } @@ -745,7 +457,7 @@ func processFile(job *FileJob) bool { return false } - duplicates.Add(job.Bytes, job.Hash) + duplicates.Add(job.Bytes, jobHash) duplicates.mux.Unlock() }