Merge branch 'master' of github.com:boyter/cs

boyter · Mar 13, 2020 · 3673f74 · 3673f74
2 parents 31ff190 + 6e04b5f
commit 3673f74
Show file tree

Hide file tree

Showing 71 changed files with 5,867 additions and 6,116 deletions.
diff --git a/Gopkg.lock b/Gopkg.lock
diff --git a/README.md b/README.md
@@ -18,7 +18,9 @@ BUGS
 search for cs --hidden --no-gitignore --no-ignore 英文 cuts in the middle of a rune
 score from TF/IDF appears to be negative in some cases (overflow??)
 searches in TUI for very large directories clobber each other making UI unresponsive
-search cs result_ranker highlights the whole section
+highlight on windows command line not escaped correctly
+
+http://localhost:8080/?q=test&ss=300 bug where display is all yellow
 
 go install && cs --hidden --no-gitignore --no-ignore ten thousand a year <<<<< out of range BUG
 
@@ -34,7 +36,15 @@ TODO
 clean up parser so multiple spaces aren't tokens or flag em to be ignored
 add proximity search "this is"~5 which means they need to be within 5 bytes of each other
 add limit to number of results
-investigate string match limit
+investigate string match limit (might be wrong for unicode insensitive)
+JSON endpoint for HTTP
+JSON output for cli
+Save to disk output
+
+MAYBE
+
+HTML parser option
+https://stackoverflow.com/questions/44441665/how-to-extract-only-text-from-html-in-golang
 ```
 
 template example (from root)

diff --git a/processor/parser.go b/processor/parser.go
@@ -5,12 +5,12 @@ import (
 )
 
 const (
-	Default int64 = 0
-	Quoted  int64 = 1
-	Regex   int64 = 2
-	Negated int64 = 3
-	Fuzzy1  int64 = 4
-	Fuzzy2  int64 = 5
+	Default = iota
+	Quoted
+	Regex
+	Negated
+	Fuzzy1
+	Fuzzy2
 )
 
 type searchParams struct {

diff --git a/processor/snippet.go b/processor/snippet.go
@@ -66,17 +66,6 @@ func extractRelevantV3(res *fileJob, documentFrequencies map[string]int, relLeng
 	// which makes things easy to sort and deal with
 	for k, v := range res.MatchLocations {
 		for _, i := range v {
-			// For filename matches the mark is from 0 to 0 so we don't highlight anything
-			// however it means we don't match anything either so set it to the full length
-			// of what we need to display
-			if i[0] == 0 && i[1] == 0 {
-				if relLength > len(res.Content) {
-					i[1] = len(res.Content)
-				} else {
-					i[1] = relLength
-				}
-			}
-
 			rv3 = append(rv3, relevantV3{
 				Word:  k,
 				Start: i[0],
@@ -85,10 +74,32 @@ func extractRelevantV3(res *fileJob, documentFrequencies map[string]int, relLeng
 		}
 	}
 
+	// If we have a single result and its a filename match which has
+	// no real start or end position
+	// it means we have no content to look through so just display the first
+	// chunk of the file
+	if len(rv3) == 1 && rv3[0].Start == 0 && rv3[0].End == 0 {
+		endPos := 300
+		if len(res.Content) < 300 {
+			endPos = len(res.Content)
+		}
+
+		// TODO check for if we cut in the middle of a multibyte character
+		return []Snippet{
+			{
+				Content:  string(res.Content[:endPos]),
+				StartPos: 0,
+				EndPos:   0,
+			},
+		}
+	}
+
+	// Sort the results so when we slide around everything is in order
 	sort.Slice(rv3, func(i, j int) bool {
 		return rv3[i].Start < rv3[j].Start
 	})
 
+
 	// Slide around looking for matches that fit in the length
 	for i := 0; i < len(rv3); i++ {
 		m := bestMatch{
@@ -183,16 +194,16 @@ func extractRelevantV3(res *fileJob, documentFrequencies map[string]int, relLeng
 		// how good a match it is and hopefully display to the user what they
 		// were actually looking for
 		m.Score += float64(len(m.Relevant))     // Factor in how many matches we have
-		m.Score += float64(m.EndPos - m.StartPos) // Factor in how large the snippet is 
+		//m.Score += float64(m.EndPos - m.StartPos) // Factor in how large the snippet is NB weight this as it makes things worse sometimes
 
 		// Apply higher score where the words are near each other
 		mid := rv3[i].Start + (rv3[i].End-rv3[i].End)/2 // match word midpoint
 		for _, v := range m.Relevant {
 			p := v.Start + (v.End-v.Start)/2 // comparison word midpoint
 
 			// If the word is within a reasonable distance of this word boost the score
-			// weighted by how common that word is so that matches like a impact the rank
-			// less than something like cromulent
+			// weighted by how common that word is so that matches like 'a' impact the rank
+			// less than something like 'cromulent' which in theory should not occur as much
 			if abs(mid-p) < (relLength / 3) {
 				m.Score += 100 / float64(documentFrequencies[v.Word])
 			}

diff --git a/processor/worker_summarize.go b/processor/worker_summarize.go
@@ -3,21 +3,25 @@ package processor
 import (
 	"fmt"
 	str "github.com/boyter/cs/string"
-	. "github.com/logrusorgru/aurora"
+	"github.com/fatih/color"
+	"github.com/mattn/go-isatty"
+	"os"
 )
 
 type ResultSummarizer struct {
 	input            chan *fileJob
 	ResultLimit      int64
 	FileReaderWorker *FileReaderWorker
 	SnippetCount     int
+	NoColor          bool
 }
 
 func NewResultSummarizer(input chan *fileJob) ResultSummarizer {
 	return ResultSummarizer{
 		input:        input,
 		ResultLimit:  -1,
 		SnippetCount: 1,
+		NoColor:      os.Getenv("TERM") == "dumb" || (!isatty.IsTerminal(os.Stdout.Fd()) && !isatty.IsCygwinTerminal(os.Stdout.Fd())),
 	}
 }
 
@@ -39,11 +43,15 @@ func (f *ResultSummarizer) Start() {
 
 	fmtBegin := "\033[1;31m"
 	fmtEnd := "\033[0m"
+	if f.NoColor {
+		fmtBegin = ""
+		fmtEnd = ""
+	}
 
 	documentFrequency := calculateDocumentFrequency(results)
 
 	for _, res := range results {
-		fmt.Printf("%s %s%.3f%s\n", Magenta(res.Location), Magenta("("), Magenta(res.Score), Magenta(")"))
+		color.Magenta(fmt.Sprintf("%s (%.3f)", res.Location, res.Score))
 
 		v3 := extractRelevantV3(res, documentFrequency, int(SnippetLength), "…")[0]
 
@@ -62,9 +70,15 @@ func (f *ResultSummarizer) Start() {
 			}
 		}
 
-		coloredContent := str.HighlightString(v3.Content, l, fmtBegin, fmtEnd)
+		displayContent := v3.Content
+
+		// If the start and end pos are 0 then we don't need to highlight because there is
+		// nothing to do so, which means its likely to be a filename match with no content
+		if v3.StartPos != 0 && v3.EndPos != 0 {
+			displayContent = str.HighlightString(v3.Content, l, fmtBegin, fmtEnd)
+		}
 
-		fmt.Println(coloredContent)
+		fmt.Println(displayContent)
 		fmt.Println("")
 	}
 }
diff --git a/string/index.go b/string/index.go
@@ -105,8 +105,8 @@ func IndexAllIgnoreCaseUnicode(haystack string, needle string, limit int) [][]in
 	// If the needle is over some amount of characters long you chop off the first few
 	// and then search for those. However this means you are not finding actual matches and as such
 	// you the need to validate a potential match after you have found one.
-	// In this case the confirmation match is done using regular expressions
-	// because its faster than checking for all case options for longer needles.
+	// The confirmation match is done in a loop because for some literals regular expression
+	// is still to slow, although for most its a valid option.
 
 	locs := [][]int{}
 	// Char limit is the cut-off where we switch from all case permutations
@@ -187,10 +187,14 @@ func IndexAllIgnoreCaseUnicode(haystack string, needle string, limit int) [][]in
 				isMatch := false
 				for i := 0; i < len(toMatch); i++ {
 					isMatch = false
-					if toMatch[i] != needleRune[i] {
+
+					// Check against the actual term and if that's a match we can avoid folding
+					// and doing those comparisons to hopefully save some CPU time
+					// TODO confirm this actually makes a difference
+					if toMatch[i] == needleRune[i] {
 						isMatch = true
 					} else {
-						// case fold and check
+						// Not a match so case fold to actually check
 						for _, j := range AllSimpleFold(toMatch[i]) {
 							if j == needleRune[i] {
 								isMatch = true
@@ -199,6 +203,7 @@ func IndexAllIgnoreCaseUnicode(haystack string, needle string, limit int) [][]in
 					}
 
 					// Bail out as there is no point to continue checking at this point
+					// as we found no match and there is no point burning more CPU checking
 					if !isMatch {
 						break
 					}
@@ -208,7 +213,7 @@ func IndexAllIgnoreCaseUnicode(haystack string, needle string, limit int) [][]in
 					// When we have confirmed a match we add it to our total
 					// but adjust the positions to the match and the length of the
 					// needle to ensure the byte count lines up
-					locs = append(locs, []int{match[0], match[0] + len(toMatch)})
+					locs = append(locs, []int{match[0], match[0] + len(string(toMatch))})
 
 					if limit > 0 && len(locs) > limit {
 						return locs[:limit]

diff --git a/string/index_test.go b/string/index_test.go
diff --git a/vendor/github.com/fatih/color/LICENSE.md b/vendor/github.com/fatih/color/LICENSE.md