Skip to content

Commit

Permalink
Merge branch 'master' of github.com:boyter/cs
Browse files Browse the repository at this point in the history
  • Loading branch information
min committed Mar 13, 2020
2 parents 31ff190 + 6e04b5f commit 3673f74
Show file tree
Hide file tree
Showing 71 changed files with 5,867 additions and 6,116 deletions.
34 changes: 25 additions & 9 deletions Gopkg.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

14 changes: 12 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,9 @@ BUGS
search for cs --hidden --no-gitignore --no-ignore 英文 cuts in the middle of a rune
score from TF/IDF appears to be negative in some cases (overflow??)
searches in TUI for very large directories clobber each other making UI unresponsive
search cs result_ranker highlights the whole section
highlight on windows command line not escaped correctly
http://localhost:8080/?q=test&ss=300 bug where display is all yellow
go install && cs --hidden --no-gitignore --no-ignore ten thousand a year <<<<< out of range BUG
Expand All @@ -34,7 +36,15 @@ TODO
clean up parser so multiple spaces aren't tokens or flag em to be ignored
add proximity search "this is"~5 which means they need to be within 5 bytes of each other
add limit to number of results
investigate string match limit
investigate string match limit (might be wrong for unicode insensitive)
JSON endpoint for HTTP
JSON output for cli
Save to disk output
MAYBE
HTML parser option
https://stackoverflow.com/questions/44441665/how-to-extract-only-text-from-html-in-golang
```

template example (from root)
Expand Down
12 changes: 6 additions & 6 deletions processor/parser.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,12 @@ import (
)

const (
Default int64 = 0
Quoted int64 = 1
Regex int64 = 2
Negated int64 = 3
Fuzzy1 int64 = 4
Fuzzy2 int64 = 5
Default = iota
Quoted
Regex
Negated
Fuzzy1
Fuzzy2
)

type searchParams struct {
Expand Down
39 changes: 25 additions & 14 deletions processor/snippet.go
Original file line number Diff line number Diff line change
Expand Up @@ -66,17 +66,6 @@ func extractRelevantV3(res *fileJob, documentFrequencies map[string]int, relLeng
// which makes things easy to sort and deal with
for k, v := range res.MatchLocations {
for _, i := range v {
// For filename matches the mark is from 0 to 0 so we don't highlight anything
// however it means we don't match anything either so set it to the full length
// of what we need to display
if i[0] == 0 && i[1] == 0 {
if relLength > len(res.Content) {
i[1] = len(res.Content)
} else {
i[1] = relLength
}
}

rv3 = append(rv3, relevantV3{
Word: k,
Start: i[0],
Expand All @@ -85,10 +74,32 @@ func extractRelevantV3(res *fileJob, documentFrequencies map[string]int, relLeng
}
}

// If we have a single result and its a filename match which has
// no real start or end position
// it means we have no content to look through so just display the first
// chunk of the file
if len(rv3) == 1 && rv3[0].Start == 0 && rv3[0].End == 0 {
endPos := 300
if len(res.Content) < 300 {
endPos = len(res.Content)
}

// TODO check for if we cut in the middle of a multibyte character
return []Snippet{
{
Content: string(res.Content[:endPos]),
StartPos: 0,
EndPos: 0,
},
}
}

// Sort the results so when we slide around everything is in order
sort.Slice(rv3, func(i, j int) bool {
return rv3[i].Start < rv3[j].Start
})


// Slide around looking for matches that fit in the length
for i := 0; i < len(rv3); i++ {
m := bestMatch{
Expand Down Expand Up @@ -183,16 +194,16 @@ func extractRelevantV3(res *fileJob, documentFrequencies map[string]int, relLeng
// how good a match it is and hopefully display to the user what they
// were actually looking for
m.Score += float64(len(m.Relevant)) // Factor in how many matches we have
m.Score += float64(m.EndPos - m.StartPos) // Factor in how large the snippet is
//m.Score += float64(m.EndPos - m.StartPos) // Factor in how large the snippet is NB weight this as it makes things worse sometimes

// Apply higher score where the words are near each other
mid := rv3[i].Start + (rv3[i].End-rv3[i].End)/2 // match word midpoint
for _, v := range m.Relevant {
p := v.Start + (v.End-v.Start)/2 // comparison word midpoint

// If the word is within a reasonable distance of this word boost the score
// weighted by how common that word is so that matches like a impact the rank
// less than something like cromulent
// weighted by how common that word is so that matches like 'a' impact the rank
// less than something like 'cromulent' which in theory should not occur as much
if abs(mid-p) < (relLength / 3) {
m.Score += 100 / float64(documentFrequencies[v.Word])
}
Expand Down
22 changes: 18 additions & 4 deletions processor/worker_summarize.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,21 +3,25 @@ package processor
import (
"fmt"
str "github.com/boyter/cs/string"
. "github.com/logrusorgru/aurora"
"github.com/fatih/color"
"github.com/mattn/go-isatty"
"os"
)

type ResultSummarizer struct {
input chan *fileJob
ResultLimit int64
FileReaderWorker *FileReaderWorker
SnippetCount int
NoColor bool
}

func NewResultSummarizer(input chan *fileJob) ResultSummarizer {
return ResultSummarizer{
input: input,
ResultLimit: -1,
SnippetCount: 1,
NoColor: os.Getenv("TERM") == "dumb" || (!isatty.IsTerminal(os.Stdout.Fd()) && !isatty.IsCygwinTerminal(os.Stdout.Fd())),
}
}

Expand All @@ -39,11 +43,15 @@ func (f *ResultSummarizer) Start() {

fmtBegin := "\033[1;31m"
fmtEnd := "\033[0m"
if f.NoColor {
fmtBegin = ""
fmtEnd = ""
}

documentFrequency := calculateDocumentFrequency(results)

for _, res := range results {
fmt.Printf("%s %s%.3f%s\n", Magenta(res.Location), Magenta("("), Magenta(res.Score), Magenta(")"))
color.Magenta(fmt.Sprintf("%s (%.3f)", res.Location, res.Score))

v3 := extractRelevantV3(res, documentFrequency, int(SnippetLength), "…")[0]

Expand All @@ -62,9 +70,15 @@ func (f *ResultSummarizer) Start() {
}
}

coloredContent := str.HighlightString(v3.Content, l, fmtBegin, fmtEnd)
displayContent := v3.Content

// If the start and end pos are 0 then we don't need to highlight because there is
// nothing to do so, which means its likely to be a filename match with no content
if v3.StartPos != 0 && v3.EndPos != 0 {
displayContent = str.HighlightString(v3.Content, l, fmtBegin, fmtEnd)
}

fmt.Println(coloredContent)
fmt.Println(displayContent)
fmt.Println("")
}
}
15 changes: 10 additions & 5 deletions string/index.go
Original file line number Diff line number Diff line change
Expand Up @@ -105,8 +105,8 @@ func IndexAllIgnoreCaseUnicode(haystack string, needle string, limit int) [][]in
// If the needle is over some amount of characters long you chop off the first few
// and then search for those. However this means you are not finding actual matches and as such
// you the need to validate a potential match after you have found one.
// In this case the confirmation match is done using regular expressions
// because its faster than checking for all case options for longer needles.
// The confirmation match is done in a loop because for some literals regular expression
// is still to slow, although for most its a valid option.

locs := [][]int{}
// Char limit is the cut-off where we switch from all case permutations
Expand Down Expand Up @@ -187,10 +187,14 @@ func IndexAllIgnoreCaseUnicode(haystack string, needle string, limit int) [][]in
isMatch := false
for i := 0; i < len(toMatch); i++ {
isMatch = false
if toMatch[i] != needleRune[i] {

// Check against the actual term and if that's a match we can avoid folding
// and doing those comparisons to hopefully save some CPU time
// TODO confirm this actually makes a difference
if toMatch[i] == needleRune[i] {
isMatch = true
} else {
// case fold and check
// Not a match so case fold to actually check
for _, j := range AllSimpleFold(toMatch[i]) {
if j == needleRune[i] {
isMatch = true
Expand All @@ -199,6 +203,7 @@ func IndexAllIgnoreCaseUnicode(haystack string, needle string, limit int) [][]in
}

// Bail out as there is no point to continue checking at this point
// as we found no match and there is no point burning more CPU checking
if !isMatch {
break
}
Expand All @@ -208,7 +213,7 @@ func IndexAllIgnoreCaseUnicode(haystack string, needle string, limit int) [][]in
// When we have confirmed a match we add it to our total
// but adjust the positions to the match and the length of the
// needle to ensure the byte count lines up
locs = append(locs, []int{match[0], match[0] + len(toMatch)})
locs = append(locs, []int{match[0], match[0] + len(string(toMatch))})

if limit > 0 && len(locs) > limit {
return locs[:limit]
Expand Down
26 changes: 9 additions & 17 deletions string/index_test.go

Large diffs are not rendered by default.

20 changes: 20 additions & 0 deletions vendor/github.com/fatih/color/LICENSE.md

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading

0 comments on commit 3673f74

Please sign in to comment.