various updates

boyter · Jun 23, 2020 · abc49e4 · abc49e4
1 parent 0d89f04
commit abc49e4
Show file tree

Hide file tree

Showing 17 changed files with 320 additions and 189 deletions.
diff --git a/asset/tui/main.go b/asset/tui/main.go
diff --git a/main.go b/main.go
@@ -182,7 +182,7 @@ func main() {
 		&processor.Ranker,
 		"ranker",
 		"bm25",
-		"set ranking algorithm [wc, tfidf, tfidf2, bm25]",
+		"set ranking algorithm [simple, tfidf, tfidf2, bm25]",
 	)
 	flags.StringVarP(
 		&processor.FileOutput,

diff --git a/processor/fuzz.go b/processor/fuzz.go
@@ -24,15 +24,15 @@ func Fuzz(data []byte) int {
 	freq := map[string]int{}
 	freq[find[:2]] = 5
 
-	res := &fileJob{
+	res := &FileJob{
 		Content:        data,
 		MatchLocations: loc,
 	}
 
 	extractRelevantV3(res, freq, 300, "...")
 
-	findSpaceRight(&fileJob{Content: data}, 0, 10000)
-	findSpaceLeft(&fileJob{Content: data}, len(data)-1, 10000)
+	findSpaceRight(&FileJob{Content: data}, 0, 10000)
+	findSpaceLeft(&FileJob{Content: data}, len(data)-1, 10000)
 
 	return 1
 }
diff --git a/processor/http.go b/processor/http.go
@@ -114,7 +114,7 @@ func StartHttpServer() {
 		page := tryParseInt(r.URL.Query().Get("p"), 0)
 		pageSize := 20
 
-		var results []*fileJob
+		var results []*FileJob
 		var fileCount int64
 
 		log.Info().
@@ -136,8 +136,8 @@ func StartHttpServer() {
 			}
 
 			fileQueue := make(chan *file.File, 1000)                // Files ready to be read from disk NB we buffer here because http runs till finished or the process is cancelled
-			toProcessQueue := make(chan *fileJob, runtime.NumCPU()) // Files to be read into memory for processing
-			summaryQueue := make(chan *fileJob, runtime.NumCPU())   // Files that match and need to be displayed
+			toProcessQueue := make(chan *FileJob, runtime.NumCPU()) // Files to be read into memory for processing
+			summaryQueue := make(chan *FileJob, runtime.NumCPU())   // Files that match and need to be displayed
 
 			fileWalker := file.NewFileWalker(directory, fileQueue)
 			fileWalker.PathExclude = PathDenylist
@@ -314,7 +314,7 @@ func calculateExtensionFacet(extensionFacets map[string]int, query string, snipp
 	return ef
 }
 
-func calculatePages(results []*fileJob, pageSize int, query string, snippetLength int) []pageResult {
+func calculatePages(results []*FileJob, pageSize int, query string, snippetLength int) []pageResult {
 	var pages []pageResult
 
 	if len(results) == 0 {

diff --git a/processor/http_test.go b/processor/http_test.go
@@ -7,15 +7,15 @@ import (
 )
 
 func TestCalculatePagesNone(t *testing.T) {
-	var pages = calculatePages([]*fileJob{}, 20, "", 100)
+	var pages = calculatePages([]*FileJob{}, 20, "", 100)
 
 	if len(pages) != 0 {
 		t.Error("expected no result")
 	}
 }
 
 func TestCalculatePagesSingle(t *testing.T) {
-	var pages = calculatePages([]*fileJob{
+	var pages = calculatePages([]*FileJob{
 		{},
 	}, 20, "", 100)
 
@@ -33,9 +33,9 @@ func TestCalculatePagesSingle(t *testing.T) {
 }
 
 func TestCalculatePagesEdgeStart(t *testing.T) {
-	var fj []*fileJob
+	var fj []*FileJob
 	for i := 0; i < 20; i++ {
-		fj = append(fj, &fileJob{})
+		fj = append(fj, &FileJob{})
 	}
 
 	var pages = calculatePages(fj, 20, "", 100)
@@ -46,9 +46,9 @@ func TestCalculatePagesEdgeStart(t *testing.T) {
 }
 
 func TestCalculatePagesEdgeOver(t *testing.T) {
-	var fj []*fileJob
+	var fj []*FileJob
 	for i := 0; i < 21; i++ {
-		fj = append(fj, &fileJob{})
+		fj = append(fj, &FileJob{})
 	}
 
 	var pages = calculatePages(fj, 20, "", 100)
@@ -59,9 +59,9 @@ func TestCalculatePagesEdgeOver(t *testing.T) {
 }
 
 func TestCalculatePagesSecondPageEdge(t *testing.T) {
-	var fj []*fileJob
+	var fj []*FileJob
 	for i := 0; i < 40; i++ {
-		fj = append(fj, &fileJob{})
+		fj = append(fj, &FileJob{})
 	}
 
 	var pages = calculatePages(fj, 20, "", 100)
@@ -72,9 +72,9 @@ func TestCalculatePagesSecondPageEdge(t *testing.T) {
 }
 
 func TestCalculatePagesSecondPageEdgeOver(t *testing.T) {
-	var fj []*fileJob
+	var fj []*FileJob
 	for i := 0; i < 41; i++ {
-		fj = append(fj, &fileJob{})
+		fj = append(fj, &FileJob{})
 	}
 
 	var pages = calculatePages(fj, 20, "", 100)

diff --git a/processor/processor.go b/processor/processor.go
@@ -51,8 +51,8 @@ func (process *Process) StartProcess() {
 	}
 
 	fileQueue := make(chan *file.File, 1000)                // Files ready to be read from disk NB we buffer here because CLI runs till finished or the process is cancelled
-	toProcessQueue := make(chan *fileJob, runtime.NumCPU()) // Files to be read into memory for processing
-	summaryQueue := make(chan *fileJob, runtime.NumCPU())   // Files that match and need to be displayed
+	toProcessQueue := make(chan *FileJob, runtime.NumCPU()) // Files to be read into memory for processing
+	summaryQueue := make(chan *FileJob, runtime.NumCPU())   // Files that match and need to be displayed
 
 	fileWalker := file.NewFileWalker(process.Directory, fileQueue)
 	fileWalker.PathExclude = PathDenylist

diff --git a/processor/result_ranker.go b/processor/result_ranker.go
@@ -15,20 +15,22 @@ import (
 // Note that this method will evolve over time
 // and as such you should never rely on the returned results being
 // the same
-func rankResults(corpusCount int, results []*fileJob) []*fileJob {
+func rankResults(corpusCount int, results []*FileJob) []*FileJob {
 	// needs to come first because it resets the scores
 	switch Ranker {
-	case "wc":
-		results = results
+	case "simple":
+		// in this case the results are already ranked by the number of matches
 	case "bm25":
 		results = rankResultsBM25(corpusCount, results, calculateDocumentFrequency(results))
+		results = rankResultsLocation(results)
 	case "tfidf2":
 		results = rankResultsTFIDF(corpusCount, results, calculateDocumentFrequency(results), false)
+		results = rankResultsLocation(results)
 	default:
 		results = rankResultsTFIDF(corpusCount, results, calculateDocumentFrequency(results), true)
+		results = rankResultsLocation(results)
 	}
 
-	results = rankResultsLocation(results)
 	// TODO maybe need to add something here to reward phrases
 	sortResults(results)
 	return results
@@ -47,7 +49,7 @@ const (
 // heavy. This is fairly similar to how the snippet extraction works but with less work because it does
 // not need to deal with cutting between unicode endpoints
 // NB this is one of the more expensive parts of the ranking
-func rankResultsPhrase(results []*fileJob, documentFrequencies map[string]int) []*fileJob {
+func rankResultsPhrase(results []*FileJob, documentFrequencies map[string]int) []*FileJob {
 	for i := 0; i < len(results); i++ {
 		rv3 := convertToRelevant(results[i])
 
@@ -72,7 +74,7 @@ func rankResultsPhrase(results []*fileJob, documentFrequencies map[string]int) [
 // file location field.
 // This is not using TF-IDF or any fancy algorithm just basic checks
 // and boosts
-func rankResultsLocation(results []*fileJob) []*fileJob {
+func rankResultsLocation(results []*FileJob) []*FileJob {
 	for i := 0; i < len(results); i++ {
 		foundTerms := 0
 		for key := range results[i].MatchLocations {
@@ -134,7 +136,7 @@ func rankResultsLocation(results []*fileJob) []*fileJob {
 // NB loops in here use increment to avoid duffcopy
 // https://stackoverflow.com/questions/45786687/runtime-duffcopy-is-called-a-lot
 // due to how often it is called by things like the TUI mode
-func rankResultsTFIDF(corpusCount int, results []*fileJob, documentFrequencies map[string]int, classic bool) []*fileJob {
+func rankResultsTFIDF(corpusCount int, results []*FileJob, documentFrequencies map[string]int, classic bool) []*FileJob {
 	var weight float64
 	for i := 0; i < len(results); i++ {
 		weight = 0
@@ -198,7 +200,7 @@ func rankResultsTFIDF(corpusCount int, results []*fileJob, documentFrequencies m
 //                 IDF * TF * (k1 + 1)
 // BM25 = sum ----------------------------
 //            TF + k1 * (1 - b + b * D / L)
-func rankResultsBM25(corpusCount int, results []*fileJob, documentFrequencies map[string]int) []*fileJob {
+func rankResultsBM25(corpusCount int, results []*FileJob, documentFrequencies map[string]int) []*FileJob {
 	var weight float64
 
 	// Get the average number of words across all documents because we need that in BM25 to calculate correctly
@@ -247,7 +249,7 @@ func rankResultsBM25(corpusCount int, results []*fileJob, documentFrequencies ma
 // Calculate the document term frequency for all words across all documents
 // letting us know how many times a term appears across the corpus
 // This is mostly used for snippet extraction
-func calculateDocumentTermFrequency(results []*fileJob) map[string]int {
+func calculateDocumentTermFrequency(results []*FileJob) map[string]int {
 	documentFrequencies := map[string]int{}
 	for i := 0; i < len(results); i++ {
 		for k := range results[i].MatchLocations {
@@ -261,7 +263,7 @@ func calculateDocumentTermFrequency(results []*fileJob) map[string]int {
 // Calculate the document frequency for all words across all documents
 // allowing us to know the number of documents for which a term appears
 // This is mostly used for TF-IDF calculation
-func calculateDocumentFrequency(results []*fileJob) map[string]int {
+func calculateDocumentFrequency(results []*FileJob) map[string]int {
 	documentFrequencies := map[string]int{}
 	for i := 0; i < len(results); i++ {
 		for k := range results[i].MatchLocations {
@@ -276,7 +278,7 @@ func calculateDocumentFrequency(results []*fileJob) map[string]int {
 // and then sort based on location to stop any undeterministic ordering happening
 // as since the location includes the filename we should never have two matches
 // that are 100% equal based on the two criteria we use.
-func sortResults(results []*fileJob) {
+func sortResults(results []*FileJob) {
 	sort.Slice(results, func(i, j int) bool {
 		if results[i].Score == results[j].Score {
 			return strings.Compare(results[i].Location, results[j].Location) < 0

diff --git a/processor/result_ranker_test.go b/processor/result_ranker_test.go
@@ -14,7 +14,7 @@ func TestRankResultsTFIDFTraditional(t *testing.T) {
 	ml2 := map[string][][]int{}
 	ml2["example"] = [][]int{{1}, {2}, {3}}
 
-	s := []*fileJob{
+	s := []*FileJob{
 		{
 			MatchLocations: ml1,
 			Location:       "/test/other.go",
@@ -42,7 +42,7 @@ func TestRankResultsTFIDFComparison(t *testing.T) {
 	ml1 := map[string][][]int{}
 	ml1["example"] = [][]int{{1}, {2}, {3}}
 
-	s := []*fileJob{
+	s := []*FileJob{
 		{
 			MatchLocations: ml1,
 			Location:       "/test/other.go",
@@ -65,7 +65,7 @@ func TestRankResultsRankerComparison(t *testing.T) {
 	ml1 := map[string][][]int{}
 	ml1["example"] = [][]int{{1}, {2}, {3}}
 
-	s := []*fileJob{
+	s := []*FileJob{
 		{
 			MatchLocations: ml1,
 			Location:       "/test/other.go",
@@ -91,7 +91,7 @@ func TestRankResultsLocation(t *testing.T) {
 	ml := map[string][][]int{}
 	ml["test"] = [][]int{{1}, {2}, {3}}
 
-	s := []*fileJob{
+	s := []*FileJob{
 		{
 			MatchLocations: ml,
 			Location:       "/test/other.go",
@@ -113,7 +113,7 @@ func TestCalculateDocumentFrequency(t *testing.T) {
 	ml := map[string][][]int{}
 	ml["test"] = [][]int{{1}, {2}, {3}}
 
-	s := []*fileJob{
+	s := []*FileJob{
 		{
 			MatchLocations: ml,
 		},
@@ -130,7 +130,7 @@ func TestCalculateDocumentFrequency(t *testing.T) {
 }
 
 func TestSortResults(t *testing.T) {
-	s := []*fileJob{
+	s := []*FileJob{
 		{
 			Filename: "1",
 			Location: "",
@@ -150,7 +150,7 @@ func TestSortResults(t *testing.T) {
 }
 
 func TestSortResultsEqualScore(t *testing.T) {
-	s := []*fileJob{
+	s := []*FileJob{
 		{
 			Filename: "1",
 			Location: "2",

diff --git a/processor/snippet.go b/processor/snippet.go
@@ -74,7 +74,7 @@ type Snippet struct {
 // to differ between people. Heck a few times I have been disappointed with results that I was previously happy with.
 // As such this is not tested as much as other methods and you should not rely on the results being static over time
 // as the internals will be modified to produce better results where possible
-func extractRelevantV3(res *fileJob, documentFrequencies map[string]int, relLength int, indicator string) []Snippet {
+func extractRelevantV3(res *FileJob, documentFrequencies map[string]int, relLength int, indicator string) []Snippet {
 	wrapLength := relLength / 2
 	var bestMatches []bestMatch
 
@@ -275,7 +275,7 @@ func extractRelevantV3(res *fileJob, documentFrequencies map[string]int, relLeng
 
 // Get all of the locations into a new data structure
 // which makes things easy to sort and deal with
-func convertToRelevant(res *fileJob) []relevantV3 {
+func convertToRelevant(res *FileJob) []relevantV3 {
 	var rv3 []relevantV3
 
 	for k, v := range res.MatchLocations {
@@ -299,7 +299,7 @@ func convertToRelevant(res *fileJob) []relevantV3 {
 // Looks for a nearby whitespace character near this position (`pos`)
 // up to `distance` away.  Returns index of space if a space was found and
 // true, otherwise returns the original index and false
-func findSpaceRight(res *fileJob, pos int, distance int) (int, bool) {
+func findSpaceRight(res *FileJob, pos int, distance int) (int, bool) {
 	if len(res.Content) == 0 {
 		return pos, false
 	}
@@ -322,7 +322,7 @@ func findSpaceRight(res *fileJob, pos int, distance int) (int, bool) {
 // Looks for nearby whitespace character near this position
 // up to distance away. Returns index of space if a space was found and tru
 // otherwise the original index is return and false
-func findSpaceLeft(res *fileJob, pos int, distance int) (int, bool) {
+func findSpaceLeft(res *FileJob, pos int, distance int) (int, bool) {
 	if len(res.Content) == 0 {
 		return pos, false
 	}