fix ranking algorithm

boyter · Jun 10, 2020 · a01fa93 · a01fa93
1 parent 6eac844
commit a01fa93
Show file tree

Hide file tree

Showing 7 changed files with 107 additions and 71 deletions.
diff --git a/README.md b/README.md
@@ -15,7 +15,6 @@ cs t NOT something test~1 "ten thousand a year" "/pr[e-i]de/"
 
 ```
 BUGS
-score from TF/IDF appears to be negative in some cases (overflow??)
 need to clear pdf cache
 searching for http://localhost:8080/?q=%22about+to+explore%22&ss=300 seems to highlight to for some reason, same in TUI mode
 

diff --git a/processor/http.go b/processor/http.go
@@ -196,7 +196,7 @@ func StartHttpServer() {
 		fmtBegin := hex.EncodeToString(md5Digest.Sum([]byte(fmt.Sprintf("begin_%d", makeTimestampNano()))))
 		fmtEnd := hex.EncodeToString(md5Digest.Sum([]byte(fmt.Sprintf("end_%d", makeTimestampNano()))))
 
-		documentFrequency := calculateDocumentFrequency(results)
+		documentTermFrequency := calculateDocumentTermFrequency(results)
 
 		var searchResults []searchResult
 		extensionFacets := map[string]int{}
@@ -223,7 +223,7 @@ func StartHttpServer() {
 		}
 
 		for _, res := range displayResults {
-			v3 := extractRelevantV3(res, documentFrequency, snippetLength, "…")[0]
+			v3 := extractRelevantV3(res, documentTermFrequency, snippetLength, "…")[0]
 
 			// We have the snippet so now we need to highlight it
 			// we get all the locations that fall in the snippet length

diff --git a/processor/result_ranker.go b/processor/result_ranker.go
@@ -17,11 +17,9 @@ import (
 // and as such you should never rely on the returned results being
 // the same
 func rankResults(corpusCount int, results []*fileJob) []*fileJob {
-	documentFrequencies := calculateDocumentFrequency(results)
-
-	results = rankResultsTFIDF(corpusCount, results, documentFrequencies) // needs to come first because it resets the scores
-	results = rankResultsPhrase(results, documentFrequencies)
+	results = rankResultsTFIDF(corpusCount, results, calculateDocumentFrequency(results)) // needs to come first because it resets the scores
 	results = rankResultsLocation(results)
+	// TODO maybe need to add something here to reward phrases
 	sortResults(results)
 	return results
 }
@@ -30,6 +28,7 @@ func rankResults(corpusCount int, results []*fileJob) []*fileJob {
 // should be boosted by
 const (
 	LocationBoostValue = 0.05
+	DefaultScoreValue  = 0.01
 	PhraseBoostValue   = 1.00
 )
 
@@ -50,7 +49,6 @@ func rankResultsPhrase(results []*fileJob, documentFrequencies map[string]int) [
 			// weighted by how common that word is so that matches like 'a' impact the rank
 			// less than something like 'cromulent' which in theory should not occur as much
 			if rv3[j].Start-rv3[j-1].End < 5 {
-				// Set to 1 which seems to produce reasonable results by only boosting a little per term
 				results[i].Score += PhraseBoostValue / float64(documentFrequencies[rv3[j].Word])
 			}
 		}
@@ -67,17 +65,17 @@ func rankResultsLocation(results []*fileJob) []*fileJob {
 	for i := 0; i < len(results); i++ {
 		foundTerms := 0
 		for key := range results[i].MatchLocations {
-			locs := str.IndexAllIgnoreCaseUnicode(results[i].Location, key, -1)
+			l := str.IndexAllIgnoreCaseUnicode(results[i].Location, key, -1)
 
 			// Boost the rank slightly based on number of matches and on
 			// how long a match it is as we should reward longer matches
-			if len(locs) != 0 {
+			if len(l) != 0 {
 				foundTerms++
 
 				// If the rank is ever 0 than nothing will change, so set it
 				// to a small value to at least introduce some ranking here
 				if results[i].Score == 0 || math.IsNaN(results[i].Score) {
-					results[i].Score = 0.1
+					results[i].Score = DefaultScoreValue
 				}
 
 				// Set the score to be itself * 1.something where something
@@ -90,13 +88,13 @@ func rankResultsLocation(results []*fileJob) []*fileJob {
 				// Of course this assumes that they have the text test in the
 				// content otherwise the match is discarded
 				results[i].Score = results[i].Score * (1.0 +
-					(LocationBoostValue * float64(len(locs)) * float64(len(key))))
+					(LocationBoostValue * float64(len(l)) * float64(len(key))))
 
 				// If the location is closer to the start boost or rather don't
 				// affect negatively as much because we reduce the score slightly based on
 				// how far away from the start it is
 				low := math.MaxInt32
-				for _, l := range locs {
+				for _, l := range l {
 					if l[0] < low {
 						low = l[0]
 					}
@@ -126,23 +124,22 @@ func rankResultsLocation(results []*fileJob) []*fileJob {
 // https://stackoverflow.com/questions/45786687/runtime-duffcopy-is-called-a-lot
 // due to how often it is called by things like the TUI mode
 func rankResultsTFIDF(corpusCount int, results []*fileJob, documentFrequencies map[string]int) []*fileJob {
-	// Get the number of docs with each word in it, which is just the number of results because we are AND only
-	// and as such each document must contain all the words although they may have different counts
 	var weight float64
 	for i := 0; i < len(results); i++ {
 		weight = 0
 
 		// We don't know how many words are actually in this document... and I don't want to check
 		// because its going to slow things down. Keep in mind that this works inside the words themselves
 		// I.E. partial matches are the norm so it makes sense to base it on the number of bytes
-		// where we assume about 50 "words" per 1000 bytes of text.
 		// Also ensure that it is at least 1 to avoid divide by zero errors later on.
-		words := float64(maxInt(1, results[i].Bytes/20))
+		words := float64(maxInt(1, results[i].Bytes/2))
 
-		for key, value := range results[i].MatchLocations {
+		// word in the case is the word we are dealing with IE what the user actually searched for
+		// and wordCount is the locations of those words allowing us to know the number of words matching
+		for word, wordCount := range results[i].MatchLocations {
 			// Technically the IDF for this is wrong because we only
 			// have the count for the matches of the document not all the terms
-			// that are actually required I.E.
+			// that are actually required
 			// its likely that a search for "a b" is missing the counts
 			// for documents that have a but not b and as such
 			// the document frequencies are off with respect to the total
@@ -157,10 +154,10 @@ func rankResultsTFIDF(corpusCount int, results []*fileJob, documentFrequencies m
 			// TF  = number of this words in this document / words in entire document
 			// IDF = number of documents that contain this word
 
-			tf := float64(len(value)) / words
-			idf := float64(corpusCount) / float64(documentFrequencies[key])
+			tf := float64(len(wordCount)) / words
+			idf := math.Log10(float64(corpusCount) / float64(documentFrequencies[word]))
 
-			weight += tf * math.Log2(idf)
+			weight += tf * idf
 		}
 
 		// Override the score here because we don't want whatever we got originally
@@ -172,10 +169,10 @@ func rankResultsTFIDF(corpusCount int, results []*fileJob, documentFrequencies m
 	return results
 }
 
-func calculateDocumentFrequency(results []*fileJob) map[string]int {
-	// Calculate the document frequency for all words across all documents
-	// that we have to get the term frequency for each allowing us to determine
-	// how rare or common a word is across the corpus
+// Calculate the document term frequency for all words across all documents
+// letting us know how many times a term appears across the corpus
+// This is mostly used for snippet extraction
+func calculateDocumentTermFrequency(results []*fileJob) map[string]int {
 	documentFrequencies := map[string]int{}
 	for i := 0; i < len(results); i++ {
 		for k := range results[i].MatchLocations {
@@ -186,6 +183,20 @@ func calculateDocumentFrequency(results []*fileJob) map[string]int {
 	return documentFrequencies
 }
 
+// Calculate the document frequency for all words across all documents
+// allowing us to know the number of documents for which a term appears
+// This is mostly used for TF-IDF calculation
+func calculateDocumentFrequency(results []*fileJob) map[string]int {
+	documentFrequencies := map[string]int{}
+	for i := 0; i < len(results); i++ {
+		for k := range results[i].MatchLocations {
+			documentFrequencies[k] = documentFrequencies[k] + 1
+		}
+	}
+
+	return documentFrequencies
+}
+
 // Sort a slice of filejob results based on their score for displaying
 // and then sort based on location to stop any undeterministic ordering happening
 // as since the location includes the filename we should never have two matches

diff --git a/processor/result_ranker_test.go b/processor/result_ranker_test.go
@@ -3,40 +3,41 @@
 
 package processor
 
-import "testing"
-
-//func TestRankResultsTFIDF(t *testing.T) {
-//	ml1 := map[string][][]int{}
-//	ml1["this"] = [][]int{{1}}
-//	ml1["is"] = [][]int{{1}}
-//	ml1["a"] = [][]int{{1}, {2}}
-//	ml1["sample"] = [][]int{{1}}
-//
-//	ml2 := map[string][][]int{}
-//	ml2["this"] = [][]int{{1}}
-//	ml2["is"] = [][]int{{1}}
-//	ml2["another"] = [][]int{{1}, {2}}
-//	ml2["example"] = [][]int{{1}, {2}, {3}}
-//
-//	s := []*fileJob{
-//		{
-//			MatchLocations: ml1,
-//			Location:       "/test/other.go",
-//			Bytes:          12,
-//		},
-//		{
-//			MatchLocations: ml2,
-//			Location:       "/test/test.go",
-//			Bytes:          12,
-//		},
-//	}
-//
-//	s = rankResultsTFIDF(2, s, calculateDocumentFrequency(s))
-//
-//	if s[0].Score > s[1].Score {
-//		t.Error("index 0 should have lower score than 1")
-//	}
-//}
+import (
+	"testing"
+)
+
+// This is based roughly the example provided by https://en.wikipedia.org/wiki/Tf%E2%80%93idf
+// with the output for it compared to ensure the results are fairly similar
+func TestRankResultsTFIDF(t *testing.T) {
+	ml1 := map[string][][]int{}
+
+	ml2 := map[string][][]int{}
+	ml2["example"] = [][]int{{1}, {2}, {3}}
+
+	s := []*fileJob{
+		{
+			MatchLocations: ml1,
+			Location:       "/test/other.go",
+			Bytes:          12,
+		},
+		{
+			MatchLocations: ml2,
+			Location:       "/test/test.go",
+			Bytes:          12,
+		},
+	}
+
+	s = rankResultsTFIDF(2, s, calculateDocumentFrequency(s))
+
+	if s[0].Score > s[1].Score {
+		t.Error("index 0 should have lower score than 1")
+	}
+
+	if s[1].Score < 0.13 || s[1].Score > 0.16 {
+		t.Error("score should be in this range")
+	}
+}
 
 func TestRankResultsLocation(t *testing.T) {
 	ml := map[string][][]int{}
@@ -73,7 +74,7 @@ func TestCalculateDocumentFrequency(t *testing.T) {
 		},
 	}
 
-	freq := calculateDocumentFrequency(s)
+	freq := calculateDocumentTermFrequency(s)
 
 	if len(freq) != 1 || freq["test"] != 6 {
 		t.Error("did not work as expected")

diff --git a/processor/snippet_test.go b/processor/snippet_test.go
@@ -104,7 +104,7 @@ func TestExtractRelevantV3PaintedShip(t *testing.T) {
 		res.MatchLocations[t] = str.IndexAllIgnoreCaseUnicode(rhymeOfTheAncient, t, -1)
 	}
 
-	df := calculateDocumentFrequency([]*fileJob{res})
+	df := calculateDocumentTermFrequency([]*fileJob{res})
 	snippets := extractRelevantV3(res, df, 300, "")
 
 	if !strings.Contains(snippets[0].Content, `Day after day, day after day,
@@ -132,7 +132,7 @@ func TestExtractRelevantV3WaterWaterEverywhere(t *testing.T) {
 		res.MatchLocations[t] = str.IndexAllIgnoreCaseUnicode(rhymeOfTheAncient, t, -1)
 	}
 
-	df := calculateDocumentFrequency([]*fileJob{res})
+	df := calculateDocumentTermFrequency([]*fileJob{res})
 	snippets := extractRelevantV3(res, df, 300, "")
 
 	if !strings.Contains(snippets[0].Content, `Water, water, every where,
@@ -158,7 +158,7 @@ func TestExtractRelevantV3GroanedDead(t *testing.T) {
 		res.MatchLocations[t] = str.IndexAllIgnoreCaseUnicode(rhymeOfTheAncient, t, -1)
 	}
 
-	df := calculateDocumentFrequency([]*fileJob{res})
+	df := calculateDocumentTermFrequency([]*fileJob{res})
 	snippets := extractRelevantV3(res, df, 300, "")
 
 	if !strings.Contains(snippets[0].Content, `They groaned, they stirred, they all uprose,
@@ -184,7 +184,7 @@ func TestExtractRelevantV3DeathFires(t *testing.T) {
 		res.MatchLocations[t] = str.IndexAllIgnoreCaseUnicode(rhymeOfTheAncient, t, -1)
 	}
 
-	df := calculateDocumentFrequency([]*fileJob{res})
+	df := calculateDocumentTermFrequency([]*fileJob{res})
 	snippets := extractRelevantV3(res, df, 300, "")
 
 	if !strings.Contains(snippets[0].Content, `About, about, in reel and rout
@@ -210,7 +210,7 @@ func TestExtractRelevantV3PoorNerves(t *testing.T) {
 		res.MatchLocations[t] = str.IndexAllIgnoreCaseUnicode(prideAndPrejudice, t, -1)
 	}
 
-	df := calculateDocumentFrequency([]*fileJob{res})
+	df := calculateDocumentTermFrequency([]*fileJob{res})
 	snippets := extractRelevantV3(res, df, 300, "")
 
 	if !strings.Contains(snippets[0].Content, `You take delight in vexing me. You have no compassion for my poor
@@ -236,11 +236,36 @@ func TestExtractRelevantV3TenThousandAYear(t *testing.T) {
 		res.MatchLocations[t] = str.IndexAllIgnoreCaseUnicode(prideAndPrejudice, t, -1)
 	}
 
-	df := calculateDocumentFrequency([]*fileJob{res})
+	df := calculateDocumentTermFrequency([]*fileJob{res})
 	snippets := extractRelevantV3(res, df, 300, "")
 
 	if !strings.Contains(snippets[0].Content, `of his having
       ten thousand a year. The gentlemen pronounced him to be a fine`) {
 		t.Error("expected to have snippet")
 	}
 }
+
+func TestExtractRelevantV3StrangerParents(t *testing.T) {
+	terms := []string{
+		"stranger",
+		"parents",
+	}
+
+	res := &fileJob{
+		Content:        []byte(prideAndPrejudice),
+		MatchLocations: map[string][][]int{},
+	}
+
+	for _, t := range terms {
+		res.MatchLocations[t] = str.IndexAllIgnoreCaseUnicode(prideAndPrejudice, t, -1)
+	}
+
+	df := calculateDocumentTermFrequency([]*fileJob{res})
+	snippets := extractRelevantV3(res, df, 300, "")
+
+	if !strings.Contains(snippets[0].Content, `An unhappy alternative is before you, Elizabeth. From this day
+      you must be a stranger to one of your parents. Your mother will
+      never see you again if you`) {
+		t.Error("expected to have snippet")
+	}
+}
diff --git a/processor/tui.go b/processor/tui.go
@@ -181,10 +181,10 @@ func drawResults(app *tview.Application, results []*fileJob, textView *tview.Tex
 	var resultText string
 	resultText += fmt.Sprintf("%d results(s) for '%s' from %d files %s\n\n", len(results), searchTerm, fileCount, inProgress)
 
-	documentFrequency := calculateDocumentFrequency(results)
+	documentTermFrequency := calculateDocumentTermFrequency(results)
 	for i, res := range pResults {
 		// NB this just gets the first snippet which should in theory be the most relevant
-		v3 := extractRelevantV3(res, documentFrequency, int(SnippetLength), "…")[0]
+		v3 := extractRelevantV3(res, documentTermFrequency, int(SnippetLength), "…")[0]
 
 		resultText += fmt.Sprintf("[purple]%d. %s (%.3f)", i+1, res.Location, res.Score) + "[white]\n\n"
 

diff --git a/processor/worker_summarize.go b/processor/worker_summarize.go
@@ -59,7 +59,7 @@ func (f *ResultSummarizer) Start() {
 func (f *ResultSummarizer) formatJson(results []*fileJob) {
 	jsonResults := []jsonResult{}
 
-	documentFrequency := calculateDocumentFrequency(results)
+	documentFrequency := calculateDocumentTermFrequency(results)
 
 	for _, res := range results {
 		v3 := extractRelevantV3(res, documentFrequency, int(SnippetLength), "…")[0]
@@ -100,7 +100,7 @@ func (f *ResultSummarizer) formatDefault(results []*fileJob) {
 		fmtEnd = ""
 	}
 
-	documentFrequency := calculateDocumentFrequency(results)
+	documentFrequency := calculateDocumentTermFrequency(results)
 
 	for _, res := range results {
 		color.Magenta(fmt.Sprintf("%s (%.3f)", res.Location, res.Score))