modify ranking switch to be better

boyter · Jun 11, 2020 · c0fc7fc · c0fc7fc
1 parent b85edab
commit c0fc7fc
Show file tree

Hide file tree

Showing 2 changed files with 14 additions and 9 deletions.
diff --git a/processor/result_ranker.go b/processor/result_ranker.go
@@ -17,11 +17,16 @@ import (
 // and as such you should never rely on the returned results being
 // the same
 func rankResults(corpusCount int, results []*fileJob) []*fileJob {
-	if Ranker == "bm25" {
+	// needs to come first because it resets the scores
+	switch Ranker {
+	case "bm25":
 		results = rankResultsBM25(corpusCount, results, calculateDocumentFrequency(results))
-	} else {
-		results = rankResultsTFIDF(corpusCount, results, calculateDocumentFrequency(results)) // needs to come first because it resets the scores
+	case "tfidfl":
+		results = rankResultsTFIDF(corpusCount, results, calculateDocumentFrequency(results), false)
+	default:
+		results = rankResultsTFIDF(corpusCount, results, calculateDocumentFrequency(results), true)
 	}
+
 	results = rankResultsLocation(results)
 	// TODO maybe need to add something here to reward phrases
 	sortResults(results)
@@ -128,7 +133,7 @@ func rankResultsLocation(results []*fileJob) []*fileJob {
 // NB loops in here use increment to avoid duffcopy
 // https://stackoverflow.com/questions/45786687/runtime-duffcopy-is-called-a-lot
 // due to how often it is called by things like the TUI mode
-func rankResultsTFIDF(corpusCount int, results []*fileJob, documentFrequencies map[string]int) []*fileJob {
+func rankResultsTFIDF(corpusCount int, results []*fileJob, documentFrequencies map[string]int, classic bool) []*fileJob {
 	var weight float64
 	for i := 0; i < len(results); i++ {
 		weight = 0
@@ -162,11 +167,11 @@ func rankResultsTFIDF(corpusCount int, results []*fileJob, documentFrequencies m
 			tf := float64(len(wordCount)) / words
 			idf := math.Log10(float64(corpusCount) / float64(documentFrequencies[word]))
 
-			if Ranker == "tfidfl" {
+			if classic {
+				weight += tf * idf
+			} else {
 				// Lucene modification to improve results https://opensourceconnections.com/blog/2015/10/16/bm25-the-next-generation-of-lucene-relevation/
 				weight += math.Sqrt(tf) * idf * (1 / math.Sqrt(words))
-			} else {
-				weight += tf * idf
 			}
 		}
 

diff --git a/processor/result_ranker_test.go b/processor/result_ranker_test.go
@@ -9,7 +9,7 @@ import (
 
 // This is based roughly the example provided by https://en.wikipedia.org/wiki/Tf%E2%80%93idf
 // with the output for it compared to ensure the results are fairly similar
-func TestRankResultsTFIDF(t *testing.T) {
+func TestRankResultsTFIDFTraditional(t *testing.T) {
 	ml1 := map[string][][]int{}
 
 	ml2 := map[string][][]int{}
@@ -28,7 +28,7 @@ func TestRankResultsTFIDF(t *testing.T) {
 		},
 	}
 
-	s = rankResultsTFIDF(2, s, calculateDocumentFrequency(s))
+	s = rankResultsTFIDF(2, s, calculateDocumentFrequency(s), true)
 
 	if s[0].Score > s[1].Score {
 		t.Error("index 0 should have lower score than 1")