Skip to content

Commit

Permalink
modify ranking switch to be better
Browse files Browse the repository at this point in the history
  • Loading branch information
boyter committed Jun 11, 2020
1 parent b85edab commit c0fc7fc
Show file tree
Hide file tree
Showing 2 changed files with 14 additions and 9 deletions.
19 changes: 12 additions & 7 deletions processor/result_ranker.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,11 +17,16 @@ import (
// and as such you should never rely on the returned results being
// the same
func rankResults(corpusCount int, results []*fileJob) []*fileJob {
if Ranker == "bm25" {
// needs to come first because it resets the scores
switch Ranker {
case "bm25":
results = rankResultsBM25(corpusCount, results, calculateDocumentFrequency(results))
} else {
results = rankResultsTFIDF(corpusCount, results, calculateDocumentFrequency(results)) // needs to come first because it resets the scores
case "tfidfl":
results = rankResultsTFIDF(corpusCount, results, calculateDocumentFrequency(results), false)
default:
results = rankResultsTFIDF(corpusCount, results, calculateDocumentFrequency(results), true)
}

results = rankResultsLocation(results)
// TODO maybe need to add something here to reward phrases
sortResults(results)
Expand Down Expand Up @@ -128,7 +133,7 @@ func rankResultsLocation(results []*fileJob) []*fileJob {
// NB loops in here use increment to avoid duffcopy
// https://stackoverflow.com/questions/45786687/runtime-duffcopy-is-called-a-lot
// due to how often it is called by things like the TUI mode
func rankResultsTFIDF(corpusCount int, results []*fileJob, documentFrequencies map[string]int) []*fileJob {
func rankResultsTFIDF(corpusCount int, results []*fileJob, documentFrequencies map[string]int, classic bool) []*fileJob {
var weight float64
for i := 0; i < len(results); i++ {
weight = 0
Expand Down Expand Up @@ -162,11 +167,11 @@ func rankResultsTFIDF(corpusCount int, results []*fileJob, documentFrequencies m
tf := float64(len(wordCount)) / words
idf := math.Log10(float64(corpusCount) / float64(documentFrequencies[word]))

if Ranker == "tfidfl" {
if classic {
weight += tf * idf
} else {
// Lucene modification to improve results https://opensourceconnections.com/blog/2015/10/16/bm25-the-next-generation-of-lucene-relevation/
weight += math.Sqrt(tf) * idf * (1 / math.Sqrt(words))
} else {
weight += tf * idf
}
}

Expand Down
4 changes: 2 additions & 2 deletions processor/result_ranker_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ import (

// This is based roughly the example provided by https://en.wikipedia.org/wiki/Tf%E2%80%93idf
// with the output for it compared to ensure the results are fairly similar
func TestRankResultsTFIDF(t *testing.T) {
func TestRankResultsTFIDFTraditional(t *testing.T) {
ml1 := map[string][][]int{}

ml2 := map[string][][]int{}
Expand All @@ -28,7 +28,7 @@ func TestRankResultsTFIDF(t *testing.T) {
},
}

s = rankResultsTFIDF(2, s, calculateDocumentFrequency(s))
s = rankResultsTFIDF(2, s, calculateDocumentFrequency(s), true)

if s[0].Score > s[1].Score {
t.Error("index 0 should have lower score than 1")
Expand Down

0 comments on commit c0fc7fc

Please sign in to comment.