From 26f8936eb75b9e0d51ad528480e7ebdd336b01f4 Mon Sep 17 00:00:00 2001 From: Ben Boyter Date: Thu, 11 Jun 2020 09:19:33 +1000 Subject: [PATCH] update ranker --- main.go | 6 ++++++ processor/arguments.go | 3 +++ processor/result_ranker.go | 7 ++++++- 3 files changed, 15 insertions(+), 1 deletion(-) diff --git a/main.go b/main.go index dd84117..dc6103f 100644 --- a/main.go +++ b/main.go @@ -167,6 +167,12 @@ func main() { "text", "set output format [text, json]", ) + flags.StringVar( + &processor.Ranker, + "ranker", + "tfidfl", + "set ranking algorithm [tfidf, tfidfl]", + ) // the below flags we want but are not enabled as yet diff --git a/processor/arguments.go b/processor/arguments.go index f12652a..1868786 100644 --- a/processor/arguments.go +++ b/processor/arguments.go @@ -41,6 +41,9 @@ var IncludeBinaryFiles = false // Format sets the output format of the formatter var Format = "" +// Ranker sets which ranking algorithm to use +var Ranker = "" + // FileOutput sets the file that output should be written to var FileOutput = "" diff --git a/processor/result_ranker.go b/processor/result_ranker.go index 6f2e5b4..e674fb2 100644 --- a/processor/result_ranker.go +++ b/processor/result_ranker.go @@ -157,7 +157,12 @@ func rankResultsTFIDF(corpusCount int, results []*fileJob, documentFrequencies m tf := float64(len(wordCount)) / words idf := math.Log10(float64(corpusCount) / float64(documentFrequencies[word])) - weight += tf * idf + // TODO adding math.Sqrt around tf can improve results https://opensourceconnections.com/blog/2015/10/16/bm25-the-next-generation-of-lucene-relevation/ + if Ranker == "tfidfl" { + weight += math.Sqrt(tf) * idf * (1/math.Sqrt(words)) + } else { + weight += tf * idf + } } // Override the score here because we don't want whatever we got originally