From a5db98663157624be94e2064edb7a57a26e003ea Mon Sep 17 00:00:00 2001 From: Ben Boyter Date: Thu, 11 Jun 2020 11:23:53 +1000 Subject: [PATCH] add in bm25 ranking --- main.go | 4 +-- processor/result_ranker.go | 70 ++++++++++++++++++++++++++++++++++++-- 2 files changed, 69 insertions(+), 5 deletions(-) diff --git a/main.go b/main.go index dc6103f..286ddaa 100644 --- a/main.go +++ b/main.go @@ -170,8 +170,8 @@ func main() { flags.StringVar( &processor.Ranker, "ranker", - "tfidfl", - "set ranking algorithm [tfidf, tfidfl]", + "tfidf", + "set ranking algorithm [tfidf, tfidfl, bm25]", ) // the below flags we want but are not enabled as yet diff --git a/processor/result_ranker.go b/processor/result_ranker.go index e674fb2..0134d4c 100644 --- a/processor/result_ranker.go +++ b/processor/result_ranker.go @@ -17,7 +17,11 @@ import ( // and as such you should never rely on the returned results being // the same func rankResults(corpusCount int, results []*fileJob) []*fileJob { - results = rankResultsTFIDF(corpusCount, results, calculateDocumentFrequency(results)) // needs to come first because it resets the scores + if Ranker == "bm25" { + results = rankResultsBM25(corpusCount, results, calculateDocumentFrequency(results)) + } else { + results = rankResultsTFIDF(corpusCount, results, calculateDocumentFrequency(results)) // needs to come first because it resets the scores + } results = rankResultsLocation(results) // TODO maybe need to add something here to reward phrases sortResults(results) @@ -30,6 +34,7 @@ const ( LocationBoostValue = 0.05 DefaultScoreValue = 0.01 PhraseBoostValue = 1.00 + BytesWordDivisor = 2 ) // Given the results boost based on how close the phrases are to each other IE make it slightly phrase @@ -132,7 +137,7 @@ func rankResultsTFIDF(corpusCount int, results []*fileJob, documentFrequencies m // because its going to slow things down. Keep in mind that this works inside the words themselves // I.E. partial matches are the norm so it makes sense to base it on the number of bytes // Also ensure that it is at least 1 to avoid divide by zero errors later on. - words := float64(maxInt(1, results[i].Bytes/2)) + words := float64(maxInt(1, results[i].Bytes/BytesWordDivisor)) // word in the case is the word we are dealing with IE what the user actually searched for // and wordCount is the locations of those words allowing us to know the number of words matching @@ -157,8 +162,8 @@ func rankResultsTFIDF(corpusCount int, results []*fileJob, documentFrequencies m tf := float64(len(wordCount)) / words idf := math.Log10(float64(corpusCount) / float64(documentFrequencies[word])) - // TODO adding math.Sqrt around tf can improve results https://opensourceconnections.com/blog/2015/10/16/bm25-the-next-generation-of-lucene-relevation/ if Ranker == "tfidfl" { + // Lucene modification to improve results https://opensourceconnections.com/blog/2015/10/16/bm25-the-next-generation-of-lucene-relevation/ weight += math.Sqrt(tf) * idf * (1/math.Sqrt(words)) } else { weight += tf * idf @@ -174,6 +179,65 @@ func rankResultsTFIDF(corpusCount int, results []*fileJob, documentFrequencies m return results } +// BM25 implementation which ranks the results +// Technically this is not a real BM25 because we don't +// have counts of terms for documents that don't match +// so the IDF value is not correctly calculated +// https://en.wikipedia.org/wiki/Okapi_BM25 +// +// NB loops in here use increment to avoid duffcopy +// https://stackoverflow.com/questions/45786687/runtime-duffcopy-is-called-a-lot +// due to how often it is called by things like the TUI mode +// +// IDF * TF * (k1 + 1) +// BM25 = sum ---------------------------- +// TF + k1 * (1 - b + b * D / L) +func rankResultsBM25(corpusCount int, results []*fileJob, documentFrequencies map[string]int) []*fileJob { + var weight float64 + + // Get the average number of words across all documents because we need that in BM25 to calculate correctly + var averageDocumentWords float64 + for i := 0; i < len(results); i++ { + averageDocumentWords += float64(maxInt(1, results[i].Bytes/BytesWordDivisor)) + } + averageDocumentWords = averageDocumentWords/float64(len(results)) + + k1 := 1.2 + b := 0.75 + + for i := 0; i < len(results); i++ { + weight = 0 + + // We don't know how many words are actually in this document... and I don't want to check + // because its going to slow things down. Keep in mind that this works inside the words themselves + // I.E. partial matches are the norm so it makes sense to base it on the number of bytes + // Also ensure that it is at least 1 to avoid divide by zero errors later on. + words := float64(maxInt(1, results[i].Bytes/BytesWordDivisor)) + + // word in the case is the word we are dealing with IE what the user actually searched for + // and wordCount is the locations of those words allowing us to know the number of words matching + for word, wordCount := range results[i].MatchLocations { + + // TF = number of this words in this document / words in entire document + // IDF = number of documents that contain this word + tf := float64(len(wordCount)) / words + idf := math.Log10(float64(corpusCount) / float64(documentFrequencies[word])) + + step1 := idf * tf * (k1 + 1) + step2 := tf + k1 * (1 - b + (b * words / averageDocumentWords)) + + weight += step1 / step2 + } + + // Override the score here because we don't want whatever we got originally + // which is just based on the number of keyword matches... of course this assumes + // that + results[i].Score = weight + } + + return results +} + // Calculate the document term frequency for all words across all documents // letting us know how many times a term appears across the corpus // This is mostly used for snippet extraction