Skip to content

Commit

Permalink
fix ranking algorithm
Browse files Browse the repository at this point in the history
  • Loading branch information
boyter committed Jun 10, 2020
1 parent 6eac844 commit a01fa93
Show file tree
Hide file tree
Showing 7 changed files with 107 additions and 71 deletions.
1 change: 0 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@ cs t NOT something test~1 "ten thousand a year" "/pr[e-i]de/"

```
BUGS
score from TF/IDF appears to be negative in some cases (overflow??)
need to clear pdf cache
searching for http://localhost:8080/?q=%22about+to+explore%22&ss=300 seems to highlight to for some reason, same in TUI mode
Expand Down
4 changes: 2 additions & 2 deletions processor/http.go
Original file line number Diff line number Diff line change
Expand Up @@ -196,7 +196,7 @@ func StartHttpServer() {
fmtBegin := hex.EncodeToString(md5Digest.Sum([]byte(fmt.Sprintf("begin_%d", makeTimestampNano()))))
fmtEnd := hex.EncodeToString(md5Digest.Sum([]byte(fmt.Sprintf("end_%d", makeTimestampNano()))))

documentFrequency := calculateDocumentFrequency(results)
documentTermFrequency := calculateDocumentTermFrequency(results)

var searchResults []searchResult
extensionFacets := map[string]int{}
Expand All @@ -223,7 +223,7 @@ func StartHttpServer() {
}

for _, res := range displayResults {
v3 := extractRelevantV3(res, documentFrequency, snippetLength, "…")[0]
v3 := extractRelevantV3(res, documentTermFrequency, snippetLength, "…")[0]

// We have the snippet so now we need to highlight it
// we get all the locations that fall in the snippet length
Expand Down
57 changes: 34 additions & 23 deletions processor/result_ranker.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,11 +17,9 @@ import (
// and as such you should never rely on the returned results being
// the same
func rankResults(corpusCount int, results []*fileJob) []*fileJob {
documentFrequencies := calculateDocumentFrequency(results)

results = rankResultsTFIDF(corpusCount, results, documentFrequencies) // needs to come first because it resets the scores
results = rankResultsPhrase(results, documentFrequencies)
results = rankResultsTFIDF(corpusCount, results, calculateDocumentFrequency(results)) // needs to come first because it resets the scores
results = rankResultsLocation(results)
// TODO maybe need to add something here to reward phrases
sortResults(results)
return results
}
Expand All @@ -30,6 +28,7 @@ func rankResults(corpusCount int, results []*fileJob) []*fileJob {
// should be boosted by
const (
LocationBoostValue = 0.05
DefaultScoreValue = 0.01
PhraseBoostValue = 1.00
)

Expand All @@ -50,7 +49,6 @@ func rankResultsPhrase(results []*fileJob, documentFrequencies map[string]int) [
// weighted by how common that word is so that matches like 'a' impact the rank
// less than something like 'cromulent' which in theory should not occur as much
if rv3[j].Start-rv3[j-1].End < 5 {
// Set to 1 which seems to produce reasonable results by only boosting a little per term
results[i].Score += PhraseBoostValue / float64(documentFrequencies[rv3[j].Word])
}
}
Expand All @@ -67,17 +65,17 @@ func rankResultsLocation(results []*fileJob) []*fileJob {
for i := 0; i < len(results); i++ {
foundTerms := 0
for key := range results[i].MatchLocations {
locs := str.IndexAllIgnoreCaseUnicode(results[i].Location, key, -1)
l := str.IndexAllIgnoreCaseUnicode(results[i].Location, key, -1)

// Boost the rank slightly based on number of matches and on
// how long a match it is as we should reward longer matches
if len(locs) != 0 {
if len(l) != 0 {
foundTerms++

// If the rank is ever 0 than nothing will change, so set it
// to a small value to at least introduce some ranking here
if results[i].Score == 0 || math.IsNaN(results[i].Score) {
results[i].Score = 0.1
results[i].Score = DefaultScoreValue
}

// Set the score to be itself * 1.something where something
Expand All @@ -90,13 +88,13 @@ func rankResultsLocation(results []*fileJob) []*fileJob {
// Of course this assumes that they have the text test in the
// content otherwise the match is discarded
results[i].Score = results[i].Score * (1.0 +
(LocationBoostValue * float64(len(locs)) * float64(len(key))))
(LocationBoostValue * float64(len(l)) * float64(len(key))))

// If the location is closer to the start boost or rather don't
// affect negatively as much because we reduce the score slightly based on
// how far away from the start it is
low := math.MaxInt32
for _, l := range locs {
for _, l := range l {
if l[0] < low {
low = l[0]
}
Expand Down Expand Up @@ -126,23 +124,22 @@ func rankResultsLocation(results []*fileJob) []*fileJob {
// https://stackoverflow.com/questions/45786687/runtime-duffcopy-is-called-a-lot
// due to how often it is called by things like the TUI mode
func rankResultsTFIDF(corpusCount int, results []*fileJob, documentFrequencies map[string]int) []*fileJob {
// Get the number of docs with each word in it, which is just the number of results because we are AND only
// and as such each document must contain all the words although they may have different counts
var weight float64
for i := 0; i < len(results); i++ {
weight = 0

// We don't know how many words are actually in this document... and I don't want to check
// because its going to slow things down. Keep in mind that this works inside the words themselves
// I.E. partial matches are the norm so it makes sense to base it on the number of bytes
// where we assume about 50 "words" per 1000 bytes of text.
// Also ensure that it is at least 1 to avoid divide by zero errors later on.
words := float64(maxInt(1, results[i].Bytes/20))
words := float64(maxInt(1, results[i].Bytes/2))

for key, value := range results[i].MatchLocations {
// word in the case is the word we are dealing with IE what the user actually searched for
// and wordCount is the locations of those words allowing us to know the number of words matching
for word, wordCount := range results[i].MatchLocations {
// Technically the IDF for this is wrong because we only
// have the count for the matches of the document not all the terms
// that are actually required I.E.
// that are actually required
// its likely that a search for "a b" is missing the counts
// for documents that have a but not b and as such
// the document frequencies are off with respect to the total
Expand All @@ -157,10 +154,10 @@ func rankResultsTFIDF(corpusCount int, results []*fileJob, documentFrequencies m
// TF = number of this words in this document / words in entire document
// IDF = number of documents that contain this word

tf := float64(len(value)) / words
idf := float64(corpusCount) / float64(documentFrequencies[key])
tf := float64(len(wordCount)) / words
idf := math.Log10(float64(corpusCount) / float64(documentFrequencies[word]))

weight += tf * math.Log2(idf)
weight += tf * idf
}

// Override the score here because we don't want whatever we got originally
Expand All @@ -172,10 +169,10 @@ func rankResultsTFIDF(corpusCount int, results []*fileJob, documentFrequencies m
return results
}

func calculateDocumentFrequency(results []*fileJob) map[string]int {
// Calculate the document frequency for all words across all documents
// that we have to get the term frequency for each allowing us to determine
// how rare or common a word is across the corpus
// Calculate the document term frequency for all words across all documents
// letting us know how many times a term appears across the corpus
// This is mostly used for snippet extraction
func calculateDocumentTermFrequency(results []*fileJob) map[string]int {
documentFrequencies := map[string]int{}
for i := 0; i < len(results); i++ {
for k := range results[i].MatchLocations {
Expand All @@ -186,6 +183,20 @@ func calculateDocumentFrequency(results []*fileJob) map[string]int {
return documentFrequencies
}

// Calculate the document frequency for all words across all documents
// allowing us to know the number of documents for which a term appears
// This is mostly used for TF-IDF calculation
func calculateDocumentFrequency(results []*fileJob) map[string]int {
documentFrequencies := map[string]int{}
for i := 0; i < len(results); i++ {
for k := range results[i].MatchLocations {
documentFrequencies[k] = documentFrequencies[k] + 1
}
}

return documentFrequencies
}

// Sort a slice of filejob results based on their score for displaying
// and then sort based on location to stop any undeterministic ordering happening
// as since the location includes the filename we should never have two matches
Expand Down
71 changes: 36 additions & 35 deletions processor/result_ranker_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,40 +3,41 @@

package processor

import "testing"

//func TestRankResultsTFIDF(t *testing.T) {
// ml1 := map[string][][]int{}
// ml1["this"] = [][]int{{1}}
// ml1["is"] = [][]int{{1}}
// ml1["a"] = [][]int{{1}, {2}}
// ml1["sample"] = [][]int{{1}}
//
// ml2 := map[string][][]int{}
// ml2["this"] = [][]int{{1}}
// ml2["is"] = [][]int{{1}}
// ml2["another"] = [][]int{{1}, {2}}
// ml2["example"] = [][]int{{1}, {2}, {3}}
//
// s := []*fileJob{
// {
// MatchLocations: ml1,
// Location: "/test/other.go",
// Bytes: 12,
// },
// {
// MatchLocations: ml2,
// Location: "/test/test.go",
// Bytes: 12,
// },
// }
//
// s = rankResultsTFIDF(2, s, calculateDocumentFrequency(s))
//
// if s[0].Score > s[1].Score {
// t.Error("index 0 should have lower score than 1")
// }
//}
import (
"testing"
)

// This is based roughly the example provided by https://en.wikipedia.org/wiki/Tf%E2%80%93idf
// with the output for it compared to ensure the results are fairly similar
func TestRankResultsTFIDF(t *testing.T) {
ml1 := map[string][][]int{}

ml2 := map[string][][]int{}
ml2["example"] = [][]int{{1}, {2}, {3}}

s := []*fileJob{
{
MatchLocations: ml1,
Location: "/test/other.go",
Bytes: 12,
},
{
MatchLocations: ml2,
Location: "/test/test.go",
Bytes: 12,
},
}

s = rankResultsTFIDF(2, s, calculateDocumentFrequency(s))

if s[0].Score > s[1].Score {
t.Error("index 0 should have lower score than 1")
}

if s[1].Score < 0.13 || s[1].Score > 0.16 {
t.Error("score should be in this range")
}
}

func TestRankResultsLocation(t *testing.T) {
ml := map[string][][]int{}
Expand Down Expand Up @@ -73,7 +74,7 @@ func TestCalculateDocumentFrequency(t *testing.T) {
},
}

freq := calculateDocumentFrequency(s)
freq := calculateDocumentTermFrequency(s)

if len(freq) != 1 || freq["test"] != 6 {
t.Error("did not work as expected")
Expand Down
37 changes: 31 additions & 6 deletions processor/snippet_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,7 @@ func TestExtractRelevantV3PaintedShip(t *testing.T) {
res.MatchLocations[t] = str.IndexAllIgnoreCaseUnicode(rhymeOfTheAncient, t, -1)
}

df := calculateDocumentFrequency([]*fileJob{res})
df := calculateDocumentTermFrequency([]*fileJob{res})
snippets := extractRelevantV3(res, df, 300, "")

if !strings.Contains(snippets[0].Content, `Day after day, day after day,
Expand Down Expand Up @@ -132,7 +132,7 @@ func TestExtractRelevantV3WaterWaterEverywhere(t *testing.T) {
res.MatchLocations[t] = str.IndexAllIgnoreCaseUnicode(rhymeOfTheAncient, t, -1)
}

df := calculateDocumentFrequency([]*fileJob{res})
df := calculateDocumentTermFrequency([]*fileJob{res})
snippets := extractRelevantV3(res, df, 300, "")

if !strings.Contains(snippets[0].Content, `Water, water, every where,
Expand All @@ -158,7 +158,7 @@ func TestExtractRelevantV3GroanedDead(t *testing.T) {
res.MatchLocations[t] = str.IndexAllIgnoreCaseUnicode(rhymeOfTheAncient, t, -1)
}

df := calculateDocumentFrequency([]*fileJob{res})
df := calculateDocumentTermFrequency([]*fileJob{res})
snippets := extractRelevantV3(res, df, 300, "")

if !strings.Contains(snippets[0].Content, `They groaned, they stirred, they all uprose,
Expand All @@ -184,7 +184,7 @@ func TestExtractRelevantV3DeathFires(t *testing.T) {
res.MatchLocations[t] = str.IndexAllIgnoreCaseUnicode(rhymeOfTheAncient, t, -1)
}

df := calculateDocumentFrequency([]*fileJob{res})
df := calculateDocumentTermFrequency([]*fileJob{res})
snippets := extractRelevantV3(res, df, 300, "")

if !strings.Contains(snippets[0].Content, `About, about, in reel and rout
Expand All @@ -210,7 +210,7 @@ func TestExtractRelevantV3PoorNerves(t *testing.T) {
res.MatchLocations[t] = str.IndexAllIgnoreCaseUnicode(prideAndPrejudice, t, -1)
}

df := calculateDocumentFrequency([]*fileJob{res})
df := calculateDocumentTermFrequency([]*fileJob{res})
snippets := extractRelevantV3(res, df, 300, "")

if !strings.Contains(snippets[0].Content, `You take delight in vexing me. You have no compassion for my poor
Expand All @@ -236,11 +236,36 @@ func TestExtractRelevantV3TenThousandAYear(t *testing.T) {
res.MatchLocations[t] = str.IndexAllIgnoreCaseUnicode(prideAndPrejudice, t, -1)
}

df := calculateDocumentFrequency([]*fileJob{res})
df := calculateDocumentTermFrequency([]*fileJob{res})
snippets := extractRelevantV3(res, df, 300, "")

if !strings.Contains(snippets[0].Content, `of his having
ten thousand a year. The gentlemen pronounced him to be a fine`) {
t.Error("expected to have snippet")
}
}

func TestExtractRelevantV3StrangerParents(t *testing.T) {
terms := []string{
"stranger",
"parents",
}

res := &fileJob{
Content: []byte(prideAndPrejudice),
MatchLocations: map[string][][]int{},
}

for _, t := range terms {
res.MatchLocations[t] = str.IndexAllIgnoreCaseUnicode(prideAndPrejudice, t, -1)
}

df := calculateDocumentTermFrequency([]*fileJob{res})
snippets := extractRelevantV3(res, df, 300, "")

if !strings.Contains(snippets[0].Content, `An unhappy alternative is before you, Elizabeth. From this day
you must be a stranger to one of your parents. Your mother will
never see you again if you`) {
t.Error("expected to have snippet")
}
}
4 changes: 2 additions & 2 deletions processor/tui.go
Original file line number Diff line number Diff line change
Expand Up @@ -181,10 +181,10 @@ func drawResults(app *tview.Application, results []*fileJob, textView *tview.Tex
var resultText string
resultText += fmt.Sprintf("%d results(s) for '%s' from %d files %s\n\n", len(results), searchTerm, fileCount, inProgress)

documentFrequency := calculateDocumentFrequency(results)
documentTermFrequency := calculateDocumentTermFrequency(results)
for i, res := range pResults {
// NB this just gets the first snippet which should in theory be the most relevant
v3 := extractRelevantV3(res, documentFrequency, int(SnippetLength), "…")[0]
v3 := extractRelevantV3(res, documentTermFrequency, int(SnippetLength), "…")[0]

resultText += fmt.Sprintf("[purple]%d. %s (%.3f)", i+1, res.Location, res.Score) + "[white]\n\n"

Expand Down
4 changes: 2 additions & 2 deletions processor/worker_summarize.go
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ func (f *ResultSummarizer) Start() {
func (f *ResultSummarizer) formatJson(results []*fileJob) {
jsonResults := []jsonResult{}

documentFrequency := calculateDocumentFrequency(results)
documentFrequency := calculateDocumentTermFrequency(results)

for _, res := range results {
v3 := extractRelevantV3(res, documentFrequency, int(SnippetLength), "…")[0]
Expand Down Expand Up @@ -100,7 +100,7 @@ func (f *ResultSummarizer) formatDefault(results []*fileJob) {
fmtEnd = ""
}

documentFrequency := calculateDocumentFrequency(results)
documentFrequency := calculateDocumentTermFrequency(results)

for _, res := range results {
color.Magenta(fmt.Sprintf("%s (%.3f)", res.Location, res.Score))
Expand Down

0 comments on commit a01fa93

Please sign in to comment.