Skip to content

Commit

Permalink
various updates
Browse files Browse the repository at this point in the history
  • Loading branch information
boyter committed Jun 23, 2020
1 parent 0d89f04 commit abc49e4
Show file tree
Hide file tree
Showing 17 changed files with 320 additions and 189 deletions.
332 changes: 231 additions & 101 deletions asset/tui/main.go

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion main.go
Original file line number Diff line number Diff line change
Expand Up @@ -182,7 +182,7 @@ func main() {
&processor.Ranker,
"ranker",
"bm25",
"set ranking algorithm [wc, tfidf, tfidf2, bm25]",
"set ranking algorithm [simple, tfidf, tfidf2, bm25]",
)
flags.StringVarP(
&processor.FileOutput,
Expand Down
6 changes: 3 additions & 3 deletions processor/fuzz.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,15 +24,15 @@ func Fuzz(data []byte) int {
freq := map[string]int{}
freq[find[:2]] = 5

res := &fileJob{
res := &FileJob{
Content: data,
MatchLocations: loc,
}

extractRelevantV3(res, freq, 300, "...")

findSpaceRight(&fileJob{Content: data}, 0, 10000)
findSpaceLeft(&fileJob{Content: data}, len(data)-1, 10000)
findSpaceRight(&FileJob{Content: data}, 0, 10000)
findSpaceLeft(&FileJob{Content: data}, len(data)-1, 10000)

return 1
}
8 changes: 4 additions & 4 deletions processor/http.go
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,7 @@ func StartHttpServer() {
page := tryParseInt(r.URL.Query().Get("p"), 0)
pageSize := 20

var results []*fileJob
var results []*FileJob
var fileCount int64

log.Info().
Expand All @@ -136,8 +136,8 @@ func StartHttpServer() {
}

fileQueue := make(chan *file.File, 1000) // Files ready to be read from disk NB we buffer here because http runs till finished or the process is cancelled
toProcessQueue := make(chan *fileJob, runtime.NumCPU()) // Files to be read into memory for processing
summaryQueue := make(chan *fileJob, runtime.NumCPU()) // Files that match and need to be displayed
toProcessQueue := make(chan *FileJob, runtime.NumCPU()) // Files to be read into memory for processing
summaryQueue := make(chan *FileJob, runtime.NumCPU()) // Files that match and need to be displayed

fileWalker := file.NewFileWalker(directory, fileQueue)
fileWalker.PathExclude = PathDenylist
Expand Down Expand Up @@ -314,7 +314,7 @@ func calculateExtensionFacet(extensionFacets map[string]int, query string, snipp
return ef
}

func calculatePages(results []*fileJob, pageSize int, query string, snippetLength int) []pageResult {
func calculatePages(results []*FileJob, pageSize int, query string, snippetLength int) []pageResult {
var pages []pageResult

if len(results) == 0 {
Expand Down
20 changes: 10 additions & 10 deletions processor/http_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,15 +7,15 @@ import (
)

func TestCalculatePagesNone(t *testing.T) {
var pages = calculatePages([]*fileJob{}, 20, "", 100)
var pages = calculatePages([]*FileJob{}, 20, "", 100)

if len(pages) != 0 {
t.Error("expected no result")
}
}

func TestCalculatePagesSingle(t *testing.T) {
var pages = calculatePages([]*fileJob{
var pages = calculatePages([]*FileJob{
{},
}, 20, "", 100)

Expand All @@ -33,9 +33,9 @@ func TestCalculatePagesSingle(t *testing.T) {
}

func TestCalculatePagesEdgeStart(t *testing.T) {
var fj []*fileJob
var fj []*FileJob
for i := 0; i < 20; i++ {
fj = append(fj, &fileJob{})
fj = append(fj, &FileJob{})
}

var pages = calculatePages(fj, 20, "", 100)
Expand All @@ -46,9 +46,9 @@ func TestCalculatePagesEdgeStart(t *testing.T) {
}

func TestCalculatePagesEdgeOver(t *testing.T) {
var fj []*fileJob
var fj []*FileJob
for i := 0; i < 21; i++ {
fj = append(fj, &fileJob{})
fj = append(fj, &FileJob{})
}

var pages = calculatePages(fj, 20, "", 100)
Expand All @@ -59,9 +59,9 @@ func TestCalculatePagesEdgeOver(t *testing.T) {
}

func TestCalculatePagesSecondPageEdge(t *testing.T) {
var fj []*fileJob
var fj []*FileJob
for i := 0; i < 40; i++ {
fj = append(fj, &fileJob{})
fj = append(fj, &FileJob{})
}

var pages = calculatePages(fj, 20, "", 100)
Expand All @@ -72,9 +72,9 @@ func TestCalculatePagesSecondPageEdge(t *testing.T) {
}

func TestCalculatePagesSecondPageEdgeOver(t *testing.T) {
var fj []*fileJob
var fj []*FileJob
for i := 0; i < 41; i++ {
fj = append(fj, &fileJob{})
fj = append(fj, &FileJob{})
}

var pages = calculatePages(fj, 20, "", 100)
Expand Down
4 changes: 2 additions & 2 deletions processor/processor.go
Original file line number Diff line number Diff line change
Expand Up @@ -51,8 +51,8 @@ func (process *Process) StartProcess() {
}

fileQueue := make(chan *file.File, 1000) // Files ready to be read from disk NB we buffer here because CLI runs till finished or the process is cancelled
toProcessQueue := make(chan *fileJob, runtime.NumCPU()) // Files to be read into memory for processing
summaryQueue := make(chan *fileJob, runtime.NumCPU()) // Files that match and need to be displayed
toProcessQueue := make(chan *FileJob, runtime.NumCPU()) // Files to be read into memory for processing
summaryQueue := make(chan *FileJob, runtime.NumCPU()) // Files that match and need to be displayed

fileWalker := file.NewFileWalker(process.Directory, fileQueue)
fileWalker.PathExclude = PathDenylist
Expand Down
24 changes: 13 additions & 11 deletions processor/result_ranker.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,20 +15,22 @@ import (
// Note that this method will evolve over time
// and as such you should never rely on the returned results being
// the same
func rankResults(corpusCount int, results []*fileJob) []*fileJob {
func rankResults(corpusCount int, results []*FileJob) []*FileJob {
// needs to come first because it resets the scores
switch Ranker {
case "wc":
results = results
case "simple":
// in this case the results are already ranked by the number of matches
case "bm25":
results = rankResultsBM25(corpusCount, results, calculateDocumentFrequency(results))
results = rankResultsLocation(results)
case "tfidf2":
results = rankResultsTFIDF(corpusCount, results, calculateDocumentFrequency(results), false)
results = rankResultsLocation(results)
default:
results = rankResultsTFIDF(corpusCount, results, calculateDocumentFrequency(results), true)
results = rankResultsLocation(results)
}

results = rankResultsLocation(results)
// TODO maybe need to add something here to reward phrases
sortResults(results)
return results
Expand All @@ -47,7 +49,7 @@ const (
// heavy. This is fairly similar to how the snippet extraction works but with less work because it does
// not need to deal with cutting between unicode endpoints
// NB this is one of the more expensive parts of the ranking
func rankResultsPhrase(results []*fileJob, documentFrequencies map[string]int) []*fileJob {
func rankResultsPhrase(results []*FileJob, documentFrequencies map[string]int) []*FileJob {
for i := 0; i < len(results); i++ {
rv3 := convertToRelevant(results[i])

Expand All @@ -72,7 +74,7 @@ func rankResultsPhrase(results []*fileJob, documentFrequencies map[string]int) [
// file location field.
// This is not using TF-IDF or any fancy algorithm just basic checks
// and boosts
func rankResultsLocation(results []*fileJob) []*fileJob {
func rankResultsLocation(results []*FileJob) []*FileJob {
for i := 0; i < len(results); i++ {
foundTerms := 0
for key := range results[i].MatchLocations {
Expand Down Expand Up @@ -134,7 +136,7 @@ func rankResultsLocation(results []*fileJob) []*fileJob {
// NB loops in here use increment to avoid duffcopy
// https://stackoverflow.com/questions/45786687/runtime-duffcopy-is-called-a-lot
// due to how often it is called by things like the TUI mode
func rankResultsTFIDF(corpusCount int, results []*fileJob, documentFrequencies map[string]int, classic bool) []*fileJob {
func rankResultsTFIDF(corpusCount int, results []*FileJob, documentFrequencies map[string]int, classic bool) []*FileJob {
var weight float64
for i := 0; i < len(results); i++ {
weight = 0
Expand Down Expand Up @@ -198,7 +200,7 @@ func rankResultsTFIDF(corpusCount int, results []*fileJob, documentFrequencies m
// IDF * TF * (k1 + 1)
// BM25 = sum ----------------------------
// TF + k1 * (1 - b + b * D / L)
func rankResultsBM25(corpusCount int, results []*fileJob, documentFrequencies map[string]int) []*fileJob {
func rankResultsBM25(corpusCount int, results []*FileJob, documentFrequencies map[string]int) []*FileJob {
var weight float64

// Get the average number of words across all documents because we need that in BM25 to calculate correctly
Expand Down Expand Up @@ -247,7 +249,7 @@ func rankResultsBM25(corpusCount int, results []*fileJob, documentFrequencies ma
// Calculate the document term frequency for all words across all documents
// letting us know how many times a term appears across the corpus
// This is mostly used for snippet extraction
func calculateDocumentTermFrequency(results []*fileJob) map[string]int {
func calculateDocumentTermFrequency(results []*FileJob) map[string]int {
documentFrequencies := map[string]int{}
for i := 0; i < len(results); i++ {
for k := range results[i].MatchLocations {
Expand All @@ -261,7 +263,7 @@ func calculateDocumentTermFrequency(results []*fileJob) map[string]int {
// Calculate the document frequency for all words across all documents
// allowing us to know the number of documents for which a term appears
// This is mostly used for TF-IDF calculation
func calculateDocumentFrequency(results []*fileJob) map[string]int {
func calculateDocumentFrequency(results []*FileJob) map[string]int {
documentFrequencies := map[string]int{}
for i := 0; i < len(results); i++ {
for k := range results[i].MatchLocations {
Expand All @@ -276,7 +278,7 @@ func calculateDocumentFrequency(results []*fileJob) map[string]int {
// and then sort based on location to stop any undeterministic ordering happening
// as since the location includes the filename we should never have two matches
// that are 100% equal based on the two criteria we use.
func sortResults(results []*fileJob) {
func sortResults(results []*FileJob) {
sort.Slice(results, func(i, j int) bool {
if results[i].Score == results[j].Score {
return strings.Compare(results[i].Location, results[j].Location) < 0
Expand Down
14 changes: 7 additions & 7 deletions processor/result_ranker_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ func TestRankResultsTFIDFTraditional(t *testing.T) {
ml2 := map[string][][]int{}
ml2["example"] = [][]int{{1}, {2}, {3}}

s := []*fileJob{
s := []*FileJob{
{
MatchLocations: ml1,
Location: "/test/other.go",
Expand Down Expand Up @@ -42,7 +42,7 @@ func TestRankResultsTFIDFComparison(t *testing.T) {
ml1 := map[string][][]int{}
ml1["example"] = [][]int{{1}, {2}, {3}}

s := []*fileJob{
s := []*FileJob{
{
MatchLocations: ml1,
Location: "/test/other.go",
Expand All @@ -65,7 +65,7 @@ func TestRankResultsRankerComparison(t *testing.T) {
ml1 := map[string][][]int{}
ml1["example"] = [][]int{{1}, {2}, {3}}

s := []*fileJob{
s := []*FileJob{
{
MatchLocations: ml1,
Location: "/test/other.go",
Expand All @@ -91,7 +91,7 @@ func TestRankResultsLocation(t *testing.T) {
ml := map[string][][]int{}
ml["test"] = [][]int{{1}, {2}, {3}}

s := []*fileJob{
s := []*FileJob{
{
MatchLocations: ml,
Location: "/test/other.go",
Expand All @@ -113,7 +113,7 @@ func TestCalculateDocumentFrequency(t *testing.T) {
ml := map[string][][]int{}
ml["test"] = [][]int{{1}, {2}, {3}}

s := []*fileJob{
s := []*FileJob{
{
MatchLocations: ml,
},
Expand All @@ -130,7 +130,7 @@ func TestCalculateDocumentFrequency(t *testing.T) {
}

func TestSortResults(t *testing.T) {
s := []*fileJob{
s := []*FileJob{
{
Filename: "1",
Location: "",
Expand All @@ -150,7 +150,7 @@ func TestSortResults(t *testing.T) {
}

func TestSortResultsEqualScore(t *testing.T) {
s := []*fileJob{
s := []*FileJob{
{
Filename: "1",
Location: "2",
Expand Down
8 changes: 4 additions & 4 deletions processor/snippet.go
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ type Snippet struct {
// to differ between people. Heck a few times I have been disappointed with results that I was previously happy with.
// As such this is not tested as much as other methods and you should not rely on the results being static over time
// as the internals will be modified to produce better results where possible
func extractRelevantV3(res *fileJob, documentFrequencies map[string]int, relLength int, indicator string) []Snippet {
func extractRelevantV3(res *FileJob, documentFrequencies map[string]int, relLength int, indicator string) []Snippet {
wrapLength := relLength / 2
var bestMatches []bestMatch

Expand Down Expand Up @@ -275,7 +275,7 @@ func extractRelevantV3(res *fileJob, documentFrequencies map[string]int, relLeng

// Get all of the locations into a new data structure
// which makes things easy to sort and deal with
func convertToRelevant(res *fileJob) []relevantV3 {
func convertToRelevant(res *FileJob) []relevantV3 {
var rv3 []relevantV3

for k, v := range res.MatchLocations {
Expand All @@ -299,7 +299,7 @@ func convertToRelevant(res *fileJob) []relevantV3 {
// Looks for a nearby whitespace character near this position (`pos`)
// up to `distance` away. Returns index of space if a space was found and
// true, otherwise returns the original index and false
func findSpaceRight(res *fileJob, pos int, distance int) (int, bool) {
func findSpaceRight(res *FileJob, pos int, distance int) (int, bool) {
if len(res.Content) == 0 {
return pos, false
}
Expand All @@ -322,7 +322,7 @@ func findSpaceRight(res *fileJob, pos int, distance int) (int, bool) {
// Looks for nearby whitespace character near this position
// up to distance away. Returns index of space if a space was found and tru
// otherwise the original index is return and false
func findSpaceLeft(res *fileJob, pos int, distance int) (int, bool) {
func findSpaceLeft(res *FileJob, pos int, distance int) (int, bool) {
if len(res.Content) == 0 {
return pos, false
}
Expand Down
Loading

0 comments on commit abc49e4

Please sign in to comment.