Skip to content

Commit

Permalink
bleve v2.0.0 proposal PR (#1494)
Browse files Browse the repository at this point in the history
* refactor to remove circular deps

* reduce index and segment API surface area

* update Advanced() method (#1509)

* relocate store pkg inside upsidedown (#1510)

* remove Advanced() method from scorch (#1511)

* remove use of dump methods from index reader (#1516)

* remove remaining blevex imports (#1520)

* update analysis.TokenFrequency to use options (#1522)

* switch default index to scorch with latest zap (#1528)

* MatchOperator should be that type instead of int (#1410)

Co-authored-by: Sreekanth Sivasankaran <sreekanth.sivasankaran@couchbase.com>
Co-authored-by: Mario de Frutos Dieguez <mario@defrutos.org>
  • Loading branch information
3 people committed Jan 12, 2021
1 parent eaa06ad commit 89234a6
Show file tree
Hide file tree
Showing 513 changed files with 2,488 additions and 5,471 deletions.
4 changes: 2 additions & 2 deletions analysis/analyzer/custom/custom.go
Expand Up @@ -17,8 +17,8 @@ package custom
import (
"fmt"

"github.com/blevesearch/bleve/analysis"
"github.com/blevesearch/bleve/registry"
"github.com/blevesearch/bleve/v2/analysis"
"github.com/blevesearch/bleve/v2/registry"
)

const Name = "custom"
Expand Down
6 changes: 3 additions & 3 deletions analysis/analyzer/keyword/keyword.go
Expand Up @@ -15,9 +15,9 @@
package keyword

import (
"github.com/blevesearch/bleve/analysis"
"github.com/blevesearch/bleve/analysis/tokenizer/single"
"github.com/blevesearch/bleve/registry"
"github.com/blevesearch/bleve/v2/analysis"
"github.com/blevesearch/bleve/v2/analysis/tokenizer/single"
"github.com/blevesearch/bleve/v2/registry"
)

const Name = "keyword"
Expand Down
8 changes: 4 additions & 4 deletions analysis/analyzer/simple/simple.go
Expand Up @@ -15,10 +15,10 @@
package simple

import (
"github.com/blevesearch/bleve/analysis"
"github.com/blevesearch/bleve/analysis/token/lowercase"
"github.com/blevesearch/bleve/analysis/tokenizer/letter"
"github.com/blevesearch/bleve/registry"
"github.com/blevesearch/bleve/v2/analysis"
"github.com/blevesearch/bleve/v2/analysis/token/lowercase"
"github.com/blevesearch/bleve/v2/analysis/tokenizer/letter"
"github.com/blevesearch/bleve/v2/registry"
)

const Name = "simple"
Expand Down
10 changes: 5 additions & 5 deletions analysis/analyzer/standard/standard.go
Expand Up @@ -15,11 +15,11 @@
package standard

import (
"github.com/blevesearch/bleve/analysis"
"github.com/blevesearch/bleve/analysis/lang/en"
"github.com/blevesearch/bleve/analysis/token/lowercase"
"github.com/blevesearch/bleve/analysis/tokenizer/unicode"
"github.com/blevesearch/bleve/registry"
"github.com/blevesearch/bleve/v2/analysis"
"github.com/blevesearch/bleve/v2/analysis/lang/en"
"github.com/blevesearch/bleve/v2/analysis/token/lowercase"
"github.com/blevesearch/bleve/v2/analysis/tokenizer/unicode"
"github.com/blevesearch/bleve/v2/registry"
)

const Name = "standard"
Expand Down
10 changes: 5 additions & 5 deletions analysis/analyzer/web/web.go
Expand Up @@ -15,11 +15,11 @@
package web

import (
"github.com/blevesearch/bleve/analysis"
"github.com/blevesearch/bleve/analysis/lang/en"
"github.com/blevesearch/bleve/analysis/token/lowercase"
"github.com/blevesearch/bleve/analysis/tokenizer/web"
"github.com/blevesearch/bleve/registry"
"github.com/blevesearch/bleve/v2/analysis"
"github.com/blevesearch/bleve/v2/analysis/lang/en"
"github.com/blevesearch/bleve/v2/analysis/token/lowercase"
"github.com/blevesearch/bleve/v2/analysis/tokenizer/web"
"github.com/blevesearch/bleve/v2/registry"
)

const Name = "web"
Expand Down
9 changes: 5 additions & 4 deletions analysis/benchmark_test.go
Expand Up @@ -15,11 +15,12 @@
package analysis_test

import (
index "github.com/blevesearch/bleve_index_api"
"testing"

"github.com/blevesearch/bleve/analysis"
"github.com/blevesearch/bleve/analysis/analyzer/standard"
"github.com/blevesearch/bleve/registry"
"github.com/blevesearch/bleve/v2/analysis"
"github.com/blevesearch/bleve/v2/analysis/analyzer/standard"
"github.com/blevesearch/bleve/v2/registry"
)

func BenchmarkAnalysis(b *testing.B) {
Expand All @@ -32,7 +33,7 @@ func BenchmarkAnalysis(b *testing.B) {
}

ts := analyzer.Analyze(bleveWikiArticle)
freqs := analysis.TokenFrequency(ts, nil, true)
freqs := analysis.TokenFrequency(ts, nil, index.IncludeTermVectors)
if len(freqs) != 511 {
b.Errorf("expected %d freqs, got %d", 511, len(freqs))
}
Expand Down
4 changes: 2 additions & 2 deletions analysis/char/asciifolding/asciifolding.go
Expand Up @@ -18,8 +18,8 @@
package asciifolding

import (
"github.com/blevesearch/bleve/analysis"
"github.com/blevesearch/bleve/registry"
"github.com/blevesearch/bleve/v2/analysis"
"github.com/blevesearch/bleve/v2/registry"
)

const Name = "asciifolding"
Expand Down
6 changes: 3 additions & 3 deletions analysis/char/html/html.go
Expand Up @@ -17,9 +17,9 @@ package html
import (
"regexp"

"github.com/blevesearch/bleve/analysis"
regexpCharFilter "github.com/blevesearch/bleve/analysis/char/regexp"
"github.com/blevesearch/bleve/registry"
"github.com/blevesearch/bleve/v2/analysis"
regexpCharFilter "github.com/blevesearch/bleve/v2/analysis/char/regexp"
"github.com/blevesearch/bleve/v2/registry"
)

const Name = "html"
Expand Down
4 changes: 2 additions & 2 deletions analysis/char/regexp/regexp.go
Expand Up @@ -18,8 +18,8 @@ import (
"fmt"
"regexp"

"github.com/blevesearch/bleve/analysis"
"github.com/blevesearch/bleve/registry"
"github.com/blevesearch/bleve/v2/analysis"
"github.com/blevesearch/bleve/v2/registry"
)

const Name = "regexp"
Expand Down
6 changes: 3 additions & 3 deletions analysis/char/zerowidthnonjoiner/zerowidthnonjoiner.go
Expand Up @@ -17,9 +17,9 @@ package zerowidthnonjoiner
import (
"regexp"

"github.com/blevesearch/bleve/analysis"
regexpCharFilter "github.com/blevesearch/bleve/analysis/char/regexp"
"github.com/blevesearch/bleve/registry"
"github.com/blevesearch/bleve/v2/analysis"
regexpCharFilter "github.com/blevesearch/bleve/v2/analysis/char/regexp"
"github.com/blevesearch/bleve/v2/registry"
)

const Name = "zero_width_spaces"
Expand Down
4 changes: 2 additions & 2 deletions analysis/datetime/flexible/flexible.go
Expand Up @@ -18,8 +18,8 @@ import (
"fmt"
"time"

"github.com/blevesearch/bleve/analysis"
"github.com/blevesearch/bleve/registry"
"github.com/blevesearch/bleve/v2/analysis"
"github.com/blevesearch/bleve/v2/registry"
)

const Name = "flexiblego"
Expand Down
2 changes: 1 addition & 1 deletion analysis/datetime/flexible/flexible_test.go
Expand Up @@ -19,7 +19,7 @@ import (
"testing"
"time"

"github.com/blevesearch/bleve/analysis"
"github.com/blevesearch/bleve/v2/analysis"
)

func TestFlexibleDateTimeParser(t *testing.T) {
Expand Down
6 changes: 3 additions & 3 deletions analysis/datetime/optional/optional.go
Expand Up @@ -17,9 +17,9 @@ package optional
import (
"time"

"github.com/blevesearch/bleve/analysis"
"github.com/blevesearch/bleve/analysis/datetime/flexible"
"github.com/blevesearch/bleve/registry"
"github.com/blevesearch/bleve/v2/analysis"
"github.com/blevesearch/bleve/v2/analysis/datetime/flexible"
"github.com/blevesearch/bleve/v2/registry"
)

const Name = "dateTimeOptional"
Expand Down
124 changes: 21 additions & 103 deletions analysis/freq.go
Expand Up @@ -15,105 +15,18 @@
package analysis

import (
"reflect"

"github.com/blevesearch/bleve/size"
index "github.com/blevesearch/bleve_index_api"
)

var reflectStaticSizeTokenLocation int
var reflectStaticSizeTokenFreq int

func init() {
var tl TokenLocation
reflectStaticSizeTokenLocation = int(reflect.TypeOf(tl).Size())
var tf TokenFreq
reflectStaticSizeTokenFreq = int(reflect.TypeOf(tf).Size())
}

// TokenLocation represents one occurrence of a term at a particular location in
// a field. Start, End and Position have the same meaning as in analysis.Token.
// Field and ArrayPositions identify the field value in the source document.
// See document.Field for details.
type TokenLocation struct {
Field string
ArrayPositions []uint64
Start int
End int
Position int
}

func (tl *TokenLocation) Size() int {
rv := reflectStaticSizeTokenLocation
rv += len(tl.ArrayPositions) * size.SizeOfUint64
return rv
}

// TokenFreq represents all the occurrences of a term in all fields of a
// document.
type TokenFreq struct {
Term []byte
Locations []*TokenLocation
frequency int
}

func (tf *TokenFreq) Size() int {
rv := reflectStaticSizeTokenFreq
rv += len(tf.Term)
for _, loc := range tf.Locations {
rv += loc.Size()
}
return rv
}

func (tf *TokenFreq) Frequency() int {
return tf.frequency
}

// TokenFrequencies maps document terms to their combined frequencies from all
// fields.
type TokenFrequencies map[string]*TokenFreq
func TokenFrequency(tokens TokenStream, arrayPositions []uint64, options index.FieldIndexingOptions) index.TokenFrequencies {
rv := make(map[string]*index.TokenFreq, len(tokens))

func (tfs TokenFrequencies) Size() int {
rv := size.SizeOfMap
rv += len(tfs) * (size.SizeOfString + size.SizeOfPtr)
for k, v := range tfs {
rv += len(k)
rv += v.Size()
}
return rv
}

func (tfs TokenFrequencies) MergeAll(remoteField string, other TokenFrequencies) {
// walk the new token frequencies
for tfk, tf := range other {
// set the remoteField value in incoming token freqs
for _, l := range tf.Locations {
l.Field = remoteField
}
existingTf, exists := tfs[tfk]
if exists {
existingTf.Locations = append(existingTf.Locations, tf.Locations...)
existingTf.frequency = existingTf.frequency + tf.frequency
} else {
tfs[tfk] = &TokenFreq{
Term: tf.Term,
frequency: tf.frequency,
Locations: make([]*TokenLocation, len(tf.Locations)),
}
copy(tfs[tfk].Locations, tf.Locations)
}
}
}

func TokenFrequency(tokens TokenStream, arrayPositions []uint64, includeTermVectors bool) TokenFrequencies {
rv := make(map[string]*TokenFreq, len(tokens))

if includeTermVectors {
tls := make([]TokenLocation, len(tokens))
if options.IncludeTermVectors() {
tls := make([]index.TokenLocation, len(tokens))
tlNext := 0

for _, token := range tokens {
tls[tlNext] = TokenLocation{
tls[tlNext] = index.TokenLocation{
ArrayPositions: arrayPositions,
Start: token.Start,
End: token.End,
Expand All @@ -123,27 +36,32 @@ func TokenFrequency(tokens TokenStream, arrayPositions []uint64, includeTermVect
curr, ok := rv[string(token.Term)]
if ok {
curr.Locations = append(curr.Locations, &tls[tlNext])
curr.frequency++
} else {
rv[string(token.Term)] = &TokenFreq{
curr = &index.TokenFreq{
Term: token.Term,
Locations: []*TokenLocation{&tls[tlNext]},
frequency: 1,
Locations: []*index.TokenLocation{&tls[tlNext]},
}
rv[string(token.Term)] = curr
}

if !options.SkipFreqNorm() {
curr.SetFrequency(curr.Frequency() + 1)
}

tlNext++
}
} else {
for _, token := range tokens {
curr, exists := rv[string(token.Term)]
if exists {
curr.frequency++
} else {
rv[string(token.Term)] = &TokenFreq{
Term: token.Term,
frequency: 1,
if !exists {
curr = &index.TokenFreq{
Term: token.Term,
}
rv[string(token.Term)] = curr
}

if !options.SkipFreqNorm() {
curr.SetFrequency(curr.Frequency() + 1)
}
}
}
Expand Down

0 comments on commit 89234a6

Please sign in to comment.