stringclassifier/searchset/searchset.go

// Copyright 2017 Google Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// Package searchset generates hashes for all substrings of a text. Potential
// matches between two SearchSet objects can then be determined quickly.
// Generating the hashes can be expensive, so it's best to perform it once. If
// the text is part of a known corpus, then the SearchSet can be serialized and
// kept in an archive.
//
// Matching occurs by "mapping" ranges from the source text into the target
// text but still retaining the source order:
//
//	SOURCE: |-----------------------------|
//
//	TARGET: |*****************************************|
//
//	MAP SOURCE SECTIONS ONTO TARGET IN SOURCE ORDER:
//
//	  S:  |-[--]-----[---]------[----]------|
//	         /         |           \
//	      |---|   |---------|   |-------------|
//	  T: |*****************************************|
//
// Note that a single source range may match many different ranges in the
// target. The matching algorithm untangles these so that all matched ranges
// are in order with respect to the source ranges. This is especially important
// since the source text may occur more than once in the target text. The
// algorithm finds each potential occurrence of S in T and returns all as
// potential matched ranges.
package searchset

import (
	"encoding/gob"
	"fmt"
	"io"
	"sort"

	"github.com/bugfender-contrib/licenseclassifier/stringclassifier/searchset/tokenizer"
)

// DefaultGranularity is the minimum size (in words) of the hash chunks.
const DefaultGranularity = 3

// SearchSet is a set of substrings that have hashes associated with them,
// making it fast to search for potential matches.
type SearchSet struct {
	// Tokens is a tokenized list of the original input string.
	Tokens tokenizer.Tokens
	// Hashes is a map of checksums to a range of tokens.
	Hashes tokenizer.Hash
	// Checksums is a list of checksums ordered from longest range to
	// shortest.
	Checksums []uint32
	// ChecksumRanges are the token ranges for the above checksums.
	ChecksumRanges tokenizer.TokenRanges

	nodes []*node
}

// node consists of a range of tokens along with the checksum for those tokens.
type node struct {
	checksum uint32
	tokens   *tokenizer.TokenRange
}

func (n *node) String() string {
	return fmt.Sprintf("[%d:%d]", n.tokens.Start, n.tokens.End)
}

// New creates a new SearchSet object. It generates a hash for each substring of "s".
func New(s string, granularity int) *SearchSet {
	toks := tokenizer.Tokenize(s)

	// Start generating hash values for all substrings within the text.
	h := make(tokenizer.Hash)
	checksums, tokenRanges := toks.GenerateHashes(h, func(a, b int) int {
		if a < b {
			return a
		}
		return b
	}(len(toks), granularity))
	sset := &SearchSet{
		Tokens:         toks,
		Hashes:         h,
		Checksums:      checksums,
		ChecksumRanges: tokenRanges,
	}
	sset.GenerateNodeList()
	return sset
}

// GenerateNodeList creates a node list out of the search set.
func (s *SearchSet) GenerateNodeList() {
	if len(s.Tokens) == 0 {
		return
	}

	for i := 0; i < len(s.Checksums); i++ {
		s.nodes = append(s.nodes, &node{
			checksum: s.Checksums[i],
			tokens:   s.ChecksumRanges[i],
		})
	}
}

// Serialize emits the SearchSet out so that it can be recreated at a later
// time.
func (s *SearchSet) Serialize(w io.Writer) error {
	return gob.NewEncoder(w).Encode(s)
}

// Deserialize reads a file with a serialized SearchSet in it and reconstructs it.
func Deserialize(r io.Reader, s *SearchSet) error {
	if err := gob.NewDecoder(r).Decode(&s); err != nil {
		return err
	}
	s.GenerateNodeList()
	return nil
}

// MatchRange is the range within the source text that is a match to the range
// in the target text.
type MatchRange struct {
	// Offsets into the source tokens.
	SrcStart, SrcEnd int
	// Offsets into the target tokens.
	TargetStart, TargetEnd int
}

// in returns true if the start and end are enclosed in the match range.
func (m *MatchRange) in(start, end int) bool {
	return start >= m.TargetStart && end <= m.TargetEnd
}

func (m *MatchRange) String() string {
	return fmt.Sprintf("[%v, %v)->[%v, %v)", m.SrcStart, m.SrcEnd, m.TargetStart, m.TargetEnd)
}

// MatchRanges is a list of "MatchRange"s. The ranges are monotonically
// increasing in value and indicate a single potential occurrence of the source
// text in the target text.
type MatchRanges []*MatchRange

func (m MatchRanges) Len() int      { return len(m) }
func (m MatchRanges) Swap(i, j int) { m[i], m[j] = m[j], m[i] }
func (m MatchRanges) Less(i, j int) bool {
	if m[i].TargetStart < m[j].TargetStart {
		return true
	}
	return m[i].TargetStart == m[j].TargetStart && m[i].SrcStart < m[j].SrcStart
}

// TargetRange is the start and stop token offsets into the target text.
func (m MatchRanges) TargetRange(target *SearchSet) (start, end int) {
	start = target.Tokens[m[0].TargetStart].Offset
	end = target.Tokens[m[len(m)-1].TargetEnd-1].Offset + len(target.Tokens[m[len(m)-1].TargetEnd-1].Text)
	return start, end
}

// Size is the number of source tokens that were matched.
func (m MatchRanges) Size() int {
	sum := 0
	for _, mr := range m {
		sum += mr.SrcEnd - mr.SrcStart
	}
	return sum
}

// FindPotentialMatches returns the ranges in the target (unknown) text that
// are best potential matches to the source (known) text.
func FindPotentialMatches(src, target *SearchSet) []MatchRanges {
	matchedRanges := getMatchedRanges(src, target)
	if len(matchedRanges) == 0 {
		return nil
	}

	// Cleanup the matching ranges so that we get the longest contiguous ranges.
	for i := 0; i < len(matchedRanges); i++ {
		matchedRanges[i] = coalesceMatchRanges(matchedRanges[i])
	}
	return matchedRanges
}

// getMatchedRanges finds the ranges in the target text that match the source
// text. There can be multiple occurrences of the source text within the target
// text. Each separate occurrence is an entry in the returned slice.
func getMatchedRanges(src, target *SearchSet) []MatchRanges {
	matched := targetMatchedRanges(src, target)
	if len(matched) == 0 {
		return nil
	}
	sort.Sort(matched)
	matched = untangleSourceRanges(matched)
	matchedRanges := splitRanges(matched)
	return mergeConsecutiveRanges(matchedRanges)
}

func extendsAny(tr tokenizer.TokenRanges, mr []MatchRanges) bool {
	if len(mr) == 0 {
		return false
	}
	for _, tv := range tr {
		for _, mv := range mr {
			if tv.Start >= mv[0].TargetStart && tv.Start <= mv[len(mv)-1].TargetEnd {
				return true
			}
		}
	}
	return false
}

// targetMatchedRanges finds matching sequences in target and src ordered by target position
func targetMatchedRanges(src, target *SearchSet) MatchRanges {
	if src.nodes == nil {
		return nil
	}

	var matched MatchRanges
	var previous *node
	var possible []MatchRanges
	for _, tgtNode := range target.nodes {
		sr, ok := src.Hashes[tgtNode.checksum]
		if !ok || (previous != nil && tgtNode.tokens.Start > previous.tokens.End) || !extendsAny(sr, possible) {
			for _, r := range possible {
				matched = append(matched, r...)
			}
			possible = possible[:0]
			previous = nil
		}
		if !ok {
			// There isn't a match in the source.
			continue
		}

		// Maps index within `possible` to the slice of ranges extended by a new range
		extended := make(map[int]*MatchRanges)
		// Go over the set of source ranges growing lists of `possible` match ranges.
		tv := tgtNode.tokens
		for _, sv := range sr {
			r := &MatchRange{
				SrcStart:    sv.Start,
				SrcEnd:      sv.End,
				TargetStart: tv.Start,
				TargetEnd:   tv.End,
			}
			found := false
			// Grow or extend each abutting `possible` match range.
			for i, p := range possible {
				last := p[len(p)-1]
				if sv.Start >= last.SrcStart && sv.Start <= last.SrcEnd && tv.Start >= last.TargetStart && tv.Start <= last.TargetEnd {
					found = true
					possible[i] = append(possible[i], r)
					extended[i] = &possible[i]
				}
			}
			if !found {
				// Did not abut any existing ranges, start a new `possible` match range.
				mrs := make(MatchRanges, 0, 2)
				mrs = append(mrs, r)
				possible = append(possible, mrs)
				extended[len(possible)-1] = &possible[len(possible)-1]
			}
		}
		if len(extended) < len(possible) {
			// Ranges not extended--add to `matched` if not included in other range.
			for i := 0; i < len(possible); {
				_, updated := extended[i]
				if updated {
					i++ // Keep in `possible` and advance to next index.
					continue
				}
				p1 := possible[i]
				found := false // whether found as subrange of another `possible` match.
				for _, p2 := range extended {
					if p1[0].SrcStart >= (*p2)[0].SrcStart && p1[0].TargetStart >= (*p2)[0].TargetStart {
						found = true
						break
					}
				}
				if !found {
					matched = append(matched, p1...)
				} // else included in other match.
				// Finished -- delete from `possible` and continue from same index.
				possible = append(possible[:i], possible[i+1:]...)
			}
		}
		previous = tgtNode
	}
	// At end of file, terminate all `possible` match ranges.
	for i := 0; i < len(possible); i++ {
		p1 := possible[i]
		found := false // whether found as subrange of another `possible` match.
		for j := i + 1; j < len(possible); {
			p2 := possible[j]
			if p1[0].SrcStart <= p2[0].SrcStart && p1[0].TargetStart <= p2[0].TargetStart {
				// Delete later sub-ranges included in this range.
				possible = append(possible[:j], possible[j+1:]...)
				continue
			}
			// Skip if subrange of a later range
			if p1[0].SrcStart >= p2[0].SrcStart && p1[0].TargetStart >= p2[0].TargetStart {
				found = true
			}
			j++
		}
		if !found {
			matched = append(matched, p1...)
		}
	}
	return matched
}

// untangleSourceRanges goes through the ranges and removes any whose source
// ranges are "out of order". A source range is "out of order" if the source
// range is out of sequence with the source ranges before and after it. This
// happens when more than one source range maps to the same target range.
// E.g.:
//
//	   SrcStart: 20, SrcEnd: 30, TargetStart: 127, TargetEnd: 137
//	1: SrcStart: 12, SrcEnd: 17, TargetStart: 138, TargetEnd: 143
//	2: SrcStart: 32, SrcEnd: 37, TargetStart: 138, TargetEnd: 143
//	   SrcStart: 38, SrcEnd: 40, TargetStart: 144, TargetEnd: 146
//
// Here (1) is out of order, because the source range [12, 17) is out of
// sequence with the surrounding source sequences, but [32, 37) is.
func untangleSourceRanges(matched MatchRanges) MatchRanges {
	mr := MatchRanges{matched[0]}
NEXT:
	for i := 1; i < len(matched); i++ {
		if mr[len(mr)-1].TargetStart == matched[i].TargetStart && mr[len(mr)-1].TargetEnd == matched[i].TargetEnd {
			// The matched range has already been added.
			continue
		}

		if i+1 < len(matched) && equalTargetRange(matched[i], matched[i+1]) {
			// A sequence of ranges match the same target range.
			// Find the first one that has a source range greater
			// than the currently matched range. Omit all others.
			if matched[i].SrcStart > mr[len(mr)-1].SrcStart {
				mr = append(mr, matched[i])
				continue
			}

			for j := i + 1; j < len(matched) && equalTargetRange(matched[i], matched[j]); j++ {
				// Check subsequent ranges to see if we can
				// find one that matches in the correct order.
				if matched[j].SrcStart > mr[len(mr)-1].SrcStart {
					mr = append(mr, matched[j])
					i = j
					continue NEXT
				}
			}
		}

		mr = append(mr, matched[i])
	}
	return mr
}

// equalTargetRange returns true if the two MatchRange's cover the same target range.
func equalTargetRange(this, that *MatchRange) bool {
	return this.TargetStart == that.TargetStart && this.TargetEnd == that.TargetEnd
}

// splitRanges splits the matched ranges so that a single match range has a
// monotonically increasing source range (indicating a single, potential
// instance of the source in the target).
func splitRanges(matched MatchRanges) []MatchRanges {
	var matchedRanges []MatchRanges
	mr := MatchRanges{matched[0]}
	for i := 1; i < len(matched); i++ {
		if mr[len(mr)-1].SrcStart > matched[i].SrcStart {
			matchedRanges = append(matchedRanges, mr)
			mr = MatchRanges{matched[i]}
		} else {
			mr = append(mr, matched[i])
		}
	}
	matchedRanges = append(matchedRanges, mr)
	return matchedRanges
}

// mergeConsecutiveRanges goes through the matched ranges and merges
// consecutive ranges. Two ranges are consecutive if the end of the previous
// matched range and beginning of the next matched range overlap. "matched"
// should have 1 or more MatchRanges, each with one or more MatchRange objects.
func mergeConsecutiveRanges(matched []MatchRanges) []MatchRanges {
	mr := []MatchRanges{matched[0]}

	// Convenience functions.
	prevMatchedRange := func() MatchRanges {
		return mr[len(mr)-1]
	}
	prevMatchedRangeLastElem := func() *MatchRange {
		return prevMatchedRange()[len(prevMatchedRange())-1]
	}

	// This algorithm compares the start of each MatchRanges object to the
	// end of the previous MatchRanges object. If they overlap, then it
	// tries to combine them. Note that a 0 offset into a MatchRanges
	// object (e.g., matched[i][0]) is its first MatchRange, which
	// indicates the start of the whole matched range.
NEXT:
	for i := 1; i < len(matched); i++ {
		if prevMatchedRangeLastElem().TargetEnd > matched[i][0].TargetStart {
			// Consecutive matched ranges overlap. Merge them.
			if prevMatchedRangeLastElem().TargetStart < matched[i][0].TargetStart {
				// The last element of the previous matched
				// range overlaps with the first element of the
				// current matched range. Concatenate them.
				if prevMatchedRangeLastElem().TargetEnd < matched[i][0].TargetEnd {
					prevMatchedRangeLastElem().SrcEnd += matched[i][0].TargetEnd - prevMatchedRangeLastElem().TargetEnd
					prevMatchedRangeLastElem().TargetEnd = matched[i][0].TargetEnd
				}
				mr[len(mr)-1] = append(prevMatchedRange(), matched[i][1:]...)
				continue
			}

			for j := 1; j < len(matched[i]); j++ {
				// Find the positions in the ranges where the
				// tail end of the previous matched range
				// overlaps with the start of the next matched
				// range.
				for k := len(prevMatchedRange()) - 1; k > 0; k-- {
					if prevMatchedRange()[k].SrcStart < matched[i][j].SrcStart &&
						prevMatchedRange()[k].TargetStart < matched[i][j].TargetStart {
						// Append the next range to the previous range.
						if prevMatchedRange()[k].TargetEnd < matched[i][j].TargetStart {
							// Coalesce the ranges.
							prevMatchedRange()[k].SrcEnd += matched[i][j-1].TargetEnd - prevMatchedRange()[k].TargetEnd
							prevMatchedRange()[k].TargetEnd = matched[i][j-1].TargetEnd
						}
						mr[len(mr)-1] = append(prevMatchedRange()[:k+1], matched[i][j:]...)
						continue NEXT
					}
				}
			}
		}
		mr = append(mr, matched[i])
	}
	return mr
}

// coalesceMatchRanges coalesces overlapping match ranges into a single
// contiguous match range.
func coalesceMatchRanges(matchedRanges MatchRanges) MatchRanges {
	coalesced := MatchRanges{matchedRanges[0]}
	for i := 1; i < len(matchedRanges); i++ {
		c := coalesced[len(coalesced)-1]
		mr := matchedRanges[i]

		if mr.SrcStart <= c.SrcEnd && mr.SrcStart >= c.SrcStart {
			var se, ts, te int
			if mr.SrcEnd > c.SrcEnd {
				se = mr.SrcEnd
			} else {
				se = c.SrcEnd
			}
			if mr.TargetStart < c.TargetStart {
				ts = mr.TargetStart
			} else {
				ts = c.TargetStart
			}
			if mr.TargetEnd > c.TargetEnd {
				te = mr.TargetEnd
			} else {
				te = c.TargetEnd
			}
			coalesced[len(coalesced)-1] = &MatchRange{
				SrcStart:    c.SrcStart,
				SrcEnd:      se,
				TargetStart: ts,
				TargetEnd:   te,
			}
		} else {
			coalesced = append(coalesced, mr)
		}
	}
	return coalesced
}