initial checkin

blevesearch · Oct 16, 2014 · 2291e99 · 2291e99
commit 2291e99
Show file tree

Hide file tree

Showing 11 changed files with 15,057 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,9 @@
+#*
+*.sublime-*
+*~
+.#*
+.project
+.settings
+.DS_Store
+/maketables
+/maketesttables
diff --git a/Makefile b/Makefile
@@ -0,0 +1,9 @@
+maketables: maketables.go maketesttables.go
+	go build maketables.go
+	go build maketesttables.go
+
+tables:	maketables
+	./maketables > tables.go
+	gofmt -w tables.go
+	./maketesttables > tables_test.go
+	gofmt -w tables_test.go
diff --git a/README.md b/README.md
@@ -0,0 +1,38 @@
+# segment
+
+A library for performing Unicode Text Segmentation
+as described in [Unicode Standard Annex #29](http://www.unicode.org/reports/tr29/)
+
+## Features
+
+* Currently only segmentation at Word Boundaries is supported.
+
+## Usage
+
+The functionality is exposed in two ways:
+
+1.  You can use a bufio.Scanner with the SplitWords implementation of SplitFunc.
+The SplitWords function will identify the appropriate word boundaries in the input
+text and the Scanner will return tokens at the appropriate place.
+
+		scanner := bufio.NewScanner(...)
+		scanner.Split(segment.SplitWords)
+		for scanner.Scan() {
+			tokenBytes := scanner.Bytes()
+		}
+		if err := scanner.Err(); err != nil {
+			t.Fatal(err)
+		}
+
+2.  Sometimes you would also like information returned about the type of token.
+To do this we have introduce a new type named Segmenter.  It works just like Scanner
+but additionally a token type is returned.
+
+		segmenter := segment.NewWordSegmenter(...)
+		for segmenter.Segment() {
+			tokenBytes := segmenter.Bytes())
+			tokenType := segmenter.Type()
+		}
+		if err := segmenter.Err(); err != nil {
+			t.Fatal(err)
+		}
diff --git a/doc.go b/doc.go
@@ -0,0 +1,45 @@
+//  Copyright (c) 2014 Couchbase, Inc.
+//  Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
+//  except in compliance with the License. You may obtain a copy of the License at
+//    http://www.apache.org/licenses/LICENSE-2.0
+//  Unless required by applicable law or agreed to in writing, software distributed under the
+//  License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+//  either express or implied. See the License for the specific language governing permissions
+//  and limitations under the License.
+
+/*
+Package segment is a library for performing Unicode Text Segmentation
+as described in Unicode Standard Annex #29 http://www.unicode.org/reports/tr29/
+
+Currently only segmentation at Word Boundaries is supported.
+
+The functionality is exposed in two ways:
+
+1.  You can use a bufio.Scanner with the SplitWords implementation of SplitFunc.
+The SplitWords function will identify the appropriate word boundaries in the input
+text and the Scanner will return tokens at the appropriate place.
+
+		scanner := bufio.NewScanner(...)
+		scanner.Split(segment.SplitWords)
+		for scanner.Scan() {
+			tokenBytes := scanner.Bytes()
+		}
+		if err := scanner.Err(); err != nil {
+			t.Fatal(err)
+		}
+
+2.  Sometimes you would also like information returned about the type of token.
+To do this we have introduce a new type named Segmenter.  It works just like Scanner
+but additionally a token type is returned.
+
+		segmenter := segment.NewWordSegmenter(...)
+		for segmenter.Segment() {
+			tokenBytes := segmenter.Bytes())
+			tokenType := segmenter.Type()
+		}
+		if err := segmenter.Err(); err != nil {
+			t.Fatal(err)
+		}
+
+*/
+package segment
diff --git a/maketables.go b/maketables.go
@@ -0,0 +1,235 @@
+//  Copyright (c) 2014 Couchbase, Inc.
+//  Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
+//  except in compliance with the License. You may obtain a copy of the License at
+//    http://www.apache.org/licenses/LICENSE-2.0
+//  Unless required by applicable law or agreed to in writing, software distributed under the
+//  License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+//  either express or implied. See the License for the specific language governing permissions
+//  and limitations under the License.
+
+// +build ignore
+
+package main
+
+import (
+	"bufio"
+	"flag"
+	"fmt"
+	"io"
+	"log"
+	"net/http"
+	"os"
+	"strconv"
+	"strings"
+	"unicode"
+)
+
+var url = flag.String("url",
+	"http://www.unicode.org/Public/"+unicode.Version+"/ucd/auxiliary/",
+	"URL of Unicode database directory")
+var verbose = flag.Bool("verbose",
+	false,
+	"write data to stdout as it is parsed")
+var localFiles = flag.Bool("local",
+	false,
+	"data files have been copied to the current directory; for debugging only")
+
+func main() {
+	flag.Parse()
+
+	graphemePropertyRanges := make(map[string]*unicode.RangeTable)
+	loadUnicodeData("GraphemeBreakProperty.txt", graphemePropertyRanges)
+	wordPropertyRanges := make(map[string]*unicode.RangeTable)
+	loadUnicodeData("WordBreakProperty.txt", wordPropertyRanges)
+	sentencePropertyRanges := make(map[string]*unicode.RangeTable)
+	loadUnicodeData("SentenceBreakProperty.txt", sentencePropertyRanges)
+
+	fmt.Printf(fileHeader, *url)
+	generateTables("Grapheme", graphemePropertyRanges)
+	generateTables("Word", wordPropertyRanges)
+	generateTables("Sentence", sentencePropertyRanges)
+}
+
+// WordBreakProperty.txt has the form:
+// 05F0..05F2    ; Hebrew_Letter # Lo   [3] HEBREW LIGATURE YIDDISH DOUBLE VAV..HEBREW LIGATURE YIDDISH DOUBLE YOD
+// FB1D          ; Hebrew_Letter # Lo       HEBREW LETTER YOD WITH HIRIQ
+func openReader(file string) (input io.ReadCloser) {
+	if *localFiles {
+		f, err := os.Open(file)
+		if err != nil {
+			log.Fatal(err)
+		}
+		input = f
+	} else {
+		path := *url + file
+		resp, err := http.Get(path)
+		if err != nil {
+			log.Fatal(err)
+		}
+		if resp.StatusCode != 200 {
+			log.Fatal("bad GET status for "+file, resp.Status)
+		}
+		input = resp.Body
+	}
+	return
+}
+
+func loadUnicodeData(filename string, propertyRanges map[string]*unicode.RangeTable) {
+	f := openReader(filename)
+	defer f.Close()
+	bufioReader := bufio.NewReader(f)
+	line, err := bufioReader.ReadString('\n')
+	for err == nil {
+		parseLine(line, propertyRanges)
+		line, err = bufioReader.ReadString('\n')
+	}
+	// if the err was EOF still need to process last value
+	if err == io.EOF {
+		parseLine(line, propertyRanges)
+	}
+}
+
+const comment = "#"
+const sep = ";"
+const rnge = ".."
+
+func parseLine(line string, propertyRanges map[string]*unicode.RangeTable) {
+	if strings.HasPrefix(line, comment) {
+		return
+	}
+	line = strings.TrimSpace(line)
+	if len(line) == 0 {
+		return
+	}
+	commentStart := strings.Index(line, comment)
+	if commentStart > 0 {
+		line = line[0:commentStart]
+	}
+	pieces := strings.Split(line, sep)
+	if len(pieces) != 2 {
+		log.Printf("unexpected %d pieces in %s", len(pieces), line)
+		return
+	}
+
+	propertyName := strings.TrimSpace(pieces[1])
+
+	rangeTable, ok := propertyRanges[propertyName]
+	if !ok {
+		rangeTable = &unicode.RangeTable{
+			LatinOffset: 0,
+		}
+		propertyRanges[propertyName] = rangeTable
+	}
+
+	codepointRange := strings.TrimSpace(pieces[0])
+	rngeIndex := strings.Index(codepointRange, rnge)
+
+	if rngeIndex < 0 {
+		// single codepoint, not range
+		codepointInt, err := strconv.ParseUint(codepointRange, 16, 64)
+		if err != nil {
+			log.Printf("error parsing int: %v", err)
+			return
+		}
+		if codepointInt < 0x10000 {
+			r16 := unicode.Range16{
+				Lo:     uint16(codepointInt),
+				Hi:     uint16(codepointInt),
+				Stride: 1,
+			}
+			addR16ToTable(rangeTable, r16)
+		} else {
+			r32 := unicode.Range32{
+				Lo:     uint32(codepointInt),
+				Hi:     uint32(codepointInt),
+				Stride: 1,
+			}
+			addR32ToTable(rangeTable, r32)
+		}
+	} else {
+		rngeStart := codepointRange[0:rngeIndex]
+		rngeEnd := codepointRange[rngeIndex+2:]
+		rngeStartInt, err := strconv.ParseUint(rngeStart, 16, 64)
+		if err != nil {
+			log.Printf("error parsing int: %v", err)
+			return
+		}
+		rngeEndInt, err := strconv.ParseUint(rngeEnd, 16, 64)
+		if err != nil {
+			log.Printf("error parsing int: %v", err)
+			return
+		}
+		if rngeStartInt < 0x10000 && rngeEndInt < 0x10000 {
+			r16 := unicode.Range16{
+				Lo:     uint16(rngeStartInt),
+				Hi:     uint16(rngeEndInt),
+				Stride: 1,
+			}
+			addR16ToTable(rangeTable, r16)
+		} else if rngeStartInt >= 0x10000 && rngeEndInt >= 0x10000 {
+			r32 := unicode.Range32{
+				Lo:     uint32(rngeStartInt),
+				Hi:     uint32(rngeEndInt),
+				Stride: 1,
+			}
+			addR32ToTable(rangeTable, r32)
+		} else {
+			log.Printf("unexpected range")
+		}
+	}
+}
+
+func addR16ToTable(r *unicode.RangeTable, r16 unicode.Range16) {
+	if r.R16 == nil {
+		r.R16 = make([]unicode.Range16, 0, 1)
+	}
+	r.R16 = append(r.R16, r16)
+	if r16.Hi <= unicode.MaxLatin1 {
+		r.LatinOffset++
+	}
+}
+
+func addR32ToTable(r *unicode.RangeTable, r32 unicode.Range32) {
+	if r.R32 == nil {
+		r.R32 = make([]unicode.Range32, 0, 1)
+	}
+	r.R32 = append(r.R32, r32)
+}
+
+func generateTables(prefix string, propertyRanges map[string]*unicode.RangeTable) {
+	for key, rt := range propertyRanges {
+		fmt.Printf("var _%s%s = %s\n", prefix, key, generateRangeTable(rt))
+	}
+}
+
+func generateRangeTable(rt *unicode.RangeTable) string {
+	rv := "&unicode.RangeTable{\n"
+	if rt.R16 != nil {
+		rv += "\tR16: []unicode.Range16{\n"
+		for _, r16 := range rt.R16 {
+			rv += fmt.Sprintf("\t\t%#v,\n", r16)
+		}
+		rv += "\t},\n"
+	}
+	if rt.R32 != nil {
+		rv += "\tR32: []unicode.Range32{\n"
+		for _, r32 := range rt.R32 {
+			rv += fmt.Sprintf("\t\t%#v,\n", r32)
+		}
+		rv += "\t},\n"
+	}
+	rv += fmt.Sprintf("\t\tLatinOffset: %d,\n", rt.LatinOffset)
+	rv += "}\n"
+	return rv
+}
+
+const fileHeader = `// Generated by running
+//      maketables --url=%s
+// DO NOT EDIT
+
+package segment
+
+import(
+	"unicode"
+)
+`