From 633c4d30b74dbd69dbfc2c70c01c7638317c2d6a Mon Sep 17 00:00:00 2001 From: Marvin Humphrey Date: Sun, 19 Jul 2015 16:47:23 -0700 Subject: [PATCH 1/8] Remove RegexTokenizer_Set_Token_RE. It was used internally a long time ago and is now obsolete. --- c/src/Lucy/Analysis/RegexTokenizer.c | 7 -- core/Lucy/Analysis/RegexTokenizer.cfh | 6 -- .../src/Lucy/Analysis/RegexTokenizer.c | 5 -- perl/xs/Lucy/Analysis/RegexTokenizer.c | 75 +++++-------------- 4 files changed, 20 insertions(+), 73 deletions(-) diff --git a/c/src/Lucy/Analysis/RegexTokenizer.c b/c/src/Lucy/Analysis/RegexTokenizer.c index a811979e0..d47b3ea23 100644 --- a/c/src/Lucy/Analysis/RegexTokenizer.c +++ b/c/src/Lucy/Analysis/RegexTokenizer.c @@ -86,13 +86,6 @@ RegexTokenizer_init(RegexTokenizer *self, String *pattern) { return self; } -void -RegexTokenizer_Set_Token_RE_IMP(RegexTokenizer *self, void *token_re) { - UNUSED_VAR(self); - UNUSED_VAR(token_re); - THROW(ERR, "TODO"); -} - void RegexTokenizer_Destroy_IMP(RegexTokenizer *self) { RegexTokenizerIVARS *const ivars = RegexTokenizer_IVARS(self); diff --git a/core/Lucy/Analysis/RegexTokenizer.cfh b/core/Lucy/Analysis/RegexTokenizer.cfh index 9e352025e..ec14d51e8 100644 --- a/core/Lucy/Analysis/RegexTokenizer.cfh +++ b/core/Lucy/Analysis/RegexTokenizer.cfh @@ -84,12 +84,6 @@ public class Lucy::Analysis::RegexTokenizer Tokenize_Utf8(RegexTokenizer *self, const char *text, size_t len, Inversion *inversion); - /** Set the compiled regular expression for matching a token. Also sets - * `pattern` as a side effect. - */ - void - Set_Token_RE(RegexTokenizer *self, void *token_re); - public incremented Obj* Dump(RegexTokenizer *self); diff --git a/example-lang/src/Lucy/Analysis/RegexTokenizer.c b/example-lang/src/Lucy/Analysis/RegexTokenizer.c index 2f21afb91..92f42ecb7 100644 --- a/example-lang/src/Lucy/Analysis/RegexTokenizer.c +++ b/example-lang/src/Lucy/Analysis/RegexTokenizer.c @@ -28,11 +28,6 @@ lucy_RegexTokenizer_init(lucy_RegexTokenizer *self, UNREACHABLE_RETURN(lucy_RegexTokenizer*); } -void -lucy_RegexTokenizer_set_token_re(lucy_RegexTokenizer *self, void *token_re) { - THROW(LUCY_ERR, "TODO"); -} - void lucy_RegexTokenizer_destroy(lucy_RegexTokenizer *self) { THROW(LUCY_ERR, "TODO"); diff --git a/perl/xs/Lucy/Analysis/RegexTokenizer.c b/perl/xs/Lucy/Analysis/RegexTokenizer.c index 4c6e1f11e..f95cf0f20 100644 --- a/perl/xs/Lucy/Analysis/RegexTokenizer.c +++ b/perl/xs/Lucy/Analysis/RegexTokenizer.c @@ -27,13 +27,6 @@ static SV* S_compile_token_re(pTHX_ cfish_String *pattern); -static void -S_set_token_re_but_not_pattern(pTHX_ lucy_RegexTokenizer *self, - void *token_re); - -static void -S_set_pattern_from_token_re(pTHX_ lucy_RegexTokenizer *self, void *token_re); - bool lucy_RegexTokenizer_is_available(void) { return true; @@ -61,36 +54,7 @@ lucy_RegexTokenizer_init(lucy_RegexTokenizer *self, // Acquire a compiled regex engine for matching one token. dTHX; - SV *token_re_sv = S_compile_token_re(aTHX_ ivars->pattern); - S_set_token_re_but_not_pattern(aTHX_ self, SvRV(token_re_sv)); - SvREFCNT_dec(token_re_sv); - - return self; -} - -static SV* -S_compile_token_re(pTHX_ cfish_String *pattern) { - dSP; - ENTER; - SAVETMPS; - EXTEND(SP, 1); - PUSHMARK(SP); - XPUSHs((SV*)CFISH_Str_To_Host(pattern)); - PUTBACK; - call_pv("Lucy::Analysis::RegexTokenizer::_compile_token_re", G_SCALAR); - SPAGAIN; - SV *token_re_sv = POPs; - (void)SvREFCNT_inc(token_re_sv); - PUTBACK; - FREETMPS; - LEAVE; - return token_re_sv; -} - -static void -S_set_token_re_but_not_pattern(pTHX_ lucy_RegexTokenizer *self, - void *token_re) { - lucy_RegexTokenizerIVARS *const ivars = lucy_RegexTokenizer_IVARS(self); + SV *token_re = S_compile_token_re(aTHX_ ivars->pattern); #if (PERL_VERSION > 10) REGEXP *rx = SvRX((SV*)token_re); #else @@ -107,29 +71,30 @@ S_set_token_re_but_not_pattern(pTHX_ lucy_RegexTokenizer *self, THROW(CFISH_ERR, "Failed to extract REGEXP from token_re '%s'", SvPV_nolen((SV*)token_re)); } - if (ivars->token_re) { ReREFCNT_dec(((REGEXP*)ivars->token_re)); } ivars->token_re = rx; (void)ReREFCNT_inc(((REGEXP*)ivars->token_re)); -} + SvREFCNT_dec(token_re); -static void -S_set_pattern_from_token_re(pTHX_ lucy_RegexTokenizer *self, void *token_re) { - lucy_RegexTokenizerIVARS *const ivars = lucy_RegexTokenizer_IVARS(self); - SV *rv = newRV((SV*)token_re); - STRLEN len = 0; - char *ptr = SvPVutf8((SV*)rv, len); - CFISH_DECREF(ivars->pattern); - ivars->pattern = cfish_Str_new_from_trusted_utf8(ptr, len); - SvREFCNT_dec(rv); + return self; } -void -LUCY_RegexTokenizer_Set_Token_RE_IMP(lucy_RegexTokenizer *self, - void *token_re) { - dTHX; - S_set_token_re_but_not_pattern(aTHX_ self, token_re); - // Set pattern as a side effect. - S_set_pattern_from_token_re(aTHX_ self, token_re); +static SV* +S_compile_token_re(pTHX_ cfish_String *pattern) { + dSP; + ENTER; + SAVETMPS; + EXTEND(SP, 1); + PUSHMARK(SP); + XPUSHs((SV*)CFISH_Str_To_Host(pattern)); + PUTBACK; + call_pv("Lucy::Analysis::RegexTokenizer::_compile_token_re", G_SCALAR); + SPAGAIN; + SV *token_re_sv = POPs; + (void)SvREFCNT_inc(token_re_sv); + PUTBACK; + FREETMPS; + LEAVE; + return token_re_sv; } void From fed6ca76b6b8310c1db97bfbbed1b56af47e6194 Mon Sep 17 00:00:00 2001 From: Marvin Humphrey Date: Tue, 14 Jul 2015 18:57:11 -0700 Subject: [PATCH 2/8] Copy C host-specific code to Go. Copy the C binding code in anticipation of replacing it with Go-specific binding code. --- common/charmonizer.c | 2 +- common/charmonizer.main | 2 +- go/cfext/lucy.c | 483 ++++++++++++++++++++++++++++++++++++++++ 3 files changed, 485 insertions(+), 2 deletions(-) create mode 100644 go/cfext/lucy.c diff --git a/common/charmonizer.c b/common/charmonizer.c index d3c3dc38a..f0ebfcd26 100644 --- a/common/charmonizer.c +++ b/common/charmonizer.c @@ -8134,7 +8134,7 @@ lucy_MakeFile_new(chaz_CLI *cli) { self->host_src_dir = "xs"; } else if (chaz_CLI_defined(cli, "enable-go")) { - self->host_src_dir = "../c/src"; + self->host_src_dir = "cfext"; } else { self->host_src_dir = "src"; diff --git a/common/charmonizer.main b/common/charmonizer.main index 991593b51..800442e7b 100644 --- a/common/charmonizer.main +++ b/common/charmonizer.main @@ -252,7 +252,7 @@ lucy_MakeFile_new(chaz_CLI *cli) { self->host_src_dir = "xs"; } else if (chaz_CLI_defined(cli, "enable-go")) { - self->host_src_dir = "../c/src"; + self->host_src_dir = "cfext"; } else { self->host_src_dir = "src"; diff --git a/go/cfext/lucy.c b/go/cfext/lucy.c new file mode 100644 index 000000000..d1044dfe3 --- /dev/null +++ b/go/cfext/lucy.c @@ -0,0 +1,483 @@ +/* Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + + +#define C_LUCY_REGEXTOKENIZER +#define C_LUCY_DOC +#define C_LUCY_DOCREADER +#define C_LUCY_DEFAULTDOCREADER +#define C_LUCY_INVERTER +#define C_LUCY_INVERTERENTRY +#define CFISH_USE_SHORT_NAMES +#define LUCY_USE_SHORT_NAMES + + + +#include + +#include "charmony.h" + +#include "Lucy/Analysis/RegexTokenizer.h" +#include "Lucy/Document/Doc.h" +#include "Lucy/Index/DocReader.h" +#include "Lucy/Index/Inverter.h" +#include "Clownfish/Blob.h" +#include "Clownfish/String.h" +#include "Clownfish/Err.h" +#include "Clownfish/Hash.h" +#include "Clownfish/HashIterator.h" +#include "Clownfish/Num.h" +#include "Clownfish/Vector.h" +#include "Clownfish/Class.h" +#include "Clownfish/Util/Memory.h" +#include "Clownfish/Util/StringHelper.h" +#include "Lucy/Analysis/Token.h" +#include "Lucy/Analysis/Inversion.h" +#include "Lucy/Document/HitDoc.h" +#include "Lucy/Index/Segment.h" +#include "Lucy/Plan/FieldType.h" +#include "Lucy/Plan/Schema.h" +#include "Lucy/Store/InStream.h" +#include "Lucy/Store/OutStream.h" +#include "Lucy/Util/Freezer.h" + +#if defined(CHY_HAS_PCRE_H) + +#include + +static uint32_t +S_count_code_points(const char *string, size_t len); + +bool +RegexTokenizer_is_available(void) { + return true; +} + +RegexTokenizer* +RegexTokenizer_init(RegexTokenizer *self, String *pattern) { + Analyzer_init((Analyzer*)self); + RegexTokenizerIVARS *const ivars = RegexTokenizer_IVARS(self); + + char *pattern_buf = NULL; + const char *pattern_ptr; + if (pattern) { + ivars->pattern = Str_Clone(pattern); + pattern_buf = Str_To_Utf8(ivars->pattern); + pattern_ptr = pattern_buf; + } + else { + pattern_ptr = "\\w+(?:['\\x{2019}]\\w+)*"; + ivars->pattern + = Str_new_from_trusted_utf8(pattern_ptr, strlen(pattern_ptr)); + } + + int options = PCRE_UTF8 | PCRE_NO_UTF8_CHECK; +#ifdef PCRE_BSR_UNICODE + // Available since PCRE 7.4 + options |= PCRE_BSR_UNICODE; +#endif +#ifdef PCRE_NEWLINE_LF + // Available since PCRE 6.7 + options |= PCRE_NEWLINE_LF; +#endif + const char *err_ptr; + int err_offset; + pcre *re = pcre_compile(pattern_ptr, options, &err_ptr, &err_offset, NULL); + if (pattern_buf) { + FREEMEM(pattern_buf); + } + if (!re) { + THROW(ERR, "%s", err_ptr); + } + + // TODO: Check whether pcre_study improves performance + + ivars->token_re = re; + + return self; +} + +void +RegexTokenizer_Destroy_IMP(RegexTokenizer *self) { + RegexTokenizerIVARS *const ivars = RegexTokenizer_IVARS(self); + DECREF(ivars->pattern); + pcre *re = (pcre*)ivars->token_re; + if (re) { + pcre_free(re); + } + SUPER_DESTROY(self, REGEXTOKENIZER); +} + +void +RegexTokenizer_Tokenize_Utf8_IMP(RegexTokenizer *self, const char *string, + size_t string_len, Inversion *inversion) { + RegexTokenizerIVARS *const ivars = RegexTokenizer_IVARS(self); + pcre *re = (pcre*)ivars->token_re; + int byte_offset = 0; + uint32_t cp_offset = 0; // Code points + int options = PCRE_NO_UTF8_CHECK; + int ovector[3]; + + int return_code = pcre_exec(re, NULL, string, string_len, byte_offset, + options, ovector, 3); + while (return_code >= 0) { + const char *match = string + ovector[0]; + size_t match_len = ovector[1] - ovector[0]; + + uint32_t cp_before = S_count_code_points(string + byte_offset, + ovector[0] - byte_offset); + uint32_t cp_start = cp_offset + cp_before; + uint32_t cp_matched = S_count_code_points(match, match_len); + uint32_t cp_end = cp_start + cp_matched; + + // Add a token to the new inversion. + Token *token = Token_new(match, match_len, cp_start, cp_end, 1.0f, 1); + Inversion_Append(inversion, token); + + byte_offset = ovector[1]; + cp_offset = cp_end; + return_code = pcre_exec(re, NULL, string, string_len, byte_offset, + options, ovector, 3); + } + + if (return_code != PCRE_ERROR_NOMATCH) { + THROW(ERR, "pcre_exec failed: %d", return_code); + } +} + +static uint32_t +S_count_code_points(const char *string, size_t len) { + uint32_t num_code_points = 0; + size_t i = 0; + + while (i < len) { + i += StrHelp_UTF8_COUNT[(uint8_t)(string[i])]; + ++num_code_points; + } + + if (i != len) { + THROW(ERR, "Match between code point boundaries in '%s'", string); + } + + return num_code_points; +} + +#else // CHY_HAS_PCRE_H + +bool +RegexTokenizer_is_available(void) { + return false; +} + +RegexTokenizer* +RegexTokenizer_init(RegexTokenizer *self, String *pattern) { + UNUSED_VAR(self); + UNUSED_VAR(pattern); + THROW(ERR, + "RegexTokenizer is not available because Lucy was compiled" + " without PCRE."); + UNREACHABLE_RETURN(RegexTokenizer*); +} + +void +RegexTokenizer_Set_Token_RE_IMP(RegexTokenizer *self, void *token_re) { + UNUSED_VAR(self); + UNUSED_VAR(token_re); + THROW(ERR, + "RegexTokenizer is not available because Lucy was compiled" + " without PCRE."); +} + +void +RegexTokenizer_Destroy_IMP(RegexTokenizer *self) { + UNUSED_VAR(self); + THROW(ERR, + "RegexTokenizer is not available because Lucy was compiled" + " without PCRE."); +} + +void +RegexTokenizer_Tokenize_Utf8_IMP(RegexTokenizer *self, const char *string, + size_t string_len, Inversion *inversion) { + UNUSED_VAR(self); + UNUSED_VAR(string); + UNUSED_VAR(string_len); + UNUSED_VAR(inversion); + THROW(ERR, + "RegexTokenizer is not available because Lucy was compiled" + " without PCRE."); +} + +#endif // CHY_HAS_PCRE_H + +/********************************** Doc ********************************/ + +Doc* +Doc_init(Doc *self, void *fields, int32_t doc_id) { + DocIVARS *const ivars = Doc_IVARS(self); + Hash *hash; + + if (fields) { + hash = (Hash *)INCREF(CERTIFY(fields, HASH)); + } + else { + hash = Hash_new(0); + } + ivars->fields = hash; + ivars->doc_id = doc_id; + + return self; +} + +void +Doc_Set_Fields_IMP(Doc *self, void *fields) { + DocIVARS *const ivars = Doc_IVARS(self); + DECREF(ivars->fields); + ivars->fields = CERTIFY(fields, HASH); +} + +uint32_t +Doc_Get_Size_IMP(Doc *self) { + Hash *hash = (Hash*)Doc_IVARS(self)->fields; + return Hash_Get_Size(hash); +} + +void +Doc_Store_IMP(Doc *self, String *field, Obj *value) { + Hash *hash = (Hash*)Doc_IVARS(self)->fields; + Hash_Store(hash, field, INCREF(value)); +} + +void +Doc_Serialize_IMP(Doc *self, OutStream *outstream) { + DocIVARS *const ivars = Doc_IVARS(self); + Hash *hash = (Hash*)ivars->fields; + Freezer_serialize_hash(hash, outstream); + OutStream_Write_C32(outstream, ivars->doc_id); +} + +Doc* +Doc_Deserialize_IMP(Doc *self, InStream *instream) { + DocIVARS *const ivars = Doc_IVARS(self); + ivars->fields = Freezer_read_hash(instream); + ivars->doc_id = InStream_Read_C32(instream); + return self; +} + +Obj* +Doc_Extract_IMP(Doc *self, String *field) { + Hash *hash = (Hash*)Doc_IVARS(self)->fields; + return INCREF(Hash_Fetch(hash, field)); +} + +Hash* +Doc_Dump_IMP(Doc *self) { + UNUSED_VAR(self); + THROW(ERR, "TODO"); + UNREACHABLE_RETURN(Hash*); +} + +Doc* +Doc_Load_IMP(Doc *self, Obj *dump) { + UNUSED_VAR(self); + UNUSED_VAR(dump); + THROW(ERR, "TODO"); + UNREACHABLE_RETURN(Doc*); +} + +bool +Doc_Equals_IMP(Doc *self, Obj *other) { + if ((Doc*)other == self) { return true; } + if (!Obj_is_a(other, DOC)) { return false; } + DocIVARS *const ivars = Doc_IVARS(self); + DocIVARS *const ovars = Doc_IVARS((Doc*)other); + return Hash_Equals((Hash*)ivars->fields, (Obj*)ovars->fields); +} + +void +Doc_Destroy_IMP(Doc *self) { + DocIVARS *const ivars = Doc_IVARS(self); + DECREF(ivars->fields); + SUPER_DESTROY(self, DOC); +} + + +/**************************** DocReader *****************************/ + +HitDoc* +DefDocReader_Fetch_Doc_IMP(DefaultDocReader *self, int32_t doc_id) { + DefaultDocReaderIVARS *const ivars = DefDocReader_IVARS(self); + Schema *const schema = ivars->schema; + InStream *const dat_in = ivars->dat_in; + InStream *const ix_in = ivars->ix_in; + Hash *const fields = Hash_new(1); + int64_t start; + uint32_t num_fields; + uint32_t field_name_cap = 31; + char *field_name = (char*)MALLOCATE(field_name_cap + 1); + + // Get data file pointer from index, read number of fields. + InStream_Seek(ix_in, (int64_t)doc_id * 8); + start = InStream_Read_U64(ix_in); + InStream_Seek(dat_in, start); + num_fields = InStream_Read_C32(dat_in); + + // Decode stored data and build up the doc field by field. + while (num_fields--) { + uint32_t field_name_len; + Obj *value; + FieldType *type; + + // Read field name. + field_name_len = InStream_Read_C32(dat_in); + if (field_name_len > field_name_cap) { + field_name_cap = field_name_len; + field_name = (char*)REALLOCATE(field_name, + field_name_cap + 1); + } + InStream_Read_Bytes(dat_in, field_name, field_name_len); + + // Find the Field's FieldType. + String *field_name_str = SSTR_WRAP_UTF8(field_name, field_name_len); + type = Schema_Fetch_Type(schema, field_name_str); + + // Read the field value. + switch (FType_Primitive_ID(type) & FType_PRIMITIVE_ID_MASK) { + case FType_TEXT: { + uint32_t value_len = InStream_Read_C32(dat_in); + char *buf = (char*)MALLOCATE(value_len + 1); + InStream_Read_Bytes(dat_in, buf, value_len); + buf[value_len] = '\0'; + value = (Obj*)Str_new_steal_utf8(buf, value_len); + break; + } + case FType_BLOB: { + uint32_t value_len = InStream_Read_C32(dat_in); + char *buf = (char*)MALLOCATE(value_len); + InStream_Read_Bytes(dat_in, buf, value_len); + value = (Obj*)Blob_new_steal(buf, value_len); + break; + } + case FType_FLOAT32: + value = (Obj*)Float_new(InStream_Read_F32(dat_in)); + break; + case FType_FLOAT64: + value = (Obj*)Float_new(InStream_Read_F64(dat_in)); + break; + case FType_INT32: + value = (Obj*)Int_new((int32_t)InStream_Read_C32(dat_in)); + break; + case FType_INT64: + value = (Obj*)Int_new((int64_t)InStream_Read_C64(dat_in)); + break; + default: + value = NULL; + THROW(ERR, "Unrecognized type: %o", type); + } + + // Store the value. + Hash_Store_Utf8(fields, field_name, field_name_len, value); + } + FREEMEM(field_name); + + HitDoc *retval = HitDoc_new(fields, doc_id, 0.0); + DECREF(fields); + return retval; +} + +/**************************** Inverter *****************************/ + +static InverterEntry* +S_fetch_entry(InverterIVARS *ivars, String *field) { + Schema *const schema = ivars->schema; + int32_t field_num = Seg_Field_Num(ivars->segment, field); + if (!field_num) { + // This field seems not to be in the segment yet. Try to find it in + // the Schema. + if (Schema_Fetch_Type(schema, field)) { + // The field is in the Schema. Get a field num from the Segment. + field_num = Seg_Add_Field(ivars->segment, field); + } + else { + // We've truly failed to find the field. The user must + // not have spec'd it. + THROW(ERR, "Unknown field name: '%o'", field); + } + } + + InverterEntry *entry + = (InverterEntry*)Vec_Fetch(ivars->entry_pool, field_num); + if (!entry) { + entry = InvEntry_new(schema, (String*)field, field_num); + Vec_Store(ivars->entry_pool, field_num, (Obj*)entry); + } + return entry; +} + +void +Inverter_Invert_Doc_IMP(Inverter *self, Doc *doc) { + InverterIVARS *const ivars = Inverter_IVARS(self); + Hash *const fields = (Hash*)Doc_Get_Fields(doc); + + // Prepare for the new doc. + Inverter_Set_Doc(self, doc); + + // Extract and invert the doc's fields. + HashIterator *iter = HashIter_new(fields); + while (HashIter_Next(iter)) { + String *field = HashIter_Get_Key(iter); + Obj *obj = HashIter_Get_Value(iter); + + InverterEntry *inventry = S_fetch_entry(ivars, field); + InverterEntryIVARS *inventry_ivars = InvEntry_IVARS(inventry); + FieldType *type = inventry_ivars->type; + + // Get the field value. + switch (FType_Primitive_ID(type) & FType_PRIMITIVE_ID_MASK) { + case FType_TEXT: { + CERTIFY(obj, STRING); + break; + } + case FType_BLOB: { + CERTIFY(obj, BLOB); + break; + } + case FType_INT32: + case FType_INT64: { + CERTIFY(obj, INTEGER); + break; + } + case FType_FLOAT32: + case FType_FLOAT64: { + CERTIFY(obj, FLOAT); + break; + } + default: + THROW(ERR, "Unrecognized type: %o", type); + } + + if (inventry_ivars->value != obj) { + DECREF(inventry_ivars->value); + inventry_ivars->value = INCREF(obj); + } + + Inverter_Add_Field(self, inventry); + } + DECREF(iter); +} + + From dab9a88d5c654a2539e4a81db4d74f88cc976cf3 Mon Sep 17 00:00:00 2001 From: Marvin Humphrey Date: Thu, 16 Jul 2015 20:32:45 -0700 Subject: [PATCH 3/8] Make it possible to reference Go objects from C. This patch is a variant on sample code written by Nick Wellnhofer. --- go/lucy/registry.go | 135 +++++++++++++++++++++++++++++++++++++++ go/lucy/registry_test.go | 84 ++++++++++++++++++++++++ 2 files changed, 219 insertions(+) create mode 100644 go/lucy/registry.go create mode 100644 go/lucy/registry_test.go diff --git a/go/lucy/registry.go b/go/lucy/registry.go new file mode 100644 index 000000000..86719316b --- /dev/null +++ b/go/lucy/registry.go @@ -0,0 +1,135 @@ +/* Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package lucy + +import "sync" + +type indexInt uintptr + +type objRegistry struct { + // Use pointer to array to guarantee atomic update for lock-free reads. + // Assume that loads and stores of the pointer are atomic. + entries *[]interface{} + freeListHead indexInt + mutex sync.Mutex +} + +func newObjRegistry(size uintptr) *objRegistry { + entries := make([]interface{}, size) + + // Each empty entry points to the index of the next empty entry. Index 0 + // is unused. The last slot is seet to a terminating sentry value of 0. + entries[0] = indexInt(0) // unused + for i := uintptr(1); i < size - 1; i++ { + entries[i] = indexInt(i + 1) + } + entries[size-1] = indexInt(0) + + reg := &objRegistry{} + reg.entries = &entries + reg.freeListHead = indexInt(1) + + return reg +} + +func (reg *objRegistry) store(obj interface{}) uintptr { + reg.mutex.Lock() + + // Find the index of the next empty slot. + index := uintptr(reg.freeListHead) + + entries := reg.entries + + if (index != 0) { + // A slot is available. It contains the index of the next available + // slot; put that index into the freeListHead. + reg.freeListHead = (*entries)[index].(indexInt) + } else { + // The sentinel value was encountered, indicating that we are out of + // space and must grow the entries array. + + // The list head was 0, a slot we don't want to use. Figure out what + // slot we're going to use instead. If the current size of the + // entries array is 8, and will soon be 16, use slot 8. + index = uintptr(len(*entries)) + + // Duplicate the array and copy in the existing entries data. + newSize := index * 2 + newEntries := make([]interface{}, newSize) + copy(newEntries, *entries) + + // Set up each new empty slot to point at another new empty slot, up + // to the final slot which will get the sentinel value 0. + for i := index + 1; i < newSize - 1; i++ { + newEntries[i] = indexInt(i + 1) + } + newEntries[newSize - 1] = indexInt(0) + entries = &newEntries + reg.entries = entries + + // Set the freeListHead to one greater than the slot we're using this + // time -- i.e. if the current size is 8, the new size is 16, and the + // slot we use for the supplied value is 8, then the new list head + // will be 9. + reg.freeListHead = indexInt(index + 1) + } + + // Store the supplied value in the slot. + (*entries)[index] = obj + + reg.mutex.Unlock() + + return index +} + +func (reg *objRegistry) fetch(index uintptr) interface{} { + + // Ignore an out of range request. + if index >= uintptr(len(*reg.entries)) { + return nil + } + entry := (*reg.entries)[index] + if _, ok := entry.(indexInt); ok { + // Return nil if the slot is empty. + return nil + } + return entry +} + +func (reg *objRegistry) delete(index uintptr) { + reg.mutex.Lock() + + // Overwrite the value at the supplied index with the freeListHead. For + // example, if you are storing strings and the entries array consists of + // {0, "A", "B", C", 5, 6, 7, 0}, with freeListHead at 4, then deleting + // index 2 (string value "B") will result in the following state: + // {0, "A", 4, "C", 5, 6, 7, 0} and freeListHead at 2. + // + // Some potential errors are ignored: + // * Index is greater than the size of the array. + // * Slot is empty. + if index < uintptr(len(*reg.entries)) { + _, isIndexInt := (*reg.entries)[index].(indexInt) + if !isIndexInt { + (*reg.entries)[index] = reg.freeListHead + reg.freeListHead = indexInt(index) + } + } + + reg.mutex.Unlock() +} + diff --git a/go/lucy/registry_test.go b/go/lucy/registry_test.go new file mode 100644 index 000000000..ea4f9da61 --- /dev/null +++ b/go/lucy/registry_test.go @@ -0,0 +1,84 @@ +/* Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package lucy + +import "testing" +import "math/rand" + +func TestRegistrySingle(t *testing.T) { + reg := newObjRegistry(4) + index := reg.store(42) + if intVal, ok := reg.fetch(index).(int); !ok || intVal != 42 { + t.Error("Failed to store/fetch int") + } + reg.delete(index) + if reg.fetch(index) != nil { + t.Error("Failed to delete int") + } +} + +func TestRegistryMany(t *testing.T) { + reg := newObjRegistry(4) + stored := make(map[int]uintptr) + deleted := make(map[int]uintptr) + for i := 0; i < 1000; i++ { + if rand.Intn(10) == 0 { + // Randomly delete an element 10% of the time. + goner := rand.Intn(i - 1) + if index, ok := stored[goner]; ok { + reg.delete(index) + delete(stored, goner) + deleted[goner] = index + } + } + stored[i] = reg.store(i) + } + for expected, index := range stored { + got, ok := reg.fetch(index).(int) + if !ok { + t.Errorf("Failed to fetch stored value %d at index %d", expected, index) + } else if got != expected { + t.Errorf("Expected %d got %d", expected, got) + } + } + for i := 0; i < len(*reg.entries) - 1; i++ { + got, ok := reg.fetch(uintptr(i)).(int) + if ok { + if _, wasDeleted := deleted[got]; wasDeleted { + t.Errorf("Deleted item %d still present at index %d", got, i) + } + } + } +} + +func TestRegistryStringSlice(t *testing.T) { + reg := newObjRegistry(4) + s := make([]int, 2) + index := reg.store(&s) + s2 := reg.fetch(index).(*[]int) + (*s2)[1] = 1000 + if s[1] != 1000 { + t.Error("Not the same slice") + } +} + +func TestRegistryRange(t *testing.T) { + reg := newObjRegistry(4) + if reg.fetch(uintptr(10)) != nil { + t.Error("Out of range index should return nil") + } +} From 44fc440fdc419b655fb4c482afb63b9020138011 Mon Sep 17 00:00:00 2001 From: Marvin Humphrey Date: Sun, 19 Jul 2015 12:57:13 -0700 Subject: [PATCH 4/8] Port RegexTokenizer stubs to CGO. --- go/cfext/lucy.c | 157 +++--------------------------------------------- go/lucy/lucy.go | 48 +++++++++++++++ 2 files changed, 56 insertions(+), 149 deletions(-) diff --git a/go/cfext/lucy.c b/go/cfext/lucy.c index d1044dfe3..5773f16fe 100644 --- a/go/cfext/lucy.c +++ b/go/cfext/lucy.c @@ -55,175 +55,34 @@ #include "Lucy/Store/OutStream.h" #include "Lucy/Util/Freezer.h" -#if defined(CHY_HAS_PCRE_H) - -#include - -static uint32_t -S_count_code_points(const char *string, size_t len); - bool RegexTokenizer_is_available(void) { - return true; + return false; } RegexTokenizer* -RegexTokenizer_init(RegexTokenizer *self, String *pattern) { - Analyzer_init((Analyzer*)self); - RegexTokenizerIVARS *const ivars = RegexTokenizer_IVARS(self); - - char *pattern_buf = NULL; - const char *pattern_ptr; - if (pattern) { - ivars->pattern = Str_Clone(pattern); - pattern_buf = Str_To_Utf8(ivars->pattern); - pattern_ptr = pattern_buf; - } - else { - pattern_ptr = "\\w+(?:['\\x{2019}]\\w+)*"; - ivars->pattern - = Str_new_from_trusted_utf8(pattern_ptr, strlen(pattern_ptr)); - } - - int options = PCRE_UTF8 | PCRE_NO_UTF8_CHECK; -#ifdef PCRE_BSR_UNICODE - // Available since PCRE 7.4 - options |= PCRE_BSR_UNICODE; -#endif -#ifdef PCRE_NEWLINE_LF - // Available since PCRE 6.7 - options |= PCRE_NEWLINE_LF; -#endif - const char *err_ptr; - int err_offset; - pcre *re = pcre_compile(pattern_ptr, options, &err_ptr, &err_offset, NULL); - if (pattern_buf) { - FREEMEM(pattern_buf); - } - if (!re) { - THROW(ERR, "%s", err_ptr); - } - - // TODO: Check whether pcre_study improves performance - - ivars->token_re = re; - - return self; -} - -void -RegexTokenizer_Destroy_IMP(RegexTokenizer *self) { - RegexTokenizerIVARS *const ivars = RegexTokenizer_IVARS(self); - DECREF(ivars->pattern); - pcre *re = (pcre*)ivars->token_re; - if (re) { - pcre_free(re); - } - SUPER_DESTROY(self, REGEXTOKENIZER); -} - -void -RegexTokenizer_Tokenize_Utf8_IMP(RegexTokenizer *self, const char *string, - size_t string_len, Inversion *inversion) { - RegexTokenizerIVARS *const ivars = RegexTokenizer_IVARS(self); - pcre *re = (pcre*)ivars->token_re; - int byte_offset = 0; - uint32_t cp_offset = 0; // Code points - int options = PCRE_NO_UTF8_CHECK; - int ovector[3]; - - int return_code = pcre_exec(re, NULL, string, string_len, byte_offset, - options, ovector, 3); - while (return_code >= 0) { - const char *match = string + ovector[0]; - size_t match_len = ovector[1] - ovector[0]; - - uint32_t cp_before = S_count_code_points(string + byte_offset, - ovector[0] - byte_offset); - uint32_t cp_start = cp_offset + cp_before; - uint32_t cp_matched = S_count_code_points(match, match_len); - uint32_t cp_end = cp_start + cp_matched; - - // Add a token to the new inversion. - Token *token = Token_new(match, match_len, cp_start, cp_end, 1.0f, 1); - Inversion_Append(inversion, token); - - byte_offset = ovector[1]; - cp_offset = cp_end; - return_code = pcre_exec(re, NULL, string, string_len, byte_offset, - options, ovector, 3); - } - - if (return_code != PCRE_ERROR_NOMATCH) { - THROW(ERR, "pcre_exec failed: %d", return_code); - } -} - -static uint32_t -S_count_code_points(const char *string, size_t len) { - uint32_t num_code_points = 0; - size_t i = 0; - - while (i < len) { - i += StrHelp_UTF8_COUNT[(uint8_t)(string[i])]; - ++num_code_points; - } - - if (i != len) { - THROW(ERR, "Match between code point boundaries in '%s'", string); - } - - return num_code_points; -} - -#else // CHY_HAS_PCRE_H - -bool -RegexTokenizer_is_available(void) { - return false; -} +(*GOLUCY_RegexTokenizer_init_BRIDGE)(RegexTokenizer *self, String *pattern); RegexTokenizer* RegexTokenizer_init(RegexTokenizer *self, String *pattern) { - UNUSED_VAR(self); - UNUSED_VAR(pattern); - THROW(ERR, - "RegexTokenizer is not available because Lucy was compiled" - " without PCRE."); - UNREACHABLE_RETURN(RegexTokenizer*); + return GOLUCY_RegexTokenizer_init_BRIDGE(self, pattern); } -void -RegexTokenizer_Set_Token_RE_IMP(RegexTokenizer *self, void *token_re) { - UNUSED_VAR(self); - UNUSED_VAR(token_re); - THROW(ERR, - "RegexTokenizer is not available because Lucy was compiled" - " without PCRE."); -} +RegexTokenizer_Destroy_t GOLUCY_RegexTokenizer_Destroy_BRIDGE; void RegexTokenizer_Destroy_IMP(RegexTokenizer *self) { - UNUSED_VAR(self); - THROW(ERR, - "RegexTokenizer is not available because Lucy was compiled" - " without PCRE."); + GOLUCY_RegexTokenizer_Destroy_BRIDGE(self); } +RegexTokenizer_Tokenize_Utf8_t GOLUCY_RegexTokenizer_Tokenize_Utf8_BRIDGE; + void RegexTokenizer_Tokenize_Utf8_IMP(RegexTokenizer *self, const char *string, size_t string_len, Inversion *inversion) { - UNUSED_VAR(self); - UNUSED_VAR(string); - UNUSED_VAR(string_len); - UNUSED_VAR(inversion); - THROW(ERR, - "RegexTokenizer is not available because Lucy was compiled" - " without PCRE."); + GOLUCY_RegexTokenizer_Tokenize_Utf8_BRIDGE(self, string, string_len, inversion); } -#endif // CHY_HAS_PCRE_H - /********************************** Doc ********************************/ Doc* diff --git a/go/lucy/lucy.go b/go/lucy/lucy.go index 908599af3..13bdafa96 100644 --- a/go/lucy/lucy.go +++ b/go/lucy/lucy.go @@ -17,11 +17,59 @@ package lucy /* +#define C_LUCY_REGEXTOKENIZER + #include "lucy_parcel.h" +#include "Lucy/Analysis/RegexTokenizer.h" + +extern lucy_RegexTokenizer* +GOLUCY_RegexTokenizer_init(lucy_RegexTokenizer *self, cfish_String *pattern); +extern lucy_RegexTokenizer* +(*GOLUCY_RegexTokenizer_init_BRIDGE)(lucy_RegexTokenizer *self, + cfish_String *pattern); +extern void +GOLUCY_RegexTokenizer_Destroy(lucy_RegexTokenizer *self); +extern void +(*GOLUCY_RegexTokenizer_Destroy_BRIDGE)(lucy_RegexTokenizer *self); +extern void +GOLUCY_RegexTokenizer_Tokenize_Utf8(lucy_RegexTokenizer *self, char *str, + size_t string_len, lucy_Inversion *inversion); +extern void +(*GOLUCY_RegexTokenizer_Tokenize_Utf8_BRIDGE)(lucy_RegexTokenizer *self, const char *str, + size_t string_len, lucy_Inversion *inversion); + + +// C symbols linked into a Go-built package archive are not visible to +// external C code -- but internal code *can* see symbols from outside. +// This allows us to fake up symbol export by assigning values only known +// interally to external symbols during Go package initialization. +static CFISH_INLINE void +GOLUCY_glue_exported_symbols() { + GOLUCY_RegexTokenizer_init_BRIDGE = GOLUCY_RegexTokenizer_init; + GOLUCY_RegexTokenizer_Destroy_BRIDGE = GOLUCY_RegexTokenizer_Destroy; + GOLUCY_RegexTokenizer_Tokenize_Utf8_BRIDGE + = (LUCY_RegexTokenizer_Tokenize_Utf8_t)GOLUCY_RegexTokenizer_Tokenize_Utf8; +} + */ import "C" import _ "git-wip-us.apache.org/repos/asf/lucy-clownfish.git/runtime/go/clownfish" func init() { + C.GOLUCY_glue_exported_symbols() C.lucy_bootstrap_parcel() } + +//export GOLUCY_RegexTokenizer_init +func GOLUCY_RegexTokenizer_init(rt *C.lucy_RegexTokenizer, pattern *C.cfish_String) *C.lucy_RegexTokenizer { + return nil +} + +//export GOLUCY_RegexTokenizer_Destroy +func GOLUCY_RegexTokenizer_Destroy(rt *C.lucy_RegexTokenizer) { +} + +//export GOLUCY_RegexTokenizer_Tokenize_Utf8 +func GOLUCY_RegexTokenizer_Tokenize_Utf8(rt *C.lucy_RegexTokenizer, str *C.char, + stringLen C.size_t, inversion *C.lucy_Inversion) { +} From 8f634425b5390423d7c4013b28710e4ddc92bf0b Mon Sep 17 00:00:00 2001 From: Marvin Humphrey Date: Sun, 19 Jul 2015 12:57:13 -0700 Subject: [PATCH 5/8] Port Doc code to CGO. --- go/cfext/lucy.c | 67 ++++++++++------------- go/lucy/lucy.go | 141 ++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 169 insertions(+), 39 deletions(-) diff --git a/go/cfext/lucy.c b/go/cfext/lucy.c index 5773f16fe..e2719bc2d 100644 --- a/go/cfext/lucy.c +++ b/go/cfext/lucy.c @@ -86,61 +86,53 @@ RegexTokenizer_Tokenize_Utf8_IMP(RegexTokenizer *self, const char *string, /********************************** Doc ********************************/ Doc* -Doc_init(Doc *self, void *fields, int32_t doc_id) { - DocIVARS *const ivars = Doc_IVARS(self); - Hash *hash; - - if (fields) { - hash = (Hash *)INCREF(CERTIFY(fields, HASH)); - } - else { - hash = Hash_new(0); - } - ivars->fields = hash; - ivars->doc_id = doc_id; +(*GOLUCY_Doc_init_BRIDGE)(Doc *self, void *fields, int32_t doc_id); - return self; +Doc* +Doc_init(Doc *self, void *fields, int32_t doc_id) { + return GOLUCY_Doc_init_BRIDGE(self, fields, doc_id); } +Doc_Set_Fields_t GOLUCY_Doc_Set_Fields_BRIDGE; + void Doc_Set_Fields_IMP(Doc *self, void *fields) { - DocIVARS *const ivars = Doc_IVARS(self); - DECREF(ivars->fields); - ivars->fields = CERTIFY(fields, HASH); + GOLUCY_Doc_Set_Fields_BRIDGE(self, fields); } +Doc_Get_Size_t GOLUCY_Doc_Get_Size_BRIDGE; + uint32_t Doc_Get_Size_IMP(Doc *self) { - Hash *hash = (Hash*)Doc_IVARS(self)->fields; - return Hash_Get_Size(hash); + return GOLUCY_Doc_Get_Size_BRIDGE(self); } +Doc_Store_t GOLUCY_Doc_Store_BRIDGE; + void Doc_Store_IMP(Doc *self, String *field, Obj *value) { - Hash *hash = (Hash*)Doc_IVARS(self)->fields; - Hash_Store(hash, field, INCREF(value)); + GOLUCY_Doc_Store_BRIDGE(self, field, value); } +Doc_Serialize_t GOLUCY_Doc_Serialize_BRIDGE; + void Doc_Serialize_IMP(Doc *self, OutStream *outstream) { - DocIVARS *const ivars = Doc_IVARS(self); - Hash *hash = (Hash*)ivars->fields; - Freezer_serialize_hash(hash, outstream); - OutStream_Write_C32(outstream, ivars->doc_id); + GOLUCY_Doc_Serialize_BRIDGE(self, outstream); } +Doc_Deserialize_t GOLUCY_Doc_Deserialize_BRIDGE; + Doc* Doc_Deserialize_IMP(Doc *self, InStream *instream) { - DocIVARS *const ivars = Doc_IVARS(self); - ivars->fields = Freezer_read_hash(instream); - ivars->doc_id = InStream_Read_C32(instream); - return self; + return GOLUCY_Doc_Deserialize_BRIDGE(self, instream); } +Doc_Extract_t GOLUCY_Doc_Extract_BRIDGE; + Obj* Doc_Extract_IMP(Doc *self, String *field) { - Hash *hash = (Hash*)Doc_IVARS(self)->fields; - return INCREF(Hash_Fetch(hash, field)); + return GOLUCY_Doc_Extract_BRIDGE(self, field); } Hash* @@ -158,23 +150,20 @@ Doc_Load_IMP(Doc *self, Obj *dump) { UNREACHABLE_RETURN(Doc*); } +Doc_Equals_t GOLUCY_Doc_Equals_BRIDGE; + bool Doc_Equals_IMP(Doc *self, Obj *other) { - if ((Doc*)other == self) { return true; } - if (!Obj_is_a(other, DOC)) { return false; } - DocIVARS *const ivars = Doc_IVARS(self); - DocIVARS *const ovars = Doc_IVARS((Doc*)other); - return Hash_Equals((Hash*)ivars->fields, (Obj*)ovars->fields); + return GOLUCY_Doc_Equals_BRIDGE(self, other); } +Doc_Destroy_t GOLUCY_Doc_Destroy_BRIDGE; + void Doc_Destroy_IMP(Doc *self) { - DocIVARS *const ivars = Doc_IVARS(self); - DECREF(ivars->fields); - SUPER_DESTROY(self, DOC); + GOLUCY_Doc_Destroy_BRIDGE(self); } - /**************************** DocReader *****************************/ HitDoc* diff --git a/go/lucy/lucy.go b/go/lucy/lucy.go index 13bdafa96..7d5579887 100644 --- a/go/lucy/lucy.go +++ b/go/lucy/lucy.go @@ -17,10 +17,17 @@ package lucy /* +#define C_LUCY_DOC #define C_LUCY_REGEXTOKENIZER #include "lucy_parcel.h" #include "Lucy/Analysis/RegexTokenizer.h" +#include "Lucy/Document/Doc.h" + +#include "Clownfish/Hash.h" +#include "Lucy/Store/InStream.h" +#include "Lucy/Store/OutStream.h" +#include "Lucy/Util/Freezer.h" extern lucy_RegexTokenizer* GOLUCY_RegexTokenizer_init(lucy_RegexTokenizer *self, cfish_String *pattern); @@ -38,6 +45,44 @@ extern void (*GOLUCY_RegexTokenizer_Tokenize_Utf8_BRIDGE)(lucy_RegexTokenizer *self, const char *str, size_t string_len, lucy_Inversion *inversion); +extern lucy_Doc* +GOLUCY_Doc_init(lucy_Doc *doc, void *fields, int32_t doc_id); +extern lucy_Doc* +(*GOLUCY_Doc_init_BRIDGE)(lucy_Doc *doc, void *fields, int32_t doc_id); +extern void +GOLUCY_Doc_Set_Fields(lucy_Doc *self, void *fields); +extern void +(*GOLUCY_Doc_Set_Fields_BRIDGE)(lucy_Doc *self, void *fields); +extern uint32_t +GOLUCY_Doc_Get_Size(lucy_Doc *self); +extern uint32_t +(*GOLUCY_Doc_Get_Size_BRIDGE)(lucy_Doc *self); +extern void +GOLUCY_Doc_Store(lucy_Doc *self, cfish_String *field, cfish_Obj *value); +extern void +(*GOLUCY_Doc_Store_BRIDGE)(lucy_Doc *self, cfish_String *field, cfish_Obj *value); +extern void +GOLUCY_Doc_Serialize(lucy_Doc *self, lucy_OutStream *outstream); +extern void +(*GOLUCY_Doc_Serialize_BRIDGE)(lucy_Doc *self, lucy_OutStream *outstream); +extern lucy_Doc* +GOLUCY_Doc_Deserialize(lucy_Doc *self, lucy_InStream *instream); +extern lucy_Doc* +(*GOLUCY_Doc_Deserialize_BRIDGE)(lucy_Doc *self, lucy_InStream *instream); +extern cfish_Obj* +GOLUCY_Doc_Extract(lucy_Doc *self, cfish_String *field); +extern cfish_Obj* +(*GOLUCY_Doc_Extract_BRIDGE)(lucy_Doc *self, cfish_String *field); +extern bool +GOLUCY_Doc_Equals(lucy_Doc *self, cfish_Obj *other); +extern bool +(*GOLUCY_Doc_Equals_BRIDGE)(lucy_Doc *self, cfish_Obj *other); +extern void +GOLUCY_Doc_Destroy(lucy_Doc *self); +extern void +(*GOLUCY_Doc_Destroy_BRIDGE)(lucy_Doc *self); + + // C symbols linked into a Go-built package archive are not visible to // external C code -- but internal code *can* see symbols from outside. @@ -49,10 +94,20 @@ GOLUCY_glue_exported_symbols() { GOLUCY_RegexTokenizer_Destroy_BRIDGE = GOLUCY_RegexTokenizer_Destroy; GOLUCY_RegexTokenizer_Tokenize_Utf8_BRIDGE = (LUCY_RegexTokenizer_Tokenize_Utf8_t)GOLUCY_RegexTokenizer_Tokenize_Utf8; + GOLUCY_Doc_init_BRIDGE = GOLUCY_Doc_init; + GOLUCY_Doc_Set_Fields_BRIDGE = GOLUCY_Doc_Set_Fields; + GOLUCY_Doc_Get_Size_BRIDGE = GOLUCY_Doc_Get_Size; + GOLUCY_Doc_Store_BRIDGE = GOLUCY_Doc_Store; + GOLUCY_Doc_Serialize_BRIDGE = GOLUCY_Doc_Serialize; + GOLUCY_Doc_Deserialize_BRIDGE = GOLUCY_Doc_Deserialize; + GOLUCY_Doc_Extract_BRIDGE = GOLUCY_Doc_Extract; + GOLUCY_Doc_Equals_BRIDGE = GOLUCY_Doc_Equals; + GOLUCY_Doc_Destroy_BRIDGE = GOLUCY_Doc_Destroy; } */ import "C" +import "unsafe" import _ "git-wip-us.apache.org/repos/asf/lucy-clownfish.git/runtime/go/clownfish" func init() { @@ -73,3 +128,89 @@ func GOLUCY_RegexTokenizer_Destroy(rt *C.lucy_RegexTokenizer) { func GOLUCY_RegexTokenizer_Tokenize_Utf8(rt *C.lucy_RegexTokenizer, str *C.char, stringLen C.size_t, inversion *C.lucy_Inversion) { } + +func NewDoc(docID int32) Doc { + retvalCF := C.lucy_Doc_new(nil, C.int32_t(docID)) + return WRAPDoc(unsafe.Pointer(retvalCF)) +} + +//export GOLUCY_Doc_init +func GOLUCY_Doc_init(d *C.lucy_Doc, fields unsafe.Pointer, docID C.int32_t) *C.lucy_Doc { + ivars := C.lucy_Doc_IVARS(d) + if fields != nil { + ivars.fields = unsafe.Pointer(C.cfish_inc_refcount(fields)) + } else { + ivars.fields = unsafe.Pointer(C.cfish_Hash_new(0)) + } + ivars.doc_id = docID + return d +} + +//export GOLUCY_Doc_Set_Fields +func GOLUCY_Doc_Set_Fields(d *C.lucy_Doc, fields unsafe.Pointer) { + ivars := C.lucy_Doc_IVARS(d) + temp := ivars.fields + ivars.fields = unsafe.Pointer(C.cfish_inc_refcount(fields)) + C.cfish_decref(temp) +} + +//export GOLUCY_Doc_Get_Size +func GOLUCY_Doc_Get_Size(d *C.lucy_Doc) C.uint32_t { + ivars := C.lucy_Doc_IVARS(d) + hash := ((*C.cfish_Hash)(ivars.fields)) + return C.uint32_t(C.CFISH_Hash_Get_Size(hash)) +} + +//export GOLUCY_Doc_Store +func GOLUCY_Doc_Store(d *C.lucy_Doc, field *C.cfish_String, value *C.cfish_Obj) { + ivars := C.lucy_Doc_IVARS(d) + hash := (*C.cfish_Hash)(ivars.fields) + C.CFISH_Hash_Store(hash, field, C.cfish_inc_refcount(unsafe.Pointer(value))) +} + +//export GOLUCY_Doc_Serialize +func GOLUCY_Doc_Serialize(d *C.lucy_Doc, outstream *C.lucy_OutStream) { + ivars := C.lucy_Doc_IVARS(d) + hash := (*C.cfish_Hash)(ivars.fields) + C.lucy_Freezer_serialize_hash(hash, outstream) + C.LUCY_OutStream_Write_C32(outstream, C.uint32_t(ivars.doc_id)) +} + +//export GOLUCY_Doc_Deserialize +func GOLUCY_Doc_Deserialize(d *C.lucy_Doc, instream *C.lucy_InStream) *C.lucy_Doc { + ivars := C.lucy_Doc_IVARS(d) + ivars.fields = unsafe.Pointer(C.lucy_Freezer_read_hash(instream)) + ivars.doc_id = C.int32_t(C.LUCY_InStream_Read_C32(instream)) + return d +} + +//export GOLUCY_Doc_Extract +func GOLUCY_Doc_Extract(d *C.lucy_Doc, field *C.cfish_String) *C.cfish_Obj { + ivars := C.lucy_Doc_IVARS(d) + hash := (*C.cfish_Hash)(ivars.fields) + val := C.CFISH_Hash_Fetch(hash, field) + return C.cfish_inc_refcount(unsafe.Pointer(val)) +} + +//export GOLUCY_Doc_Equals +func GOLUCY_Doc_Equals(d *C.lucy_Doc, other *C.cfish_Obj) C.bool { + twin := (*C.lucy_Doc)(unsafe.Pointer(other)) + if twin == d { + return true + } + if !C.cfish_Obj_is_a(other, C.LUCY_DOC) { + return false + } + ivars := C.lucy_Doc_IVARS(d) + ovars := C.lucy_Doc_IVARS(twin) + hash := (*C.cfish_Hash)(ivars.fields) + otherHash := (*C.cfish_Obj)(ovars.fields) + return C.CFISH_Hash_Equals(hash, otherHash) +} + +//export GOLUCY_Doc_Destroy +func GOLUCY_Doc_Destroy(d *C.lucy_Doc) { + ivars := C.lucy_Doc_IVARS(d) + C.cfish_decref(unsafe.Pointer(ivars.fields)) + C.cfish_super_destroy(unsafe.Pointer(d), C.LUCY_DOC) +} From acd74d27985e635606c775a44d7c5c718583d76f Mon Sep 17 00:00:00 2001 From: Marvin Humphrey Date: Sat, 18 Jul 2015 14:49:30 -0700 Subject: [PATCH 6/8] Port Inverter and InverterEntry to CGO. --- go/cfext/lucy.c | 76 +----------------------------------- go/lucy/lucy.go | 101 +++++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 102 insertions(+), 75 deletions(-) diff --git a/go/cfext/lucy.c b/go/cfext/lucy.c index e2719bc2d..f10a23f93 100644 --- a/go/cfext/lucy.c +++ b/go/cfext/lucy.c @@ -249,83 +249,11 @@ DefDocReader_Fetch_Doc_IMP(DefaultDocReader *self, int32_t doc_id) { /**************************** Inverter *****************************/ -static InverterEntry* -S_fetch_entry(InverterIVARS *ivars, String *field) { - Schema *const schema = ivars->schema; - int32_t field_num = Seg_Field_Num(ivars->segment, field); - if (!field_num) { - // This field seems not to be in the segment yet. Try to find it in - // the Schema. - if (Schema_Fetch_Type(schema, field)) { - // The field is in the Schema. Get a field num from the Segment. - field_num = Seg_Add_Field(ivars->segment, field); - } - else { - // We've truly failed to find the field. The user must - // not have spec'd it. - THROW(ERR, "Unknown field name: '%o'", field); - } - } - - InverterEntry *entry - = (InverterEntry*)Vec_Fetch(ivars->entry_pool, field_num); - if (!entry) { - entry = InvEntry_new(schema, (String*)field, field_num); - Vec_Store(ivars->entry_pool, field_num, (Obj*)entry); - } - return entry; -} +Inverter_Invert_Doc_t GOLUCY_Inverter_Invert_Doc_BRIDGE; void Inverter_Invert_Doc_IMP(Inverter *self, Doc *doc) { - InverterIVARS *const ivars = Inverter_IVARS(self); - Hash *const fields = (Hash*)Doc_Get_Fields(doc); - - // Prepare for the new doc. - Inverter_Set_Doc(self, doc); - - // Extract and invert the doc's fields. - HashIterator *iter = HashIter_new(fields); - while (HashIter_Next(iter)) { - String *field = HashIter_Get_Key(iter); - Obj *obj = HashIter_Get_Value(iter); - - InverterEntry *inventry = S_fetch_entry(ivars, field); - InverterEntryIVARS *inventry_ivars = InvEntry_IVARS(inventry); - FieldType *type = inventry_ivars->type; - - // Get the field value. - switch (FType_Primitive_ID(type) & FType_PRIMITIVE_ID_MASK) { - case FType_TEXT: { - CERTIFY(obj, STRING); - break; - } - case FType_BLOB: { - CERTIFY(obj, BLOB); - break; - } - case FType_INT32: - case FType_INT64: { - CERTIFY(obj, INTEGER); - break; - } - case FType_FLOAT32: - case FType_FLOAT64: { - CERTIFY(obj, FLOAT); - break; - } - default: - THROW(ERR, "Unrecognized type: %o", type); - } - - if (inventry_ivars->value != obj) { - DECREF(inventry_ivars->value); - inventry_ivars->value = INCREF(obj); - } - - Inverter_Add_Field(self, inventry); - } - DECREF(iter); + GOLUCY_Inverter_Invert_Doc_BRIDGE(self, doc); } diff --git a/go/lucy/lucy.go b/go/lucy/lucy.go index 7d5579887..664a2001b 100644 --- a/go/lucy/lucy.go +++ b/go/lucy/lucy.go @@ -19,12 +19,20 @@ package lucy /* #define C_LUCY_DOC #define C_LUCY_REGEXTOKENIZER +#define C_LUCY_INVERTER +#define C_LUCY_INVERTERENTRY #include "lucy_parcel.h" #include "Lucy/Analysis/RegexTokenizer.h" #include "Lucy/Document/Doc.h" +#include "Lucy/Index/Inverter.h" #include "Clownfish/Hash.h" +#include "Clownfish/HashIterator.h" +#include "Clownfish/Vector.h" +#include "Lucy/Plan/FieldType.h" +#include "Lucy/Plan/Schema.h" +#include "Lucy/Index/Segment.h" #include "Lucy/Store/InStream.h" #include "Lucy/Store/OutStream.h" #include "Lucy/Util/Freezer.h" @@ -82,6 +90,10 @@ GOLUCY_Doc_Destroy(lucy_Doc *self); extern void (*GOLUCY_Doc_Destroy_BRIDGE)(lucy_Doc *self); +extern void +GOLUCY_Inverter_Invert_Doc(lucy_Inverter *self, lucy_Doc *doc); +extern void +(*GOLUCY_Inverter_Invert_Doc_BRIDGE)(lucy_Inverter *self, lucy_Doc *doc); // C symbols linked into a Go-built package archive are not visible to @@ -103,12 +115,14 @@ GOLUCY_glue_exported_symbols() { GOLUCY_Doc_Extract_BRIDGE = GOLUCY_Doc_Extract; GOLUCY_Doc_Equals_BRIDGE = GOLUCY_Doc_Equals; GOLUCY_Doc_Destroy_BRIDGE = GOLUCY_Doc_Destroy; + GOLUCY_Inverter_Invert_Doc_BRIDGE = GOLUCY_Inverter_Invert_Doc; } */ import "C" import "unsafe" -import _ "git-wip-us.apache.org/repos/asf/lucy-clownfish.git/runtime/go/clownfish" +import "fmt" +import "git-wip-us.apache.org/repos/asf/lucy-clownfish.git/runtime/go/clownfish" func init() { C.GOLUCY_glue_exported_symbols() @@ -214,3 +228,88 @@ func GOLUCY_Doc_Destroy(d *C.lucy_Doc) { C.cfish_decref(unsafe.Pointer(ivars.fields)) C.cfish_super_destroy(unsafe.Pointer(d), C.LUCY_DOC) } + +func fetchEntry(ivars *C.lucy_InverterIVARS, field *C.cfish_String) *C.lucy_InverterEntry { + schema := ivars.schema + fieldNum := C.LUCY_Seg_Field_Num(ivars.segment, field) + if fieldNum == 0 { + // This field seems not to be in the segment yet. Try to find it in + // the Schema. + if C.LUCY_Schema_Fetch_Type(schema, field) != nil { + // The field is in the Schema. Get a field num from the Segment. + fieldNum = C.LUCY_Seg_Add_Field(ivars.segment, field) + } else { + // We've truly failed to find the field. The user must + // not have spec'd it. + fieldGo := clownfish.CFStringToGo(unsafe.Pointer(field)) + err := clownfish.NewErr("Unknown field name: '" + fieldGo + "'") + panic(err) + } + } + entry := C.CFISH_Vec_Fetch(ivars.entry_pool, C.size_t(fieldNum)) + if entry == nil { + newEntry := C.lucy_InvEntry_new(schema, field, fieldNum) + C.CFISH_Vec_Store(ivars.entry_pool, C.size_t(fieldNum), + (*C.cfish_Obj)(unsafe.Pointer(entry))) + return newEntry + } + return (*C.lucy_InverterEntry)(unsafe.Pointer(entry)) +} + +//export GOLUCY_Inverter_Invert_Doc +func GOLUCY_Inverter_Invert_Doc(inverter *C.lucy_Inverter, doc *C.lucy_Doc) { + ivars := C.lucy_Inverter_IVARS(inverter) + fields := (*C.cfish_Hash)(C.LUCY_Doc_Get_Fields(doc)) + + // Prepare for the new doc. + C.LUCY_Inverter_Set_Doc(inverter, doc) + + // Extract and invert the doc's fields. + iter := C.cfish_HashIter_new(fields) + for C.CFISH_HashIter_Next(iter) { + field := C.CFISH_HashIter_Get_Key(iter) + obj := C.CFISH_HashIter_Get_Value(iter) + if obj == nil { + mess := "Invalid nil value for field" + clownfish.CFStringToGo(unsafe.Pointer(field)) + panic(clownfish.NewErr(mess)) + } + + inventry := fetchEntry(ivars, field) + inventryIvars := C.lucy_InvEntry_IVARS(inventry) + fieldType := inventryIvars._type + + // Get the field value. + var expectedType *C.cfish_Class + switch C.LUCY_FType_Primitive_ID(fieldType) & C.lucy_FType_PRIMITIVE_ID_MASK { + case C.lucy_FType_TEXT: + expectedType = C.CFISH_STRING + case C.lucy_FType_BLOB: + expectedType = C.CFISH_BLOB + case C.lucy_FType_INT32: + expectedType = C.CFISH_INTEGER + case C.lucy_FType_INT64: + expectedType = C.CFISH_INTEGER + case C.lucy_FType_FLOAT32: + expectedType = C.CFISH_FLOAT + case C.lucy_FType_FLOAT64: + expectedType = C.CFISH_FLOAT + default: + panic(clownfish.NewErr("Internal Lucy error: bad type id for field " + + clownfish.CFStringToGo(unsafe.Pointer(field)))) + } + if !C.cfish_Obj_is_a(obj, expectedType) { + className := C.cfish_Obj_get_class_name((*C.cfish_Obj)(unsafe.Pointer(fieldType))) + mess := fmt.Sprintf("Invalid type for field '%s': '%s'", + clownfish.CFStringToGo(unsafe.Pointer(field)), + clownfish.CFStringToGo(unsafe.Pointer(className))) + panic(clownfish.NewErr(mess)) + } + if inventryIvars.value != obj { + C.cfish_decref(unsafe.Pointer(inventryIvars.value)) + inventryIvars.value = C.cfish_inc_refcount(unsafe.Pointer(obj)) + } + + C.LUCY_Inverter_Add_Field(inverter, inventry) + } + C.cfish_dec_refcount(unsafe.Pointer(iter)) +} From 7749e595b2a6c32af57b904d1a00b066691ace37 Mon Sep 17 00:00:00 2001 From: Marvin Humphrey Date: Sun, 19 Jul 2015 12:57:13 -0700 Subject: [PATCH 7/8] Port DefDocReader code to CGO. --- go/cfext/lucy.c | 80 ++--------------------------------------- go/lucy/lucy.go | 94 +++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 97 insertions(+), 77 deletions(-) diff --git a/go/cfext/lucy.c b/go/cfext/lucy.c index f10a23f93..9e9b840f4 100644 --- a/go/cfext/lucy.c +++ b/go/cfext/lucy.c @@ -166,85 +166,11 @@ Doc_Destroy_IMP(Doc *self) { /**************************** DocReader *****************************/ +DefDocReader_Fetch_Doc_t GOLUCY_DefDocReader_Fetch_Doc_BRIDGE; + HitDoc* DefDocReader_Fetch_Doc_IMP(DefaultDocReader *self, int32_t doc_id) { - DefaultDocReaderIVARS *const ivars = DefDocReader_IVARS(self); - Schema *const schema = ivars->schema; - InStream *const dat_in = ivars->dat_in; - InStream *const ix_in = ivars->ix_in; - Hash *const fields = Hash_new(1); - int64_t start; - uint32_t num_fields; - uint32_t field_name_cap = 31; - char *field_name = (char*)MALLOCATE(field_name_cap + 1); - - // Get data file pointer from index, read number of fields. - InStream_Seek(ix_in, (int64_t)doc_id * 8); - start = InStream_Read_U64(ix_in); - InStream_Seek(dat_in, start); - num_fields = InStream_Read_C32(dat_in); - - // Decode stored data and build up the doc field by field. - while (num_fields--) { - uint32_t field_name_len; - Obj *value; - FieldType *type; - - // Read field name. - field_name_len = InStream_Read_C32(dat_in); - if (field_name_len > field_name_cap) { - field_name_cap = field_name_len; - field_name = (char*)REALLOCATE(field_name, - field_name_cap + 1); - } - InStream_Read_Bytes(dat_in, field_name, field_name_len); - - // Find the Field's FieldType. - String *field_name_str = SSTR_WRAP_UTF8(field_name, field_name_len); - type = Schema_Fetch_Type(schema, field_name_str); - - // Read the field value. - switch (FType_Primitive_ID(type) & FType_PRIMITIVE_ID_MASK) { - case FType_TEXT: { - uint32_t value_len = InStream_Read_C32(dat_in); - char *buf = (char*)MALLOCATE(value_len + 1); - InStream_Read_Bytes(dat_in, buf, value_len); - buf[value_len] = '\0'; - value = (Obj*)Str_new_steal_utf8(buf, value_len); - break; - } - case FType_BLOB: { - uint32_t value_len = InStream_Read_C32(dat_in); - char *buf = (char*)MALLOCATE(value_len); - InStream_Read_Bytes(dat_in, buf, value_len); - value = (Obj*)Blob_new_steal(buf, value_len); - break; - } - case FType_FLOAT32: - value = (Obj*)Float_new(InStream_Read_F32(dat_in)); - break; - case FType_FLOAT64: - value = (Obj*)Float_new(InStream_Read_F64(dat_in)); - break; - case FType_INT32: - value = (Obj*)Int_new((int32_t)InStream_Read_C32(dat_in)); - break; - case FType_INT64: - value = (Obj*)Int_new((int64_t)InStream_Read_C64(dat_in)); - break; - default: - value = NULL; - THROW(ERR, "Unrecognized type: %o", type); - } - - // Store the value. - Hash_Store_Utf8(fields, field_name, field_name_len, value); - } - FREEMEM(field_name); - - HitDoc *retval = HitDoc_new(fields, doc_id, 0.0); - DECREF(fields); - return retval; + return GOLUCY_DefDocReader_Fetch_Doc_BRIDGE(self, doc_id); } /**************************** Inverter *****************************/ diff --git a/go/lucy/lucy.go b/go/lucy/lucy.go index 664a2001b..556235e4c 100644 --- a/go/lucy/lucy.go +++ b/go/lucy/lucy.go @@ -17,19 +17,28 @@ package lucy /* + +#include + #define C_LUCY_DOC #define C_LUCY_REGEXTOKENIZER +#define C_LUCY_DEFAULTDOCREADER #define C_LUCY_INVERTER #define C_LUCY_INVERTERENTRY #include "lucy_parcel.h" #include "Lucy/Analysis/RegexTokenizer.h" #include "Lucy/Document/Doc.h" +#include "Lucy/Index/DocReader.h" #include "Lucy/Index/Inverter.h" +#include "Clownfish/String.h" +#include "Clownfish/Blob.h" +#include "Clownfish/Num.h" #include "Clownfish/Hash.h" #include "Clownfish/HashIterator.h" #include "Clownfish/Vector.h" +#include "Lucy/Document/HitDoc.h" #include "Lucy/Plan/FieldType.h" #include "Lucy/Plan/Schema.h" #include "Lucy/Index/Segment.h" @@ -90,6 +99,11 @@ GOLUCY_Doc_Destroy(lucy_Doc *self); extern void (*GOLUCY_Doc_Destroy_BRIDGE)(lucy_Doc *self); +extern lucy_HitDoc* +GOLUCY_DefDocReader_Fetch_Doc(lucy_DefaultDocReader *self, int32_t doc_id); +extern lucy_HitDoc* +(*GOLUCY_DefDocReader_Fetch_Doc_BRIDGE)(lucy_DefaultDocReader *self, int32_t doc_id); + extern void GOLUCY_Inverter_Invert_Doc(lucy_Inverter *self, lucy_Doc *doc); extern void @@ -115,9 +129,16 @@ GOLUCY_glue_exported_symbols() { GOLUCY_Doc_Extract_BRIDGE = GOLUCY_Doc_Extract; GOLUCY_Doc_Equals_BRIDGE = GOLUCY_Doc_Equals; GOLUCY_Doc_Destroy_BRIDGE = GOLUCY_Doc_Destroy; + GOLUCY_DefDocReader_Fetch_Doc_BRIDGE = GOLUCY_DefDocReader_Fetch_Doc; GOLUCY_Inverter_Invert_Doc_BRIDGE = GOLUCY_Inverter_Invert_Doc; } + +static void +null_terminate_string(char *string, size_t len) { + string[len] = '\0'; +} + */ import "C" import "unsafe" @@ -256,6 +277,79 @@ func fetchEntry(ivars *C.lucy_InverterIVARS, field *C.cfish_String) *C.lucy_Inve return (*C.lucy_InverterEntry)(unsafe.Pointer(entry)) } +//export GOLUCY_DefDocReader_Fetch_Doc +func GOLUCY_DefDocReader_Fetch_Doc(ddr *C.lucy_DefaultDocReader, + docID C.int32_t) *C.lucy_HitDoc { + ivars := C.lucy_DefDocReader_IVARS(ddr) + schema := ivars.schema + datInstream := ivars.dat_in + ixInstream := ivars.ix_in + fields := C.cfish_Hash_new(1) + fieldNameCap := C.size_t(31) + var fieldName *C.char = ((*C.char)(C.malloc(fieldNameCap + 1))) + + // Get data file pointer from index, read number of fields. + C.LUCY_InStream_Seek(ixInstream, C.int64_t(docID*8)) + start := C.LUCY_InStream_Read_U64(ixInstream) + C.LUCY_InStream_Seek(datInstream, C.int64_t(start)) + numFields := uint32(C.LUCY_InStream_Read_C32(datInstream)) + + // Decode stored data and build up the doc field by field. + for i := uint32(0); i < numFields; i++ { + // Read field name. + fieldNameLen := C.size_t(C.LUCY_InStream_Read_C32(datInstream)) + if fieldNameLen > fieldNameCap { + fieldNameCap = fieldNameLen + fieldName = ((*C.char)(C.realloc(unsafe.Pointer(fieldName), fieldNameCap+1))) + } + C.LUCY_InStream_Read_Bytes(datInstream, fieldName, fieldNameLen) + + // Find the Field's FieldType. + // TODO: Creating and destroying a new string each time is + // inefficient. The solution should be to add a privte + // Schema_Fetch_Type_Utf8 method which takes char* and size_t. + fieldNameStr := C.cfish_Str_new_from_utf8(fieldName, fieldNameLen) + fieldType := C.LUCY_Schema_Fetch_Type(schema, fieldNameStr) + C.cfish_dec_refcount(unsafe.Pointer(fieldNameStr)) + + // Read the field value. + var value *C.cfish_Obj + switch C.LUCY_FType_Primitive_ID(fieldType) & C.lucy_FType_PRIMITIVE_ID_MASK { + case C.lucy_FType_TEXT: + valueLen := C.size_t(C.LUCY_InStream_Read_C32(datInstream)) + buf := ((*C.char)(C.malloc(valueLen + 1))) + C.LUCY_InStream_Read_Bytes(datInstream, buf, valueLen) + C.null_terminate_string(buf, valueLen) + value = ((*C.cfish_Obj)(C.cfish_Str_new_steal_utf8(buf, valueLen))) + case C.lucy_FType_BLOB: + valueLen := C.size_t(C.LUCY_InStream_Read_C32(datInstream)) + buf := ((*C.char)(C.malloc(valueLen))) + C.LUCY_InStream_Read_Bytes(datInstream, buf, valueLen) + value = ((*C.cfish_Obj)(C.cfish_Blob_new_steal(buf, valueLen))) + case C.lucy_FType_FLOAT32: + value = ((*C.cfish_Obj)(C.cfish_Float_new(C.double(C.LUCY_InStream_Read_F32(datInstream))))) + case C.lucy_FType_FLOAT64: + value = ((*C.cfish_Obj)(C.cfish_Float_new(C.LUCY_InStream_Read_F64(datInstream)))) + case C.lucy_FType_INT32: + value = ((*C.cfish_Obj)(C.cfish_Int_new(C.int64_t(C.LUCY_InStream_Read_C32(datInstream))))) + case C.lucy_FType_INT64: + value = ((*C.cfish_Obj)(C.cfish_Int_new(C.int64_t(C.LUCY_InStream_Read_C64(datInstream))))) + default: + value = nil + panic(clownfish.NewErr("Internal Lucy error: bad type id for field " + + C.GoStringN(fieldName, C.int(fieldNameLen)))) + } + + // Store the value. + C.CFISH_Hash_Store_Utf8(fields, fieldName, fieldNameLen, value) + } + C.free(unsafe.Pointer(fieldName)) + + retval := C.lucy_HitDoc_new(unsafe.Pointer(fields), docID, 0.0) + C.cfish_dec_refcount(unsafe.Pointer(fields)) + return retval +} + //export GOLUCY_Inverter_Invert_Doc func GOLUCY_Inverter_Invert_Doc(inverter *C.lucy_Inverter, doc *C.lucy_Doc) { ivars := C.lucy_Inverter_IVARS(inverter) From 5f00a21335c3304c074223b2dba4567a62d9c97a Mon Sep 17 00:00:00 2001 From: Marvin Humphrey Date: Mon, 20 Jul 2015 12:41:34 -0700 Subject: [PATCH 8/8] Port RegexTokenizer to Go and CGO. Use Go's regular expression engine, the `regexp` package. Store Go `regexp` objects using the registry which allows them to be referenced by integer from C. --- go/lucy/lucy.go | 81 +++++++++++++++++++++++++++++++++++++++++++- go/lucy/lucy_test.go | 10 ++++++ 2 files changed, 90 insertions(+), 1 deletion(-) diff --git a/go/lucy/lucy.go b/go/lucy/lucy.go index 556235e4c..bc2e9f817 100644 --- a/go/lucy/lucy.go +++ b/go/lucy/lucy.go @@ -38,6 +38,11 @@ package lucy #include "Clownfish/Hash.h" #include "Clownfish/HashIterator.h" #include "Clownfish/Vector.h" +#include "Clownfish/Err.h" +#include "Clownfish/Util/StringHelper.h" +#include "Lucy/Analysis/Analyzer.h" +#include "Lucy/Analysis/Inversion.h" +#include "Lucy/Analysis/Token.h" #include "Lucy/Document/HitDoc.h" #include "Lucy/Plan/FieldType.h" #include "Lucy/Plan/Schema.h" @@ -133,6 +138,35 @@ GOLUCY_glue_exported_symbols() { GOLUCY_Inverter_Invert_Doc_BRIDGE = GOLUCY_Inverter_Invert_Doc; } +static uint32_t +S_count_code_points(const char *string, size_t len) { + uint32_t num_code_points = 0; + size_t i = 0; + + while (i < len) { + i += cfish_StrHelp_UTF8_COUNT[(uint8_t)(string[i])]; + ++num_code_points; + } + + if (i != len) { + CFISH_THROW(CFISH_ERR, "Match between code point boundaries in '%s'", string); + } + + return num_code_points; +} + +// Returns the number of code points through the end of the match. +static int +push_token(const char *str, int start, int end, int last_end, + int cp_count, lucy_Inversion *inversion) { + const char *match = str + start; + int match_len = end - start; + int cp_start = cp_count + S_count_code_points(str + last_end, start - last_end); + int cp_end = cp_start + S_count_code_points(match, match_len); + lucy_Token *token = lucy_Token_new(match, match_len, cp_start, cp_end, 1.0f, 1); + LUCY_Inversion_Append(inversion, token); + return cp_end; +} static void null_terminate_string(char *string, size_t len) { @@ -143,25 +177,70 @@ null_terminate_string(char *string, size_t len) { import "C" import "unsafe" import "fmt" +import "regexp" import "git-wip-us.apache.org/repos/asf/lucy-clownfish.git/runtime/go/clownfish" +var registry *objRegistry + func init() { C.GOLUCY_glue_exported_symbols() C.lucy_bootstrap_parcel() + registry = newObjRegistry(16) } //export GOLUCY_RegexTokenizer_init func GOLUCY_RegexTokenizer_init(rt *C.lucy_RegexTokenizer, pattern *C.cfish_String) *C.lucy_RegexTokenizer { - return nil + C.lucy_Analyzer_init(((*C.lucy_Analyzer)(unsafe.Pointer(rt)))) + + ivars := C.lucy_RegexTokenizer_IVARS(rt) + ivars.pattern = C.CFISH_Str_Clone(pattern) + + var patternGo string + if pattern == nil { + patternGo = "\\w+(?:['\\x{2019}]\\w+)*" + } else { + patternGo = clownfish.CFStringToGo(unsafe.Pointer(pattern)) + } + rx, err := regexp.Compile(patternGo) + if err != nil { + panic(err) + } + rxID := registry.store(rx) + ivars.token_re = unsafe.Pointer(rxID) + + return rt } //export GOLUCY_RegexTokenizer_Destroy func GOLUCY_RegexTokenizer_Destroy(rt *C.lucy_RegexTokenizer) { + ivars := C.lucy_RegexTokenizer_IVARS(rt) + rxID := uintptr(ivars.token_re) + registry.delete(rxID) + C.cfish_super_destroy(unsafe.Pointer(rt), C.LUCY_REGEXTOKENIZER) } //export GOLUCY_RegexTokenizer_Tokenize_Utf8 func GOLUCY_RegexTokenizer_Tokenize_Utf8(rt *C.lucy_RegexTokenizer, str *C.char, stringLen C.size_t, inversion *C.lucy_Inversion) { + + ivars := C.lucy_RegexTokenizer_IVARS(rt) + rxID := uintptr(ivars.token_re) + rx, ok := registry.fetch(rxID).(*regexp.Regexp) + if !ok { + mess := fmt.Sprintf("Failed to Fetch *RegExp with id %d and pattern %s", + rxID, clownfish.CFStringToGo(unsafe.Pointer(ivars.pattern))) + panic(clownfish.NewErr(mess)) + } + + buf := C.GoBytes(unsafe.Pointer(str), C.int(stringLen)) + found := rx.FindAllIndex(buf, int(stringLen)) + lastEnd := 0 + cpCount := 0 + for _, startEnd := range found { + cpCount = int(C.push_token(str, C.int(startEnd[0]), C.int(startEnd[1]), + C.int(lastEnd), C.int(cpCount), inversion)) + lastEnd = startEnd[1] + } } func NewDoc(docID int32) Doc { diff --git a/go/lucy/lucy_test.go b/go/lucy/lucy_test.go index 94e4f0aa0..82ba87899 100644 --- a/go/lucy/lucy_test.go +++ b/go/lucy/lucy_test.go @@ -18,6 +18,7 @@ package lucy import "git-wip-us.apache.org/repos/asf/lucy-clownfish.git/runtime/go/clownfish" import "testing" +import "reflect" func TestStuff(t *testing.T) { NewSchema() @@ -29,3 +30,12 @@ func TestOpenIndexer(t *testing.T) { t.Error("Didn't catch exception opening indexer") } } + +func TestRegex(t *testing.T) { + tokenizer := NewRegexTokenizer("\\S+") + var expected []interface{} = []interface{}{"foo", "bar", "baz"} + got := tokenizer.Split("foo bar baz") + if !reflect.DeepEqual(got, expected) { + t.Errorf("Expected %v, got %v", expected, got) + } +}