From 633c4d30b74dbd69dbfc2c70c01c7638317c2d6a Mon Sep 17 00:00:00 2001
From: Marvin Humphrey <marvin@rectangular.com>
Date: Sun, 19 Jul 2015 16:47:23 -0700
Subject: [PATCH 1/8] Remove RegexTokenizer_Set_Token_RE.

It was used internally a long time ago and is now obsolete.
---
 c/src/Lucy/Analysis/RegexTokenizer.c          |  7 --
 core/Lucy/Analysis/RegexTokenizer.cfh         |  6 --
 .../src/Lucy/Analysis/RegexTokenizer.c        |  5 --
 perl/xs/Lucy/Analysis/RegexTokenizer.c        | 75 +++++--------------
 4 files changed, 20 insertions(+), 73 deletions(-)

diff --git a/c/src/Lucy/Analysis/RegexTokenizer.c b/c/src/Lucy/Analysis/RegexTokenizer.c
index a811979e0..d47b3ea23 100644
--- a/c/src/Lucy/Analysis/RegexTokenizer.c
+++ b/c/src/Lucy/Analysis/RegexTokenizer.c
@@ -86,13 +86,6 @@ RegexTokenizer_init(RegexTokenizer *self, String *pattern) {
     return self;
 }
 
-void
-RegexTokenizer_Set_Token_RE_IMP(RegexTokenizer *self, void *token_re) {
-    UNUSED_VAR(self);
-    UNUSED_VAR(token_re);
-    THROW(ERR, "TODO");
-}
-
 void
 RegexTokenizer_Destroy_IMP(RegexTokenizer *self) {
     RegexTokenizerIVARS *const ivars = RegexTokenizer_IVARS(self);
diff --git a/core/Lucy/Analysis/RegexTokenizer.cfh b/core/Lucy/Analysis/RegexTokenizer.cfh
index 9e352025e..ec14d51e8 100644
--- a/core/Lucy/Analysis/RegexTokenizer.cfh
+++ b/core/Lucy/Analysis/RegexTokenizer.cfh
@@ -84,12 +84,6 @@ public class Lucy::Analysis::RegexTokenizer
     Tokenize_Utf8(RegexTokenizer *self, const char *text, size_t len,
                   Inversion *inversion);
 
-    /** Set the compiled regular expression for matching a token.  Also sets
-     * `pattern` as a side effect.
-     */
-    void
-    Set_Token_RE(RegexTokenizer *self, void *token_re);
-
     public incremented Obj*
     Dump(RegexTokenizer *self);
 
diff --git a/example-lang/src/Lucy/Analysis/RegexTokenizer.c b/example-lang/src/Lucy/Analysis/RegexTokenizer.c
index 2f21afb91..92f42ecb7 100644
--- a/example-lang/src/Lucy/Analysis/RegexTokenizer.c
+++ b/example-lang/src/Lucy/Analysis/RegexTokenizer.c
@@ -28,11 +28,6 @@ lucy_RegexTokenizer_init(lucy_RegexTokenizer *self,
     UNREACHABLE_RETURN(lucy_RegexTokenizer*);
 }
 
-void
-lucy_RegexTokenizer_set_token_re(lucy_RegexTokenizer *self, void *token_re) {
-    THROW(LUCY_ERR, "TODO");
-}
-
 void
 lucy_RegexTokenizer_destroy(lucy_RegexTokenizer *self) {
     THROW(LUCY_ERR, "TODO");
diff --git a/perl/xs/Lucy/Analysis/RegexTokenizer.c b/perl/xs/Lucy/Analysis/RegexTokenizer.c
index 4c6e1f11e..f95cf0f20 100644
--- a/perl/xs/Lucy/Analysis/RegexTokenizer.c
+++ b/perl/xs/Lucy/Analysis/RegexTokenizer.c
@@ -27,13 +27,6 @@
 static SV*
 S_compile_token_re(pTHX_ cfish_String *pattern);
 
-static void
-S_set_token_re_but_not_pattern(pTHX_ lucy_RegexTokenizer *self,
-                               void *token_re);
-
-static void
-S_set_pattern_from_token_re(pTHX_ lucy_RegexTokenizer *self, void *token_re);
-
 bool
 lucy_RegexTokenizer_is_available(void) {
     return true;
@@ -61,36 +54,7 @@ lucy_RegexTokenizer_init(lucy_RegexTokenizer *self,
 
     // Acquire a compiled regex engine for matching one token.
     dTHX;
-    SV *token_re_sv = S_compile_token_re(aTHX_ ivars->pattern);
-    S_set_token_re_but_not_pattern(aTHX_ self, SvRV(token_re_sv));
-    SvREFCNT_dec(token_re_sv);
-
-    return self;
-}
-
-static SV*
-S_compile_token_re(pTHX_ cfish_String *pattern) {
-    dSP;
-    ENTER;
-    SAVETMPS;
-    EXTEND(SP, 1);
-    PUSHMARK(SP);
-    XPUSHs((SV*)CFISH_Str_To_Host(pattern));
-    PUTBACK;
-    call_pv("Lucy::Analysis::RegexTokenizer::_compile_token_re", G_SCALAR);
-    SPAGAIN;
-    SV *token_re_sv = POPs;
-    (void)SvREFCNT_inc(token_re_sv);
-    PUTBACK;
-    FREETMPS;
-    LEAVE;
-    return token_re_sv;
-}
-
-static void
-S_set_token_re_but_not_pattern(pTHX_ lucy_RegexTokenizer *self,
-                               void *token_re) {
-    lucy_RegexTokenizerIVARS *const ivars = lucy_RegexTokenizer_IVARS(self);
+    SV *token_re = S_compile_token_re(aTHX_ ivars->pattern);
 #if (PERL_VERSION > 10)
     REGEXP *rx = SvRX((SV*)token_re);
 #else
@@ -107,29 +71,30 @@ S_set_token_re_but_not_pattern(pTHX_ lucy_RegexTokenizer *self,
         THROW(CFISH_ERR, "Failed to extract REGEXP from token_re '%s'",
               SvPV_nolen((SV*)token_re));
     }
-    if (ivars->token_re) { ReREFCNT_dec(((REGEXP*)ivars->token_re)); }
     ivars->token_re = rx;
     (void)ReREFCNT_inc(((REGEXP*)ivars->token_re));
-}
+    SvREFCNT_dec(token_re);
 
-static void
-S_set_pattern_from_token_re(pTHX_ lucy_RegexTokenizer *self, void *token_re) {
-    lucy_RegexTokenizerIVARS *const ivars = lucy_RegexTokenizer_IVARS(self);
-    SV *rv = newRV((SV*)token_re);
-    STRLEN len = 0;
-    char *ptr = SvPVutf8((SV*)rv, len);
-    CFISH_DECREF(ivars->pattern);
-    ivars->pattern = cfish_Str_new_from_trusted_utf8(ptr, len);
-    SvREFCNT_dec(rv);
+    return self;
 }
 
-void
-LUCY_RegexTokenizer_Set_Token_RE_IMP(lucy_RegexTokenizer *self,
-                                     void *token_re) {
-    dTHX;
-    S_set_token_re_but_not_pattern(aTHX_ self, token_re);
-    // Set pattern as a side effect.
-    S_set_pattern_from_token_re(aTHX_ self, token_re);
+static SV*
+S_compile_token_re(pTHX_ cfish_String *pattern) {
+    dSP;
+    ENTER;
+    SAVETMPS;
+    EXTEND(SP, 1);
+    PUSHMARK(SP);
+    XPUSHs((SV*)CFISH_Str_To_Host(pattern));
+    PUTBACK;
+    call_pv("Lucy::Analysis::RegexTokenizer::_compile_token_re", G_SCALAR);
+    SPAGAIN;
+    SV *token_re_sv = POPs;
+    (void)SvREFCNT_inc(token_re_sv);
+    PUTBACK;
+    FREETMPS;
+    LEAVE;
+    return token_re_sv;
 }
 
 void

From fed6ca76b6b8310c1db97bfbbed1b56af47e6194 Mon Sep 17 00:00:00 2001
From: Marvin Humphrey <marvin@rectangular.com>
Date: Tue, 14 Jul 2015 18:57:11 -0700
Subject: [PATCH 2/8] Copy C host-specific code to Go.

Copy the C binding code in anticipation of replacing it with Go-specific
binding code.
---
 common/charmonizer.c    |   2 +-
 common/charmonizer.main |   2 +-
 go/cfext/lucy.c         | 483 ++++++++++++++++++++++++++++++++++++++++
 3 files changed, 485 insertions(+), 2 deletions(-)
 create mode 100644 go/cfext/lucy.c

diff --git a/common/charmonizer.c b/common/charmonizer.c
index d3c3dc38a..f0ebfcd26 100644
--- a/common/charmonizer.c
+++ b/common/charmonizer.c
@@ -8134,7 +8134,7 @@ lucy_MakeFile_new(chaz_CLI *cli) {
         self->host_src_dir = "xs";
     }
 	else if (chaz_CLI_defined(cli, "enable-go")) {
-        self->host_src_dir = "../c/src";
+        self->host_src_dir = "cfext";
 	}
     else {
         self->host_src_dir = "src";
diff --git a/common/charmonizer.main b/common/charmonizer.main
index 991593b51..800442e7b 100644
--- a/common/charmonizer.main
+++ b/common/charmonizer.main
@@ -252,7 +252,7 @@ lucy_MakeFile_new(chaz_CLI *cli) {
         self->host_src_dir = "xs";
     }
 	else if (chaz_CLI_defined(cli, "enable-go")) {
-        self->host_src_dir = "../c/src";
+        self->host_src_dir = "cfext";
 	}
     else {
         self->host_src_dir = "src";
diff --git a/go/cfext/lucy.c b/go/cfext/lucy.c
new file mode 100644
index 000000000..d1044dfe3
--- /dev/null
+++ b/go/cfext/lucy.c
@@ -0,0 +1,483 @@
+/* Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+
+#define C_LUCY_REGEXTOKENIZER
+#define C_LUCY_DOC
+#define C_LUCY_DOCREADER
+#define C_LUCY_DEFAULTDOCREADER
+#define C_LUCY_INVERTER
+#define C_LUCY_INVERTERENTRY
+#define CFISH_USE_SHORT_NAMES
+#define LUCY_USE_SHORT_NAMES
+
+
+
+#include <string.h>
+
+#include "charmony.h"
+
+#include "Lucy/Analysis/RegexTokenizer.h"
+#include "Lucy/Document/Doc.h"
+#include "Lucy/Index/DocReader.h"
+#include "Lucy/Index/Inverter.h"
+#include "Clownfish/Blob.h"
+#include "Clownfish/String.h"
+#include "Clownfish/Err.h"
+#include "Clownfish/Hash.h"
+#include "Clownfish/HashIterator.h"
+#include "Clownfish/Num.h"
+#include "Clownfish/Vector.h"
+#include "Clownfish/Class.h"
+#include "Clownfish/Util/Memory.h"
+#include "Clownfish/Util/StringHelper.h"
+#include "Lucy/Analysis/Token.h"
+#include "Lucy/Analysis/Inversion.h"
+#include "Lucy/Document/HitDoc.h"
+#include "Lucy/Index/Segment.h"
+#include "Lucy/Plan/FieldType.h"
+#include "Lucy/Plan/Schema.h"
+#include "Lucy/Store/InStream.h"
+#include "Lucy/Store/OutStream.h"
+#include "Lucy/Util/Freezer.h"
+
+#if defined(CHY_HAS_PCRE_H)
+
+#include <pcre.h>
+
+static uint32_t
+S_count_code_points(const char *string, size_t len);
+
+bool
+RegexTokenizer_is_available(void) {
+    return true;
+}
+
+RegexTokenizer*
+RegexTokenizer_init(RegexTokenizer *self, String *pattern) {
+    Analyzer_init((Analyzer*)self);
+    RegexTokenizerIVARS *const ivars = RegexTokenizer_IVARS(self);
+
+    char *pattern_buf = NULL;
+    const char *pattern_ptr;
+    if (pattern) {
+        ivars->pattern = Str_Clone(pattern);
+        pattern_buf = Str_To_Utf8(ivars->pattern);
+        pattern_ptr = pattern_buf;
+    }
+    else {
+        pattern_ptr = "\\w+(?:['\\x{2019}]\\w+)*";
+        ivars->pattern
+            = Str_new_from_trusted_utf8(pattern_ptr, strlen(pattern_ptr));
+    }
+
+    int options = PCRE_UTF8 | PCRE_NO_UTF8_CHECK;
+#ifdef PCRE_BSR_UNICODE
+    // Available since PCRE 7.4
+    options |= PCRE_BSR_UNICODE;
+#endif
+#ifdef PCRE_NEWLINE_LF
+    // Available since PCRE 6.7
+    options |= PCRE_NEWLINE_LF;
+#endif
+    const char *err_ptr;
+    int err_offset;
+    pcre *re = pcre_compile(pattern_ptr, options, &err_ptr, &err_offset, NULL);
+    if (pattern_buf) {
+        FREEMEM(pattern_buf);
+    }
+    if (!re) {
+        THROW(ERR, "%s", err_ptr);
+    }
+
+    // TODO: Check whether pcre_study improves performance
+
+    ivars->token_re = re;
+
+    return self;
+}
+
+void
+RegexTokenizer_Destroy_IMP(RegexTokenizer *self) {
+    RegexTokenizerIVARS *const ivars = RegexTokenizer_IVARS(self);
+    DECREF(ivars->pattern);
+    pcre *re = (pcre*)ivars->token_re;
+    if (re) {
+        pcre_free(re);
+    }
+    SUPER_DESTROY(self, REGEXTOKENIZER);
+}
+
+void
+RegexTokenizer_Tokenize_Utf8_IMP(RegexTokenizer *self, const char *string,
+                                 size_t string_len, Inversion *inversion) {
+    RegexTokenizerIVARS *const ivars = RegexTokenizer_IVARS(self);
+    pcre      *re          = (pcre*)ivars->token_re;
+    int        byte_offset = 0;
+    uint32_t   cp_offset   = 0; // Code points
+    int        options     = PCRE_NO_UTF8_CHECK;
+    int        ovector[3];
+
+    int return_code = pcre_exec(re, NULL, string, string_len, byte_offset,
+                                options, ovector, 3);
+    while (return_code >= 0) {
+        const char *match     = string + ovector[0];
+        size_t      match_len = ovector[1] - ovector[0];
+
+        uint32_t cp_before  = S_count_code_points(string + byte_offset,
+                                                  ovector[0] - byte_offset);
+        uint32_t cp_start   = cp_offset + cp_before;
+        uint32_t cp_matched = S_count_code_points(match, match_len);
+        uint32_t cp_end     = cp_start + cp_matched;
+
+        // Add a token to the new inversion.
+        Token *token = Token_new(match, match_len, cp_start, cp_end, 1.0f, 1);
+        Inversion_Append(inversion, token);
+
+        byte_offset = ovector[1];
+        cp_offset   = cp_end;
+        return_code = pcre_exec(re, NULL, string, string_len, byte_offset,
+                                options, ovector, 3);
+    }
+
+    if (return_code != PCRE_ERROR_NOMATCH) {
+        THROW(ERR, "pcre_exec failed: %d", return_code);
+    }
+}
+
+static uint32_t
+S_count_code_points(const char *string, size_t len) {
+    uint32_t num_code_points = 0;
+    size_t i = 0;
+
+    while (i < len) {
+        i += StrHelp_UTF8_COUNT[(uint8_t)(string[i])];
+        ++num_code_points;
+    }
+
+    if (i != len) {
+        THROW(ERR, "Match between code point boundaries in '%s'", string);
+    }
+
+    return num_code_points;
+}
+
+#else // CHY_HAS_PCRE_H
+
+bool
+RegexTokenizer_is_available(void) {
+    return false;
+}
+
+RegexTokenizer*
+RegexTokenizer_init(RegexTokenizer *self, String *pattern) {
+    UNUSED_VAR(self);
+    UNUSED_VAR(pattern);
+    THROW(ERR,
+          "RegexTokenizer is not available because Lucy was compiled"
+          " without PCRE.");
+    UNREACHABLE_RETURN(RegexTokenizer*);
+}
+
+void
+RegexTokenizer_Set_Token_RE_IMP(RegexTokenizer *self, void *token_re) {
+    UNUSED_VAR(self);
+    UNUSED_VAR(token_re);
+    THROW(ERR,
+          "RegexTokenizer is not available because Lucy was compiled"
+          " without PCRE.");
+}
+
+void
+RegexTokenizer_Destroy_IMP(RegexTokenizer *self) {
+    UNUSED_VAR(self);
+    THROW(ERR,
+          "RegexTokenizer is not available because Lucy was compiled"
+          " without PCRE.");
+}
+
+void
+RegexTokenizer_Tokenize_Utf8_IMP(RegexTokenizer *self, const char *string,
+                                 size_t string_len, Inversion *inversion) {
+    UNUSED_VAR(self);
+    UNUSED_VAR(string);
+    UNUSED_VAR(string_len);
+    UNUSED_VAR(inversion);
+    THROW(ERR,
+          "RegexTokenizer is not available because Lucy was compiled"
+          " without PCRE.");
+}
+
+#endif // CHY_HAS_PCRE_H
+
+/********************************** Doc ********************************/
+
+Doc*
+Doc_init(Doc *self, void *fields, int32_t doc_id) {
+    DocIVARS *const ivars = Doc_IVARS(self);
+    Hash *hash;
+
+    if (fields) {
+        hash = (Hash *)INCREF(CERTIFY(fields, HASH));
+    }
+    else {
+        hash = Hash_new(0);
+    }
+    ivars->fields = hash;
+    ivars->doc_id = doc_id;
+
+    return self;
+}
+
+void
+Doc_Set_Fields_IMP(Doc *self, void *fields) {
+    DocIVARS *const ivars = Doc_IVARS(self);
+    DECREF(ivars->fields);
+    ivars->fields = CERTIFY(fields, HASH);
+}
+
+uint32_t
+Doc_Get_Size_IMP(Doc *self) {
+    Hash *hash = (Hash*)Doc_IVARS(self)->fields;
+    return Hash_Get_Size(hash);
+}
+
+void
+Doc_Store_IMP(Doc *self, String *field, Obj *value) {
+    Hash *hash = (Hash*)Doc_IVARS(self)->fields;
+    Hash_Store(hash, field, INCREF(value));
+}
+
+void
+Doc_Serialize_IMP(Doc *self, OutStream *outstream) {
+    DocIVARS *const ivars = Doc_IVARS(self);
+    Hash *hash = (Hash*)ivars->fields;
+    Freezer_serialize_hash(hash, outstream);
+    OutStream_Write_C32(outstream, ivars->doc_id);
+}
+
+Doc*
+Doc_Deserialize_IMP(Doc *self, InStream *instream) {
+    DocIVARS *const ivars = Doc_IVARS(self);
+    ivars->fields = Freezer_read_hash(instream);
+    ivars->doc_id = InStream_Read_C32(instream);
+    return self;
+}
+
+Obj*
+Doc_Extract_IMP(Doc *self, String *field) {
+    Hash *hash = (Hash*)Doc_IVARS(self)->fields;
+    return INCREF(Hash_Fetch(hash, field));
+}
+
+Hash*
+Doc_Dump_IMP(Doc *self) {
+    UNUSED_VAR(self);
+    THROW(ERR, "TODO");
+    UNREACHABLE_RETURN(Hash*);
+}
+
+Doc*
+Doc_Load_IMP(Doc *self, Obj *dump) {
+    UNUSED_VAR(self);
+    UNUSED_VAR(dump);
+    THROW(ERR, "TODO");
+    UNREACHABLE_RETURN(Doc*);
+}
+
+bool
+Doc_Equals_IMP(Doc *self, Obj *other) {
+    if ((Doc*)other == self)   { return true;  }
+    if (!Obj_is_a(other, DOC)) { return false; }
+    DocIVARS *const ivars = Doc_IVARS(self);
+    DocIVARS *const ovars = Doc_IVARS((Doc*)other);
+    return Hash_Equals((Hash*)ivars->fields, (Obj*)ovars->fields);
+}
+
+void
+Doc_Destroy_IMP(Doc *self) {
+    DocIVARS *const ivars = Doc_IVARS(self);
+    DECREF(ivars->fields);
+    SUPER_DESTROY(self, DOC);
+}
+
+
+/**************************** DocReader *****************************/
+
+HitDoc*
+DefDocReader_Fetch_Doc_IMP(DefaultDocReader *self, int32_t doc_id) {
+    DefaultDocReaderIVARS *const ivars = DefDocReader_IVARS(self);
+    Schema   *const schema = ivars->schema;
+    InStream *const dat_in = ivars->dat_in;
+    InStream *const ix_in  = ivars->ix_in;
+    Hash     *const fields = Hash_new(1);
+    int64_t   start;
+    uint32_t  num_fields;
+    uint32_t  field_name_cap = 31;
+    char     *field_name = (char*)MALLOCATE(field_name_cap + 1);
+
+    // Get data file pointer from index, read number of fields.
+    InStream_Seek(ix_in, (int64_t)doc_id * 8);
+    start = InStream_Read_U64(ix_in);
+    InStream_Seek(dat_in, start);
+    num_fields = InStream_Read_C32(dat_in);
+
+    // Decode stored data and build up the doc field by field.
+    while (num_fields--) {
+        uint32_t        field_name_len;
+        Obj       *value;
+        FieldType *type;
+
+        // Read field name.
+        field_name_len = InStream_Read_C32(dat_in);
+        if (field_name_len > field_name_cap) {
+            field_name_cap = field_name_len;
+            field_name     = (char*)REALLOCATE(field_name,
+                                                    field_name_cap + 1);
+        }
+        InStream_Read_Bytes(dat_in, field_name, field_name_len);
+
+        // Find the Field's FieldType.
+        String *field_name_str = SSTR_WRAP_UTF8(field_name, field_name_len);
+        type = Schema_Fetch_Type(schema, field_name_str);
+
+        // Read the field value.
+        switch (FType_Primitive_ID(type) & FType_PRIMITIVE_ID_MASK) {
+            case FType_TEXT: {
+                    uint32_t value_len = InStream_Read_C32(dat_in);
+                    char *buf = (char*)MALLOCATE(value_len + 1);
+                    InStream_Read_Bytes(dat_in, buf, value_len);
+                    buf[value_len] = '\0';
+                    value = (Obj*)Str_new_steal_utf8(buf, value_len);
+                    break;
+                }
+            case FType_BLOB: {
+                    uint32_t value_len = InStream_Read_C32(dat_in);
+                    char *buf = (char*)MALLOCATE(value_len);
+                    InStream_Read_Bytes(dat_in, buf, value_len);
+                    value = (Obj*)Blob_new_steal(buf, value_len);
+                    break;
+                }
+            case FType_FLOAT32:
+                value = (Obj*)Float_new(InStream_Read_F32(dat_in));
+                break;
+            case FType_FLOAT64:
+                value = (Obj*)Float_new(InStream_Read_F64(dat_in));
+                break;
+            case FType_INT32:
+                value = (Obj*)Int_new((int32_t)InStream_Read_C32(dat_in));
+                break;
+            case FType_INT64:
+                value = (Obj*)Int_new((int64_t)InStream_Read_C64(dat_in));
+                break;
+            default:
+                value = NULL;
+                THROW(ERR, "Unrecognized type: %o", type);
+        }
+
+        // Store the value.
+        Hash_Store_Utf8(fields, field_name, field_name_len, value);
+    }
+    FREEMEM(field_name);
+
+    HitDoc *retval = HitDoc_new(fields, doc_id, 0.0);
+    DECREF(fields);
+    return retval;
+}
+
+/**************************** Inverter *****************************/
+
+static InverterEntry*
+S_fetch_entry(InverterIVARS *ivars, String *field) {
+    Schema *const schema = ivars->schema;
+    int32_t field_num = Seg_Field_Num(ivars->segment, field);
+    if (!field_num) {
+        // This field seems not to be in the segment yet.  Try to find it in
+        // the Schema.
+        if (Schema_Fetch_Type(schema, field)) {
+            // The field is in the Schema.  Get a field num from the Segment.
+            field_num = Seg_Add_Field(ivars->segment, field);
+        }
+        else {
+            // We've truly failed to find the field.  The user must
+            // not have spec'd it.
+            THROW(ERR, "Unknown field name: '%o'", field);
+        }
+    }
+
+    InverterEntry *entry
+        = (InverterEntry*)Vec_Fetch(ivars->entry_pool, field_num);
+    if (!entry) {
+        entry = InvEntry_new(schema, (String*)field, field_num);
+        Vec_Store(ivars->entry_pool, field_num, (Obj*)entry);
+    }
+    return entry;
+}
+
+void
+Inverter_Invert_Doc_IMP(Inverter *self, Doc *doc) {
+    InverterIVARS *const ivars = Inverter_IVARS(self);
+    Hash *const fields = (Hash*)Doc_Get_Fields(doc);
+
+    // Prepare for the new doc.
+    Inverter_Set_Doc(self, doc);
+
+    // Extract and invert the doc's fields.
+    HashIterator *iter = HashIter_new(fields);
+    while (HashIter_Next(iter)) {
+        String *field = HashIter_Get_Key(iter);
+        Obj    *obj   = HashIter_Get_Value(iter);
+
+        InverterEntry *inventry = S_fetch_entry(ivars, field);
+        InverterEntryIVARS *inventry_ivars = InvEntry_IVARS(inventry);
+        FieldType *type = inventry_ivars->type;
+
+        // Get the field value.
+        switch (FType_Primitive_ID(type) & FType_PRIMITIVE_ID_MASK) {
+            case FType_TEXT: {
+                    CERTIFY(obj, STRING);
+                    break;
+                }
+            case FType_BLOB: {
+                    CERTIFY(obj, BLOB);
+                    break;
+                }
+            case FType_INT32:
+            case FType_INT64: {
+                    CERTIFY(obj, INTEGER);
+                    break;
+                }
+            case FType_FLOAT32:
+            case FType_FLOAT64: {
+                    CERTIFY(obj, FLOAT);
+                    break;
+                }
+            default:
+                THROW(ERR, "Unrecognized type: %o", type);
+        }
+
+        if (inventry_ivars->value != obj) {
+            DECREF(inventry_ivars->value);
+            inventry_ivars->value = INCREF(obj);
+        }
+
+        Inverter_Add_Field(self, inventry);
+    }
+    DECREF(iter);
+}
+
+

From dab9a88d5c654a2539e4a81db4d74f88cc976cf3 Mon Sep 17 00:00:00 2001
From: Marvin Humphrey <marvin@rectangular.com>
Date: Thu, 16 Jul 2015 20:32:45 -0700
Subject: [PATCH 3/8] Make it possible to reference Go objects from C.

This patch is a variant on sample code written by Nick Wellnhofer.
---
 go/lucy/registry.go      | 135 +++++++++++++++++++++++++++++++++++++++
 go/lucy/registry_test.go |  84 ++++++++++++++++++++++++
 2 files changed, 219 insertions(+)
 create mode 100644 go/lucy/registry.go
 create mode 100644 go/lucy/registry_test.go

diff --git a/go/lucy/registry.go b/go/lucy/registry.go
new file mode 100644
index 000000000..86719316b
--- /dev/null
+++ b/go/lucy/registry.go
@@ -0,0 +1,135 @@
+/* Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package lucy
+
+import "sync"
+
+type indexInt uintptr
+
+type objRegistry struct {
+	// Use pointer to array to guarantee atomic update for lock-free reads.
+	// Assume that loads and stores of the pointer are atomic.
+	entries *[]interface{}
+	freeListHead indexInt
+	mutex sync.Mutex
+}
+
+func newObjRegistry(size uintptr) *objRegistry {
+	entries := make([]interface{}, size)
+
+	// Each empty entry points to the index of the next empty entry.  Index 0
+	// is unused.  The last slot is seet to a terminating sentry value of 0.
+	entries[0] = indexInt(0) // unused
+	for i := uintptr(1); i < size - 1; i++ {
+		entries[i] = indexInt(i + 1)
+	}
+	entries[size-1] = indexInt(0)
+
+	reg := &objRegistry{}
+	reg.entries = &entries
+	reg.freeListHead = indexInt(1)
+
+	return reg
+}
+
+func (reg *objRegistry) store(obj interface{}) uintptr {
+	reg.mutex.Lock()
+
+	// Find the index of the next empty slot.
+	index := uintptr(reg.freeListHead)
+
+	entries := reg.entries
+
+	if (index != 0) {
+		// A slot is available.  It contains the index of the next available
+		// slot; put that index into the freeListHead.
+		reg.freeListHead = (*entries)[index].(indexInt)
+	} else {
+		// The sentinel value was encountered, indicating that we are out of
+		// space and must grow the entries array.
+
+		// The list head was 0, a slot we don't want to use.  Figure out what
+		// slot we're going to use instead.  If the current size of the
+		// entries array is 8, and will soon be 16, use slot 8.
+		index = uintptr(len(*entries))
+
+		// Duplicate the array and copy in the existing entries data.
+		newSize := index * 2
+		newEntries := make([]interface{}, newSize)
+		copy(newEntries, *entries)
+
+		// Set up each new empty slot to point at another new empty slot, up
+		// to the final slot which will get the sentinel value 0.
+		for i := index + 1; i < newSize - 1; i++ {
+			newEntries[i] = indexInt(i + 1)
+		}
+		newEntries[newSize - 1] = indexInt(0)
+		entries = &newEntries
+		reg.entries = entries
+
+		// Set the freeListHead to one greater than the slot we're using this
+		// time -- i.e. if the current size is 8, the new size is 16, and the
+		// slot we use for the supplied value is 8, then the new list head
+		// will be 9.
+		reg.freeListHead = indexInt(index + 1)
+	}
+
+	// Store the supplied value in the slot.
+	(*entries)[index] = obj
+
+	reg.mutex.Unlock()
+
+	return index
+}
+
+func (reg *objRegistry) fetch(index uintptr) interface{} {
+
+	// Ignore an out of range request.
+	if index >= uintptr(len(*reg.entries)) {
+		return nil
+	}
+	entry := (*reg.entries)[index]
+	if _, ok := entry.(indexInt); ok {
+		// Return nil if the slot is empty.
+		return nil
+	}
+	return entry
+}
+
+func (reg *objRegistry) delete(index uintptr) {
+	reg.mutex.Lock()
+
+	// Overwrite the value at the supplied index with the freeListHead.  For
+	// example, if you are storing strings and the entries array consists of
+	// {0, "A", "B", C", 5, 6, 7, 0}, with freeListHead at 4, then deleting
+	// index 2 (string value "B") will result in the following state:
+	// {0, "A", 4, "C", 5, 6, 7, 0} and freeListHead at 2.
+	//
+	// Some potential errors are ignored:
+	// *   Index is greater than the size of the array.
+	// *   Slot is empty.
+	if index < uintptr(len(*reg.entries)) {
+		_, isIndexInt := (*reg.entries)[index].(indexInt)
+		if !isIndexInt {
+			(*reg.entries)[index] = reg.freeListHead
+			reg.freeListHead = indexInt(index)
+		}
+	}
+
+	reg.mutex.Unlock()
+}
+
diff --git a/go/lucy/registry_test.go b/go/lucy/registry_test.go
new file mode 100644
index 000000000..ea4f9da61
--- /dev/null
+++ b/go/lucy/registry_test.go
@@ -0,0 +1,84 @@
+/* Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package lucy
+
+import "testing"
+import "math/rand"
+
+func TestRegistrySingle(t *testing.T) {
+	reg := newObjRegistry(4)
+	index := reg.store(42)
+	if intVal, ok := reg.fetch(index).(int); !ok || intVal != 42 {
+		t.Error("Failed to store/fetch int")
+	}
+	reg.delete(index)
+	if reg.fetch(index) != nil {
+		t.Error("Failed to delete int")
+	}
+}
+
+func TestRegistryMany(t *testing.T) {
+	reg := newObjRegistry(4)
+	stored := make(map[int]uintptr)
+	deleted := make(map[int]uintptr)
+	for i := 0; i < 1000; i++ {
+		if rand.Intn(10) == 0 {
+			// Randomly delete an element 10% of the time.
+			goner := rand.Intn(i - 1)
+			if index, ok := stored[goner]; ok {
+				reg.delete(index)
+				delete(stored, goner)
+				deleted[goner] = index
+			}
+		}
+		stored[i] = reg.store(i)
+	}
+	for expected, index := range stored {
+		got, ok := reg.fetch(index).(int)
+		if !ok {
+			t.Errorf("Failed to fetch stored value %d at index %d", expected, index)
+		} else if got != expected {
+			t.Errorf("Expected %d got %d", expected, got)
+		}
+	}
+	for i := 0; i < len(*reg.entries) - 1; i++ {
+		got, ok := reg.fetch(uintptr(i)).(int)
+		if ok {
+			if _, wasDeleted := deleted[got]; wasDeleted {
+				t.Errorf("Deleted item %d still present at index %d", got, i)
+			}
+		}
+	}
+}
+
+func TestRegistryStringSlice(t *testing.T) {
+	reg := newObjRegistry(4)
+	s := make([]int, 2)
+	index := reg.store(&s)
+	s2 := reg.fetch(index).(*[]int)
+	(*s2)[1] = 1000
+	if s[1] != 1000 {
+		t.Error("Not the same slice")
+	}
+}
+
+func TestRegistryRange(t *testing.T) {
+	reg := newObjRegistry(4)
+	if reg.fetch(uintptr(10)) != nil {
+		t.Error("Out of range index should return nil")
+	}
+}

From 44fc440fdc419b655fb4c482afb63b9020138011 Mon Sep 17 00:00:00 2001
From: Marvin Humphrey <marvin@rectangular.com>
Date: Sun, 19 Jul 2015 12:57:13 -0700
Subject: [PATCH 4/8] Port RegexTokenizer stubs to CGO.

---
 go/cfext/lucy.c | 157 +++---------------------------------------------
 go/lucy/lucy.go |  48 +++++++++++++++
 2 files changed, 56 insertions(+), 149 deletions(-)

diff --git a/go/cfext/lucy.c b/go/cfext/lucy.c
index d1044dfe3..5773f16fe 100644
--- a/go/cfext/lucy.c
+++ b/go/cfext/lucy.c
@@ -55,175 +55,34 @@
 #include "Lucy/Store/OutStream.h"
 #include "Lucy/Util/Freezer.h"
 
-#if defined(CHY_HAS_PCRE_H)
-
-#include <pcre.h>
-
-static uint32_t
-S_count_code_points(const char *string, size_t len);
-
 bool
 RegexTokenizer_is_available(void) {
-    return true;
+    return false;
 }
 
 RegexTokenizer*
-RegexTokenizer_init(RegexTokenizer *self, String *pattern) {
-    Analyzer_init((Analyzer*)self);
-    RegexTokenizerIVARS *const ivars = RegexTokenizer_IVARS(self);
-
-    char *pattern_buf = NULL;
-    const char *pattern_ptr;
-    if (pattern) {
-        ivars->pattern = Str_Clone(pattern);
-        pattern_buf = Str_To_Utf8(ivars->pattern);
-        pattern_ptr = pattern_buf;
-    }
-    else {
-        pattern_ptr = "\\w+(?:['\\x{2019}]\\w+)*";
-        ivars->pattern
-            = Str_new_from_trusted_utf8(pattern_ptr, strlen(pattern_ptr));
-    }
-
-    int options = PCRE_UTF8 | PCRE_NO_UTF8_CHECK;
-#ifdef PCRE_BSR_UNICODE
-    // Available since PCRE 7.4
-    options |= PCRE_BSR_UNICODE;
-#endif
-#ifdef PCRE_NEWLINE_LF
-    // Available since PCRE 6.7
-    options |= PCRE_NEWLINE_LF;
-#endif
-    const char *err_ptr;
-    int err_offset;
-    pcre *re = pcre_compile(pattern_ptr, options, &err_ptr, &err_offset, NULL);
-    if (pattern_buf) {
-        FREEMEM(pattern_buf);
-    }
-    if (!re) {
-        THROW(ERR, "%s", err_ptr);
-    }
-
-    // TODO: Check whether pcre_study improves performance
-
-    ivars->token_re = re;
-
-    return self;
-}
-
-void
-RegexTokenizer_Destroy_IMP(RegexTokenizer *self) {
-    RegexTokenizerIVARS *const ivars = RegexTokenizer_IVARS(self);
-    DECREF(ivars->pattern);
-    pcre *re = (pcre*)ivars->token_re;
-    if (re) {
-        pcre_free(re);
-    }
-    SUPER_DESTROY(self, REGEXTOKENIZER);
-}
-
-void
-RegexTokenizer_Tokenize_Utf8_IMP(RegexTokenizer *self, const char *string,
-                                 size_t string_len, Inversion *inversion) {
-    RegexTokenizerIVARS *const ivars = RegexTokenizer_IVARS(self);
-    pcre      *re          = (pcre*)ivars->token_re;
-    int        byte_offset = 0;
-    uint32_t   cp_offset   = 0; // Code points
-    int        options     = PCRE_NO_UTF8_CHECK;
-    int        ovector[3];
-
-    int return_code = pcre_exec(re, NULL, string, string_len, byte_offset,
-                                options, ovector, 3);
-    while (return_code >= 0) {
-        const char *match     = string + ovector[0];
-        size_t      match_len = ovector[1] - ovector[0];
-
-        uint32_t cp_before  = S_count_code_points(string + byte_offset,
-                                                  ovector[0] - byte_offset);
-        uint32_t cp_start   = cp_offset + cp_before;
-        uint32_t cp_matched = S_count_code_points(match, match_len);
-        uint32_t cp_end     = cp_start + cp_matched;
-
-        // Add a token to the new inversion.
-        Token *token = Token_new(match, match_len, cp_start, cp_end, 1.0f, 1);
-        Inversion_Append(inversion, token);
-
-        byte_offset = ovector[1];
-        cp_offset   = cp_end;
-        return_code = pcre_exec(re, NULL, string, string_len, byte_offset,
-                                options, ovector, 3);
-    }
-
-    if (return_code != PCRE_ERROR_NOMATCH) {
-        THROW(ERR, "pcre_exec failed: %d", return_code);
-    }
-}
-
-static uint32_t
-S_count_code_points(const char *string, size_t len) {
-    uint32_t num_code_points = 0;
-    size_t i = 0;
-
-    while (i < len) {
-        i += StrHelp_UTF8_COUNT[(uint8_t)(string[i])];
-        ++num_code_points;
-    }
-
-    if (i != len) {
-        THROW(ERR, "Match between code point boundaries in '%s'", string);
-    }
-
-    return num_code_points;
-}
-
-#else // CHY_HAS_PCRE_H
-
-bool
-RegexTokenizer_is_available(void) {
-    return false;
-}
+(*GOLUCY_RegexTokenizer_init_BRIDGE)(RegexTokenizer *self, String *pattern);
 
 RegexTokenizer*
 RegexTokenizer_init(RegexTokenizer *self, String *pattern) {
-    UNUSED_VAR(self);
-    UNUSED_VAR(pattern);
-    THROW(ERR,
-          "RegexTokenizer is not available because Lucy was compiled"
-          " without PCRE.");
-    UNREACHABLE_RETURN(RegexTokenizer*);
+    return GOLUCY_RegexTokenizer_init_BRIDGE(self, pattern);
 }
 
-void
-RegexTokenizer_Set_Token_RE_IMP(RegexTokenizer *self, void *token_re) {
-    UNUSED_VAR(self);
-    UNUSED_VAR(token_re);
-    THROW(ERR,
-          "RegexTokenizer is not available because Lucy was compiled"
-          " without PCRE.");
-}
+RegexTokenizer_Destroy_t GOLUCY_RegexTokenizer_Destroy_BRIDGE;
 
 void
 RegexTokenizer_Destroy_IMP(RegexTokenizer *self) {
-    UNUSED_VAR(self);
-    THROW(ERR,
-          "RegexTokenizer is not available because Lucy was compiled"
-          " without PCRE.");
+    GOLUCY_RegexTokenizer_Destroy_BRIDGE(self);
 }
 
+RegexTokenizer_Tokenize_Utf8_t GOLUCY_RegexTokenizer_Tokenize_Utf8_BRIDGE;
+
 void
 RegexTokenizer_Tokenize_Utf8_IMP(RegexTokenizer *self, const char *string,
                                  size_t string_len, Inversion *inversion) {
-    UNUSED_VAR(self);
-    UNUSED_VAR(string);
-    UNUSED_VAR(string_len);
-    UNUSED_VAR(inversion);
-    THROW(ERR,
-          "RegexTokenizer is not available because Lucy was compiled"
-          " without PCRE.");
+    GOLUCY_RegexTokenizer_Tokenize_Utf8_BRIDGE(self, string, string_len, inversion);
 }
 
-#endif // CHY_HAS_PCRE_H
-
 /********************************** Doc ********************************/
 
 Doc*
diff --git a/go/lucy/lucy.go b/go/lucy/lucy.go
index 908599af3..13bdafa96 100644
--- a/go/lucy/lucy.go
+++ b/go/lucy/lucy.go
@@ -17,11 +17,59 @@
 package lucy
 
 /*
+#define C_LUCY_REGEXTOKENIZER
+
 #include "lucy_parcel.h"
+#include "Lucy/Analysis/RegexTokenizer.h"
+
+extern lucy_RegexTokenizer*
+GOLUCY_RegexTokenizer_init(lucy_RegexTokenizer *self, cfish_String *pattern);
+extern lucy_RegexTokenizer*
+(*GOLUCY_RegexTokenizer_init_BRIDGE)(lucy_RegexTokenizer *self,
+									 cfish_String *pattern);
+extern void
+GOLUCY_RegexTokenizer_Destroy(lucy_RegexTokenizer *self);
+extern void
+(*GOLUCY_RegexTokenizer_Destroy_BRIDGE)(lucy_RegexTokenizer *self);
+extern void
+GOLUCY_RegexTokenizer_Tokenize_Utf8(lucy_RegexTokenizer *self, char *str,
+									size_t string_len, lucy_Inversion *inversion);
+extern void
+(*GOLUCY_RegexTokenizer_Tokenize_Utf8_BRIDGE)(lucy_RegexTokenizer *self, const char *str,
+											  size_t string_len, lucy_Inversion *inversion);
+
+
+// C symbols linked into a Go-built package archive are not visible to
+// external C code -- but internal code *can* see symbols from outside.
+// This allows us to fake up symbol export by assigning values only known
+// interally to external symbols during Go package initialization.
+static CFISH_INLINE void
+GOLUCY_glue_exported_symbols() {
+	GOLUCY_RegexTokenizer_init_BRIDGE = GOLUCY_RegexTokenizer_init;
+	GOLUCY_RegexTokenizer_Destroy_BRIDGE = GOLUCY_RegexTokenizer_Destroy;
+	GOLUCY_RegexTokenizer_Tokenize_Utf8_BRIDGE
+		= (LUCY_RegexTokenizer_Tokenize_Utf8_t)GOLUCY_RegexTokenizer_Tokenize_Utf8;
+}
+
 */
 import "C"
 import _ "git-wip-us.apache.org/repos/asf/lucy-clownfish.git/runtime/go/clownfish"
 
 func init() {
+	C.GOLUCY_glue_exported_symbols()
 	C.lucy_bootstrap_parcel()
 }
+
+//export GOLUCY_RegexTokenizer_init
+func GOLUCY_RegexTokenizer_init(rt *C.lucy_RegexTokenizer, pattern *C.cfish_String) *C.lucy_RegexTokenizer {
+	return nil
+}
+
+//export GOLUCY_RegexTokenizer_Destroy
+func GOLUCY_RegexTokenizer_Destroy(rt *C.lucy_RegexTokenizer) {
+}
+
+//export GOLUCY_RegexTokenizer_Tokenize_Utf8
+func GOLUCY_RegexTokenizer_Tokenize_Utf8(rt *C.lucy_RegexTokenizer, str *C.char,
+	stringLen C.size_t, inversion *C.lucy_Inversion) {
+}

From 8f634425b5390423d7c4013b28710e4ddc92bf0b Mon Sep 17 00:00:00 2001
From: Marvin Humphrey <marvin@rectangular.com>
Date: Sun, 19 Jul 2015 12:57:13 -0700
Subject: [PATCH 5/8] Port Doc code to CGO.

---
 go/cfext/lucy.c |  67 ++++++++++-------------
 go/lucy/lucy.go | 141 ++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 169 insertions(+), 39 deletions(-)

diff --git a/go/cfext/lucy.c b/go/cfext/lucy.c
index 5773f16fe..e2719bc2d 100644
--- a/go/cfext/lucy.c
+++ b/go/cfext/lucy.c
@@ -86,61 +86,53 @@ RegexTokenizer_Tokenize_Utf8_IMP(RegexTokenizer *self, const char *string,
 /********************************** Doc ********************************/
 
 Doc*
-Doc_init(Doc *self, void *fields, int32_t doc_id) {
-    DocIVARS *const ivars = Doc_IVARS(self);
-    Hash *hash;
-
-    if (fields) {
-        hash = (Hash *)INCREF(CERTIFY(fields, HASH));
-    }
-    else {
-        hash = Hash_new(0);
-    }
-    ivars->fields = hash;
-    ivars->doc_id = doc_id;
+(*GOLUCY_Doc_init_BRIDGE)(Doc *self, void *fields, int32_t doc_id);
 
-    return self;
+Doc*
+Doc_init(Doc *self, void *fields, int32_t doc_id) {
+    return GOLUCY_Doc_init_BRIDGE(self, fields, doc_id);
 }
 
+Doc_Set_Fields_t GOLUCY_Doc_Set_Fields_BRIDGE;
+
 void
 Doc_Set_Fields_IMP(Doc *self, void *fields) {
-    DocIVARS *const ivars = Doc_IVARS(self);
-    DECREF(ivars->fields);
-    ivars->fields = CERTIFY(fields, HASH);
+    GOLUCY_Doc_Set_Fields_BRIDGE(self, fields);
 }
 
+Doc_Get_Size_t GOLUCY_Doc_Get_Size_BRIDGE;
+
 uint32_t
 Doc_Get_Size_IMP(Doc *self) {
-    Hash *hash = (Hash*)Doc_IVARS(self)->fields;
-    return Hash_Get_Size(hash);
+    return GOLUCY_Doc_Get_Size_BRIDGE(self);
 }
 
+Doc_Store_t GOLUCY_Doc_Store_BRIDGE;
+
 void
 Doc_Store_IMP(Doc *self, String *field, Obj *value) {
-    Hash *hash = (Hash*)Doc_IVARS(self)->fields;
-    Hash_Store(hash, field, INCREF(value));
+    GOLUCY_Doc_Store_BRIDGE(self, field, value);
 }
 
+Doc_Serialize_t GOLUCY_Doc_Serialize_BRIDGE;
+
 void
 Doc_Serialize_IMP(Doc *self, OutStream *outstream) {
-    DocIVARS *const ivars = Doc_IVARS(self);
-    Hash *hash = (Hash*)ivars->fields;
-    Freezer_serialize_hash(hash, outstream);
-    OutStream_Write_C32(outstream, ivars->doc_id);
+    GOLUCY_Doc_Serialize_BRIDGE(self, outstream);
 }
 
+Doc_Deserialize_t GOLUCY_Doc_Deserialize_BRIDGE;
+
 Doc*
 Doc_Deserialize_IMP(Doc *self, InStream *instream) {
-    DocIVARS *const ivars = Doc_IVARS(self);
-    ivars->fields = Freezer_read_hash(instream);
-    ivars->doc_id = InStream_Read_C32(instream);
-    return self;
+    return GOLUCY_Doc_Deserialize_BRIDGE(self, instream);
 }
 
+Doc_Extract_t GOLUCY_Doc_Extract_BRIDGE;
+
 Obj*
 Doc_Extract_IMP(Doc *self, String *field) {
-    Hash *hash = (Hash*)Doc_IVARS(self)->fields;
-    return INCREF(Hash_Fetch(hash, field));
+    return GOLUCY_Doc_Extract_BRIDGE(self, field);
 }
 
 Hash*
@@ -158,23 +150,20 @@ Doc_Load_IMP(Doc *self, Obj *dump) {
     UNREACHABLE_RETURN(Doc*);
 }
 
+Doc_Equals_t GOLUCY_Doc_Equals_BRIDGE;
+
 bool
 Doc_Equals_IMP(Doc *self, Obj *other) {
-    if ((Doc*)other == self)   { return true;  }
-    if (!Obj_is_a(other, DOC)) { return false; }
-    DocIVARS *const ivars = Doc_IVARS(self);
-    DocIVARS *const ovars = Doc_IVARS((Doc*)other);
-    return Hash_Equals((Hash*)ivars->fields, (Obj*)ovars->fields);
+    return GOLUCY_Doc_Equals_BRIDGE(self, other);
 }
 
+Doc_Destroy_t GOLUCY_Doc_Destroy_BRIDGE;
+
 void
 Doc_Destroy_IMP(Doc *self) {
-    DocIVARS *const ivars = Doc_IVARS(self);
-    DECREF(ivars->fields);
-    SUPER_DESTROY(self, DOC);
+    GOLUCY_Doc_Destroy_BRIDGE(self);
 }
 
-
 /**************************** DocReader *****************************/
 
 HitDoc*
diff --git a/go/lucy/lucy.go b/go/lucy/lucy.go
index 13bdafa96..7d5579887 100644
--- a/go/lucy/lucy.go
+++ b/go/lucy/lucy.go
@@ -17,10 +17,17 @@
 package lucy
 
 /*
+#define C_LUCY_DOC
 #define C_LUCY_REGEXTOKENIZER
 
 #include "lucy_parcel.h"
 #include "Lucy/Analysis/RegexTokenizer.h"
+#include "Lucy/Document/Doc.h"
+
+#include "Clownfish/Hash.h"
+#include "Lucy/Store/InStream.h"
+#include "Lucy/Store/OutStream.h"
+#include "Lucy/Util/Freezer.h"
 
 extern lucy_RegexTokenizer*
 GOLUCY_RegexTokenizer_init(lucy_RegexTokenizer *self, cfish_String *pattern);
@@ -38,6 +45,44 @@ extern void
 (*GOLUCY_RegexTokenizer_Tokenize_Utf8_BRIDGE)(lucy_RegexTokenizer *self, const char *str,
 											  size_t string_len, lucy_Inversion *inversion);
 
+extern lucy_Doc*
+GOLUCY_Doc_init(lucy_Doc *doc, void *fields, int32_t doc_id);
+extern lucy_Doc*
+(*GOLUCY_Doc_init_BRIDGE)(lucy_Doc *doc, void *fields, int32_t doc_id);
+extern void
+GOLUCY_Doc_Set_Fields(lucy_Doc *self, void *fields);
+extern void
+(*GOLUCY_Doc_Set_Fields_BRIDGE)(lucy_Doc *self, void *fields);
+extern uint32_t
+GOLUCY_Doc_Get_Size(lucy_Doc *self);
+extern uint32_t
+(*GOLUCY_Doc_Get_Size_BRIDGE)(lucy_Doc *self);
+extern void
+GOLUCY_Doc_Store(lucy_Doc *self, cfish_String *field, cfish_Obj *value);
+extern void
+(*GOLUCY_Doc_Store_BRIDGE)(lucy_Doc *self, cfish_String *field, cfish_Obj *value);
+extern void
+GOLUCY_Doc_Serialize(lucy_Doc *self, lucy_OutStream *outstream);
+extern void
+(*GOLUCY_Doc_Serialize_BRIDGE)(lucy_Doc *self, lucy_OutStream *outstream);
+extern lucy_Doc*
+GOLUCY_Doc_Deserialize(lucy_Doc *self, lucy_InStream *instream);
+extern lucy_Doc*
+(*GOLUCY_Doc_Deserialize_BRIDGE)(lucy_Doc *self, lucy_InStream *instream);
+extern cfish_Obj*
+GOLUCY_Doc_Extract(lucy_Doc *self, cfish_String *field);
+extern cfish_Obj*
+(*GOLUCY_Doc_Extract_BRIDGE)(lucy_Doc *self, cfish_String *field);
+extern bool
+GOLUCY_Doc_Equals(lucy_Doc *self, cfish_Obj *other);
+extern bool
+(*GOLUCY_Doc_Equals_BRIDGE)(lucy_Doc *self, cfish_Obj *other);
+extern void
+GOLUCY_Doc_Destroy(lucy_Doc *self);
+extern void
+(*GOLUCY_Doc_Destroy_BRIDGE)(lucy_Doc *self);
+
+
 
 // C symbols linked into a Go-built package archive are not visible to
 // external C code -- but internal code *can* see symbols from outside.
@@ -49,10 +94,20 @@ GOLUCY_glue_exported_symbols() {
 	GOLUCY_RegexTokenizer_Destroy_BRIDGE = GOLUCY_RegexTokenizer_Destroy;
 	GOLUCY_RegexTokenizer_Tokenize_Utf8_BRIDGE
 		= (LUCY_RegexTokenizer_Tokenize_Utf8_t)GOLUCY_RegexTokenizer_Tokenize_Utf8;
+	GOLUCY_Doc_init_BRIDGE = GOLUCY_Doc_init;
+	GOLUCY_Doc_Set_Fields_BRIDGE = GOLUCY_Doc_Set_Fields;
+	GOLUCY_Doc_Get_Size_BRIDGE = GOLUCY_Doc_Get_Size;
+	GOLUCY_Doc_Store_BRIDGE = GOLUCY_Doc_Store;
+	GOLUCY_Doc_Serialize_BRIDGE = GOLUCY_Doc_Serialize;
+	GOLUCY_Doc_Deserialize_BRIDGE = GOLUCY_Doc_Deserialize;
+	GOLUCY_Doc_Extract_BRIDGE = GOLUCY_Doc_Extract;
+	GOLUCY_Doc_Equals_BRIDGE = GOLUCY_Doc_Equals;
+	GOLUCY_Doc_Destroy_BRIDGE = GOLUCY_Doc_Destroy;
 }
 
 */
 import "C"
+import "unsafe"
 import _ "git-wip-us.apache.org/repos/asf/lucy-clownfish.git/runtime/go/clownfish"
 
 func init() {
@@ -73,3 +128,89 @@ func GOLUCY_RegexTokenizer_Destroy(rt *C.lucy_RegexTokenizer) {
 func GOLUCY_RegexTokenizer_Tokenize_Utf8(rt *C.lucy_RegexTokenizer, str *C.char,
 	stringLen C.size_t, inversion *C.lucy_Inversion) {
 }
+
+func NewDoc(docID int32) Doc {
+	retvalCF := C.lucy_Doc_new(nil, C.int32_t(docID))
+	return WRAPDoc(unsafe.Pointer(retvalCF))
+}
+
+//export GOLUCY_Doc_init
+func GOLUCY_Doc_init(d *C.lucy_Doc, fields unsafe.Pointer, docID C.int32_t) *C.lucy_Doc {
+	ivars := C.lucy_Doc_IVARS(d)
+	if fields != nil {
+		ivars.fields = unsafe.Pointer(C.cfish_inc_refcount(fields))
+	} else {
+		ivars.fields = unsafe.Pointer(C.cfish_Hash_new(0))
+	}
+	ivars.doc_id = docID
+	return d
+}
+
+//export GOLUCY_Doc_Set_Fields
+func GOLUCY_Doc_Set_Fields(d *C.lucy_Doc, fields unsafe.Pointer) {
+	ivars := C.lucy_Doc_IVARS(d)
+	temp := ivars.fields
+	ivars.fields = unsafe.Pointer(C.cfish_inc_refcount(fields))
+	C.cfish_decref(temp)
+}
+
+//export GOLUCY_Doc_Get_Size
+func GOLUCY_Doc_Get_Size(d *C.lucy_Doc) C.uint32_t {
+	ivars := C.lucy_Doc_IVARS(d)
+	hash := ((*C.cfish_Hash)(ivars.fields))
+	return C.uint32_t(C.CFISH_Hash_Get_Size(hash))
+}
+
+//export GOLUCY_Doc_Store
+func GOLUCY_Doc_Store(d *C.lucy_Doc, field *C.cfish_String, value *C.cfish_Obj) {
+	ivars := C.lucy_Doc_IVARS(d)
+	hash := (*C.cfish_Hash)(ivars.fields)
+	C.CFISH_Hash_Store(hash, field, C.cfish_inc_refcount(unsafe.Pointer(value)))
+}
+
+//export GOLUCY_Doc_Serialize
+func GOLUCY_Doc_Serialize(d *C.lucy_Doc, outstream *C.lucy_OutStream) {
+	ivars := C.lucy_Doc_IVARS(d)
+	hash := (*C.cfish_Hash)(ivars.fields)
+	C.lucy_Freezer_serialize_hash(hash, outstream)
+	C.LUCY_OutStream_Write_C32(outstream, C.uint32_t(ivars.doc_id))
+}
+
+//export GOLUCY_Doc_Deserialize
+func GOLUCY_Doc_Deserialize(d *C.lucy_Doc, instream *C.lucy_InStream) *C.lucy_Doc {
+	ivars := C.lucy_Doc_IVARS(d)
+	ivars.fields = unsafe.Pointer(C.lucy_Freezer_read_hash(instream))
+	ivars.doc_id = C.int32_t(C.LUCY_InStream_Read_C32(instream))
+	return d
+}
+
+//export GOLUCY_Doc_Extract
+func GOLUCY_Doc_Extract(d *C.lucy_Doc, field *C.cfish_String) *C.cfish_Obj {
+	ivars := C.lucy_Doc_IVARS(d)
+	hash := (*C.cfish_Hash)(ivars.fields)
+	val := C.CFISH_Hash_Fetch(hash, field)
+	return C.cfish_inc_refcount(unsafe.Pointer(val))
+}
+
+//export GOLUCY_Doc_Equals
+func GOLUCY_Doc_Equals(d *C.lucy_Doc, other *C.cfish_Obj) C.bool {
+	twin := (*C.lucy_Doc)(unsafe.Pointer(other))
+	if twin == d {
+		return true
+	}
+	if !C.cfish_Obj_is_a(other, C.LUCY_DOC) {
+		return false
+	}
+	ivars := C.lucy_Doc_IVARS(d)
+	ovars := C.lucy_Doc_IVARS(twin)
+	hash := (*C.cfish_Hash)(ivars.fields)
+	otherHash := (*C.cfish_Obj)(ovars.fields)
+	return C.CFISH_Hash_Equals(hash, otherHash)
+}
+
+//export GOLUCY_Doc_Destroy
+func GOLUCY_Doc_Destroy(d *C.lucy_Doc) {
+	ivars := C.lucy_Doc_IVARS(d)
+	C.cfish_decref(unsafe.Pointer(ivars.fields))
+	C.cfish_super_destroy(unsafe.Pointer(d), C.LUCY_DOC)
+}

From acd74d27985e635606c775a44d7c5c718583d76f Mon Sep 17 00:00:00 2001
From: Marvin Humphrey <marvin@rectangular.com>
Date: Sat, 18 Jul 2015 14:49:30 -0700
Subject: [PATCH 6/8] Port Inverter and InverterEntry to CGO.

---
 go/cfext/lucy.c |  76 +-----------------------------------
 go/lucy/lucy.go | 101 +++++++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 102 insertions(+), 75 deletions(-)

diff --git a/go/cfext/lucy.c b/go/cfext/lucy.c
index e2719bc2d..f10a23f93 100644
--- a/go/cfext/lucy.c
+++ b/go/cfext/lucy.c
@@ -249,83 +249,11 @@ DefDocReader_Fetch_Doc_IMP(DefaultDocReader *self, int32_t doc_id) {
 
 /**************************** Inverter *****************************/
 
-static InverterEntry*
-S_fetch_entry(InverterIVARS *ivars, String *field) {
-    Schema *const schema = ivars->schema;
-    int32_t field_num = Seg_Field_Num(ivars->segment, field);
-    if (!field_num) {
-        // This field seems not to be in the segment yet.  Try to find it in
-        // the Schema.
-        if (Schema_Fetch_Type(schema, field)) {
-            // The field is in the Schema.  Get a field num from the Segment.
-            field_num = Seg_Add_Field(ivars->segment, field);
-        }
-        else {
-            // We've truly failed to find the field.  The user must
-            // not have spec'd it.
-            THROW(ERR, "Unknown field name: '%o'", field);
-        }
-    }
-
-    InverterEntry *entry
-        = (InverterEntry*)Vec_Fetch(ivars->entry_pool, field_num);
-    if (!entry) {
-        entry = InvEntry_new(schema, (String*)field, field_num);
-        Vec_Store(ivars->entry_pool, field_num, (Obj*)entry);
-    }
-    return entry;
-}
+Inverter_Invert_Doc_t GOLUCY_Inverter_Invert_Doc_BRIDGE;
 
 void
 Inverter_Invert_Doc_IMP(Inverter *self, Doc *doc) {
-    InverterIVARS *const ivars = Inverter_IVARS(self);
-    Hash *const fields = (Hash*)Doc_Get_Fields(doc);
-
-    // Prepare for the new doc.
-    Inverter_Set_Doc(self, doc);
-
-    // Extract and invert the doc's fields.
-    HashIterator *iter = HashIter_new(fields);
-    while (HashIter_Next(iter)) {
-        String *field = HashIter_Get_Key(iter);
-        Obj    *obj   = HashIter_Get_Value(iter);
-
-        InverterEntry *inventry = S_fetch_entry(ivars, field);
-        InverterEntryIVARS *inventry_ivars = InvEntry_IVARS(inventry);
-        FieldType *type = inventry_ivars->type;
-
-        // Get the field value.
-        switch (FType_Primitive_ID(type) & FType_PRIMITIVE_ID_MASK) {
-            case FType_TEXT: {
-                    CERTIFY(obj, STRING);
-                    break;
-                }
-            case FType_BLOB: {
-                    CERTIFY(obj, BLOB);
-                    break;
-                }
-            case FType_INT32:
-            case FType_INT64: {
-                    CERTIFY(obj, INTEGER);
-                    break;
-                }
-            case FType_FLOAT32:
-            case FType_FLOAT64: {
-                    CERTIFY(obj, FLOAT);
-                    break;
-                }
-            default:
-                THROW(ERR, "Unrecognized type: %o", type);
-        }
-
-        if (inventry_ivars->value != obj) {
-            DECREF(inventry_ivars->value);
-            inventry_ivars->value = INCREF(obj);
-        }
-
-        Inverter_Add_Field(self, inventry);
-    }
-    DECREF(iter);
+    GOLUCY_Inverter_Invert_Doc_BRIDGE(self, doc);
 }
 
 
diff --git a/go/lucy/lucy.go b/go/lucy/lucy.go
index 7d5579887..664a2001b 100644
--- a/go/lucy/lucy.go
+++ b/go/lucy/lucy.go
@@ -19,12 +19,20 @@ package lucy
 /*
 #define C_LUCY_DOC
 #define C_LUCY_REGEXTOKENIZER
+#define C_LUCY_INVERTER
+#define C_LUCY_INVERTERENTRY
 
 #include "lucy_parcel.h"
 #include "Lucy/Analysis/RegexTokenizer.h"
 #include "Lucy/Document/Doc.h"
+#include "Lucy/Index/Inverter.h"
 
 #include "Clownfish/Hash.h"
+#include "Clownfish/HashIterator.h"
+#include "Clownfish/Vector.h"
+#include "Lucy/Plan/FieldType.h"
+#include "Lucy/Plan/Schema.h"
+#include "Lucy/Index/Segment.h"
 #include "Lucy/Store/InStream.h"
 #include "Lucy/Store/OutStream.h"
 #include "Lucy/Util/Freezer.h"
@@ -82,6 +90,10 @@ GOLUCY_Doc_Destroy(lucy_Doc *self);
 extern void
 (*GOLUCY_Doc_Destroy_BRIDGE)(lucy_Doc *self);
 
+extern void
+GOLUCY_Inverter_Invert_Doc(lucy_Inverter *self, lucy_Doc *doc);
+extern void
+(*GOLUCY_Inverter_Invert_Doc_BRIDGE)(lucy_Inverter *self, lucy_Doc *doc);
 
 
 // C symbols linked into a Go-built package archive are not visible to
@@ -103,12 +115,14 @@ GOLUCY_glue_exported_symbols() {
 	GOLUCY_Doc_Extract_BRIDGE = GOLUCY_Doc_Extract;
 	GOLUCY_Doc_Equals_BRIDGE = GOLUCY_Doc_Equals;
 	GOLUCY_Doc_Destroy_BRIDGE = GOLUCY_Doc_Destroy;
+	GOLUCY_Inverter_Invert_Doc_BRIDGE = GOLUCY_Inverter_Invert_Doc;
 }
 
 */
 import "C"
 import "unsafe"
-import _ "git-wip-us.apache.org/repos/asf/lucy-clownfish.git/runtime/go/clownfish"
+import "fmt"
+import "git-wip-us.apache.org/repos/asf/lucy-clownfish.git/runtime/go/clownfish"
 
 func init() {
 	C.GOLUCY_glue_exported_symbols()
@@ -214,3 +228,88 @@ func GOLUCY_Doc_Destroy(d *C.lucy_Doc) {
 	C.cfish_decref(unsafe.Pointer(ivars.fields))
 	C.cfish_super_destroy(unsafe.Pointer(d), C.LUCY_DOC)
 }
+
+func fetchEntry(ivars *C.lucy_InverterIVARS, field *C.cfish_String) *C.lucy_InverterEntry {
+	schema := ivars.schema
+	fieldNum := C.LUCY_Seg_Field_Num(ivars.segment, field)
+	if fieldNum == 0 {
+		// This field seems not to be in the segment yet.  Try to find it in
+		// the Schema.
+		if C.LUCY_Schema_Fetch_Type(schema, field) != nil {
+			// The field is in the Schema.  Get a field num from the Segment.
+			fieldNum = C.LUCY_Seg_Add_Field(ivars.segment, field)
+		} else {
+			// We've truly failed to find the field.  The user must
+			// not have spec'd it.
+			fieldGo := clownfish.CFStringToGo(unsafe.Pointer(field))
+			err := clownfish.NewErr("Unknown field name: '" + fieldGo + "'")
+			panic(err)
+		}
+	}
+	entry := C.CFISH_Vec_Fetch(ivars.entry_pool, C.size_t(fieldNum))
+	if entry == nil {
+		newEntry := C.lucy_InvEntry_new(schema, field, fieldNum)
+		C.CFISH_Vec_Store(ivars.entry_pool, C.size_t(fieldNum),
+			(*C.cfish_Obj)(unsafe.Pointer(entry)))
+		return newEntry
+	}
+	return (*C.lucy_InverterEntry)(unsafe.Pointer(entry))
+}
+
+//export GOLUCY_Inverter_Invert_Doc
+func GOLUCY_Inverter_Invert_Doc(inverter *C.lucy_Inverter, doc *C.lucy_Doc) {
+	ivars := C.lucy_Inverter_IVARS(inverter)
+	fields := (*C.cfish_Hash)(C.LUCY_Doc_Get_Fields(doc))
+
+	// Prepare for the new doc.
+	C.LUCY_Inverter_Set_Doc(inverter, doc)
+
+	// Extract and invert the doc's fields.
+	iter := C.cfish_HashIter_new(fields)
+	for C.CFISH_HashIter_Next(iter) {
+		field := C.CFISH_HashIter_Get_Key(iter)
+		obj := C.CFISH_HashIter_Get_Value(iter)
+		if obj == nil {
+			mess := "Invalid nil value for field" + clownfish.CFStringToGo(unsafe.Pointer(field))
+			panic(clownfish.NewErr(mess))
+		}
+
+		inventry := fetchEntry(ivars, field)
+		inventryIvars := C.lucy_InvEntry_IVARS(inventry)
+		fieldType := inventryIvars._type
+
+		// Get the field value.
+		var expectedType *C.cfish_Class
+		switch C.LUCY_FType_Primitive_ID(fieldType) & C.lucy_FType_PRIMITIVE_ID_MASK {
+		case C.lucy_FType_TEXT:
+			expectedType = C.CFISH_STRING
+		case C.lucy_FType_BLOB:
+			expectedType = C.CFISH_BLOB
+		case C.lucy_FType_INT32:
+			expectedType = C.CFISH_INTEGER
+		case C.lucy_FType_INT64:
+			expectedType = C.CFISH_INTEGER
+		case C.lucy_FType_FLOAT32:
+			expectedType = C.CFISH_FLOAT
+		case C.lucy_FType_FLOAT64:
+			expectedType = C.CFISH_FLOAT
+		default:
+			panic(clownfish.NewErr("Internal Lucy error: bad type id for field " +
+				clownfish.CFStringToGo(unsafe.Pointer(field))))
+		}
+		if !C.cfish_Obj_is_a(obj, expectedType) {
+			className := C.cfish_Obj_get_class_name((*C.cfish_Obj)(unsafe.Pointer(fieldType)))
+			mess := fmt.Sprintf("Invalid type for field '%s': '%s'",
+				clownfish.CFStringToGo(unsafe.Pointer(field)),
+				clownfish.CFStringToGo(unsafe.Pointer(className)))
+			panic(clownfish.NewErr(mess))
+		}
+		if inventryIvars.value != obj {
+			C.cfish_decref(unsafe.Pointer(inventryIvars.value))
+			inventryIvars.value = C.cfish_inc_refcount(unsafe.Pointer(obj))
+		}
+
+		C.LUCY_Inverter_Add_Field(inverter, inventry)
+	}
+	C.cfish_dec_refcount(unsafe.Pointer(iter))
+}

From 7749e595b2a6c32af57b904d1a00b066691ace37 Mon Sep 17 00:00:00 2001
From: Marvin Humphrey <marvin@rectangular.com>
Date: Sun, 19 Jul 2015 12:57:13 -0700
Subject: [PATCH 7/8] Port DefDocReader code to CGO.

---
 go/cfext/lucy.c | 80 ++---------------------------------------
 go/lucy/lucy.go | 94 +++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 97 insertions(+), 77 deletions(-)

diff --git a/go/cfext/lucy.c b/go/cfext/lucy.c
index f10a23f93..9e9b840f4 100644
--- a/go/cfext/lucy.c
+++ b/go/cfext/lucy.c
@@ -166,85 +166,11 @@ Doc_Destroy_IMP(Doc *self) {
 
 /**************************** DocReader *****************************/
 
+DefDocReader_Fetch_Doc_t GOLUCY_DefDocReader_Fetch_Doc_BRIDGE;
+
 HitDoc*
 DefDocReader_Fetch_Doc_IMP(DefaultDocReader *self, int32_t doc_id) {
-    DefaultDocReaderIVARS *const ivars = DefDocReader_IVARS(self);
-    Schema   *const schema = ivars->schema;
-    InStream *const dat_in = ivars->dat_in;
-    InStream *const ix_in  = ivars->ix_in;
-    Hash     *const fields = Hash_new(1);
-    int64_t   start;
-    uint32_t  num_fields;
-    uint32_t  field_name_cap = 31;
-    char     *field_name = (char*)MALLOCATE(field_name_cap + 1);
-
-    // Get data file pointer from index, read number of fields.
-    InStream_Seek(ix_in, (int64_t)doc_id * 8);
-    start = InStream_Read_U64(ix_in);
-    InStream_Seek(dat_in, start);
-    num_fields = InStream_Read_C32(dat_in);
-
-    // Decode stored data and build up the doc field by field.
-    while (num_fields--) {
-        uint32_t        field_name_len;
-        Obj       *value;
-        FieldType *type;
-
-        // Read field name.
-        field_name_len = InStream_Read_C32(dat_in);
-        if (field_name_len > field_name_cap) {
-            field_name_cap = field_name_len;
-            field_name     = (char*)REALLOCATE(field_name,
-                                                    field_name_cap + 1);
-        }
-        InStream_Read_Bytes(dat_in, field_name, field_name_len);
-
-        // Find the Field's FieldType.
-        String *field_name_str = SSTR_WRAP_UTF8(field_name, field_name_len);
-        type = Schema_Fetch_Type(schema, field_name_str);
-
-        // Read the field value.
-        switch (FType_Primitive_ID(type) & FType_PRIMITIVE_ID_MASK) {
-            case FType_TEXT: {
-                    uint32_t value_len = InStream_Read_C32(dat_in);
-                    char *buf = (char*)MALLOCATE(value_len + 1);
-                    InStream_Read_Bytes(dat_in, buf, value_len);
-                    buf[value_len] = '\0';
-                    value = (Obj*)Str_new_steal_utf8(buf, value_len);
-                    break;
-                }
-            case FType_BLOB: {
-                    uint32_t value_len = InStream_Read_C32(dat_in);
-                    char *buf = (char*)MALLOCATE(value_len);
-                    InStream_Read_Bytes(dat_in, buf, value_len);
-                    value = (Obj*)Blob_new_steal(buf, value_len);
-                    break;
-                }
-            case FType_FLOAT32:
-                value = (Obj*)Float_new(InStream_Read_F32(dat_in));
-                break;
-            case FType_FLOAT64:
-                value = (Obj*)Float_new(InStream_Read_F64(dat_in));
-                break;
-            case FType_INT32:
-                value = (Obj*)Int_new((int32_t)InStream_Read_C32(dat_in));
-                break;
-            case FType_INT64:
-                value = (Obj*)Int_new((int64_t)InStream_Read_C64(dat_in));
-                break;
-            default:
-                value = NULL;
-                THROW(ERR, "Unrecognized type: %o", type);
-        }
-
-        // Store the value.
-        Hash_Store_Utf8(fields, field_name, field_name_len, value);
-    }
-    FREEMEM(field_name);
-
-    HitDoc *retval = HitDoc_new(fields, doc_id, 0.0);
-    DECREF(fields);
-    return retval;
+    return GOLUCY_DefDocReader_Fetch_Doc_BRIDGE(self, doc_id);
 }
 
 /**************************** Inverter *****************************/
diff --git a/go/lucy/lucy.go b/go/lucy/lucy.go
index 664a2001b..556235e4c 100644
--- a/go/lucy/lucy.go
+++ b/go/lucy/lucy.go
@@ -17,19 +17,28 @@
 package lucy
 
 /*
+
+#include <stdlib.h>
+
 #define C_LUCY_DOC
 #define C_LUCY_REGEXTOKENIZER
+#define C_LUCY_DEFAULTDOCREADER
 #define C_LUCY_INVERTER
 #define C_LUCY_INVERTERENTRY
 
 #include "lucy_parcel.h"
 #include "Lucy/Analysis/RegexTokenizer.h"
 #include "Lucy/Document/Doc.h"
+#include "Lucy/Index/DocReader.h"
 #include "Lucy/Index/Inverter.h"
 
+#include "Clownfish/String.h"
+#include "Clownfish/Blob.h"
+#include "Clownfish/Num.h"
 #include "Clownfish/Hash.h"
 #include "Clownfish/HashIterator.h"
 #include "Clownfish/Vector.h"
+#include "Lucy/Document/HitDoc.h"
 #include "Lucy/Plan/FieldType.h"
 #include "Lucy/Plan/Schema.h"
 #include "Lucy/Index/Segment.h"
@@ -90,6 +99,11 @@ GOLUCY_Doc_Destroy(lucy_Doc *self);
 extern void
 (*GOLUCY_Doc_Destroy_BRIDGE)(lucy_Doc *self);
 
+extern lucy_HitDoc*
+GOLUCY_DefDocReader_Fetch_Doc(lucy_DefaultDocReader *self, int32_t doc_id);
+extern lucy_HitDoc*
+(*GOLUCY_DefDocReader_Fetch_Doc_BRIDGE)(lucy_DefaultDocReader *self, int32_t doc_id);
+
 extern void
 GOLUCY_Inverter_Invert_Doc(lucy_Inverter *self, lucy_Doc *doc);
 extern void
@@ -115,9 +129,16 @@ GOLUCY_glue_exported_symbols() {
 	GOLUCY_Doc_Extract_BRIDGE = GOLUCY_Doc_Extract;
 	GOLUCY_Doc_Equals_BRIDGE = GOLUCY_Doc_Equals;
 	GOLUCY_Doc_Destroy_BRIDGE = GOLUCY_Doc_Destroy;
+	GOLUCY_DefDocReader_Fetch_Doc_BRIDGE = GOLUCY_DefDocReader_Fetch_Doc;
 	GOLUCY_Inverter_Invert_Doc_BRIDGE = GOLUCY_Inverter_Invert_Doc;
 }
 
+
+static void
+null_terminate_string(char *string, size_t len) {
+	string[len] = '\0';
+}
+
 */
 import "C"
 import "unsafe"
@@ -256,6 +277,79 @@ func fetchEntry(ivars *C.lucy_InverterIVARS, field *C.cfish_String) *C.lucy_Inve
 	return (*C.lucy_InverterEntry)(unsafe.Pointer(entry))
 }
 
+//export GOLUCY_DefDocReader_Fetch_Doc
+func GOLUCY_DefDocReader_Fetch_Doc(ddr *C.lucy_DefaultDocReader,
+	docID C.int32_t) *C.lucy_HitDoc {
+	ivars := C.lucy_DefDocReader_IVARS(ddr)
+	schema := ivars.schema
+	datInstream := ivars.dat_in
+	ixInstream := ivars.ix_in
+	fields := C.cfish_Hash_new(1)
+	fieldNameCap := C.size_t(31)
+	var fieldName *C.char = ((*C.char)(C.malloc(fieldNameCap + 1)))
+
+	// Get data file pointer from index, read number of fields.
+	C.LUCY_InStream_Seek(ixInstream, C.int64_t(docID*8))
+	start := C.LUCY_InStream_Read_U64(ixInstream)
+	C.LUCY_InStream_Seek(datInstream, C.int64_t(start))
+	numFields := uint32(C.LUCY_InStream_Read_C32(datInstream))
+
+	// Decode stored data and build up the doc field by field.
+	for i := uint32(0); i < numFields; i++ {
+		// Read field name.
+		fieldNameLen := C.size_t(C.LUCY_InStream_Read_C32(datInstream))
+		if fieldNameLen > fieldNameCap {
+			fieldNameCap = fieldNameLen
+			fieldName = ((*C.char)(C.realloc(unsafe.Pointer(fieldName), fieldNameCap+1)))
+		}
+		C.LUCY_InStream_Read_Bytes(datInstream, fieldName, fieldNameLen)
+
+		// Find the Field's FieldType.
+		// TODO: Creating and destroying a new string each time is
+		// inefficient.  The solution should be to add a privte
+		// Schema_Fetch_Type_Utf8 method which takes char* and size_t.
+		fieldNameStr := C.cfish_Str_new_from_utf8(fieldName, fieldNameLen)
+		fieldType := C.LUCY_Schema_Fetch_Type(schema, fieldNameStr)
+		C.cfish_dec_refcount(unsafe.Pointer(fieldNameStr))
+
+		// Read the field value.
+		var value *C.cfish_Obj
+		switch C.LUCY_FType_Primitive_ID(fieldType) & C.lucy_FType_PRIMITIVE_ID_MASK {
+		case C.lucy_FType_TEXT:
+			valueLen := C.size_t(C.LUCY_InStream_Read_C32(datInstream))
+			buf := ((*C.char)(C.malloc(valueLen + 1)))
+			C.LUCY_InStream_Read_Bytes(datInstream, buf, valueLen)
+			C.null_terminate_string(buf, valueLen)
+			value = ((*C.cfish_Obj)(C.cfish_Str_new_steal_utf8(buf, valueLen)))
+		case C.lucy_FType_BLOB:
+			valueLen := C.size_t(C.LUCY_InStream_Read_C32(datInstream))
+			buf := ((*C.char)(C.malloc(valueLen)))
+			C.LUCY_InStream_Read_Bytes(datInstream, buf, valueLen)
+			value = ((*C.cfish_Obj)(C.cfish_Blob_new_steal(buf, valueLen)))
+		case C.lucy_FType_FLOAT32:
+			value = ((*C.cfish_Obj)(C.cfish_Float_new(C.double(C.LUCY_InStream_Read_F32(datInstream)))))
+		case C.lucy_FType_FLOAT64:
+			value = ((*C.cfish_Obj)(C.cfish_Float_new(C.LUCY_InStream_Read_F64(datInstream))))
+		case C.lucy_FType_INT32:
+			value = ((*C.cfish_Obj)(C.cfish_Int_new(C.int64_t(C.LUCY_InStream_Read_C32(datInstream)))))
+		case C.lucy_FType_INT64:
+			value = ((*C.cfish_Obj)(C.cfish_Int_new(C.int64_t(C.LUCY_InStream_Read_C64(datInstream)))))
+		default:
+			value = nil
+			panic(clownfish.NewErr("Internal Lucy error: bad type id for field " +
+				C.GoStringN(fieldName, C.int(fieldNameLen))))
+		}
+
+		// Store the value.
+		C.CFISH_Hash_Store_Utf8(fields, fieldName, fieldNameLen, value)
+	}
+	C.free(unsafe.Pointer(fieldName))
+
+	retval := C.lucy_HitDoc_new(unsafe.Pointer(fields), docID, 0.0)
+	C.cfish_dec_refcount(unsafe.Pointer(fields))
+	return retval
+}
+
 //export GOLUCY_Inverter_Invert_Doc
 func GOLUCY_Inverter_Invert_Doc(inverter *C.lucy_Inverter, doc *C.lucy_Doc) {
 	ivars := C.lucy_Inverter_IVARS(inverter)

From 5f00a21335c3304c074223b2dba4567a62d9c97a Mon Sep 17 00:00:00 2001
From: Marvin Humphrey <marvin@rectangular.com>
Date: Mon, 20 Jul 2015 12:41:34 -0700
Subject: [PATCH 8/8] Port RegexTokenizer to Go and CGO.

Use Go's regular expression engine, the `regexp` package.  Store Go
`regexp` objects using the registry which allows them to be referenced
by integer from C.
---
 go/lucy/lucy.go      | 81 +++++++++++++++++++++++++++++++++++++++++++-
 go/lucy/lucy_test.go | 10 ++++++
 2 files changed, 90 insertions(+), 1 deletion(-)

diff --git a/go/lucy/lucy.go b/go/lucy/lucy.go
index 556235e4c..bc2e9f817 100644
--- a/go/lucy/lucy.go
+++ b/go/lucy/lucy.go
@@ -38,6 +38,11 @@ package lucy
 #include "Clownfish/Hash.h"
 #include "Clownfish/HashIterator.h"
 #include "Clownfish/Vector.h"
+#include "Clownfish/Err.h"
+#include "Clownfish/Util/StringHelper.h"
+#include "Lucy/Analysis/Analyzer.h"
+#include "Lucy/Analysis/Inversion.h"
+#include "Lucy/Analysis/Token.h"
 #include "Lucy/Document/HitDoc.h"
 #include "Lucy/Plan/FieldType.h"
 #include "Lucy/Plan/Schema.h"
@@ -133,6 +138,35 @@ GOLUCY_glue_exported_symbols() {
 	GOLUCY_Inverter_Invert_Doc_BRIDGE = GOLUCY_Inverter_Invert_Doc;
 }
 
+static uint32_t
+S_count_code_points(const char *string, size_t len) {
+    uint32_t num_code_points = 0;
+    size_t i = 0;
+
+    while (i < len) {
+        i += cfish_StrHelp_UTF8_COUNT[(uint8_t)(string[i])];
+        ++num_code_points;
+    }
+
+    if (i != len) {
+        CFISH_THROW(CFISH_ERR, "Match between code point boundaries in '%s'", string);
+    }
+
+    return num_code_points;
+}
+
+// Returns the number of code points through the end of the match.
+static int
+push_token(const char *str, int start, int end, int last_end,
+           int cp_count, lucy_Inversion *inversion) {
+	const char *match = str + start;
+	int match_len = end - start;
+	int cp_start = cp_count + S_count_code_points(str + last_end, start - last_end);
+	int cp_end   = cp_start + S_count_code_points(match, match_len);
+	lucy_Token *token = lucy_Token_new(match, match_len, cp_start, cp_end, 1.0f, 1);
+	LUCY_Inversion_Append(inversion, token);
+	return cp_end;
+}
 
 static void
 null_terminate_string(char *string, size_t len) {
@@ -143,25 +177,70 @@ null_terminate_string(char *string, size_t len) {
 import "C"
 import "unsafe"
 import "fmt"
+import "regexp"
 import "git-wip-us.apache.org/repos/asf/lucy-clownfish.git/runtime/go/clownfish"
 
+var registry *objRegistry
+
 func init() {
 	C.GOLUCY_glue_exported_symbols()
 	C.lucy_bootstrap_parcel()
+	registry = newObjRegistry(16)
 }
 
 //export GOLUCY_RegexTokenizer_init
 func GOLUCY_RegexTokenizer_init(rt *C.lucy_RegexTokenizer, pattern *C.cfish_String) *C.lucy_RegexTokenizer {
-	return nil
+	C.lucy_Analyzer_init(((*C.lucy_Analyzer)(unsafe.Pointer(rt))))
+
+	ivars := C.lucy_RegexTokenizer_IVARS(rt)
+	ivars.pattern = C.CFISH_Str_Clone(pattern)
+
+	var patternGo string
+	if pattern == nil {
+		patternGo = "\\w+(?:['\\x{2019}]\\w+)*"
+	} else {
+		patternGo = clownfish.CFStringToGo(unsafe.Pointer(pattern))
+	}
+	rx, err := regexp.Compile(patternGo)
+	if err != nil {
+		panic(err)
+	}
+	rxID := registry.store(rx)
+	ivars.token_re = unsafe.Pointer(rxID)
+
+	return rt
 }
 
 //export GOLUCY_RegexTokenizer_Destroy
 func GOLUCY_RegexTokenizer_Destroy(rt *C.lucy_RegexTokenizer) {
+	ivars := C.lucy_RegexTokenizer_IVARS(rt)
+	rxID := uintptr(ivars.token_re)
+	registry.delete(rxID)
+	C.cfish_super_destroy(unsafe.Pointer(rt), C.LUCY_REGEXTOKENIZER)
 }
 
 //export GOLUCY_RegexTokenizer_Tokenize_Utf8
 func GOLUCY_RegexTokenizer_Tokenize_Utf8(rt *C.lucy_RegexTokenizer, str *C.char,
 	stringLen C.size_t, inversion *C.lucy_Inversion) {
+
+	ivars := C.lucy_RegexTokenizer_IVARS(rt)
+	rxID := uintptr(ivars.token_re)
+	rx, ok := registry.fetch(rxID).(*regexp.Regexp)
+	if !ok {
+		mess := fmt.Sprintf("Failed to Fetch *RegExp with id %d and pattern %s",
+			rxID, clownfish.CFStringToGo(unsafe.Pointer(ivars.pattern)))
+		panic(clownfish.NewErr(mess))
+	}
+
+	buf := C.GoBytes(unsafe.Pointer(str), C.int(stringLen))
+	found := rx.FindAllIndex(buf, int(stringLen))
+	lastEnd := 0
+	cpCount := 0
+	for _, startEnd := range found {
+		cpCount = int(C.push_token(str, C.int(startEnd[0]), C.int(startEnd[1]),
+			C.int(lastEnd), C.int(cpCount), inversion))
+		lastEnd = startEnd[1]
+	}
 }
 
 func NewDoc(docID int32) Doc {
diff --git a/go/lucy/lucy_test.go b/go/lucy/lucy_test.go
index 94e4f0aa0..82ba87899 100644
--- a/go/lucy/lucy_test.go
+++ b/go/lucy/lucy_test.go
@@ -18,6 +18,7 @@ package lucy
 
 import "git-wip-us.apache.org/repos/asf/lucy-clownfish.git/runtime/go/clownfish"
 import "testing"
+import "reflect"
 
 func TestStuff(t *testing.T) {
 	NewSchema()
@@ -29,3 +30,12 @@ func TestOpenIndexer(t *testing.T) {
 		t.Error("Didn't catch exception opening indexer")
 	}
 }
+
+func TestRegex(t *testing.T) {
+	tokenizer := NewRegexTokenizer("\\S+")
+	var expected []interface{} = []interface{}{"foo", "bar", "baz"}
+	got := tokenizer.Split("foo bar baz")
+	if !reflect.DeepEqual(got, expected) {
+		t.Errorf("Expected %v, got %v", expected, got)
+	}
+}