From 77714363ee3dcf8df35354c0f15f461039b785ae Mon Sep 17 00:00:00 2001
From: Avery Pennarun <apenwarr@gmail.com>
Date: Fri, 11 Jun 2010 06:28:08 -0400
Subject: [PATCH] wvcsv_splitline: keep string lengths around so the caller can
 use them.

Turns out strlen() was about 7% of the runtime of csv2sort.  This cuts our
500k record sorting time from about 2.8s down to about 2.6s.
---
 _wvcsv.cc |  5 +++--
 wvcsv.cc  | 37 ++++++++++++++++++++++++++++---------
 wvcsv.h   |  8 +++++---
 3 files changed, 36 insertions(+), 14 deletions(-)
diff --git a/_wvcsv.cc b/_wvcsv.cc
index 7857720..f301cbf 100644
--- a/_wvcsv.cc
+++ b/_wvcsv.cc
@@ -39,11 +39,12 @@ static PyObject *pywvcsv_quote(PyObject *self, PyObject *args)
 static PyObject *pywvcsv_splitline(PyObject *self, PyObject *args)
 {
     char *line;
-    std::vector<char *>v;
+    std::vector<char *> v;
+    std::vector<size_t> lengths;
 
     if (!PyArg_ParseTuple(args, "et", "utf-8", &line))
 	return NULL;
-    wvcsv_splitline(v, line, strlen(line), true);
+    wvcsv_splitline(v, lengths, line, strlen(line), true);
     Py_ssize_t l = v.size();
     PyObject *r = PyTuple_New(l);
     for (Py_ssize_t i = 0; i < l; ++i)
diff --git a/wvcsv.cc b/wvcsv.cc
index 13880c3..8253caf 100644
--- a/wvcsv.cc
+++ b/wvcsv.cc
@@ -78,13 +78,17 @@ char *wvcsv_readline(WvStream &stream, WvBuf &buf)
 }
 
 
-char *wvcsv_dequote(const char *in, char *out)
+char *wvcsv_dequote(const char *in, char *out, size_t *length)
 {
     if (!*in)
+    {
+	if (length) *length = 0;
 	return NULL; // unquoted blankness is a null string
+    }
     else if (!strcmp(in, "\"\""))
     {
 	*out = 0;
+	if (length) *length = 0;
 	return out; // quoted blankness is an empty string
     }
     else
@@ -113,23 +117,26 @@ char *wvcsv_dequote(const char *in, char *out)
 		*optr++ = *cptr;
 	}
 	*optr = 0;
+	if (length) *length = optr - out;
 	return out;
     }
 }
 
 
-inline char *dequote_or_not(char *tofutz, bool dequote)
+inline char *dequote_or_not(char *tofutz, bool dequote, size_t *length)
 {
-    return dequote ? wvcsv_dequote(tofutz, tofutz) : tofutz;
+    return dequote ? wvcsv_dequote(tofutz, tofutz, length) : tofutz;
 }
 
-void wvcsv_splitline(std::vector<char*> &l, char *s, size_t slen,
+void wvcsv_splitline(std::vector<char*> &l, std::vector<size_t> &ll,
+		     char *s, size_t slen,
 		     bool dequote_values)
 {
     bool inquote = false;
     int istart = 0;
     
     l.clear();
+    ll.clear();
     
     for (size_t i = 0; i < slen; i++)
     {
@@ -139,18 +146,27 @@ void wvcsv_splitline(std::vector<char*> &l, char *s, size_t slen,
 	{
 	    // end of a column
 	    s[i] = 0;
-	    l.push_back(dequote_or_not(s+istart, dequote_values));
+	    size_t length = i - istart;
+	    char *p = dequote_or_not(s+istart, dequote_values, &length);
+	    l.push_back(p);
+	    ll.push_back(length);
+	    //assert(!p || strlen(p) == length);
 	    istart = i + 1;
 	}
     }
-    l.push_back(dequote_or_not(s+istart, dequote_values));
+    
+    size_t xlength = slen - istart;
+    l.push_back(dequote_or_not(s+istart, dequote_values, &xlength));
+    ll.push_back(xlength);
+    assert(l.size() == ll.size());
 }
 
 
 void wvcsv_splitline_slow(WvStringList &l, char *s, size_t slen)
 {
     std::vector<char*> v;
-    wvcsv_splitline(v, s, slen);
+    std::vector<size_t> lv;
+    wvcsv_splitline(v, lv, s, slen);
     
     for (std::vector<char*>::iterator i = v.begin(); i < v.end(); i++)
 	l.append(*i);
@@ -175,7 +191,10 @@ WvCsvIter::WvCsvIter(WvStream &_stream, bool expect_TABLE, bool expect_headers,
 	// pointers into it
 	headerline = wvcsv_readline(stream, buf);
 	if (headerline)
-	    wvcsv_splitline(headers, headerline.edit(), headerline.len());
+	{
+	    std::vector<size_t> lv;
+	    wvcsv_splitline(headers, lv, headerline.edit(), headerline.len());
+	}
 	else
 	    err.set("CSV header line missing");
     }
@@ -189,6 +208,6 @@ bool WvCsvIter::next()
     if (!err.isok()) return false;
     char *line = wvcsv_readline(stream, buf);
     if (!line || !line[0] || !strcmp(line, "\r")) return false;
-    wvcsv_splitline(cols, line, strlen(line), dequote);
+    wvcsv_splitline(cols, lengths, line, strlen(line), dequote);
     return true;
 }
diff --git a/wvcsv.h b/wvcsv.h
index 4c65c3c..fc0ac1a 100644
--- a/wvcsv.h
+++ b/wvcsv.h
@@ -43,7 +43,7 @@ WvString wvcsv_quote(WvStringParm s);
  * 
  * Use wvcsv_readline() and wvcsv_splitline() instead.
  */
-char *wvcsv_dequote(const char *in, char *out);
+char *wvcsv_dequote(const char *in, char *out, size_t *length = NULL);
 
 /**
  * Given one "line" of a CSV file, split and decode the columns into l.
@@ -54,7 +54,8 @@ char *wvcsv_dequote(const char *in, char *out);
  * WARNING: a "line" of a CSV file may contain newlines.  You should read the
  * file using wvcsv_readline() if you don't want to screw up.
  */
-void wvcsv_splitline(std::vector<char*> &l, char *s, size_t slen,
+void wvcsv_splitline(std::vector<char*> &l, std::vector<size_t> &ll,
+		     char *s, size_t slen,
 		     bool dequote_values = true);
 
 /**
@@ -93,9 +94,10 @@ class WvCsvIter
     WvStream &stream;
     WvDynBuf buf;
     WvString headerline;
-    mutable std::vector<char*> cols;
     bool dequote;
+    mutable std::vector<char*> cols;
 public:
+    mutable std::vector<size_t> lengths;
     WvString firstline;
     WvError err;
     std::vector<char*> headers;