From 77714363ee3dcf8df35354c0f15f461039b785ae Mon Sep 17 00:00:00 2001 From: Avery Pennarun Date: Fri, 11 Jun 2010 06:28:08 -0400 Subject: [PATCH] wvcsv_splitline: keep string lengths around so the caller can use them. Turns out strlen() was about 7% of the runtime of csv2sort. This cuts our 500k record sorting time from about 2.8s down to about 2.6s. --- _wvcsv.cc | 5 +++-- wvcsv.cc | 37 ++++++++++++++++++++++++++++--------- wvcsv.h | 8 +++++--- 3 files changed, 36 insertions(+), 14 deletions(-) diff --git a/_wvcsv.cc b/_wvcsv.cc index 7857720..f301cbf 100644 --- a/_wvcsv.cc +++ b/_wvcsv.cc @@ -39,11 +39,12 @@ static PyObject *pywvcsv_quote(PyObject *self, PyObject *args) static PyObject *pywvcsv_splitline(PyObject *self, PyObject *args) { char *line; - std::vectorv; + std::vector v; + std::vector lengths; if (!PyArg_ParseTuple(args, "et", "utf-8", &line)) return NULL; - wvcsv_splitline(v, line, strlen(line), true); + wvcsv_splitline(v, lengths, line, strlen(line), true); Py_ssize_t l = v.size(); PyObject *r = PyTuple_New(l); for (Py_ssize_t i = 0; i < l; ++i) diff --git a/wvcsv.cc b/wvcsv.cc index 13880c3..8253caf 100644 --- a/wvcsv.cc +++ b/wvcsv.cc @@ -78,13 +78,17 @@ char *wvcsv_readline(WvStream &stream, WvBuf &buf) } -char *wvcsv_dequote(const char *in, char *out) +char *wvcsv_dequote(const char *in, char *out, size_t *length) { if (!*in) + { + if (length) *length = 0; return NULL; // unquoted blankness is a null string + } else if (!strcmp(in, "\"\"")) { *out = 0; + if (length) *length = 0; return out; // quoted blankness is an empty string } else @@ -113,23 +117,26 @@ char *wvcsv_dequote(const char *in, char *out) *optr++ = *cptr; } *optr = 0; + if (length) *length = optr - out; return out; } } -inline char *dequote_or_not(char *tofutz, bool dequote) +inline char *dequote_or_not(char *tofutz, bool dequote, size_t *length) { - return dequote ? wvcsv_dequote(tofutz, tofutz) : tofutz; + return dequote ? wvcsv_dequote(tofutz, tofutz, length) : tofutz; } -void wvcsv_splitline(std::vector &l, char *s, size_t slen, +void wvcsv_splitline(std::vector &l, std::vector &ll, + char *s, size_t slen, bool dequote_values) { bool inquote = false; int istart = 0; l.clear(); + ll.clear(); for (size_t i = 0; i < slen; i++) { @@ -139,18 +146,27 @@ void wvcsv_splitline(std::vector &l, char *s, size_t slen, { // end of a column s[i] = 0; - l.push_back(dequote_or_not(s+istart, dequote_values)); + size_t length = i - istart; + char *p = dequote_or_not(s+istart, dequote_values, &length); + l.push_back(p); + ll.push_back(length); + //assert(!p || strlen(p) == length); istart = i + 1; } } - l.push_back(dequote_or_not(s+istart, dequote_values)); + + size_t xlength = slen - istart; + l.push_back(dequote_or_not(s+istart, dequote_values, &xlength)); + ll.push_back(xlength); + assert(l.size() == ll.size()); } void wvcsv_splitline_slow(WvStringList &l, char *s, size_t slen) { std::vector v; - wvcsv_splitline(v, s, slen); + std::vector lv; + wvcsv_splitline(v, lv, s, slen); for (std::vector::iterator i = v.begin(); i < v.end(); i++) l.append(*i); @@ -175,7 +191,10 @@ WvCsvIter::WvCsvIter(WvStream &_stream, bool expect_TABLE, bool expect_headers, // pointers into it headerline = wvcsv_readline(stream, buf); if (headerline) - wvcsv_splitline(headers, headerline.edit(), headerline.len()); + { + std::vector lv; + wvcsv_splitline(headers, lv, headerline.edit(), headerline.len()); + } else err.set("CSV header line missing"); } @@ -189,6 +208,6 @@ bool WvCsvIter::next() if (!err.isok()) return false; char *line = wvcsv_readline(stream, buf); if (!line || !line[0] || !strcmp(line, "\r")) return false; - wvcsv_splitline(cols, line, strlen(line), dequote); + wvcsv_splitline(cols, lengths, line, strlen(line), dequote); return true; } diff --git a/wvcsv.h b/wvcsv.h index 4c65c3c..fc0ac1a 100644 --- a/wvcsv.h +++ b/wvcsv.h @@ -43,7 +43,7 @@ WvString wvcsv_quote(WvStringParm s); * * Use wvcsv_readline() and wvcsv_splitline() instead. */ -char *wvcsv_dequote(const char *in, char *out); +char *wvcsv_dequote(const char *in, char *out, size_t *length = NULL); /** * Given one "line" of a CSV file, split and decode the columns into l. @@ -54,7 +54,8 @@ char *wvcsv_dequote(const char *in, char *out); * WARNING: a "line" of a CSV file may contain newlines. You should read the * file using wvcsv_readline() if you don't want to screw up. */ -void wvcsv_splitline(std::vector &l, char *s, size_t slen, +void wvcsv_splitline(std::vector &l, std::vector &ll, + char *s, size_t slen, bool dequote_values = true); /** @@ -93,9 +94,10 @@ class WvCsvIter WvStream &stream; WvDynBuf buf; WvString headerline; - mutable std::vector cols; bool dequote; + mutable std::vector cols; public: + mutable std::vector lengths; WvString firstline; WvError err; std::vector headers;