Skip to content

Commit

Permalink
wvcsv_splitline: keep string lengths around so the caller can use them.
Browse files Browse the repository at this point in the history
Turns out strlen() was about 7% of the runtime of csv2sort.  This cuts our
500k record sorting time from about 2.8s down to about 2.6s.
  • Loading branch information
apenwarr committed Jun 26, 2010
1 parent 8b6601c commit 7771436
Show file tree
Hide file tree
Showing 3 changed files with 36 additions and 14 deletions.
5 changes: 3 additions & 2 deletions _wvcsv.cc
Expand Up @@ -39,11 +39,12 @@ static PyObject *pywvcsv_quote(PyObject *self, PyObject *args)
static PyObject *pywvcsv_splitline(PyObject *self, PyObject *args)
{
char *line;
std::vector<char *>v;
std::vector<char *> v;
std::vector<size_t> lengths;

if (!PyArg_ParseTuple(args, "et", "utf-8", &line))
return NULL;
wvcsv_splitline(v, line, strlen(line), true);
wvcsv_splitline(v, lengths, line, strlen(line), true);
Py_ssize_t l = v.size();
PyObject *r = PyTuple_New(l);
for (Py_ssize_t i = 0; i < l; ++i)
Expand Down
37 changes: 28 additions & 9 deletions wvcsv.cc
Expand Up @@ -78,13 +78,17 @@ char *wvcsv_readline(WvStream &stream, WvBuf &buf)
}


char *wvcsv_dequote(const char *in, char *out)
char *wvcsv_dequote(const char *in, char *out, size_t *length)
{
if (!*in)
{
if (length) *length = 0;
return NULL; // unquoted blankness is a null string
}
else if (!strcmp(in, "\"\""))
{
*out = 0;
if (length) *length = 0;
return out; // quoted blankness is an empty string
}
else
Expand Down Expand Up @@ -113,23 +117,26 @@ char *wvcsv_dequote(const char *in, char *out)
*optr++ = *cptr;
}
*optr = 0;
if (length) *length = optr - out;
return out;
}
}


inline char *dequote_or_not(char *tofutz, bool dequote)
inline char *dequote_or_not(char *tofutz, bool dequote, size_t *length)
{
return dequote ? wvcsv_dequote(tofutz, tofutz) : tofutz;
return dequote ? wvcsv_dequote(tofutz, tofutz, length) : tofutz;
}

void wvcsv_splitline(std::vector<char*> &l, char *s, size_t slen,
void wvcsv_splitline(std::vector<char*> &l, std::vector<size_t> &ll,
char *s, size_t slen,
bool dequote_values)
{
bool inquote = false;
int istart = 0;

l.clear();
ll.clear();

for (size_t i = 0; i < slen; i++)
{
Expand All @@ -139,18 +146,27 @@ void wvcsv_splitline(std::vector<char*> &l, char *s, size_t slen,
{
// end of a column
s[i] = 0;
l.push_back(dequote_or_not(s+istart, dequote_values));
size_t length = i - istart;
char *p = dequote_or_not(s+istart, dequote_values, &length);
l.push_back(p);
ll.push_back(length);
//assert(!p || strlen(p) == length);
istart = i + 1;
}
}
l.push_back(dequote_or_not(s+istart, dequote_values));

size_t xlength = slen - istart;
l.push_back(dequote_or_not(s+istart, dequote_values, &xlength));
ll.push_back(xlength);
assert(l.size() == ll.size());
}


void wvcsv_splitline_slow(WvStringList &l, char *s, size_t slen)
{
std::vector<char*> v;
wvcsv_splitline(v, s, slen);
std::vector<size_t> lv;
wvcsv_splitline(v, lv, s, slen);

for (std::vector<char*>::iterator i = v.begin(); i < v.end(); i++)
l.append(*i);
Expand All @@ -175,7 +191,10 @@ WvCsvIter::WvCsvIter(WvStream &_stream, bool expect_TABLE, bool expect_headers,
// pointers into it
headerline = wvcsv_readline(stream, buf);
if (headerline)
wvcsv_splitline(headers, headerline.edit(), headerline.len());
{
std::vector<size_t> lv;
wvcsv_splitline(headers, lv, headerline.edit(), headerline.len());
}
else
err.set("CSV header line missing");
}
Expand All @@ -189,6 +208,6 @@ bool WvCsvIter::next()
if (!err.isok()) return false;
char *line = wvcsv_readline(stream, buf);
if (!line || !line[0] || !strcmp(line, "\r")) return false;
wvcsv_splitline(cols, line, strlen(line), dequote);
wvcsv_splitline(cols, lengths, line, strlen(line), dequote);
return true;
}
8 changes: 5 additions & 3 deletions wvcsv.h
Expand Up @@ -43,7 +43,7 @@ WvString wvcsv_quote(WvStringParm s);
*
* Use wvcsv_readline() and wvcsv_splitline() instead.
*/
char *wvcsv_dequote(const char *in, char *out);
char *wvcsv_dequote(const char *in, char *out, size_t *length = NULL);

/**
* Given one "line" of a CSV file, split and decode the columns into l.
Expand All @@ -54,7 +54,8 @@ char *wvcsv_dequote(const char *in, char *out);
* WARNING: a "line" of a CSV file may contain newlines. You should read the
* file using wvcsv_readline() if you don't want to screw up.
*/
void wvcsv_splitline(std::vector<char*> &l, char *s, size_t slen,
void wvcsv_splitline(std::vector<char*> &l, std::vector<size_t> &ll,
char *s, size_t slen,
bool dequote_values = true);

/**
Expand Down Expand Up @@ -93,9 +94,10 @@ class WvCsvIter
WvStream &stream;
WvDynBuf buf;
WvString headerline;
mutable std::vector<char*> cols;
bool dequote;
mutable std::vector<char*> cols;
public:
mutable std::vector<size_t> lengths;
WvString firstline;
WvError err;
std::vector<char*> headers;
Expand Down

0 comments on commit 7771436

Please sign in to comment.