Skip to content
This repository has been archived by the owner on Dec 21, 2023. It is now read-only.

More CSV Parsing Improvements #1266

Merged
merged 5 commits into from Jan 10, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
30 changes: 26 additions & 4 deletions src/flexible_type/flexible_type_spirit_parser.cpp
Expand Up @@ -34,7 +34,12 @@ qi::real_parser< double, strict_real_policies<double> > real;

template <typename Iterator, typename SpaceType>
struct flexible_type_parser_impl: qi::grammar<Iterator, flexible_type(), SpaceType> {
flexible_type_parser_impl(std::string delimiter = ",", char escape_char = '\\') :
flexible_type_parser_impl(std::string delimiter = ",",
bool use_escape_char = true,
char escape_char = '\\',
const std::unordered_set<std::string>& na_val = {},
const std::unordered_set<std::string>& true_val = {},
const std::unordered_set<std::string>& false_val = {}) :
flexible_type_parser_impl::base_type(root_parser), delimiter(delimiter) {
using qi::long_long;
using qi::double_;
Expand All @@ -54,25 +59,37 @@ struct flexible_type_parser_impl: qi::grammar<Iterator, flexible_type(), SpaceTy
*/
parser_impl::parser_config recursive_element_string_parser;
recursive_element_string_parser.restrictions = ",{}[]";
recursive_element_string_parser.use_escape_char = use_escape_char;
recursive_element_string_parser.escape_char = escape_char;
recursive_element_string_parser.double_quote = true;
recursive_element_string_parser.na_val = na_val;
recursive_element_string_parser.true_val = true_val;
recursive_element_string_parser.false_val= false_val;

/*
* A parser which parses strings, and stops at all unquoted delimiters
* AND spaces.
*/
parser_impl::parser_config dictionary_element_string_parser;
dictionary_element_string_parser.restrictions = " ,\t{}[]:;";
dictionary_element_string_parser.use_escape_char = use_escape_char;
dictionary_element_string_parser.escape_char = escape_char;
dictionary_element_string_parser.double_quote = true;
dictionary_element_string_parser.na_val = na_val;
dictionary_element_string_parser.true_val = true_val;
dictionary_element_string_parser.false_val = false_val;

parser_impl::parser_config root_flex_string;
// when the delimiter is just one character, using the restrictions is faster.
if (delimiter.length() <= 1) root_flex_string.restrictions = delimiter;
else root_flex_string.delimiter = delimiter;

root_flex_string.use_escape_char = use_escape_char;
root_flex_string.escape_char = escape_char;
root_flex_string.double_quote = true;
root_flex_string.na_val = na_val;
root_flex_string.true_val = true_val;
root_flex_string.false_val = false_val;

string =
(parser_impl::restricted_string(root_flex_string)[_val = _1]);
Expand Down Expand Up @@ -184,11 +201,16 @@ struct flexible_type_parser_impl: qi::grammar<Iterator, flexible_type(), SpaceTy



flexible_type_parser::flexible_type_parser(std::string separator, char escape_char):
flexible_type_parser::flexible_type_parser(std::string separator,
bool use_escape_char,
char escape_char,
const std::unordered_set<std::string>& na_val,
const std::unordered_set<std::string>& true_val,
const std::unordered_set<std::string>& false_val):
parser(new flexible_type_parser_impl<const char*,
decltype(space)>(separator, escape_char)),
decltype(space)>(separator, use_escape_char, escape_char, na_val, true_val, false_val)),
non_space_parser(new flexible_type_parser_impl<const char*,
decltype(qi::eoi)>(separator, escape_char)),
decltype(qi::eoi)>(separator, use_escape_char, escape_char)),
m_delimiter_has_space(delimiter_has_space(parser->delimiter))
{ }

Expand Down
7 changes: 6 additions & 1 deletion src/flexible_type/flexible_type_spirit_parser.hpp
Expand Up @@ -34,7 +34,12 @@ struct flexible_type_parser_impl;
*/
class flexible_type_parser {
public:
flexible_type_parser(std::string delimiter = ",", char escape_char = '\\');
flexible_type_parser(std::string delimiter = ",",
bool use_escape_char = true,
char escape_char = '\\',
const std::unordered_set<std::string>& na_val = {},
const std::unordered_set<std::string>& true_val = {},
const std::unordered_set<std::string>& false_val = {});
/**
* Parses a generalized flexible type from a string. The *str pointer will be
* updated to point to the character after the last character parsed.
Expand Down
11 changes: 6 additions & 5 deletions src/flexible_type/string_escape.cpp
Expand Up @@ -171,16 +171,16 @@ size_t write_utf8(size_t code_point, char* c) {
return 0;
}

size_t unescape_string(char* cal, size_t length, char escape_char,
char quote_char, bool double_quote) {
size_t unescape_string(char* cal, size_t length, bool use_escape_char,
char escape_char, char quote_char, bool double_quote) {
// to avoid allocating a new string, we are do this entirely in-place
// This works because for all the escapes we have here, the output string
// is shorter than the input.
size_t in = 0;
size_t out = 0;

while(in != length) {
if (cal[in] == escape_char && in + 1 < length) {
if ((use_escape_char && cal[in] == escape_char) && in + 1 < length) {
char echar = cal[in + 1];
switch (echar) {
case '\'':
Expand Down Expand Up @@ -283,10 +283,11 @@ size_t unescape_string(char* cal, size_t length, char escape_char,
}


void unescape_string(std::string& cal, char escape_char,
void unescape_string(std::string& cal, bool use_escape_char, char escape_char,
char quote_char, bool double_quote) {
size_t new_length = unescape_string(&(cal[0]), cal.length(),
escape_char, quote_char, double_quote);
use_escape_char, escape_char,
quote_char, double_quote);
cal.resize(new_length);
}

Expand Down
4 changes: 2 additions & 2 deletions src/flexible_type/string_escape.hpp
Expand Up @@ -11,14 +11,14 @@ namespace turi {
/**
* Unescapes a string inplace
*/
void unescape_string(std::string& cal, char escape_char,
void unescape_string(std::string& cal, bool use_escape_char, char escape_char,
char quote_char, bool double_quote);

/**
* Unescapes a string inplace, returning the new length
*/
size_t unescape_string(char* cal,
size_t length, char escape_char,
size_t length, bool use_escape_char, char escape_char,
char quote_char, bool double_quote);
/**
* Escapes a string from val into output.
Expand Down
44 changes: 33 additions & 11 deletions src/flexible_type/string_parser.hpp
Expand Up @@ -17,24 +17,30 @@

namespace parser_impl {

/*
/**
* \internal
* The string parsing configuration.
*
*/
struct parser_config {
// If any of these character occurs outside of quoted string,
// the string will be terminated
/// If any of these character occurs outside of quoted string,
/// the string will be terminated
std::string restrictions;
// If the delimiter string is seen anywhere outside of a quoted string,
// the string will be terminated.
/// If the delimiter string is seen anywhere outside of a quoted string,
/// the string will be terminated.
std::string delimiter;
// The character to use for an escape character
/// Whether escape char should be used
bool use_escape_char = true;
/// The character to use for an escape character
char escape_char = '\\';
/* whether double quotes inside of a quote are treated as a single quote.
/** whether double quotes inside of a quote are treated as a single quote.
* i.e. """hello""" => \"hello\"
*/
char double_quote = true;

std::unordered_set<std::string> na_val;
std::unordered_set<std::string> true_val;
std::unordered_set<std::string> false_val;
};

BOOST_SPIRIT_TERMINAL_EX(restricted_string);
Expand Down Expand Up @@ -113,12 +119,22 @@ struct string_parser
bool has_delimiter = false;
char delimiter_first_char;
bool delimiter_is_singlechar = false;
std::unordered_map<std::string, turi::flexible_type> map_vals; // handle na_val, true_val, false_val

string_parser(){}
string_parser(parser_config config):config(config) {
has_delimiter = config.delimiter.length() > 0;
delimiter_is_singlechar = config.delimiter.length() == 1;
if (has_delimiter) delimiter_first_char = config.delimiter[0];
for (auto s: config.na_val) {
map_vals[s] = turi::flexible_type(turi::flex_type_enum::UNDEFINED);
}
for (auto s: config.true_val) {
map_vals[s] = 1;
}
for (auto s: config.false_val) {
map_vals[s] = 0;
}
}

enum class tokenizer_state {
Expand All @@ -136,7 +152,7 @@ struct string_parser
}
return true;
}
#define PUSH_CHAR(c) ret.add_char(c); escape_sequence = (c == config.escape_char);
#define PUSH_CHAR(c) ret.add_char(c); escape_sequence = config.use_escape_char && (c == config.escape_char);

// insert a character into the field buffer. resizing it if necessary

Expand Down Expand Up @@ -229,10 +245,16 @@ struct string_parser
if (!quote_char) boost::algorithm::trim_right(final_str);
else if (quote_char) {
// if was quoted field, we unescape the contents
turi::unescape_string(final_str, config.escape_char,
quote_char, config.double_quote);
turi::unescape_string(final_str, config.use_escape_char,
config.escape_char,
quote_char, config.double_quote);
}
auto map_val_iter = map_vals.find(final_str);
if (map_val_iter != map_vals.end()) {
attr = map_val_iter->second;
} else {
attr = std::move(final_str);
}
attr = std::move(final_str);
first = cur;
}
return true;
Expand Down
45 changes: 22 additions & 23 deletions src/logger/logger.cpp
Expand Up @@ -78,24 +78,6 @@ bool file_logger::set_log_file(std::string file) {
}



#define RESET 0
#define BRIGHT 1
#define DIM 2
#define UNDERLINE 3
#define BLINK 4
#define REVERSE 7
#define HIDDEN 8

#define BLACK 0
#define RED 1
#define GREEN 2
#define YELLOW 3
#define BLUE 4
#define MAGENTA 5
#define CYAN 6
#define WHITE 7

void textcolor(FILE* handle, int attr, int fg)
{
char command[13];
Expand All @@ -104,6 +86,14 @@ void textcolor(FILE* handle, int attr, int fg)
fprintf(handle, "%s", command);
}

std::string textcolor(int attr, int fg)
{
char command[13];
/* Command is the control command to the terminal */
sprintf(command, "%c[%d;%dm", 0x1B, attr, fg + 30);
return command;
}

void reset_color(FILE* handle)
{
char command[20];
Expand All @@ -113,6 +103,15 @@ void reset_color(FILE* handle)
}


std::string reset_color()
{
char command[20];
/* Command is the control command to the terminal */
sprintf(command, "%c[0m", 0x1B);
return command;
}



void file_logger::_log(int lineloglevel,const char* file,const char* function,
int line,const char* fmt, va_list ap ) {
Expand Down Expand Up @@ -219,19 +218,19 @@ void file_logger::_lograw(int lineloglevel, const char* buf, int len) {

pthread_mutex_lock(&mut);
if (lineloglevel == LOG_FATAL) {
textcolor(stderr, BRIGHT, RED);
textcolor(stderr, TEXTCOLOR_BRIGHT, TEXTCOLOR_RED);
}
else if (lineloglevel == LOG_ERROR) {
textcolor(log_to_stderr ? stderr : stdout, BRIGHT, RED);
textcolor(log_to_stderr ? stderr : stdout, TEXTCOLOR_BRIGHT, TEXTCOLOR_RED);
}
else if (lineloglevel == LOG_WARNING) {
textcolor(log_to_stderr ? stderr : stdout, BRIGHT, MAGENTA);
textcolor(log_to_stderr ? stderr : stdout, TEXTCOLOR_BRIGHT, TEXTCOLOR_MAGENTA);
}
else if (lineloglevel == LOG_DEBUG) {
textcolor(log_to_stderr ? stderr : stdout, BRIGHT, YELLOW);
textcolor(log_to_stderr ? stderr : stdout, TEXTCOLOR_BRIGHT, TEXTCOLOR_YELLOW);
}
else if (lineloglevel == LOG_EMPH) {
textcolor(log_to_stderr ? stderr : stdout, BRIGHT, GREEN);
textcolor(log_to_stderr ? stderr : stdout, TEXTCOLOR_BRIGHT, TEXTCOLOR_GREEN);
}
#endif
if(lineloglevel >= LOG_FATAL) {
Expand Down
20 changes: 20 additions & 0 deletions src/logger/logger.hpp
Expand Up @@ -689,8 +689,28 @@ struct log_stream_dispatch<false> {
}
};


#define TEXTCOLOR_RESET 0
#define TEXTCOLOR_BRIGHT 1
#define TEXTCOLOR_DIM 2
#define TEXTCOLOR_UNDERLINE 3
#define TEXTCOLOR_BLINK 4
#define TEXTCOLOR_REVERSE 7
#define TEXTCOLOR_HIDDEN 8

#define TEXTCOLOR_BLACK 0
#define TEXTCOLOR_RED 1
#define TEXTCOLOR_GREEN 2
#define TEXTCOLOR_YELLOW 3
#define TEXTCOLOR_BLUE 4
#define TEXTCOLOR_MAGENTA 5
#define TEXTCOLOR_CYAN 6
#define TEXTCOLOR_WHITE 7

void textcolor(FILE* handle, int attr, int fg);
std::string textcolor(int attr, int fg);
void reset_color(FILE* handle);
std::string reset_color();

#endif