Skip to content
Permalink
Browse files
[fix](broker-scan-node) Remove trailing spaces in broker_scanner. Mak…
…e it consistent with hive and trino behavior. (#9190)

Hive and trino/presto would automatically trim the trailing spaces but Doris doesn't.
This would cause different query result with hive.

Add a new session variable "trim_tailing_spaces_for_external_table_query".
If set to true, when reading csv from broker scan node, it will trim the tailing space of the column
  • Loading branch information
Jibing-Li committed May 20, 2022
1 parent defdae1 commit 5fa6e892beb8b51ff18b8a183fd5c92b568f5eae
Showing 4 changed files with 43 additions and 7 deletions.
@@ -339,19 +339,20 @@ void BrokerScanner::split_line(const Slice& line) {
delete[] ptr;
} else {
const char* value = line.data;
size_t start = 0; // point to the start pos of next col value.
size_t curpos = 0; // point to the start pos of separator matching sequence.
size_t p1 = 0; // point to the current pos of separator matching sequence.
size_t start = 0; // point to the start pos of next col value.
size_t curpos = 0; // point to the start pos of separator matching sequence.
size_t p1 = 0; // point to the current pos of separator matching sequence.
size_t non_space = 0; // point to the last pos of non_space charactor.

// Separator: AAAA
//
// curpos
// p1
//
// AAAA
// 1000AAAA2000AAAA
// ▲ ▲
// Start │
// p1
// curpos

while (curpos < line.size) {
if (*(value + curpos + p1) != _value_separator[p1]) {
@@ -362,16 +363,30 @@ void BrokerScanner::split_line(const Slice& line) {
p1++;
if (p1 == _value_separator_length) {
// Match a separator
_split_values.emplace_back(value + start, curpos - start);
non_space = curpos;
// Trim tailing spaces. Be consistent with hive and trino's behavior.
if (_state->trim_tailing_spaces_for_external_table_query()) {
while (non_space > start && *(value + non_space - 1) == ' ') {
non_space--;
}
}
_split_values.emplace_back(value + start, non_space - start);
start = curpos + _value_separator_length;
curpos = start;
p1 = 0;
non_space = 0;
}
}
}

CHECK(curpos == line.size) << curpos << " vs " << line.size;
_split_values.emplace_back(value + start, curpos - start);
non_space = curpos;
if (_state->trim_tailing_spaces_for_external_table_query()) {
while (non_space > start && *(value + non_space - 1) == ' ') {
non_space--;
}
}
_split_values.emplace_back(value + start, non_space - start);
}
}

@@ -326,6 +326,10 @@ class RuntimeState {

bool enable_vectorized_exec() const { return _query_options.enable_vectorized_engine; }

bool trim_tailing_spaces_for_external_table_query() const {
return _query_options.trim_tailing_spaces_for_external_table_query;
}

bool return_object_data_as_binary() const {
return _query_options.return_object_data_as_binary;
}
@@ -180,6 +180,8 @@ public class SessionVariable implements Serializable, Writable {

public static final String ENABLE_PROJECTION = "enable_projection";

public static final String TRIM_TAILING_SPACES_FOR_EXTERNAL_TABLE_QUERY = "trim_tailing_spaces_for_external_table_query";

// session origin value
public Map<Field, String> sessionOriginValue = new HashMap<Field, String>();
// check stmt is or not [select /*+ SET_VAR(...)*/ ...]
@@ -439,6 +441,9 @@ public class SessionVariable implements Serializable, Writable {
@VariableMgr.VarAttr(name = ENABLE_PROJECTION)
private boolean enableProjection = true;

@VariableMgr.VarAttr(name = TRIM_TAILING_SPACES_FOR_EXTERNAL_TABLE_QUERY, needForward = true)
public boolean trimTailingSpacesForExternalTableQuery = false;

public String getBlockEncryptionMode() {
return blockEncryptionMode;
}
@@ -895,6 +900,14 @@ public boolean isEnableProjection() {
return enableProjection;
}

public boolean isTrimTailingSpacesForExternalTableQuery() {
return trimTailingSpacesForExternalTableQuery;
}

public void setTrimTailingSpacesForExternalTableQuery(boolean trimTailingSpacesForExternalTableQuery) {
this.trimTailingSpacesForExternalTableQuery = trimTailingSpacesForExternalTableQuery;
}

// Serialize to thrift object
// used for rest api
public TQueryOptions toThrift() {
@@ -912,6 +925,7 @@ public TQueryOptions toThrift() {
tResult.setCodegenLevel(codegenLevel);
tResult.setEnableVectorizedEngine(enableVectorizedEngine);
tResult.setReturnObjectDataAsBinary(returnObjectDataAsBinary);
tResult.setTrimTailingSpacesForExternalTableQuery(trimTailingSpacesForExternalTableQuery);

tResult.setBatchSize(batchSize);
tResult.setDisableStreamPreaggregations(disableStreamPreaggregations);
@@ -160,6 +160,9 @@ struct TQueryOptions {
// show bitmap data in result, if use this in mysql cli may make the terminal
// output corrupted character
43: optional bool return_object_data_as_binary = false

// trim tailing spaces while querying external table and stream load
44: optional bool trim_tailing_spaces_for_external_table_query = false
}


0 comments on commit 5fa6e89

Please sign in to comment.