-
Notifications
You must be signed in to change notification settings - Fork 3.3k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
ARROW-12657: [C++] Adding String hex to numeric conversion #11064
Changes from 3 commits
59dcbde
4cb862b
d03b7eb
69972dd
b76caf4
111f0c7
09497a9
e380c1a
bbecb6a
495c734
425b1cb
f0879a5
8c70a5f
a1d207e
1440d5a
a45fc3f
5ead375
e9251b0
858ac57
a49048b
f12c18e
882e8b4
5d38723
2588e17
c83db7e
5c5af6c
a8953de
4390a64
cf0e5e4
b1cfa7d
303b7f4
fd47183
02343c8
5876e3f
4cb77a2
67b5bd2
6dc272a
6c7c4f0
9064fa0
080a86b
f40856a
85d8175
e396d4f
57e76e8
97135bc
a081a05
170a24f
7a23a07
e5db0fc
9dd8b6a
31f80e5
4666073
b0d89db
4b5ed4e
bb1ef85
9aee524
0c41e0b
946bdcf
4fe6fae
66d7dd4
04515de
56411f5
42d10c3
3bbec3f
3db4854
fa7cff6
bae7e2b
db5b848
c091e6d
e8ab3ae
1049dde
74f020d
293f856
9122149
f2cb977
dfaa415
0610998
52904d6
376cb45
87b2fcd
1cbc4a2
f3d3c68
0b6f531
672149b
8875d5c
f1d6811
925b2a7
68ec4db
a538072
400d886
9cac060
6053936
68a6844
7aee4f0
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -273,6 +273,96 @@ inline bool ParseUnsigned(const char* s, size_t length, uint64_t* out) { | |
#undef PARSE_UNSIGNED_ITERATION | ||
#undef PARSE_UNSIGNED_ITERATION_LAST | ||
|
||
#define PARSE_HEX_ITERATION(C_TYPE) \ | ||
if (length > 0) { \ | ||
char val = *s; \ | ||
s++; \ | ||
result = static_cast<C_TYPE>(result << 4); \ | ||
length--; \ | ||
if (val >= '0' && val <= '9'){ \ | ||
result = static_cast<C_TYPE>(result | (val -'0')); \ | ||
} else if (val >= 'A' && val <= 'F'){ \ | ||
result = static_cast<C_TYPE>(result | (val -'A' + 10)); \ | ||
} else if (val >= 'a' && val <= 'f'){ \ | ||
result = static_cast<C_TYPE>(result | (val -'a' + 10)); \ | ||
} else { \ | ||
/* Non-digit */ \ | ||
return false; \ | ||
} \ | ||
} else { \ | ||
break; \ | ||
} | ||
|
||
|
||
inline bool ParseHex(const char* s, size_t length, uint8_t* out) { | ||
uint8_t result = 0; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Instead of having multiple functions for each type, I suggest to use templates, and, given that each function is conceptually doing the same operation but different number of times based on the output data type width, use a for-loop instead. template <typename T>
inline bool ParseHex(...) {
...
for (int i = 0; i < sizeof(T) * 2; ++i) {
ParseHexIteration<T>(result);
}
...
} There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. As you prefer, but I would be fine with keeping this a macro in this case. Note there is a control flow inside the macro so you cannot turn it into such a trivial loop. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I turned it into a loop, it works fine |
||
|
||
do { | ||
PARSE_HEX_ITERATION(uint8_t); | ||
PARSE_HEX_ITERATION(uint8_t); | ||
} while (false); | ||
*out = result; | ||
return true; | ||
} | ||
|
||
inline bool ParseHex(const char* s, size_t length, uint16_t* out) { | ||
uint16_t result = 0; | ||
do { | ||
PARSE_HEX_ITERATION(uint16_t); | ||
PARSE_HEX_ITERATION(uint16_t); | ||
PARSE_HEX_ITERATION(uint16_t); | ||
PARSE_HEX_ITERATION(uint16_t); | ||
} while (false); | ||
*out = result; | ||
return true; | ||
} | ||
|
||
inline bool ParseHex(const char* s, size_t length, uint32_t* out) { | ||
uint32_t result = 0; | ||
do { | ||
PARSE_HEX_ITERATION(uint32_t); | ||
PARSE_HEX_ITERATION(uint32_t); | ||
PARSE_HEX_ITERATION(uint32_t); | ||
PARSE_HEX_ITERATION(uint32_t); | ||
|
||
PARSE_HEX_ITERATION(uint32_t); | ||
PARSE_HEX_ITERATION(uint32_t); | ||
PARSE_HEX_ITERATION(uint32_t); | ||
PARSE_HEX_ITERATION(uint32_t); | ||
} while (false); | ||
*out = result; | ||
return true; | ||
} | ||
|
||
inline bool ParseHex(const char* s, size_t length, uint64_t* out) { | ||
uint64_t result = 0; | ||
do { | ||
PARSE_HEX_ITERATION(uint64_t); | ||
PARSE_HEX_ITERATION(uint64_t); | ||
PARSE_HEX_ITERATION(uint64_t); | ||
PARSE_HEX_ITERATION(uint64_t); | ||
|
||
PARSE_HEX_ITERATION(uint64_t); | ||
PARSE_HEX_ITERATION(uint64_t); | ||
PARSE_HEX_ITERATION(uint64_t); | ||
PARSE_HEX_ITERATION(uint64_t); | ||
|
||
PARSE_HEX_ITERATION(uint64_t); | ||
PARSE_HEX_ITERATION(uint64_t); | ||
PARSE_HEX_ITERATION(uint64_t); | ||
PARSE_HEX_ITERATION(uint64_t); | ||
|
||
PARSE_HEX_ITERATION(uint64_t); | ||
PARSE_HEX_ITERATION(uint64_t); | ||
PARSE_HEX_ITERATION(uint64_t); | ||
PARSE_HEX_ITERATION(uint64_t); | ||
} while (false); | ||
*out = result; | ||
return true; | ||
} | ||
|
||
#undef PARSE_HEX_ITERATION | ||
|
||
template <class ARROW_TYPE> | ||
struct StringToUnsignedIntConverterMixin { | ||
using value_type = typename ARROW_TYPE::c_type; | ||
|
@@ -281,6 +371,19 @@ struct StringToUnsignedIntConverterMixin { | |
if (ARROW_PREDICT_FALSE(length == 0)) { | ||
return false; | ||
} | ||
// If its starts with 0x then its hex | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. "it", "it's" There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 👍 |
||
if (*s == '0' && *(s + 1) == 'x'){ | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Add support for case-insensitive There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. +1 There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Done |
||
length -= 2; | ||
s += 2; | ||
// lets make sure that the length of the string is not too big | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Nit: You probably not need to decrease There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I opted not to do this, since length is used both at StringToSignedIntConverterMixin and at ParseHex. Getting rid of length would then involve keeping track of the original pointer of |
||
if (!ARROW_PREDICT_TRUE(sizeof(value_type)*2 >= length)) { | ||
return false; | ||
} | ||
if (!ARROW_PREDICT_TRUE(ParseHex(s, length, out))) { | ||
return false; | ||
} | ||
return true; | ||
} | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You can simplify these checks and boolean result as return ARROW_PREDICT_TRUE((sizeof(value_type) * 2 >= length) && ParseHex(s, length, out)); There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Done |
||
// Skip leading zeros | ||
while (length > 0 && *s == '0') { | ||
length--; | ||
|
@@ -336,6 +439,21 @@ struct StringToSignedIntConverterMixin { | |
return false; | ||
} | ||
} | ||
|
||
// If its starts with 0x then its hex | ||
if (*s == '0' && *(s + 1) == 'x'){ | ||
length -= 2; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If there is a negative sign, then it is not a well-formed hex string. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Good catch. Fixed this |
||
s += 2; | ||
// lets make sure that the length of the string is not too big | ||
if (!ARROW_PREDICT_TRUE(sizeof(unsigned_value)*2 >= length)) { | ||
return false; | ||
} | ||
if (!ARROW_PREDICT_TRUE(ParseHex(s, length, &unsigned_value))) { | ||
return false; | ||
} | ||
*out = static_cast<value_type>(unsigned_value); | ||
return true; | ||
} | ||
// Skip leading zeros | ||
while (length > 0 && *s == '0') { | ||
length--; | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -120,6 +120,14 @@ TEST(StringConversion, ToInt8) { | |
AssertConversionFails<Int8Type>("-"); | ||
AssertConversionFails<Int8Type>("0.0"); | ||
AssertConversionFails<Int8Type>("e"); | ||
|
||
// Hex | ||
AssertConversion<Int8Type>("0x0", 0); | ||
AssertConversion<Int8Type>("0x1A", 26); | ||
AssertConversion<Int8Type>("0xb", 11); | ||
AssertConversion<Int8Type>("0x7F", 127); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What happens if I pass There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. added unit test for this case |
||
AssertConversionFails<Int8Type>("0x100"); | ||
AssertConversionFails<Int8Type>("0x1g"); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you check that There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. added unit test for this case |
||
} | ||
|
||
TEST(StringConversion, ToUInt8) { | ||
|
@@ -138,6 +146,14 @@ TEST(StringConversion, ToUInt8) { | |
AssertConversionFails<UInt8Type>("-"); | ||
AssertConversionFails<UInt8Type>("0.0"); | ||
AssertConversionFails<UInt8Type>("e"); | ||
|
||
// Hex | ||
AssertConversion<UInt8Type>("0x0", 0); | ||
AssertConversion<UInt8Type>("0x1A", 26); | ||
AssertConversion<UInt8Type>("0xb", 11); | ||
AssertConversion<UInt8Type>("0x7F", 127); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Same question re. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. added unit test for this case |
||
AssertConversionFails<UInt8Type>("0x100"); | ||
AssertConversionFails<UInt8Type>("0x1g"); | ||
} | ||
|
||
TEST(StringConversion, ToInt16) { | ||
|
@@ -155,6 +171,14 @@ TEST(StringConversion, ToInt16) { | |
AssertConversionFails<Int16Type>("-"); | ||
AssertConversionFails<Int16Type>("0.0"); | ||
AssertConversionFails<Int16Type>("e"); | ||
|
||
// Hex | ||
AssertConversion<Int16Type>("0x0", 0); | ||
AssertConversion<Int16Type>("0x1aA", 426); | ||
AssertConversion<Int16Type>("0xb", 11); | ||
AssertConversion<Int16Type>("0x7ffF", 32767); | ||
AssertConversionFails<Int16Type>("0x10000"); | ||
AssertConversionFails<Int16Type>("0x1g"); | ||
} | ||
|
||
TEST(StringConversion, ToUInt16) { | ||
|
@@ -172,6 +196,14 @@ TEST(StringConversion, ToUInt16) { | |
AssertConversionFails<UInt16Type>("-"); | ||
AssertConversionFails<UInt16Type>("0.0"); | ||
AssertConversionFails<UInt16Type>("e"); | ||
|
||
// Hex | ||
AssertConversion<UInt16Type>("0x0", 0); | ||
AssertConversion<UInt16Type>("0x1aA", 426); | ||
AssertConversion<UInt16Type>("0xb", 11); | ||
AssertConversion<UInt16Type>("0x7ffF", 32767); | ||
AssertConversionFails<UInt16Type>("0x10000"); | ||
AssertConversionFails<UInt16Type>("0x1g"); | ||
} | ||
|
||
TEST(StringConversion, ToInt32) { | ||
|
@@ -189,6 +221,17 @@ TEST(StringConversion, ToInt32) { | |
AssertConversionFails<Int32Type>("-"); | ||
AssertConversionFails<Int32Type>("0.0"); | ||
AssertConversionFails<Int32Type>("e"); | ||
|
||
// Hex | ||
AssertConversion<Int32Type>("0x0", 0); | ||
AssertConversion<Int32Type>("0x123ABC", 1194684); | ||
AssertConversion<Int32Type>("0xA4B35", 674613); | ||
AssertConversion<Int32Type>("0x7FFFFFFF", 2147483647); | ||
AssertConversion<Int32Type>("0x123abc", 1194684); | ||
AssertConversion<Int32Type>("0xA4b35", 674613); | ||
AssertConversion<Int32Type>("0x7FFFfFfF", 2147483647); | ||
AssertConversionFails<Int32Type>("0x23512ak"); | ||
|
||
} | ||
|
||
TEST(StringConversion, ToUInt32) { | ||
|
@@ -206,6 +249,16 @@ TEST(StringConversion, ToUInt32) { | |
AssertConversionFails<UInt32Type>("-"); | ||
AssertConversionFails<UInt32Type>("0.0"); | ||
AssertConversionFails<UInt32Type>("e"); | ||
|
||
// Hex | ||
AssertConversion<UInt32Type>("0x0", 0); | ||
AssertConversion<UInt32Type>("0x123ABC", 1194684); | ||
AssertConversion<UInt32Type>("0xA4B35", 674613); | ||
AssertConversion<UInt32Type>("0x7FFFFFFF", 2147483647); | ||
AssertConversion<UInt32Type>("0x123abc", 1194684); | ||
AssertConversion<UInt32Type>("0xA4b35", 674613); | ||
AssertConversion<UInt32Type>("0x7FFFfFfF", 2147483647); | ||
AssertConversionFails<UInt32Type>("0x23512ak"); | ||
} | ||
|
||
TEST(StringConversion, ToInt64) { | ||
|
@@ -223,6 +276,14 @@ TEST(StringConversion, ToInt64) { | |
AssertConversionFails<Int64Type>("-"); | ||
AssertConversionFails<Int64Type>("0.0"); | ||
AssertConversionFails<Int64Type>("e"); | ||
|
||
// Hex | ||
AssertConversion<Int64Type>("0x0", 0); | ||
AssertConversion<Int64Type>("0x5415a123ABC123cb", 6058926048274359243); | ||
AssertConversion<Int64Type>("0xA4B35", 674613); | ||
AssertConversion<Int64Type>("0x7FFFFFFFFFFFFFFf", 9223372036854775807); | ||
AssertConversionFails<Int64Type>("0x12345678901234567"); | ||
AssertConversionFails<Int64Type>("0x23512ak"); | ||
} | ||
|
||
TEST(StringConversion, ToUInt64) { | ||
|
@@ -237,6 +298,14 @@ TEST(StringConversion, ToUInt64) { | |
AssertConversionFails<UInt64Type>("-"); | ||
AssertConversionFails<UInt64Type>("0.0"); | ||
AssertConversionFails<UInt64Type>("e"); | ||
|
||
// Hex | ||
AssertConversion<UInt64Type>("0x0", 0); | ||
AssertConversion<UInt64Type>("0x5415a123ABC123cb", 6058926048274359243); | ||
AssertConversion<UInt64Type>("0xA4B35", 674613); | ||
AssertConversion<UInt64Type>("0x7FFFFFFFFFFFFFFf", 9223372036854775807); | ||
AssertConversionFails<UInt64Type>("0x12345678901234567"); | ||
AssertConversionFails<UInt64Type>("0x23512ak"); | ||
} | ||
|
||
TEST(StringConversion, ToDate32) { | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I suggest to make this a function. The use of macros is undesired in the general case.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I agree with this recommendation. I was following the pattern of how
ParseUnsigned
andPARSE_UNSIGNED_ITERATION
is done in the same file. Since I am new to the codebase I decided to follow the patterns I was seeing.