Skip to content

Commit

Permalink
ARROW-5648: [C++] Avoid using codecvt
Browse files Browse the repository at this point in the history
Some antiquated C++ build chains miss the standard <codecvt> header.
Use a small vendored UTF8 implementation instead.

Author: Antoine Pitrou <antoine@python.org>

Closes #4616 from pitrou/ARROW-5648-simple-utf8 and squashes the following commits:

54b1b2f <Antoine Pitrou> ARROW-5648:  Avoid using codecvt
  • Loading branch information
pitrou authored and wesm committed Jun 19, 2019
1 parent d54425d commit eb23ea9
Show file tree
Hide file tree
Showing 7 changed files with 815 additions and 17 deletions.
28 changes: 28 additions & 0 deletions LICENSE.txt
Expand Up @@ -773,6 +773,34 @@ THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

--------------------------------------------------------------------------------

The files in cpp/src/arrow/vendored/utf8cpp/ have the following license

Copyright 2006 Nemanja Trifunovic

Permission is hereby granted, free of charge, to any person or organization
obtaining a copy of the software and accompanying documentation covered by
this license (the "Software") to use, reproduce, display, distribute,
execute, and transmit the Software, and to prepare derivative works of the
Software, and to permit third-parties to whom the Software is furnished to
do so, all subject to the following:

The copyright notices in the Software and this entire statement, including
the above license grant, this restriction and the following disclaimer,
must be included in all copies of the Software, in whole or in part, and
all derivative works of the Software, unless such copies or derivative
works are solely in the form of machine-executable object code generated by
a source language processor.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
DEALINGS IN THE SOFTWARE.

--------------------------------------------------------------------------------

This project includes code from Apache Kudu.

* cpp/cmake_modules/CompilerInfo.cmake is based on Kudu's cmake_modules/CompilerInfo.cmake
Expand Down
34 changes: 17 additions & 17 deletions cpp/src/arrow/util/io-util.cc
Expand Up @@ -49,13 +49,6 @@
#define ARROW_WRITE_SHMODE S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH
#endif

// For filename conversion
#if defined(_WIN32)
#include <codecvt>
#include <locale>
#include <stdexcept>
#endif

#include <boost/filesystem.hpp>

// ----------------------------------------------------------------------
Expand Down Expand Up @@ -94,6 +87,11 @@
#include "arrow/util/io-util.h"
#include "arrow/util/logging.h"

// For filename conversion
#if defined(_WIN32)
#include "arrow/util/utf8.h"
#endif

namespace arrow {
namespace io {

Expand Down Expand Up @@ -184,17 +182,11 @@ namespace bfs = ::boost::filesystem;

namespace {

#if _WIN32
using NativePathCodeCvt = std::codecvt_utf8_utf16<wchar_t>;
#endif

Status StringToNative(const std::string& s, NativePathString* out) {
#if _WIN32
try {
*out = std::wstring_convert<NativePathCodeCvt>{}.from_bytes(s);
} catch (std::range_error& e) {
return Status::Invalid(e.what());
}
std::wstring ws;
RETURN_NOT_OK(::arrow::util::UTF8ToWideString(s, &ws));
*out = std::move(ws);
#else
*out = s;
#endif
Expand Down Expand Up @@ -291,7 +283,15 @@ const NativePathString& PlatformFilename::ToNative() const {

std::string PlatformFilename::ToString() const {
#if _WIN32
return impl_->path.generic_string(NativePathCodeCvt());
std::wstring ws = impl_->path.generic_wstring();
std::string s;
Status st = ::arrow::util::WideStringToUTF8(ws, &s);
if (!st.ok()) {
std::stringstream ss;
ss << "<Unrepresentable filename: " << st.ToString() << ">";
return ss.str();
}
return s;
#else
return impl_->path.generic_string();
#endif
Expand Down
52 changes: 52 additions & 0 deletions cpp/src/arrow/util/utf8-util-test.cc
Expand Up @@ -15,6 +15,7 @@
// specific language governing permissions and limitations
// under the License.

#include <cstdint>
#include <random>
#include <string>
#include <vector>
Expand Down Expand Up @@ -281,5 +282,56 @@ TEST(SkipUTF8BOM, Basics) {
CheckTruncated("\xef\xbb");
}

TEST(UTF8ToWideString, Basics) {
auto CheckOk = [](const std::string& s, const std::wstring& expected) -> void {
std::wstring ws;
ASSERT_OK(UTF8ToWideString(s, &ws));
ASSERT_EQ(ws, expected);
};

auto CheckInvalid = [](const std::string& s) -> void {
std::wstring ws;
ASSERT_RAISES(Invalid, UTF8ToWideString(s, &ws));
};

CheckOk("", L"");
CheckOk("foo", L"foo");
CheckOk("h\xc3\xa9h\xc3\xa9", L"h\u00e9h\u00e9");
CheckOk("\xf0\x9f\x98\x80", L"\U0001F600");
CheckOk("\xf4\x8f\xbf\xbf", L"\U0010FFFF");
CheckOk({0, 'x'}, {0, L'x'});

CheckInvalid("\xff");
CheckInvalid("h\xc3");
}

TEST(WideStringToUTF8, Basics) {
auto CheckOk = [](const std::wstring& ws, const std::string& expected) -> void {
std::string s;
ASSERT_OK(WideStringToUTF8(ws, &s));
ASSERT_EQ(s, expected);
};

auto CheckInvalid = [](const std::wstring& ws) -> void {
std::string s;
ASSERT_RAISES(Invalid, WideStringToUTF8(ws, &s));
};

CheckOk(L"", "");
CheckOk(L"foo", "foo");
CheckOk(L"h\u00e9h\u00e9", "h\xc3\xa9h\xc3\xa9");
CheckOk(L"\U0001F600", "\xf0\x9f\x98\x80");
CheckOk(L"\U0010FFFF", "\xf4\x8f\xbf\xbf");
CheckOk({0, L'x'}, {0, 'x'});

// Lone surrogate
CheckInvalid({0xD800});
CheckInvalid({0xDFFF});
// Invalid code point
#if WCHAR_MAX > 0xFFFF
CheckInvalid({0x110000});
#endif
}

} // namespace util
} // namespace arrow
50 changes: 50 additions & 0 deletions cpp/src/arrow/util/utf8.cc
Expand Up @@ -15,10 +15,15 @@
// specific language governing permissions and limitations
// under the License.

#include <cstdint>
#include <iterator>
#include <mutex>
#include <stdexcept>
#include <utility>

#include "arrow/util/logging.h"
#include "arrow/util/utf8.h"
#include "arrow/vendored/utf8cpp/checked.h"

namespace arrow {
namespace util {
Expand Down Expand Up @@ -105,5 +110,50 @@ Status SkipUTF8BOM(const uint8_t* data, int64_t size, const uint8_t** out) {
return Status::OK();
}

namespace {

// Some platforms (such as old MinGWs) don't have the <codecvt> header,
// so call into a vendored utf8 implementation instead.

std::wstring UTF8ToWideStringInternal(const std::string& source) {
std::wstring ws;
#if WCHAR_MAX > 0xFFFF
::utf8::utf8to32(source.begin(), source.end(), std::back_inserter(ws));
#else
::utf8::utf8to16(source.begin(), source.end(), std::back_inserter(ws));
#endif
return ws;
}

std::string WideStringToUTF8Internal(const std::wstring& source) {
std::string s;
#if WCHAR_MAX > 0xFFFF
::utf8::utf32to8(source.begin(), source.end(), std::back_inserter(s));
#else
::utf8::utf16to8(source.begin(), source.end(), std::back_inserter(s));
#endif
return s;
}

} // namespace

Status UTF8ToWideString(const std::string& source, std::wstring* out) {
try {
*out = UTF8ToWideStringInternal(source);
} catch (std::exception& e) {
return Status::Invalid(e.what());
}
return Status::OK();
}

Status WideStringToUTF8(const std::wstring& source, std::string* out) {
try {
*out = WideStringToUTF8Internal(source);
} catch (std::exception& e) {
return Status::Invalid(e.what());
}
return Status::OK();
}

} // namespace util
} // namespace arrow
8 changes: 8 additions & 0 deletions cpp/src/arrow/util/utf8.h
Expand Up @@ -22,6 +22,7 @@
#include <cstdint>
#include <cstring>
#include <memory>
#include <string>

#include "arrow/status.h"
#include "arrow/util/macros.h"
Expand All @@ -31,6 +32,13 @@
namespace arrow {
namespace util {

// Convert a UTF8 string to a wstring (either UTF16 or UTF32, depending
// on the wchar_t width).
ARROW_EXPORT Status UTF8ToWideString(const std::string& source, std::wstring* out);

// Similarly, convert a wstring to a UTF8 string.
ARROW_EXPORT Status WideStringToUTF8(const std::wstring& source, std::string* out);

namespace internal {

// Copyright (c) 2008-2010 Bjoern Hoehrmann <bjoern@hoehrmann.de>
Expand Down

0 comments on commit eb23ea9

Please sign in to comment.