diff --git a/src/libAtomVM/bitstring.c b/src/libAtomVM/bitstring.c index 37c0e9dcb3..28ef723143 100644 --- a/src/libAtomVM/bitstring.c +++ b/src/libAtomVM/bitstring.c @@ -76,3 +76,99 @@ bool bitstring_insert_any_integer(uint8_t *dst, avm_int_t offset, avm_int64_t va } return true; } + +bool bitstring_utf8_encode(avm_int_t c, uint8_t *buf, size_t *out_size) +{ + size_t sz = 0; + if (c < 0 || c > 0x10FFFF) { + return false; + } + if (c < 0x80) { + if (buf) { + *buf++ = c; + } + sz++; + } else if (c < 0x800) { + if (buf) { + *buf++ = (c >> 6) | 0xC0; + *buf++ = (c & 0x3F) | 0x80; + } + sz += 2; + } else if (c < 0x10000) { + if (buf) { + *buf++ = (c >> 12) | 0xE0; + *buf++ = ((c >> 6) & 0x3F) | 0x80; + *buf++ = (c & 0x3F) | 0x80; + } + sz += 3; + } else { + if (buf) { + *buf++ = (c >> 18) | 0xF0; + *buf++ = ((c >> 12) & 0x3F) | 0x80; + *buf++ = ((c >> 6) & 0x3F) | 0x80; + *buf++ = (c & 0x3F) | 0x80; + } + sz += 4; + } + *out_size = sz; + return true; +} + +bool bitstring_utf16_encode(avm_int_t c, uint8_t *buf, enum BitstringFlags bs_flags, size_t *out_size) +{ + size_t sz = 0; + if (c < 0 || c > 0x10FFFF) { + return false; + } + if (c < 0x10000) { + // Ignore D800-DFFF range + if (buf) { + if (bs_flags & LittleEndianIntegerMask) { + *buf++ = c & 0xFF; + *buf++ = c >> 8; + } else { + *buf++ = c >> 8; + *buf++ = c & 0xFF; + } + } + sz += 2; + } else { + if (buf) { + c -= 0x10000; + if (bs_flags & LittleEndianIntegerMask) { + *buf++ = ((c >> 10) & 0xFF); + *buf++ = (c >> 18) | 0xD8; + *buf++ = c & 0xFF; + *buf++ = ((c >> 8) & 0x03) | 0xDC; + } else { + *buf++ = (c >> 18) | 0xD8; + *buf++ = ((c >> 10) & 0xFF); + *buf++ = ((c >> 8) & 0x03) | 0xDC; + *buf++ = c & 0xFF; + } + } + sz += 4; + } + *out_size = sz; + return true; +} + +bool bitstring_utf32_encode(avm_int_t c, uint8_t *buf, enum BitstringFlags bs_flags) +{ + UNUSED(bs_flags); + if (c < 0 || c > 0x10FFFF) { + return false; + } + if (bs_flags & LittleEndianIntegerMask) { + *buf++ = c & 0xFF; + *buf++ = (c >> 8) & 0xFF; + *buf++ = (c >> 16) & 0xFF; + *buf++ = c >> 24; + } else { + *buf++ = c >> 24; + *buf++ = (c >> 16) & 0xFF; + *buf++ = (c >> 8) & 0xFF; + *buf++ = c & 0xFF; + } + return true; +} diff --git a/src/libAtomVM/bitstring.h b/src/libAtomVM/bitstring.h index 4265166ac7..6848f1fc99 100644 --- a/src/libAtomVM/bitstring.h +++ b/src/libAtomVM/bitstring.h @@ -301,6 +301,119 @@ static inline bool bitstring_insert_integer(term dst_bin, size_t offset, avm_int return bitstring_insert_any_integer((uint8_t *) term_binary_data(dst_bin), offset, value, n, bs_flags); } +/** + * @brief Encode a character to UTF-8. + * + * @param c character to encode + * @param buf the buffer to encode the sring to or NULL to only compute the + * size. + * @param out_size the size in bytes, on output (if not NULL) + * @return \c true if encoding was successful, \c false if c is not a valid + * unicode character + */ +bool bitstring_utf8_encode(avm_int_t c, uint8_t *buf, size_t *out_size); + +/** + * @brief Encode a character to UTF-16. + * + * @param c character to encode + * @param buf the buffer to encode the character to or NULL to only compute the + * size. + * @param bs_flags flags to encode the character (undefined/little/big/native) + * @param out_size the size in bytes, on output (if not NULL) + * @return \c true if encoding was successful, \c false if c is not a valid + * unicode character + */ +bool bitstring_utf16_encode(avm_int_t c, uint8_t *buf, enum BitstringFlags bs_flags, size_t *out_size); + +/** + * @brief Encode a character to UTF-32. + * + * @param c character to encode + * @param buf the buffer to encode the character + * @param bs_flags flags to encode the character (undefined/little/big/native) + * @return \c true if encoding was successful, \c false if c is not a valid + * unicode character + */ +bool bitstring_utf32_encode(avm_int_t c, uint8_t *buf, enum BitstringFlags bs_flags); + +/** + * @brief Compute the size of a character when UTF-8 encoded. + * + * @param c character to encode + * @param out_size the size in bytes, on output + * @return \c true if encoding was successful, \c false if c is not a valid + * unicode character + */ +static inline bool bitstring_utf8_size(avm_int_t c, size_t *out_size) +{ + return bitstring_utf8_encode(c, NULL, out_size); +} + +/** + * @brief Compute the size of a unicode character when UTF-16 encoded. + * + * @param c character to encode + * @param out_size the size in bytes, on output + * @return \c true if encoding was successful, \c false if c is not a valid + * unicode character + */ +static inline bool bitstring_utf16_size(avm_int_t c, size_t *out_size) { + return bitstring_utf16_encode(c, NULL, 0, out_size); +} + +/** + * @brief Insert a character in UTF-8 format + * + * @param dst_bin binary to insert to + * @param offset offset, in bits, to where to insert the character + * @param c character to encode + * @param out_size the size in bytes, on output + * @return \c true if encoding was successful, \c false if c is not a valid + * unicode character + */ +static inline bool bitstring_insert_utf8(term dst_bin, size_t offset, avm_int_t c, size_t *out_size) +{ + // size was verified by a bs_utf8_size instruction call + uint8_t *dst = (uint8_t *) term_binary_data(dst_bin) + (offset >> 3); + return bitstring_utf8_encode(c, dst, out_size); +} + +/** + * @brief Insert a character in UTF-&§ format + * + * @param dst_bin binary to insert to + * @param offset offset, in bits, to where to insert the character + * @param c character to encode + * @param bs_flags flags to encode the character (undefined/little/big/native) + * @param out_size the size in bytes, on output + * @return \c true if encoding was successful, \c false if c is not a valid + * unicode character + */ +static inline bool bitstring_insert_utf16(term dst_bin, size_t offset, avm_int_t c, enum BitstringFlags bs_flags, size_t *out_size) +{ + // size was verified by a bs_utf8_size instruction call + uint8_t *dst = (uint8_t *) term_binary_data(dst_bin) + (offset >> 3); + return bitstring_utf16_encode(c, dst, bs_flags, out_size); +} + +/** + * @brief Insert a character in UTF-32 format + * + * @param dst_bin binary to insert to + * @param offset offset, in bits, to where to insert the character + * @param c character to encode + * @param bs_flags flags to encode the character (undefined/little/big/native) + * @param out_size the size in bytes, on output + * @return \c true if encoding was successful, \c false if c is not a valid + * unicode character + */ +static inline bool bitstring_insert_utf32(term dst_bin, size_t offset, avm_int_t c, enum BitstringFlags bs_flags) +{ + uint8_t *dst = (uint8_t *) term_binary_data(dst_bin) + (offset >> 3); + return bitstring_utf32_encode(c, dst, bs_flags); +} + #ifdef __cplusplus } #endif diff --git a/src/libAtomVM/opcodes.h b/src/libAtomVM/opcodes.h index b20f5793ae..e6d236e1f9 100644 --- a/src/libAtomVM/opcodes.h +++ b/src/libAtomVM/opcodes.h @@ -115,6 +115,11 @@ #define OP_BS_APPEND 134 #define OP_TRIM 136 #define OP_BS_INIT_BITS 137 +#define OP_BS_UTF8_SIZE 144 +#define OP_BS_PUT_UTF8 145 +#define OP_BS_UTF16_SIZE 146 +#define OP_BS_PUT_UTF16 147 +#define OP_BS_PUT_UTF32 148 #define OP_RECV_MARK 150 #define OP_RECV_SET 151 #define OP_GC_BIF3 152 diff --git a/src/libAtomVM/opcodesswitch.h b/src/libAtomVM/opcodesswitch.h index a33a101ad5..e6ccd0ef99 100644 --- a/src/libAtomVM/opcodesswitch.h +++ b/src/libAtomVM/opcodesswitch.h @@ -24,6 +24,7 @@ #include #include "bif.h" +#include "bitstring.h" #include "debug.h" #include "defaultatoms.h" #include "exportedfunction.h" @@ -3553,6 +3554,176 @@ static bool maybe_call_native(Context *ctx, AtomString module_name, AtomString f break; } + case OP_BS_UTF8_SIZE: { + int next_off = 1; + uint32_t fail; + DECODE_LABEL(fail, code, i, next_off) + term src; + DECODE_COMPACT_TERM(src, code, i, next_off) + dreg_t dreg; + dreg_type_t dreg_type; + DECODE_DEST_REGISTER(dreg, dreg_type, code, i, next_off); + #ifdef IMPL_CODE_LOADER + TRACE("bs_utf8_size/3"); + #endif + #ifdef IMPL_EXECUTE_LOOP + VERIFY_IS_INTEGER(src, "bs_utf8_size/3"); + avm_int_t src_value = term_to_int(src); + TRACE("bs_utf8_size/3 fail=%i src=0x%lx dreg=%c%i\n", fail, (long) src_value, T_DEST_REG(dreg_type, dreg)); + size_t utf8_size; + if (UNLIKELY(!bitstring_utf8_size(src_value, &utf8_size))) { + RAISE_ERROR(BADARG_ATOM); + } + WRITE_REGISTER(dreg_type, dreg, term_from_int(utf8_size)); + #endif + + NEXT_INSTRUCTION(next_off); + break; + } + + case OP_BS_PUT_UTF8: { + int next_off = 1; + uint32_t fail; + DECODE_LABEL(fail, code, i, next_off) + uint32_t flags; + DECODE_LITERAL(flags, code, i, next_off) + term src; + DECODE_COMPACT_TERM(src, code, i, next_off) + #ifdef IMPL_CODE_LOADER + TRACE("bs_put_utf8/3 flags=%x\n", (int) flags); + if (flags != 0) { + fprintf(stderr, "bs_put_utf8/3 : unsupported flags %x\n", (int) flags); + AVM_ABORT(); + } + #endif + #ifdef IMPL_EXECUTE_LOOP + VERIFY_IS_INTEGER(src, "bs_put_utf8/3"); + avm_int_t src_value = term_to_int(src); + TRACE("bs_put_utf8/3 flags=%x, src=0x%lx\n", (int) flags, (long) src_value); + if (UNLIKELY(!term_is_binary(ctx->bs))) { + TRACE("bs_put_utf8/3: Bad state. ctx->bs is not a binary.\n"); + RAISE_ERROR(BADARG_ATOM); + } + if (ctx->bs_offset % 8 != 0) { + TRACE("bs_put_utf8/3: Unsupported bit syntax operation. Writing strings must be byte-aligend.\n"); + RAISE_ERROR(UNSUPPORTED_ATOM); + } + size_t byte_size; + bool result = bitstring_insert_utf8(ctx->bs, ctx->bs_offset, src_value, &byte_size); + if (UNLIKELY(!result)) { + TRACE("bs_put_utf8/3: Failed to insert character as utf8 into binary: %i\n", result); + RAISE_ERROR(BADARG_ATOM); + } + ctx->bs_offset += byte_size * 8; + #endif + NEXT_INSTRUCTION(next_off); + break; + } + + case OP_BS_UTF16_SIZE: { + int next_off = 1; + uint32_t fail; + DECODE_LABEL(fail, code, i, next_off) + term src; + DECODE_COMPACT_TERM(src, code, i, next_off) + dreg_t dreg; + dreg_type_t dreg_type; + DECODE_DEST_REGISTER(dreg, dreg_type, code, i, next_off); + #ifdef IMPL_CODE_LOADER + TRACE("bs_utf16_size/3"); + #endif + #ifdef IMPL_EXECUTE_LOOP + VERIFY_IS_INTEGER(src, "bs_utf16_size/3"); + avm_int_t src_value = term_to_int(src); + TRACE("bs_utf16_size/3 fail=%i src=0x%lx dreg=%c%i\n", fail, (long) src_value, T_DEST_REG(dreg_type, dreg)); + size_t utf16_size; + if (UNLIKELY(!bitstring_utf16_size(src_value, &utf16_size))) { + RAISE_ERROR(BADARG_ATOM); + } + WRITE_REGISTER(dreg_type, dreg, term_from_int(utf16_size)); + #endif + + NEXT_INSTRUCTION(next_off); + break; + } + + case OP_BS_PUT_UTF16: { + int next_off = 1; + uint32_t fail; + DECODE_LABEL(fail, code, i, next_off) + uint32_t flags; + DECODE_LITERAL(flags, code, i, next_off) + term src; + DECODE_COMPACT_TERM(src, code, i, next_off) + #ifdef IMPL_CODE_LOADER + TRACE("bs_put_utf16/3 flags=%x\n", (int) flags); + if (flags != 0 && flags != LittleEndianInteger && flags != NativeEndianInteger) { + fprintf(stderr, "bs_put_utf16/3 : unsupported flags %x\n", (int) flags); + AVM_ABORT(); + } + #endif + #ifdef IMPL_EXECUTE_LOOP + VERIFY_IS_INTEGER(src, "bs_put_utf16/3"); + avm_int_t src_value = term_to_int(src); + TRACE("bs_put_utf16/3 flags=%x, src=0x%lx\n", (int) flags, src_value); + if (UNLIKELY(!term_is_binary(ctx->bs))) { + TRACE("bs_put_utf16: Bad state. ctx->bs is not a binary.\n"); + RAISE_ERROR(BADARG_ATOM); + } + if (ctx->bs_offset % 8 != 0) { + TRACE("bs_put_utf16: Unsupported bit syntax operation. Writing strings must be byte-aligend.\n"); + RAISE_ERROR(UNSUPPORTED_ATOM); + } + size_t byte_size; + bool result = bitstring_insert_utf16(ctx->bs, ctx->bs_offset, src_value, flags, &byte_size); + if (UNLIKELY(!result)) { + TRACE("bs_put_utf8/3: Failed to insert character as utf8 into binary: %i\n", result); + RAISE_ERROR(BADARG_ATOM); + } + ctx->bs_offset += byte_size * 8; + #endif + NEXT_INSTRUCTION(next_off); + break; + } + + case OP_BS_PUT_UTF32: { + int next_off = 1; + uint32_t fail; + DECODE_LABEL(fail, code, i, next_off) + uint32_t flags; + DECODE_LITERAL(flags, code, i, next_off) + term src; + DECODE_COMPACT_TERM(src, code, i, next_off) + #ifdef IMPL_CODE_LOADER + TRACE("bs_put_utf32/3 flags=%x\n", (int) flags); + if (flags != 0 && flags != LittleEndianInteger && flags != NativeEndianInteger) { + fprintf(stderr, "bs_put_utf32/3 : unsupported flags %x\n", (int) flags); + AVM_ABORT(); + } + #endif + #ifdef IMPL_EXECUTE_LOOP + VERIFY_IS_INTEGER(src, "bs_put_utf32/3"); + avm_int_t src_value = term_to_int(src); + TRACE("bs_put_utf32/3 flags=%x, src=0x%lx\n", (int) flags, (long) src_value); + if (UNLIKELY(!term_is_binary(ctx->bs))) { + TRACE("bs_put_utf32/3: Bad state. ctx->bs is not a binary.\n"); + RAISE_ERROR(BADARG_ATOM); + } + if (ctx->bs_offset % 8 != 0) { + TRACE("bs_put_utf32/3: Unsupported bit syntax operation. Writing strings must be byte-aligend.\n"); + RAISE_ERROR(UNSUPPORTED_ATOM); + } + bool result = bitstring_insert_utf32(ctx->bs, ctx->bs_offset, src_value, flags); + if (UNLIKELY(!result)) { + TRACE("bs_put_utf32/3: Failed to insert integer into binary: %i\n", result); + RAISE_ERROR(BADARG_ATOM); + } + ctx->bs_offset += 4 * 8; + #endif + NEXT_INSTRUCTION(next_off); + break; + } + case OP_BS_APPEND: { int next_off = 1; uint32_t fail; diff --git a/tests/erlang_tests/CMakeLists.txt b/tests/erlang_tests/CMakeLists.txt index 2abf6e410a..4a363ea8d5 100644 --- a/tests/erlang_tests/CMakeLists.txt +++ b/tests/erlang_tests/CMakeLists.txt @@ -316,6 +316,7 @@ compile_erlang(test_ordering_1) compile_erlang(test_bs) compile_erlang(test_bs_int) compile_erlang(test_bs_int_unaligned) +compile_erlang(test_bs_utf) compile_erlang(test_catch) compile_erlang(test_gc) compile_erlang(test_raise) @@ -722,6 +723,7 @@ add_custom_target(erlang_test_modules DEPENDS test_bs.beam test_bs_int.beam test_bs_int_unaligned.beam + test_bs_utf.beam test_catch.beam test_gc.beam test_raise.beam diff --git a/tests/erlang_tests/test_bs_utf.erl b/tests/erlang_tests/test_bs_utf.erl new file mode 100644 index 0000000000..ffc5926e8f --- /dev/null +++ b/tests/erlang_tests/test_bs_utf.erl @@ -0,0 +1,140 @@ +% +% This file is part of AtomVM. +% +% Copyright 2022 Paul Guyot +% +% Licensed under the Apache License, Version 2.0 (the "License"); +% you may not use this file except in compliance with the License. +% You may obtain a copy of the License at +% +% http://www.apache.org/licenses/LICENSE-2.0 +% +% Unless required by applicable law or agreed to in writing, software +% distributed under the License is distributed on an "AS IS" BASIS, +% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +% See the License for the specific language governing permissions and +% limitations under the License. +% +% SPDX-License-Identifier: Apache-2.0 OR LGPL-2.1-or-later +% + +-module(test_bs_utf). + +-export([start/0, id/1]). + +start() -> + ok = test_put_utf8(), + ok = test_put_utf16(), + ok = test_put_utf32(), + 0. + +test_put_utf8() -> + X0 = ?MODULE:id(16#10437), + B0 = <>, + <<240, 144, 144, 183>> = B0, + X1 = ?MODULE:id($暑), + B1 = <>, + <<230, 154, 145>> = B1, + X2 = ?MODULE:id($é), + B2 = <>, + <<195, 169>> = B2, + X3 = ?MODULE:id($e), + B3 = <>, + <<101>> = B3, + XF1 = ?MODULE:id([]), + ok = assert_badarg(fun() -> <> end), + XF2 = ?MODULE:id(-1), + ok = assert_badarg(fun() -> <> end), + XF3 = ?MODULE:id(16#110000), + ok = assert_badarg(fun() -> <> end), + ok. + +test_put_utf16() -> + X0 = ?MODULE:id(16#10437), + B0 = <>, + <<216, 1, 220, 55>> = B0, + X1 = ?MODULE:id($暑), + B1 = <>, + <<102, 145>> = B1, + X2 = ?MODULE:id($é), + B2 = <>, + <<0, 233>> = B2, + X3 = ?MODULE:id($e), + B3 = <>, + <<0, 101>> = B3, + X4 = ?MODULE:id($a), + B4 = <>, + <<0, 97>> = B4, + B5 = <>, + <<97, 0>> = B5, + BugOTP17713 = + case erlang:system_info(machine) of + "BEAM" -> + Version = [ + list_to_integer(N) + || N <- string:tokens(erlang:system_info(version), ".") + ], + Version < [12, 2]; + _ -> + false + end, + if + BugOTP17713 -> + ok; + true -> + B6 = <>, + <<97:16/native>> = B6 + end, + B7 = <>, + <<216, 1, 220, 55>> = B7, + B8 = <>, + <<1, 216, 55, 220>> = B8, + XF1 = ?MODULE:id([]), + ok = assert_badarg(fun() -> <> end), + XF2 = ?MODULE:id(-1), + ok = assert_badarg(fun() -> <> end), + XF3 = ?MODULE:id(16#110000), + ok = assert_badarg(fun() -> <> end), + ok. + +test_put_utf32() -> + X0 = ?MODULE:id(16#10437), + B0 = <>, + <<0, 1, 4, 55>> = B0, + X1 = ?MODULE:id($暑), + B1 = <>, + <<0, 0, 102, 145>> = B1, + X2 = ?MODULE:id($é), + B2 = <>, + <<0, 0, 0, 233>> = B2, + X3 = ?MODULE:id($e), + B3 = <>, + <<0, 0, 0, 101>> = B3, + X4 = ?MODULE:id($a), + B4 = <>, + <<0, 0, 0, 97>> = B4, + B5 = <>, + <<97, 0, 0, 0>> = B5, + B6 = <>, + <<97:32/native>> = B6, + B7 = <>, + <<0, 1, 4, 55>> = B7, + B8 = <>, + <<55, 4, 1, 0>> = B8, + XF1 = ?MODULE:id([]), + ok = assert_badarg(fun() -> <> end), + XF2 = ?MODULE:id(-1), + ok = assert_badarg(fun() -> <> end), + XF3 = ?MODULE:id(16#110000), + ok = assert_badarg(fun() -> <> end), + ok. + +id(I) -> I. + +assert_badarg(F) -> + try + R = F(), + {fail_no_ex, R} + catch + error:badarg -> ok + end. diff --git a/tests/test.c b/tests/test.c index 5da602098b..781b444329 100644 --- a/tests/test.c +++ b/tests/test.c @@ -348,6 +348,7 @@ struct Test tests[] = { TEST_CASE(test_bs), TEST_CASE(test_bs_int), TEST_CASE(test_bs_int_unaligned), + TEST_CASE(test_bs_utf), TEST_CASE(test_catch), TEST_CASE(test_gc), TEST_CASE_EXPECTED(test_raise, 7),