Skip to content

Commit

Permalink
finish BYTES fixes, Fix #61
Browse files Browse the repository at this point in the history
Note most of the 61 fixes happened over the previous
few commits
  • Loading branch information
brodieG committed Aug 5, 2017
1 parent 954cb1e commit 594d2a6
Show file tree
Hide file tree
Showing 2 changed files with 16 additions and 5 deletions.
8 changes: 7 additions & 1 deletion src/strsub.c
Original file line number Diff line number Diff line change
Expand Up @@ -161,12 +161,18 @@ static inline int char_offset(unsigned const char * char_ptr, int is_bytes) {
* windows 1252 locales where each character can be represented by a byte
* and could potentially lead to copying of entire vectors. We'll have to
* consider a mode where we let known 255 element encodings through...
*
* Note that CE_BYTES encoding is left as is. Unfortunately this means that all
* output from this function needs to check whether the original type was bytes
* or not.
*
* @param string a CHARSXP
*/
static inline unsigned const char * as_utf8_char(SEXP string) {
const char * char_val;

cetype_t char_enc = getCharCE(string);
if(is_utf8_enc(char_enc)) {
if(is_utf8_enc(char_enc) || char_enc == CE_BYTES) {
char_val = CHAR(string);
} else {
char_val = translateCharUTF8(string);
Expand Down
13 changes: 9 additions & 4 deletions tests/unitizer/cstringr.R
Original file line number Diff line number Diff line change
Expand Up @@ -290,19 +290,24 @@ unitizer_sect("UTF8 corner cases, in UTF-8", {
Map(vetr:::char_offsets, crit.2)
Map(vetr:::char_offsets, crit.3)
Map(vetr:::char_offsets, crit.4)
invisible(Sys.setlocale('LC_CTYPE', old.locale))
})

unitizer_sect("UTF-8 corner cases - other encodings", {
# Some latin-1 codes

lat.1.1 <- c(
lat.1.1 <- lat.1.2 <- c(
"ni\xF1a",
"hello",
"\xB5 \xB6 \xBF \xC9 \xF4"
)
Encoding(lat.1.1) <- "latin1"
Encoding(lat.1.2) <- "bytes"

Map(vetr:::char_offsets, lat.1.1)
lapply(lat.1.1, vetr:::char_offsets)
lapply(lat.1.2, vetr:::char_offsets)

# What about bytes encoding
invisible(Sys.setlocale('LC_CTYPE', old.locale))
vetr:::strsub(lat.1.1, 3L, mark=FALSE)
vetr:::strsub(lat.1.2, 3L, mark=FALSE)
})

0 comments on commit 594d2a6

Please sign in to comment.