Skip to content

Commit b3d6128

Browse files
committed
Fix memory leak in grapheme clusters
[Bug #20150] String#grapheme_cluters and String#each_grapheme_cluster leaks memory because if the string is not UTF-8, then the created regex will not be freed. For example: str = "hello world".encode(Encoding::UTF_32LE) 10.times do 1_000.times do str.grapheme_clusters end puts `ps -o rss= -p #{$$}` end Before: 26000 42256 59008 75792 92528 109232 125936 142672 159392 176160 After: 9264 9504 9808 10000 10128 10224 10352 10544 10704 10896
1 parent 8f4eda5 commit b3d6128

File tree

2 files changed

+75
-34
lines changed

2 files changed

+75
-34
lines changed

string.c

Lines changed: 64 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -9344,64 +9344,79 @@ static regex_t *
93449344
get_reg_grapheme_cluster(rb_encoding *enc)
93459345
{
93469346
int encidx = rb_enc_to_index(enc);
9347-
regex_t *reg_grapheme_cluster = NULL;
9348-
static regex_t *reg_grapheme_cluster_utf8 = NULL;
93499347

9350-
/* synchronize */
9351-
if (encidx == rb_utf8_encindex() && reg_grapheme_cluster_utf8) {
9352-
reg_grapheme_cluster = reg_grapheme_cluster_utf8;
9353-
}
9354-
if (!reg_grapheme_cluster) {
9355-
const OnigUChar source_ascii[] = "\\X";
9356-
OnigErrorInfo einfo;
9357-
const OnigUChar *source = source_ascii;
9358-
size_t source_len = sizeof(source_ascii) - 1;
9359-
switch (encidx) {
9348+
const OnigUChar source_ascii[] = "\\X";
9349+
const OnigUChar *source = source_ascii;
9350+
size_t source_len = sizeof(source_ascii) - 1;
9351+
9352+
switch (encidx) {
93609353
#define CHARS_16BE(x) (OnigUChar)((x)>>8), (OnigUChar)(x)
93619354
#define CHARS_16LE(x) (OnigUChar)(x), (OnigUChar)((x)>>8)
93629355
#define CHARS_32BE(x) CHARS_16BE((x)>>16), CHARS_16BE(x)
93639356
#define CHARS_32LE(x) CHARS_16LE(x), CHARS_16LE((x)>>16)
93649357
#define CASE_UTF(e) \
9365-
case ENCINDEX_UTF_##e: { \
9366-
static const OnigUChar source_UTF_##e[] = {CHARS_##e('\\'), CHARS_##e('X')}; \
9367-
source = source_UTF_##e; \
9368-
source_len = sizeof(source_UTF_##e); \
9369-
break; \
9370-
}
9371-
CASE_UTF(16BE); CASE_UTF(16LE); CASE_UTF(32BE); CASE_UTF(32LE);
9358+
case ENCINDEX_UTF_##e: { \
9359+
static const OnigUChar source_UTF_##e[] = {CHARS_##e('\\'), CHARS_##e('X')}; \
9360+
source = source_UTF_##e; \
9361+
source_len = sizeof(source_UTF_##e); \
9362+
break; \
9363+
}
9364+
CASE_UTF(16BE); CASE_UTF(16LE); CASE_UTF(32BE); CASE_UTF(32LE);
93729365
#undef CASE_UTF
93739366
#undef CHARS_16BE
93749367
#undef CHARS_16LE
93759368
#undef CHARS_32BE
93769369
#undef CHARS_32LE
9377-
}
9378-
int r = onig_new(&reg_grapheme_cluster, source, source + source_len,
9379-
ONIG_OPTION_DEFAULT, enc, OnigDefaultSyntax, &einfo);
9380-
if (r) {
9381-
UChar message[ONIG_MAX_ERROR_MESSAGE_LEN];
9382-
onig_error_code_to_str(message, r, &einfo);
9383-
rb_fatal("cannot compile grapheme cluster regexp: %s", (char *)message);
9384-
}
9385-
if (encidx == rb_utf8_encindex()) {
9386-
reg_grapheme_cluster_utf8 = reg_grapheme_cluster;
9387-
}
93889370
}
9371+
9372+
regex_t *reg_grapheme_cluster;
9373+
OnigErrorInfo einfo;
9374+
int r = onig_new(&reg_grapheme_cluster, source, source + source_len,
9375+
ONIG_OPTION_DEFAULT, enc, OnigDefaultSyntax, &einfo);
9376+
if (r) {
9377+
UChar message[ONIG_MAX_ERROR_MESSAGE_LEN];
9378+
onig_error_code_to_str(message, r, &einfo);
9379+
rb_fatal("cannot compile grapheme cluster regexp: %s", (char *)message);
9380+
}
9381+
93899382
return reg_grapheme_cluster;
93909383
}
93919384

9385+
static regex_t *
9386+
get_cached_reg_grapheme_cluster(rb_encoding *enc)
9387+
{
9388+
int encidx = rb_enc_to_index(enc);
9389+
static regex_t *reg_grapheme_cluster_utf8 = NULL;
9390+
9391+
if (encidx == rb_utf8_encindex()) {
9392+
if (!reg_grapheme_cluster_utf8) {
9393+
reg_grapheme_cluster_utf8 = get_reg_grapheme_cluster(enc);
9394+
}
9395+
9396+
return reg_grapheme_cluster_utf8;
9397+
}
9398+
9399+
return NULL;
9400+
}
9401+
93929402
static VALUE
93939403
rb_str_each_grapheme_cluster_size(VALUE str, VALUE args, VALUE eobj)
93949404
{
93959405
size_t grapheme_cluster_count = 0;
9396-
regex_t *reg_grapheme_cluster = NULL;
93979406
rb_encoding *enc = get_encoding(str);
93989407
const char *ptr, *end;
93999408

94009409
if (!rb_enc_unicode_p(enc)) {
94019410
return rb_str_length(str);
94029411
}
94039412

9404-
reg_grapheme_cluster = get_reg_grapheme_cluster(enc);
9413+
bool cached_reg_grapheme_cluster = true;
9414+
regex_t *reg_grapheme_cluster = get_cached_reg_grapheme_cluster(enc);
9415+
if (!reg_grapheme_cluster) {
9416+
reg_grapheme_cluster = get_reg_grapheme_cluster(enc);
9417+
cached_reg_grapheme_cluster = false;
9418+
}
9419+
94059420
ptr = RSTRING_PTR(str);
94069421
end = RSTRING_END(str);
94079422

@@ -9414,14 +9429,17 @@ rb_str_each_grapheme_cluster_size(VALUE str, VALUE args, VALUE eobj)
94149429
ptr += len;
94159430
}
94169431

9432+
if (!cached_reg_grapheme_cluster) {
9433+
onig_free(reg_grapheme_cluster);
9434+
}
9435+
94179436
return SIZET2NUM(grapheme_cluster_count);
94189437
}
94199438

94209439
static VALUE
94219440
rb_str_enumerate_grapheme_clusters(VALUE str, VALUE ary)
94229441
{
94239442
VALUE orig = str;
9424-
regex_t *reg_grapheme_cluster = NULL;
94259443
rb_encoding *enc = get_encoding(str);
94269444
const char *ptr0, *ptr, *end;
94279445

@@ -9430,7 +9448,14 @@ rb_str_enumerate_grapheme_clusters(VALUE str, VALUE ary)
94309448
}
94319449

94329450
if (!ary) str = rb_str_new_frozen(str);
9433-
reg_grapheme_cluster = get_reg_grapheme_cluster(enc);
9451+
9452+
bool cached_reg_grapheme_cluster = true;
9453+
regex_t *reg_grapheme_cluster = get_cached_reg_grapheme_cluster(enc);
9454+
if (!reg_grapheme_cluster) {
9455+
reg_grapheme_cluster = get_reg_grapheme_cluster(enc);
9456+
cached_reg_grapheme_cluster = false;
9457+
}
9458+
94349459
ptr0 = ptr = RSTRING_PTR(str);
94359460
end = RSTRING_END(str);
94369461

@@ -9442,6 +9467,11 @@ rb_str_enumerate_grapheme_clusters(VALUE str, VALUE ary)
94429467
ENUM_ELEM(ary, rb_str_subseq(str, ptr-ptr0, len));
94439468
ptr += len;
94449469
}
9470+
9471+
if (!cached_reg_grapheme_cluster) {
9472+
onig_free(reg_grapheme_cluster);
9473+
}
9474+
94459475
RB_GC_GUARD(str);
94469476
if (ary)
94479477
return ary;

test/ruby/test_string.rb

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1108,6 +1108,17 @@ def test_grapheme_clusters
11081108
assert_equal("C", res[2])
11091109
end
11101110

1111+
def test_grapheme_clusters_memory_leak
1112+
assert_no_memory_leak([], "", "#{<<~"begin;"}\n#{<<~'end;'}", "[Bug #todo]", rss: true)
1113+
begin;
1114+
str = "hello world".encode(Encoding::UTF_32LE)
1115+
1116+
10_000.times do
1117+
str.grapheme_clusters
1118+
end
1119+
end;
1120+
end
1121+
11111122
def test_each_line
11121123
verbose, $VERBOSE = $VERBOSE, nil
11131124

0 commit comments

Comments
 (0)