Skip to content

Commit

Permalink
charset.c: add CHARSET_NO_CANONIFY flag to charset_extract
Browse files Browse the repository at this point in the history
  • Loading branch information
rsto committed Oct 25, 2016
1 parent a4991f7 commit 0c30c3d
Show file tree
Hide file tree
Showing 3 changed files with 10 additions and 1 deletion.
6 changes: 6 additions & 0 deletions cunit/charset.testc
Expand Up @@ -910,6 +910,12 @@ static void test_extract(void)

/* &nonesuch; is most definitely not defined */
TESTCASE("A&nonesuch;B", "us-ascii", ENCODING_NONE, "HTML", "A"UTF8_REPLACEMENT"B");

/* HTML: skip html but don't canonify */
flags |= CHARSET_NO_CANONIFY;
TESTCASE("<b>Photo</b> <em>booth</em>",
"us-ascii", ENCODING_NONE, "HTML",
"Photo booth");
}
#undef TESTCASE

Expand Down
4 changes: 3 additions & 1 deletion lib/charset.c
Expand Up @@ -2429,7 +2429,9 @@ EXPORTED int charset_extract(void (*cb)(const struct buf *, void *),
utf8 = charset_lookupname("utf-8");
tobuffer = buffer_init();
input = convert_init(utf8, 0/*to_uni*/, 0, tobuffer);
input = canon_init(flags, input);
if (!(flags & CHARSET_NO_CANONIFY)) {
input = canon_init(flags, input);
}

if (!strcmpsafe(subtype, "HTML")) {
if ((flags & CHARSET_SKIPHTML)) {
Expand Down
1 change: 1 addition & 0 deletions lib/charset.h
Expand Up @@ -54,6 +54,7 @@
#define CHARSET_SNIPPET (1<<4)
#define CHARSET_UNFOLD_SKIPWS (1<<5)
#define CHARSET_MIME_UTF8 (1<<6)
#define CHARSET_NO_CANONIFY (1<<7)

#define CHARSET_UNKNOWN_CHARSET (NULL)

Expand Down

0 comments on commit 0c30c3d

Please sign in to comment.