Skip to content

Commit

Permalink
Better handle badly-written encoding names
Browse files Browse the repository at this point in the history
Make encoding name comparison more permissive, finding names that are
very likely to refer to the same encoding.
For example, "utf8" now matches "UTF-8", and "iso8859_1" matches
"ISO-8859-1".

This makes encodings_get_idx_from_charset() and
encodings_get_from_charset() more permissive, and allow to normalize
an encoding name.
It is used to better handle user-provided encodings (e.g. one found by
a regex search) by normalizing it to the Geany name.

git-svn-id: https://geany.svn.sourceforge.net/svnroot/geany/trunk@5666 ea778897-0a13-0410-b9d1-a72fbfd435f5
  • Loading branch information
b4n committed Mar 31, 2011
1 parent b32bb79 commit 907a792
Show file tree
Hide file tree
Showing 2 changed files with 73 additions and 3 deletions.
6 changes: 6 additions & 0 deletions ChangeLog
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,12 @@
* src/encodings.c:
Update regex used to find encodings for it to allow the encoding to
be quoted, adding support for XML (closes #3183506).
* src/encodings.c:
Implement charset name normalization in order to better deal with
badly-written encoding names (i.e. names found by regex search).
This also makes encodings_get_idx_from_charset() and
encodings_get_from_charset() more permissive regarding the passed-in
encoding name.


2011-03-31 Nick Treleaven <nick(dot)treleaven(at)btinternet(dot)com>
Expand Down
70 changes: 67 additions & 3 deletions src/encodings.c
Original file line number Diff line number Diff line change
Expand Up @@ -147,6 +147,56 @@ static void init_encodings(void)
}


/* compares two encoding names in a permissive fashion.
* e.g. "utf8" matches "UTF-8", "iso8859_1" matches "ISO-8859-1", etc. */
static gboolean encodings_charset_equals(const gchar *a, const gchar *b)
{
gboolean was_alpha = FALSE; /* whether last character of previous word was a letter */
gboolean need_sep = FALSE; /* whether we're expecting an implicit separator */

while (*a && *b)
{
gboolean is_alpha;

if (g_ascii_toupper(*a) == g_ascii_toupper(*b) &&
((is_alpha = g_ascii_isalpha(*a)) || g_ascii_isdigit(*a)))
{
/* either there was a real separator, or we need a implicit one (a chage from alpha to
* numeric or so) */
if (! need_sep || (was_alpha != is_alpha))
{
a++;
b++;
was_alpha = is_alpha;
need_sep = FALSE;
}
else
return FALSE;
}
else
{
guint n_sep = 0;

if (! g_ascii_isalnum(*a))
{
a++;
n_sep++;
}
if (! g_ascii_isalnum(*b))
{
b++;
n_sep++;
}
if (n_sep < 1)
return FALSE;
else if (n_sep < 2)
need_sep = TRUE;
}
}
return *a == *b;
}


GeanyEncodingIndex encodings_get_idx_from_charset(const gchar *charset)
{
gint i;
Expand All @@ -157,7 +207,7 @@ GeanyEncodingIndex encodings_get_idx_from_charset(const gchar *charset)
i = 0;
while (i < GEANY_ENCODINGS_MAX)
{
if (strcmp(charset, encodings[i].charset) == 0)
if (encodings_charset_equals(charset, encodings[i].charset))
return i;

++i;
Expand All @@ -176,7 +226,7 @@ const GeanyEncoding *encodings_get_from_charset(const gchar *charset)
i = 0;
while (i < GEANY_ENCODINGS_MAX)
{
if (strcmp(charset, encodings[i].charset) == 0)
if (encodings_charset_equals(charset, encodings[i].charset))
return &encodings[i];

++i;
Expand All @@ -186,6 +236,18 @@ const GeanyEncoding *encodings_get_from_charset(const gchar *charset)
}


static const gchar *encodings_normalize_charset(const gchar *charset)
{
const GeanyEncoding *encoding;

encoding = encodings_get_from_charset(charset);
if (encoding != NULL)
return encoding->charset;

return NULL;
}


const GeanyEncoding *encodings_get_from_index(gint idx)
{
g_return_val_if_fail(idx >= 0 && idx < GEANY_ENCODINGS_MAX, NULL);
Expand Down Expand Up @@ -556,7 +618,9 @@ gchar *encodings_convert_to_utf8(const gchar *buffer, gsize size, gchar **used_e
if (check_regex)
{
check_regex = FALSE;
charset = regex_charset;
charset = encodings_normalize_charset(regex_charset);
if (! charset) /* we found a regex encoding that we can't normalize, try it as is */
charset = regex_charset;
i = -2; /* keep i below the start value to have it again at -1 on the next loop run */
}
else if (check_locale)
Expand Down

0 comments on commit 907a792

Please sign in to comment.