This repository has been archived by the owner on Nov 8, 2023. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 56
/
PhonebookIndex.cpp
205 lines (184 loc) · 6.94 KB
/
PhonebookIndex.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
/*
* Copyright 2010, The Android Open Source Project
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <ctype.h>
#include <string.h>
#include <unicode/ucol.h>
#include <unicode/uiter.h>
#include <unicode/ustring.h>
#include <unicode/utypes.h>
#include "PhonebookIndex.h"
#include "PhoneticStringUtils.h"
#define MIN_OUTPUT_SIZE 6 // Minimum required size for the output buffer (in bytes)
namespace android {
// IMPORTANT! Keep the codes below SORTED. We are doing a binary search on the array
static UChar DEFAULT_CHAR_MAP[] = {
0x00C6, 'A', // AE
0x00DF, 'S', // Etzett
0x1100, 0x3131, // HANGUL LETTER KIYEOK
0x1101, 0x3132, // HANGUL LETTER SSANGKIYEOK
0x1102, 0x3134, // HANGUL LETTER NIEUN
0x1103, 0x3137, // HANGUL LETTER TIKEUT
0x1104, 0x3138, // HANGUL LETTER SSANGTIKEUT
0x1105, 0x3139, // HANGUL LETTER RIEUL
0x1106, 0x3141, // HANGUL LETTER MIEUM
0x1107, 0x3142, // HANGUL LETTER PIEUP
0x1108, 0x3143, // HANGUL LETTER SSANGPIEUP
0x1109, 0x3145, // HANGUL LETTER SIOS
0x110A, 0x3146, // HANGUL LETTER SSANGSIOS
0x110B, 0x3147, // HANGUL LETTER IEUNG
0x110C, 0x3148, // HANGUL LETTER CIEUC
0x110D, 0x3149, // HANGUL LETTER SSANGCIEUC
0x110E, 0x314A, // HANGUL LETTER CHIEUCH
0x110F, 0x314B, // HANGUL LETTER KHIEUKH
0x1110, 0x314C, // HANGUL LETTER THIEUTH
0x1111, 0x314D, // HANGUL LETTER PHIEUPH
0x1112, 0x314E, // HANGUL LETTER HIEUH
0x111A, 0x3140, // HANGUL LETTER RIEUL-HIEUH
0x1121, 0x3144, // HANGUL LETTER PIEUP-SIOS
0x1161, 0x314F, // HANGUL LETTER A
0x1162, 0x3150, // HANGUL LETTER AE
0x1163, 0x3151, // HANGUL LETTER YA
0x1164, 0x3152, // HANGUL LETTER YAE
0x1165, 0x3153, // HANGUL LETTER EO
0x1166, 0x3154, // HANGUL LETTER E
0x1167, 0x3155, // HANGUL LETTER YEO
0x1168, 0x3156, // HANGUL LETTER YE
0x1169, 0x3157, // HANGUL LETTER O
0x116A, 0x3158, // HANGUL LETTER WA
0x116B, 0x3159, // HANGUL LETTER WAE
0x116C, 0x315A, // HANGUL LETTER OE
0x116D, 0x315B, // HANGUL LETTER YO
0x116E, 0x315C, // HANGUL LETTER U
0x116F, 0x315D, // HANGUL LETTER WEO
0x1170, 0x315E, // HANGUL LETTER WE
0x1171, 0x315F, // HANGUL LETTER WI
0x1172, 0x3160, // HANGUL LETTER YU
0x1173, 0x3161, // HANGUL LETTER EU
0x1174, 0x3162, // HANGUL LETTER YI
0x1175, 0x3163, // HANGUL LETTER I
0x11AA, 0x3133, // HANGUL LETTER KIYEOK-SIOS
0x11AC, 0x3135, // HANGUL LETTER NIEUN-CIEUC
0x11AD, 0x3136, // HANGUL LETTER NIEUN-HIEUH
0x11B0, 0x313A, // HANGUL LETTER RIEUL-KIYEOK
0x11B1, 0x313B, // HANGUL LETTER RIEUL-MIEUM
0x11B3, 0x313D, // HANGUL LETTER RIEUL-SIOS
0x11B4, 0x313E, // HANGUL LETTER RIEUL-THIEUTH
0x11B5, 0x313F, // HANGUL LETTER RIEUL-PHIEUPH
};
/**
* Binary search to map an individual character to the corresponding phone book index.
*/
static UChar map_character(UChar c, UChar * char_map, int32_t length) {
int from = 0, to = length;
while (from < to) {
int m = ((to + from) >> 1) & ~0x1; // Only consider even positions
UChar cm = char_map[m];
if (cm == c) {
return char_map[m + 1];
} else if (cm < c) {
from = m + 2;
} else {
to = m;
}
}
return 0;
}
/**
* Returns TRUE if the character belongs to a Hanzi unicode block
*/
static bool is_CJK(UChar c) {
return
(0x4e00 <= c && c <= 0x9fff) // CJK_UNIFIED_IDEOGRAPHS
|| (0x3400 <= c && c <= 0x4dbf) // CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A
|| (0x3000 <= c && c <= 0x303f) // CJK_SYMBOLS_AND_PUNCTUATION
|| (0x2e80 <= c && c <= 0x2eff) // CJK_RADICALS_SUPPLEMENT
|| (0x3300 <= c && c <= 0x33ff) // CJK_COMPATIBILITY
|| (0xfe30 <= c && c <= 0xfe4f) // CJK_COMPATIBILITY_FORMS
|| (0xf900 <= c && c <= 0xfaff); // CJK_COMPATIBILITY_IDEOGRAPHS
}
int32_t GetPhonebookIndex(UCharIterator * iter, const char * locale, UChar * out, int32_t size,
UBool * isError)
{
if (size < MIN_OUTPUT_SIZE) {
*isError = TRUE;
return 0;
}
*isError = FALSE;
// Normalize the first character to remove accents using the NFD normalization
UErrorCode errorCode = U_ZERO_ERROR;
int32_t len = unorm_next(iter, out, size, UNORM_NFD,
0 /* options */, TRUE /* normalize */, NULL, &errorCode);
if (U_FAILURE(errorCode)) {
*isError = TRUE;
return 0;
}
if (len == 0) { // Empty input string
return 0;
}
UChar c = out[0];
if (!u_isalpha(c)) {
// Digits go into a # section. Everything else goes into the empty section
// The unicode function u_isdigit would also identify other characters as digits (arabic),
// but if we caught them here we'd risk having the same section before and after alpha-letters
// which might break the assumption that each section exists only once
if (c >= '0' && c <= '9') {
out[0] = '#';
return 1;
}
return 0;
}
c = u_toupper(c);
// Check for explicitly mapped characters
UChar c_mapped = map_character(c, DEFAULT_CHAR_MAP, sizeof(DEFAULT_CHAR_MAP) / sizeof(UChar));
if (c_mapped != 0) {
out[0] = c_mapped;
return 1;
}
// Convert Kanas to Hiragana
UChar next = len > 2 ? out[1] : 0;
c = android::GetNormalizedCodePoint(c, next, NULL);
// Traditional grouping of Hiragana characters
if (0x3041 <= c && c <= 0x309F) {
if (c < 0x304B) c = 0x3042; // a
else if (c < 0x3055) c = 0x304B; // ka
else if (c < 0x305F) c = 0x3055; // sa
else if (c < 0x306A) c = 0x305F; // ta
else if (c < 0x306F) c = 0x306A; // na
else if (c < 0x307E) c = 0x306F; // ha
else if (c < 0x3083) c = 0x307E; // ma
else if (c < 0x3089) c = 0x3084; // ya
else if (c < 0x308E) c = 0x3089; // ra
else if (c < 0x3094) c = 0x308F; // wa
else return 0; // Others are not readable
out[0] = c;
return 1;
} else if (0x30A0 <= c && c <= 0x30FF) {
// Dot, onbiki, iteration marks are not readable
return 0;
}
if (is_CJK(c)) {
if (strncmp(locale, "ja", 2) == 0) {
// Japanese word meaning "misc" or "other"
out[0] = 0x4ED6;
return 1;
} else {
return 0;
}
}
out[0] = c;
return 1;
}
} // namespace android