/
ucnv.h
1817 lines (1726 loc) · 73.6 KB
/
ucnv.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
/*
**********************************************************************
* Copyright (C) 1999-2004, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
* ucnv.h:
* External APIs for the ICU's codeset conversion library
* Bertrand A. Damiba
*
* Modification History:
*
* Date Name Description
* 04/04/99 helena Fixed internal header inclusion.
* 05/11/00 helena Added setFallback and usesFallback APIs.
* 06/29/2000 helena Major rewrite of the callback APIs.
* 12/07/2000 srl Update of documentation
*/
/**
* \file
* \brief C API: Character conversion
*
* <h2>Character Conversion C API</h2>
*
* <p>This API is used to convert codepage or character encoded data to and
* from UTF-16. You can open a converter with {@link ucnv_open() }. With that
* converter, you can get its properties, set options, convert your data and
* close the converter.</p>
*
* <p>Since many software programs recogize different converter names for
* different types of converters, there are other functions in this API to
* iterate over the converter aliases. The functions {@link ucnv_getAvailableName() },
* {@link ucnv_getAlias() } and {@link ucnv_getStandardName() } are some of the
* more frequently used alias functions to get this information.</p>
*
* <p>When a converter encounters an illegal, irregular, invalid or unmappable character
* its default behavior is to use a substitution character to replace the
* bad byte sequence. This behavior can be changed by using {@link ucnv_getFromUCallBack() }
* or {@link ucnv_getToUCallBack() } on the converter. The header ucnv_err.h defines
* many other callback actions that can be used instead of a character substitution.</p>
*
* <p>More information about this API can be found in our
* <a href="http://oss.software.ibm.com/icu/userguide/conversion.html">User's
* Guide</a>.</p>
*/
#ifndef UCNV_H
#define UCNV_H
#include "unicode/ucnv_err.h"
#include "unicode/uenum.h"
#ifndef __USET_H__
/**
* USet is the C API type for Unicode sets.
* It is forward-declared here to avoid including the header file if related
* conversion APIs are not used.
* See unicode/uset.h
*
* @see ucnv_getUnicodeSet
* @stable ICU 2.6
*/
struct USet;
/** @stable ICU 2.6 */
typedef struct USet USet;
#endif
#if !UCONFIG_NO_CONVERSION
U_CDECL_BEGIN
/** Maximum length of a converter name including the terminating NULL @stable ICU 2.0 */
#define UCNV_MAX_CONVERTER_NAME_LENGTH 60
/** Maximum length of a converter name including path and terminating NULL @stable ICU 2.0 */
#define UCNV_MAX_FULL_FILE_NAME_LENGTH (600+UCNV_MAX_CONVERTER_NAME_LENGTH)
/** Shift in for EBDCDIC_STATEFUL and iso2022 states @stable ICU 2.0 */
#define UCNV_SI 0x0F
/** Shift out for EBDCDIC_STATEFUL and iso2022 states @stable ICU 2.0 */
#define UCNV_SO 0x0E
/**
* Enum for specifying basic types of converters
* @see ucnv_getType
* @stable ICU 2.0
*/
typedef enum {
UCNV_UNSUPPORTED_CONVERTER = -1,
UCNV_SBCS = 0,
UCNV_DBCS = 1,
UCNV_MBCS = 2,
UCNV_LATIN_1 = 3,
UCNV_UTF8 = 4,
UCNV_UTF16_BigEndian = 5,
UCNV_UTF16_LittleEndian = 6,
UCNV_UTF32_BigEndian = 7,
UCNV_UTF32_LittleEndian = 8,
UCNV_EBCDIC_STATEFUL = 9,
UCNV_ISO_2022 = 10,
UCNV_LMBCS_1 = 11,
UCNV_LMBCS_2,
UCNV_LMBCS_3,
UCNV_LMBCS_4,
UCNV_LMBCS_5,
UCNV_LMBCS_6,
UCNV_LMBCS_8,
UCNV_LMBCS_11,
UCNV_LMBCS_16,
UCNV_LMBCS_17,
UCNV_LMBCS_18,
UCNV_LMBCS_19,
UCNV_LMBCS_LAST = UCNV_LMBCS_19,
UCNV_HZ,
UCNV_SCSU,
UCNV_ISCII,
UCNV_US_ASCII,
UCNV_UTF7,
UCNV_BOCU1,
UCNV_UTF16,
UCNV_UTF32,
UCNV_CESU8,
UCNV_IMAP_MAILBOX,
/* Number of converter types for which we have conversion routines. */
UCNV_NUMBER_OF_SUPPORTED_CONVERTER_TYPES
} UConverterType;
/**
* Enum for specifying which platform a converter ID refers to.
* The use of platform/CCSID is not recommended. See ucnv_openCCSID().
*
* @see ucnv_getPlatform
* @see ucnv_openCCSID
* @see ucnv_getCCSID
* @stable ICU 2.0
*/
typedef enum {
UCNV_UNKNOWN = -1,
UCNV_IBM = 0
} UConverterPlatform;
/**
* Function pointer for error callback in the codepage to unicode direction.
* Called when an error has occured in conversion to unicode, or on open/close of the callback (see reason).
* @param context Pointer to the callback's private data
* @param args Information about the conversion in progress
* @param codeUnits Points to 'length' bytes of the concerned codepage sequence
* @param length Size (in bytes) of the concerned codepage sequence
* @param reason Defines the reason the callback was invoked
* @see ucnv_setToUCallBack
* @see UConverterToUnicodeArgs
* @stable ICU 2.0
*/
typedef void (U_EXPORT2 *UConverterToUCallback) (
const void* context,
UConverterToUnicodeArgs *args,
const char *codeUnits,
int32_t length,
UConverterCallbackReason reason,
UErrorCode *);
/**
* Function pointer for error callback in the unicode to codepage direction.
* Called when an error has occured in conversion from unicode, or on open/close of the callback (see reason).
* @param context Pointer to the callback's private data
* @param args Information about the conversion in progress
* @param codeUnits Points to 'length' UChars of the concerned Unicode sequence
* @param length Size (in bytes) of the concerned codepage sequence
* @param codePoint Single UChar32 (UTF-32) containing the concerend Unicode codepoint.
* @param reason Defines the reason the callback was invoked
* @see ucnv_setFromUCallBack
* @stable ICU 2.0
*/
typedef void (U_EXPORT2 *UConverterFromUCallback) (
const void* context,
UConverterFromUnicodeArgs *args,
const UChar* codeUnits,
int32_t length,
UChar32 codePoint,
UConverterCallbackReason reason,
UErrorCode *);
U_CDECL_END
/**
* Character that separates converter names from options and options from each other.
* @see ucnv_open
* @stable ICU 2.0
*/
#define UCNV_OPTION_SEP_CHAR ','
/**
* String version of UCNV_OPTION_SEP_CHAR.
* @see ucnv_open
* @stable ICU 2.0
*/
#define UCNV_OPTION_SEP_STRING ","
/**
* Character that separates a converter option from its value.
* @see ucnv_open
* @stable ICU 2.0
*/
#define UCNV_VALUE_SEP_CHAR '='
/**
* String version of UCNV_VALUE_SEP_CHAR.
* @see ucnv_open
* @stable ICU 2.0
*/
#define UCNV_VALUE_SEP_STRING "="
/**
* Converter option for specifying a locale.
* For example, ucnv_open("SCSU,locale=ja", &errorCode);
* See convrtrs.txt.
*
* @see ucnv_open
* @stable ICU 2.0
*/
#define UCNV_LOCALE_OPTION_STRING ",locale="
/**
* Converter option for specifying a version selector (0..9) for some converters.
* For example, ucnv_open("UTF-7,version=1", &errorCode);
* See convrtrs.txt.
*
* @see ucnv_open
* @stable ICU 2.4
*/
#define UCNV_VERSION_OPTION_STRING ",version="
/**
* Converter option for EBCDIC SBCS or mixed-SBCS/DBCS (stateful) codepages.
* Swaps Unicode mappings for EBCDIC LF and NL codes, as used on
* S/390 (z/OS) Unix System Services (Open Edition).
* For example, ucnv_open("ibm-1047,swaplfnl", &errorCode);
* See convrtrs.txt.
*
* @see ucnv_open
* @stable ICU 2.4
*/
#define UCNV_SWAP_LFNL_OPTION_STRING ",swaplfnl"
/**
* Do a fuzzy compare of a two converter/alias names. The comparison
* is case-insensitive. It also ignores the characters '-', '_', and
* ' ' (dash, underscore, and space). Thus the strings "UTF-8",
* "utf_8", and "Utf 8" are exactly equivalent.
*
* @param name1 a converter name or alias, zero-terminated
* @param name2 a converter name or alias, zero-terminated
* @return 0 if the names match, or a negative value if the name1
* lexically precedes name2, or a positive value if the name1
* lexically follows name2.
* @stable ICU 2.0
*/
U_STABLE int U_EXPORT2
ucnv_compareNames(const char *name1, const char *name2);
/**
* Creates a UConverter object with the names specified as a C string.
* The actual name will be resolved with the alias file
* using a case-insensitive string comparison that ignores
* the delimiters '-', '_', and ' ' (dash, underscore, and space).
* E.g., the names "UTF8", "utf-8", and "Utf 8" are all equivalent.
* If <code>NULL</code> is passed for the converter name, it will create one with the
* getDefaultName return value.
*
* <p>A converter name for ICU 1.5 and above may contain options
* like a locale specification to control the specific behavior of
* the newly instantiated converter.
* The meaning of the options depends on the particular converter.
* If an option is not defined for or recognized by a given converter, then it is ignored.</p>
*
* <p>Options are appended to the converter name string, with a
* <code>UCNV_OPTION_SEP_CHAR</code> between the name and the first option and
* also between adjacent options.</p>
*
* <p>If the alias is ambiguous, then the preferred converter is used
* and the status is set to U_AMBIGUOUS_ALIAS_WARNING.</p>
*
* <p>The conversion behavior and names can vary between platforms. ICU may
* convert some characters differently from other platforms. Details on this topic
* are in the <a href="http://oss.software.ibm.com/icu/userguide/conversion.html">User's
* Guide</a>.</p>
*
* @param converterName Name of the uconv table, may have options appended
* @param err outgoing error status <TT>U_MEMORY_ALLOCATION_ERROR, U_FILE_ACCESS_ERROR</TT>
* @return the created Unicode converter object, or <TT>NULL</TT> if an error occured
* @see ucnv_openU
* @see ucnv_openCCSID
* @see ucnv_close
* @stable ICU 2.0
*/
U_STABLE UConverter* U_EXPORT2
ucnv_open(const char *converterName, UErrorCode *err);
/**
* Creates a Unicode converter with the names specified as unicode string.
* The name should be limited to the ASCII-7 alphanumerics range.
* The actual name will be resolved with the alias file
* using a case-insensitive string comparison that ignores
* the delimiters '-', '_', and ' ' (dash, underscore, and space).
* E.g., the names "UTF8", "utf-8", and "Utf 8" are all equivalent.
* If <TT>NULL</TT> is passed for the converter name, it will create
* one with the ucnv_getDefaultName() return value.
* If the alias is ambiguous, then the preferred converter is used
* and the status is set to U_AMBIGUOUS_ALIAS_WARNING.
* @param name : name of the uconv table in a zero terminated
* Unicode string
* @param err outgoing error status <TT>U_MEMORY_ALLOCATION_ERROR,
* U_FILE_ACCESS_ERROR</TT>
* @return the created Unicode converter object, or <TT>NULL</TT> if an
* error occured
* @see ucnv_open
* @see ucnv_openCCSID
* @see ucnv_close
* @see ucnv_getDefaultName
* @stable ICU 2.0
*/
U_STABLE UConverter* U_EXPORT2
ucnv_openU(const UChar *name,
UErrorCode *err);
/**
* Creates a UConverter object from a CCSID number and platform pair.
* Note that the usefulness of this function is limited to platforms with numeric
* encoding IDs. Only IBM and Microsoft platforms use numeric (16-bit) identifiers for
* encodings.
*
* In addition, IBM CCSIDs and Unicode conversion tables are not 1:1 related.
* For many IBM CCSIDs there are multiple (up to six) Unicode conversion tables, and
* for some Unicode conversion tables there are multiple CCSIDs.
* Some "alternate" Unicode conversion tables are provided by the
* IBM CDRA conversion table registry.
* The most prominent example of a systematic modification of conversion tables that is
* not provided in the form of conversion table files in the repository is
* that S/390 Unix System Services swaps the codes for Line Feed and New Line in all
* EBCDIC codepages, which requires such a swap in the Unicode conversion tables as well.
*
* Only IBM default conversion tables are accessible with ucnv_openCCSID().
* ucnv_getCCSID() will return the same CCSID for all conversion tables that are associated
* with that CCSID.
*
* Currently, the only "platform" supported in the ICU converter API is UCNV_IBM.
*
* In summary, the use of CCSIDs and the associated API functions is not recommended.
*
* In order to open a converter with the default IBM CDRA Unicode conversion table,
* you can use this function or use the prefix "ibm-":
* \code
* char name[20];
* sprintf(name, "ibm-%hu", ccsid);
* cnv=ucnv_open(name, &errorCode);
* \endcode
*
* In order to open a converter with the IBM S/390 Unix System Services variant
* of a Unicode/EBCDIC conversion table,
* you can use the prefix "ibm-" together with the option string UCNV_SWAP_LFNL_OPTION_STRING:
* \code
* char name[20];
* sprintf(name, "ibm-%hu" UCNV_SWAP_LFNL_OPTION_STRING, ccsid);
* cnv=ucnv_open(name, &errorCode);
* \endcode
*
* In order to open a converter from a Microsoft codepage number, use the prefix "cp":
* \code
* char name[20];
* sprintf(name, "cp%hu", codepageID);
* cnv=ucnv_open(name, &errorCode);
* \endcode
*
* If the alias is ambiguous, then the preferred converter is used
* and the status is set to U_AMBIGUOUS_ALIAS_WARNING.
*
* @param codepage codepage number to create
* @param platform the platform in which the codepage number exists
* @param err error status <TT>U_MEMORY_ALLOCATION_ERROR, U_FILE_ACCESS_ERROR</TT>
* @return the created Unicode converter object, or <TT>NULL</TT> if an error
* occured.
* @see ucnv_open
* @see ucnv_openU
* @see ucnv_close
* @see ucnv_getCCSID
* @see ucnv_getPlatform
* @see UConverterPlatform
* @stable ICU 2.0
*/
U_STABLE UConverter* U_EXPORT2
ucnv_openCCSID(int32_t codepage,
UConverterPlatform platform,
UErrorCode * err);
/**
* <p>Creates a UConverter object specified from a packageName and a converterName.</p>
*
* <p>The packageName and converterName must point to an ICU udata object, as defined by
* <code> udata_open( packageName, "cnv", converterName, err) </code> or equivalent.
* Typically, packageName will refer to a (.dat) file, or to a package registered with
* udata_setAppData().</p>
*
* <p>The name will NOT be looked up in the alias mechanism, nor will the converter be
* stored in the converter cache or the alias table. The only way to open further converters
* is call this function multiple times, or use the ucnv_safeClone() function to clone a
* 'master' converter.</p>
*
* <p>A future version of ICU may add alias table lookups and/or caching
* to this function.</p>
*
* <p>Example Use:
* <code>cnv = ucnv_openPackage("myapp", "myconverter", &err);</code>
* </p>
*
* @param packageName name of the package (equivalent to 'path' in udata_open() call)
* @param converterName name of the data item to be used, without suffix.
* @param err outgoing error status <TT>U_MEMORY_ALLOCATION_ERROR, U_FILE_ACCESS_ERROR</TT>
* @return the created Unicode converter object, or <TT>NULL</TT> if an error occured
* @see udata_open
* @see ucnv_open
* @see ucnv_safeClone
* @see ucnv_close
* @stable ICU 2.2
*/
U_STABLE UConverter* U_EXPORT2
ucnv_openPackage(const char *packageName, const char *converterName, UErrorCode *err);
/**
* Thread safe cloning operation
* @param cnv converter to be cloned
* @param stackBuffer user allocated space for the new clone. If NULL new memory will be allocated.
* If buffer is not large enough, new memory will be allocated.
* Clients can use the U_CNV_SAFECLONE_BUFFERSIZE. This will probably be enough to avoid memory allocations.
* @param pBufferSize pointer to size of allocated space.
* If *pBufferSize == 0, a sufficient size for use in cloning will
* be returned ('pre-flighting')
* If *pBufferSize is not enough for a stack-based safe clone,
* new memory will be allocated.
* @param status to indicate whether the operation went on smoothly or there were errors
* An informational status value, U_SAFECLONE_ALLOCATED_ERROR, is used if any allocations were necessary.
* @return pointer to the new clone
* @stable ICU 2.0
*/
U_STABLE UConverter * U_EXPORT2
ucnv_safeClone(const UConverter *cnv,
void *stackBuffer,
int32_t *pBufferSize,
UErrorCode *status);
/**
* \def U_CNV_SAFECLONE_BUFFERSIZE
* Definition of a buffer size that is designed to be large enough for
* converters to be cloned with ucnv_safeClone().
* @stable ICU 2.0
*/
#define U_CNV_SAFECLONE_BUFFERSIZE 1024
/**
* Deletes the unicode converter and releases resources associated
* with just this instance.
* Does not free up shared converter tables.
*
* @param converter the converter object to be deleted
* @see ucnv_open
* @see ucnv_openU
* @see ucnv_openCCSID
* @stable ICU 2.0
*/
U_STABLE void U_EXPORT2
ucnv_close(UConverter * converter);
/**
* Fills in the output parameter, subChars, with the substitution characters
* as multiple bytes.
*
* @param converter the Unicode converter
* @param subChars the subsitution characters
* @param len on input the capacity of subChars, on output the number
* of bytes copied to it
* @param err the outgoing error status code.
* If the substitution character array is too small, an
* <TT>U_INDEX_OUTOFBOUNDS_ERROR</TT> will be returned.
* @see ucnv_setSubstChars
* @stable ICU 2.0
*/
U_STABLE void U_EXPORT2
ucnv_getSubstChars(const UConverter *converter,
char *subChars,
int8_t *len,
UErrorCode *err);
/**
* Sets the substitution chars when converting from unicode to a codepage. The
* substitution is specified as a string of 1-4 bytes, and may contain
* <TT>NULL</TT> byte.
* @param converter the Unicode converter
* @param subChars the substitution character byte sequence we want set
* @param len the number of bytes in subChars
* @param err the error status code. <TT>U_INDEX_OUTOFBOUNDS_ERROR </TT> if
* len is bigger than the maximum number of bytes allowed in subchars
* @see ucnv_getSubstChars
* @stable ICU 2.0
*/
U_STABLE void U_EXPORT2
ucnv_setSubstChars(UConverter *converter,
const char *subChars,
int8_t len,
UErrorCode *err);
/**
* Fills in the output parameter, errBytes, with the error characters from the
* last failing conversion.
*
* @param converter the Unicode converter
* @param errBytes the codepage bytes which were in error
* @param len on input the capacity of errBytes, on output the number of
* bytes which were copied to it
* @param err the error status code.
* If the substitution character array is too small, an
* <TT>U_INDEX_OUTOFBOUNDS_ERROR</TT> will be returned.
* @stable ICU 2.0
*/
U_STABLE void U_EXPORT2
ucnv_getInvalidChars(const UConverter *converter,
char *errBytes,
int8_t *len,
UErrorCode *err);
/**
* Fills in the output parameter, errChars, with the error characters from the
* last failing conversion.
*
* @param converter the Unicode converter
* @param errUChars the UChars which were in error
* @param len on input the capacity of errUChars, on output the number of
* UChars which were copied to it
* @param err the error status code.
* If the substitution character array is too small, an
* <TT>U_INDEX_OUTOFBOUNDS_ERROR</TT> will be returned.
* @stable ICU 2.0
*/
U_STABLE void U_EXPORT2
ucnv_getInvalidUChars(const UConverter *converter,
UChar *errUChars,
int8_t *len,
UErrorCode *err);
/**
* Resets the state of a converter to the default state. This is used
* in the case of an error, to restart a conversion from a known default state.
* It will also empty the internal output buffers.
* @param converter the Unicode converter
* @stable ICU 2.0
*/
U_STABLE void U_EXPORT2
ucnv_reset(UConverter *converter);
/**
* Resets the to-Unicode part of a converter state to the default state.
* This is used in the case of an error to restart a conversion to
* Unicode to a known default state. It will also empty the internal
* output buffers used for the conversion to Unicode codepoints.
* @param converter the Unicode converter
* @stable ICU 2.0
*/
U_STABLE void U_EXPORT2
ucnv_resetToUnicode(UConverter *converter);
/**
* Resets the from-Unicode part of a converter state to the default state.
* This is used in the case of an error to restart a conversion from
* Unicode to a known default state. It will also empty the internal output
* buffers used for the conversion from Unicode codepoints.
* @param converter the Unicode converter
* @stable ICU 2.0
*/
U_STABLE void U_EXPORT2
ucnv_resetFromUnicode(UConverter *converter);
/**
* Returns the maximum number of bytes that are output per UChar in conversion
* from Unicode using this converter.
* The returned number can be used with UCNV_GET_MAX_BYTES_FOR_STRING
* to calculate the size of a target buffer for conversion from Unicode.
*
* Note: Before ICU 2.8, this function did not return reliable numbers for
* some stateful converters (EBCDIC_STATEFUL, ISO-2022) and LMBCS.
*
* This number may not be the same as the maximum number of bytes per
* "conversion unit". In other words, it may not be the intuitively expected
* number of bytes per character that would be published for a charset,
* and may not fulfill any other purpose than the allocation of an output
* buffer of guaranteed sufficient size for a given input length and converter.
*
* Examples for special cases that are taken into account:
* - Supplementary code points may convert to more bytes than BMP code points.
* This function returns bytes per UChar (UTF-16 code unit), not per
* Unicode code point, for efficient buffer allocation.
* - State-shifting output (SI/SO, escapes, etc.) from stateful converters.
* - When m input UChars are converted to n output bytes, then the maximum m/n
* is taken into account.
*
* The number returned here does not take into account
* (see UCNV_GET_MAX_BYTES_FOR_STRING):
* - callbacks which output more than one charset character sequence per call,
* like escape callbacks
* - initial and final non-character bytes that are output by some converters
* (automatic BOMs, initial escape sequence, final SI, etc.)
*
* Examples for returned values:
* - SBCS charsets: 1
* - Shift-JIS: 2
* - UTF-16: 2 (2 per BMP, 4 per surrogate _pair_, BOM not counted)
* - UTF-8: 3 (3 per BMP, 4 per surrogate _pair_)
* - EBCDIC_STATEFUL (EBCDIC mixed SBCS/DBCS): 3 (SO + DBCS)
* - ISO-2022: 3 (always outputs UTF-8)
* - ISO-2022-JP: 6 (4-byte escape sequences + DBCS)
* - ISO-2022-CN: 8 (4-byte designator sequences + 2-byte SS2/SS3 + DBCS)
*
* @param converter The Unicode converter.
* @return The maximum number of bytes per UChar that are output by ucnv_fromUnicode(),
* to be used together with UCNV_GET_MAX_BYTES_FOR_STRING for buffer allocation.
*
* @see UCNV_GET_MAX_BYTES_FOR_STRING
* @see ucnv_getMinCharSize
* @stable ICU 2.0
*/
U_STABLE int8_t U_EXPORT2
ucnv_getMaxCharSize(const UConverter *converter);
#ifndef U_HIDE_DRAFT_API
/**
* Calculates the size of a buffer for conversion from Unicode to a charset.
* The calculated size is guaranteed to be sufficient for this conversion.
*
* It takes into account initial and final non-character bytes that are output
* by some converters.
* It does not take into account callbacks which output more than one charset
* character sequence per call, like escape callbacks.
* The default (substitution) callback only outputs one charset character sequence.
*
* @param length Number of UChars to be converted.
* @param maxCharSize Return value from ucnv_getMaxCharSize() for the converter
* that will be used.
* @return Size of a buffer that will be large enough to hold the output bytes of
* converting length UChars with the converter that returned the maxCharSize.
*
* @see ucnv_getMaxCharSize
* @draft ICU 2.8
*/
#define UCNV_GET_MAX_BYTES_FOR_STRING(length, maxCharSize) \
(((int32_t)(length)+10)*(int32_t)(maxCharSize))
#endif /*U_HIDE_DRAFT_API*/
/**
* Returns the minimum byte length for characters in this codepage.
* This is usually either 1 or 2.
* @param converter the Unicode converter
* @return the minimum number of bytes allowed by this particular converter
* @see ucnv_getMaxCharSize
* @stable ICU 2.0
*/
U_STABLE int8_t U_EXPORT2
ucnv_getMinCharSize(const UConverter *converter);
/**
* Returns the display name of the converter passed in based on the Locale
* passed in. If the locale contains no display name, the internal ASCII
* name will be filled in.
*
* @param converter the Unicode converter.
* @param displayLocale is the specific Locale we want to localised for
* @param displayName user provided buffer to be filled in
* @param displayNameCapacity size of displayName Buffer
* @param err error status code
* @return displayNameLength number of UChar needed in displayName
* @see ucnv_getName
* @stable ICU 2.0
*/
U_STABLE int32_t U_EXPORT2
ucnv_getDisplayName(const UConverter *converter,
const char *displayLocale,
UChar *displayName,
int32_t displayNameCapacity,
UErrorCode *err);
/**
* Gets the internal, canonical name of the converter (zero-terminated).
* The lifetime of the returned string will be that of the converter
* passed to this function.
* @param converter the Unicode converter
* @param err UErrorCode status
* @return the internal name of the converter
* @see ucnv_getDisplayName
* @stable ICU 2.0
*/
U_STABLE const char * U_EXPORT2
ucnv_getName(const UConverter *converter, UErrorCode *err);
/**
* Gets a codepage number associated with the converter. This is not guaranteed
* to be the one used to create the converter. Some converters do not represent
* platform registered codepages and return zero for the codepage number.
* The error code fill-in parameter indicates if the codepage number
* is available.
* Does not check if the converter is <TT>NULL</TT> or if converter's data
* table is <TT>NULL</TT>.
*
* Important: The use of CCSIDs is not recommended because it is limited
* to only two platforms in principle and only one (UCNV_IBM) in the current
* ICU converter API.
* Also, CCSIDs are insufficient to identify IBM Unicode conversion tables precisely.
* For more details see ucnv_openCCSID().
*
* @param converter the Unicode converter
* @param err the error status code.
* @return If any error occurrs, -1 will be returned otherwise, the codepage number
* will be returned
* @see ucnv_openCCSID
* @see ucnv_getPlatform
* @stable ICU 2.0
*/
U_STABLE int32_t U_EXPORT2
ucnv_getCCSID(const UConverter *converter,
UErrorCode *err);
/**
* Gets a codepage platform associated with the converter. Currently,
* only <TT>UCNV_IBM</TT> will be returned.
* Does not test if the converter is <TT>NULL</TT> or if converter's data
* table is <TT>NULL</TT>.
* @param converter the Unicode converter
* @param err the error status code.
* @return The codepage platform
* @stable ICU 2.0
*/
U_STABLE UConverterPlatform U_EXPORT2
ucnv_getPlatform(const UConverter *converter,
UErrorCode *err);
/**
* Gets the type of the converter
* e.g. SBCS, MBCS, DBCS, UTF8, UTF16_BE, UTF16_LE, ISO_2022,
* EBCDIC_STATEFUL, LATIN_1
* @param converter a valid, opened converter
* @return the type of the converter
* @stable ICU 2.0
*/
U_STABLE UConverterType U_EXPORT2
ucnv_getType(const UConverter * converter);
/**
* Gets the "starter" (lead) bytes for converters of type MBCS.
* Will fill in an <TT>U_ILLEGAL_ARGUMENT_ERROR</TT> if converter passed in
* is not MBCS. Fills in an array of type UBool, with the value of the byte
* as offset to the array. For example, if (starters[0x20] == TRUE) at return,
* it means that the byte 0x20 is a starter byte in this converter.
* Context pointers are always owned by the caller.
*
* @param converter a valid, opened converter of type MBCS
* @param starters an array of size 256 to be filled in
* @param err error status, <TT>U_ILLEGAL_ARGUMENT_ERROR</TT> if the
* converter is not a type which can return starters.
* @see ucnv_getType
* @stable ICU 2.0
*/
U_STABLE void U_EXPORT2
ucnv_getStarters(const UConverter* converter,
UBool starters[256],
UErrorCode* err);
/**
* Selectors for Unicode sets that can be returned by ucnv_getUnicodeSet().
* @see ucnv_getUnicodeSet
* @stable ICU 2.6
*/
typedef enum UConverterUnicodeSet {
/** Select the set of roundtrippable Unicode code points. @stable ICU 2.6 */
UCNV_ROUNDTRIP_SET,
/** Number of UConverterUnicodeSet selectors. @stable ICU 2.6 */
UCNV_SET_COUNT
} UConverterUnicodeSet;
/**
* Returns the set of Unicode code points that can be converted by an ICU converter.
*
* The current implementation returns only one kind of set (UCNV_ROUNDTRIP_SET):
* The set of all Unicode code points that can be roundtrip-converted
* (converted without any data loss) with the converter.
* This set will not include code points that have fallback mappings
* or are only the result of reverse fallback mappings.
* See UTR #22 "Character Mapping Markup Language"
* at http://www.unicode.org/reports/tr22/
*
* This is useful for example for
* - checking that a string or document can be roundtrip-converted with a converter,
* without/before actually performing the conversion
* - testing if a converter can be used for text for typical text for a certain locale,
* by comparing its roundtrip set with the set of ExemplarCharacters from
* ICU's locale data or other sources
*
* In the future, there may be more UConverterUnicodeSet choices to select
* sets with different properties.
*
* @param cnv The converter for which a set is requested.
* @param setFillIn A valid USet *. It will be cleared by this function before
* the converter's specific set is filled into the USet.
* @param whichSet A UConverterUnicodeSet selector;
* currently UCNV_ROUNDTRIP_SET is the only supported value.
* @param pErrorCode ICU error code in/out parameter.
* Must fulfill U_SUCCESS before the function call.
*
* @see UConverterUnicodeSet
* @see uset_open
* @see uset_close
* @stable ICU 2.6
*/
U_STABLE void U_EXPORT2
ucnv_getUnicodeSet(const UConverter *cnv,
USet *setFillIn,
UConverterUnicodeSet whichSet,
UErrorCode *pErrorCode);
/**
* Gets the current calback function used by the converter when an illegal
* or invalid codepage sequence is found.
* Context pointers are always owned by the caller.
*
* @param converter the unicode converter
* @param action fillin: returns the callback function pointer
* @param context fillin: returns the callback's private void* context
* @see ucnv_setToUCallBack
* @stable ICU 2.0
*/
U_STABLE void U_EXPORT2
ucnv_getToUCallBack (const UConverter * converter,
UConverterToUCallback *action,
const void **context);
/**
* Gets the current callback function used by the converter when illegal
* or invalid Unicode sequence is found.
* Context pointers are always owned by the caller.
*
* @param converter the unicode converter
* @param action fillin: returns the callback function pointer
* @param context fillin: returns the callback's private void* context
* @see ucnv_setFromUCallBack
* @stable ICU 2.0
*/
U_STABLE void U_EXPORT2
ucnv_getFromUCallBack (const UConverter * converter,
UConverterFromUCallback *action,
const void **context);
/**
* Changes the callback function used by the converter when
* an illegal or invalid sequence is found.
* Context pointers are always owned by the caller.
* Predefined actions and contexts can be found in the ucnv_err.h header.
*
* @param converter the unicode converter
* @param newAction the new callback function
* @param newContext the new toUnicode callback context pointer. This can be NULL.
* @param oldAction fillin: returns the old callback function pointer. This can be NULL.
* @param oldContext fillin: returns the old callback's private void* context. This can be NULL.
* @param err The error code status
* @see ucnv_getToUCallBack
* @stable ICU 2.0
*/
U_STABLE void U_EXPORT2
ucnv_setToUCallBack (UConverter * converter,
UConverterToUCallback newAction,
const void* newContext,
UConverterToUCallback *oldAction,
const void** oldContext,
UErrorCode * err);
/**
* Changes the current callback function used by the converter when
* an illegal or invalid sequence is found.
* Context pointers are always owned by the caller.
* Predefined actions and contexts can be found in the ucnv_err.h header.
*
* @param converter the unicode converter
* @param newAction the new callback function
* @param newContext the new fromUnicode callback context pointer. This can be NULL.
* @param oldAction fillin: returns the old callback function pointer. This can be NULL.
* @param oldContext fillin: returns the old callback's private void* context. This can be NULL.
* @param err The error code status
* @see ucnv_getFromUCallBack
* @stable ICU 2.0
*/
U_STABLE void U_EXPORT2
ucnv_setFromUCallBack (UConverter * converter,
UConverterFromUCallback newAction,
const void *newContext,
UConverterFromUCallback *oldAction,
const void **oldContext,
UErrorCode * err);
/**
* Converts an array of unicode characters to an array of codepage
* characters. This function is optimized for converting a continuous
* stream of data in buffer-sized chunks, where the entire source and
* target does not fit in available buffers.
*
* The source pointer is an in/out parameter. It starts out pointing where the
* conversion is to begin, and ends up pointing after the last UChar consumed.
*
* Target similarly starts out pointer at the first available byte in the output
* buffer, and ends up pointing after the last byte written to the output.
*
* The converter always attempts to consume the entire source buffer, unless
* (1.) the target buffer is full, or (2.) a failing error is returned from the
* current callback function. When a successful error status has been
* returned, it means that all of the source buffer has been
* consumed. At that point, the caller should reset the source and
* sourceLimit pointers to point to the next chunk.
*
* At the end of the stream (flush==TRUE), the input is completely consumed
* when *source==sourceLimit and no error code is set.
* The converter object is then automatically reset by this function.
* (This means that a converter need not be reset explicitly between data
* streams if it finishes the previous stream without errors.)
*
* This is a <I>stateful</I> conversion. Additionally, even when all source data has
* been consumed, some data may be in the converters' internal state.
* Call this function repeatedly, updating the target pointers with
* the next empty chunk of target in case of a
* <TT>U_BUFFER_OVERFLOW_ERROR</TT>, and updating the source pointers
* with the next chunk of source when a successful error status is
* returned, until there are no more chunks of source data.
* @param converter the Unicode converter
* @param target I/O parameter. Input : Points to the beginning of the buffer to copy
* codepage characters to. Output : points to after the last codepage character copied
* to <TT>target</TT>.
* @param targetLimit the pointer just after last of the <TT>target</TT> buffer
* @param source I/O parameter, pointer to pointer to the source Unicode character buffer.
* @param sourceLimit the pointer just after the last of the source buffer
* @param offsets if NULL is passed, nothing will happen to it, otherwise it needs to have the same number
* of allocated cells as <TT>target</TT>. Will fill in offsets from target to source pointer
* e.g: <TT>offsets[3]</TT> is equal to 6, it means that the <TT>target[3]</TT> was a result of transcoding <TT>source[6]</TT>
* For output data carried across calls, and other data without a specific source character
* (such as from escape sequences or callbacks) -1 will be placed for offsets.
* @param flush set to <TT>TRUE</TT> if the current source buffer is the last available
* chunk of the source, <TT>FALSE</TT> otherwise. Note that if a failing status is returned,
* this function may have to be called multiple times with flush set to <TT>TRUE</TT> until
* the source buffer is consumed.
* @param err the error status. <TT>U_ILLEGAL_ARGUMENT_ERROR</TT> will be set if the
* converter is <TT>NULL</TT>.
* <code>U_BUFFER_OVERFLOW_ERROR</code> will be set if the target is full and there is
* still data to be written to the target.
* @see ucnv_fromUChars
* @see ucnv_convert
* @see ucnv_getMinCharSize
* @see ucnv_setToUCallBack
* @stable ICU 2.0
*/
U_STABLE void U_EXPORT2
ucnv_fromUnicode (UConverter * converter,
char **target,
const char *targetLimit,
const UChar ** source,
const UChar * sourceLimit,
int32_t* offsets,
UBool flush,
UErrorCode * err);
/**
* Converts a buffer of codepage bytes into an array of unicode UChars
* characters. This function is optimized for converting a continuous
* stream of data in buffer-sized chunks, where the entire source and
* target does not fit in available buffers.
*
* The source pointer is an in/out parameter. It starts out pointing where the
* conversion is to begin, and ends up pointing after the last byte of source consumed.
*
* Target similarly starts out pointer at the first available UChar in the output
* buffer, and ends up pointing after the last UChar written to the output.
* It does NOT necessarily keep UChar sequences together.
*
* The converter always attempts to consume the entire source buffer, unless
* (1.) the target buffer is full, or (2.) a failing error is returned from the
* current callback function. When a successful error status has been
* returned, it means that all of the source buffer has been
* consumed. At that point, the caller should reset the source and
* sourceLimit pointers to point to the next chunk.
*
* At the end of the stream (flush==TRUE), the input is completely consumed