/
xfsm_api.h
2602 lines (2193 loc) · 108 KB
/
xfsm_api.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
/* $Id: xfsm_api.h $ */
/* Copyright (c) 2009 by the Palo Alto Research Center.
All rights reserved */
/*********************************************************************
**
** XFSM_API.H
** Lauri Karttunen
** Palo Alto Research Center
** July 2009
**
*********************************************************************/
/* This header file documents the data structures, constants and
function prototypes that are made available in libxfst. libxfst
is a C or C++ programmers interface to the operations that are
provided on the xfst application's command line. It does not
contain the features that are in fst, the more powerful "big
sister" of xfst, such as size and speed optimizations and
pattern matching. libxfst is a subset of a larger libcfsm library.
This is the only header file needed for the xfst library. The order
of presentation of structure and function definitions is fixed by
the needs of a C compiler that must have a definition for every
constant and data type before it is used in another definition. A
human reader might find it useful to skip the beginning and jump
right into the section on FUNCTION PROTOTYPES. */
#ifndef XFSM_API
#define XFSM_API
#ifdef __cplusplus
extern "C" {
#endif /* __cplusplus */
#include <stdio.h>
#ifndef _WIN32
#include <stdint.h>
#else
#include "win_stdint.h"
#endif
#ifdef _WIN32
#include <limits.h>
#define UINTMAX_MAX UINT_MAX
#define INT32_MAX _I32_MAX
#endif
/*****************************************************
* DATA STRUCTURES and DEFINITIONS
*****************************************************/
#ifndef BIT_DEFINED
#define BIT_DEFINED
typedef uint32_t bit;
#endif
#ifndef BYTE_DEFINED
#define BYTE_DEFINED
typedef unsigned char byte;
#endif
#ifndef TRUE
#define TRUE 1
#endif
#ifndef FALSE
#define FALSE 0
#endif
#ifndef EPSILON
#define EPSILON 0 /* The symbol ID of the epsilon symbol. */
#endif
#ifndef OTHER
#define OTHER 1 /* The symbol ID for an unknown symbol. */
#endif
/********************
* CONSTANTS
*******************/
enum symbol_pair {UPPER=0, LOWER=1, BOTH_SIDES=2};
enum visit_marks {NOT_VISITED=0, IN_PROCESS=1, DONE=2};
enum escape_p {DONT_ESCAPE=0, ESCAPE=1};
enum obey_flags_p {DONT_OBEY=0, OBEY=1};
enum keep_p {DONT_KEEP=0, KEEP=1};
enum char_encoding {CHAR_ENC_UNKNOWN=0, CHAR_ENC_UTF_8=1,
CHAR_ENC_ISO_8859_1=2};
enum alph_types {BINARY_VECTOR=0, LABEL_VECTOR=1};
enum watch_rm {DONT_WATCH_RM=0, WATCH_RM=1};
enum record_byte_pos {DONT_RECORD, RECORD};
enum flag_action {NO_ACTION=0, CLEAR_SETTING=1, POSITIVE_SETTING=2,
NEGATIVE_SETTING=3, UNIFY_TEST=4, DISALLOW_TEST=5,
REQUIRE_TEST=6, FAIL_ACTION=7, INSERT_SUBNET=8,
SET_TO_ATTR=9, EQUAL_ATTR_TEST=10, LIST_MEMBER=11,
APPLY_TRANSDUCER=12, APPLY_FUNCTION=13,
EXCLUDE_LIST=14};
enum data_types {Unknown=0, Network=1, Alphabet=2, Integer=3, Other=4};
enum bytes_or_chars {NUM_BYTES, NUM_CHARS};
/**************
* SIZES
**************/
#define int16 short
#define uint16 unsigned short
#define int32 int
#define uint32 unsigned int
typedef long LONG;
typedef unsigned long ULONG;
typedef unsigned int UTF32;
typedef uint32 id_type;
#define MAX_LV 24 /* -- maximum number of bits in a label ID */
#define ID_EOS ((unsigned) (1 << MAX_LV) -1)
/* Id symbol denoting the end of a symbol ID sequence. */
#define ID_NO_SYMBOL ((unsigned) (1 << MAX_LV) -1)
/* Special Id symbol meaning "no symbol" or "uninitialized id
value" */
/************************
* CFSM LIBRARY VERSION *
************************/
char *libcfsm_version(void);
/* Returns an unmodifiable string constant such as
"libcfsm-2.12.10 (svn 24693) 2008-08-14"
where 2.12.10 is the release number for the cfsm library,
(svn 24693) identifies the subversion repository update for
the library and 2008-08-14 is its creation date. */
char *libcfsm_build(void);
/* Returns the latest subversion repository update number
as an unmodifiable string, for example "24693". */
/*************
* HEAP
*************/
typedef struct HEAP HEAPtype, *HEAPptr;
/*************
* HASH TABLE
*************/
typedef struct HASH_TABLE HASH_TABLEtype, *HASH_TABLEptr;
/****************
* STACK
****************/
typedef struct STACK STACKtype, *STACKptr;
/****************
* INPUT BUFFER
****************/
typedef struct INPUT_BUFFER IN_BUFERtype, *IN_BUFFERptr;
/******************
* UNICODE STRINGS
*****************/
typedef UTF32 FAT_CHAR;
typedef FAT_CHAR *FAT_STR;
/* Unicode strings are represented as null-terminated sequences of
32 bit integers, called "fat strings." Standard C strings are
called "thin strings." This API allows the user to work with
ordinary C strings either in UTF-8 or LATIN-1 mode. The
conversion from thin strings to fat strings and, vice versa, is
handled automatically. Nevertheless, a few functions are provided
for the unlikely case that they are needed. */
FAT_STR make_fat_string(int length);
/* Makes a new fat string of the given length. */
FAT_STR fat_strcpy(FAT_STR s1, FAT_STR s2);
/* Analogue of strcpy. Copies s2 to s1 including the terminating 0.
Returns s1. The caller must verify that s1 has enough space. */
FAT_STR copy_fat_string(FAT_STR fs);
/* Returns a copy of fs. */
FAT_STR fat_strcat(FAT_STR s1, FAT_STR s2);
/* Analogue of strcat. Appends s2 to the end of s1 including the
terminating 0. Returns s1. The caller must verify that s1 has
enough space for the result. */
char *fat_to_thin_str(FAT_STR fat_str, char *thin_str, int with_esc);
/* Converts a fat string to a thin string conforming to the current
character encoding mode. If thin_str is NULL, a new string is
allocated. If thin_str is non-NULL, it is up to the caller to
verify that it is long enough for the result. */
int fprint_fat_string (FILE *outfile, FAT_STR fs);
/* Prints fs into outfile. See also the functions below for writing
fat strings to a string buffer or a page. */
/******************
* STRING BUFFER
******************/
typedef struct STRING_BUFFER {
int char_size;
int length;
int pos;
int lines;
void *string;
} STRING_BUFFERtype, *STRING_BUFFERptr;
/* String buffers are used to store both types of strings, fat
strings or thin strings. The char_size is either 4 or 1 bytes.
Some of the apply functions store the output into a string
buffer. */
#define STRING_BUFFER_char_size(X) (X)->char_size
#define STRING_BUFFER_length(X) (X)->length
#define STRING_BUFFER_pos(X) (X)->pos
#define STRING_BUFFER_lines(X) (X)->lines
#define STRING_BUFFER_string(X) (X)->string
STRING_BUFFERptr make_string_buffer(int length);
/* Makes a new thin string buffer */
STRING_BUFFERptr make_fat_str_buffer(int length);
/* Makes a new fat string buffer. */
void free_string_buffer(STRING_BUFFERptr str_buf);
/* Frees a string buffer of either type. */
void initialize_string_buffer(STRING_BUFFERptr str_buf);
/* Initializes a string buffer of either type. */
int append_string_to_buffer(const char *str, STRING_BUFFERptr str_buf);
/* Appends a thin string to a thin string buffer. Returns
the length of the string in the buffer. */
int append_label_to_buffer(id_type id, int escape_p, STRING_BUFFERptr str_buf);
/* Appends the label of the id to a thin string buffer. Returns the
length of the string in the buffer. If escape_p is ESCAPE,
special symbols such as newline symbols and tabs are escaped using
standard Unix conventions. */
int append_fat_str_to_buffer(FAT_STR fs, STRING_BUFFERptr str_buf);
/* Appends a fat string to a fat string buffer. Returns the
length of the fat string in the buffer. */
void assure_buffer_space(int length, STRING_BUFFERptr str_buf);
/* Makes sure that the string buffer has length amount of space
left. */
int print_string_buffer(STRING_BUFFERptr str_buf, FILE *stream);
/* Prints the string of the string buffer into the stream. If the
string is a fat string, it will be printed as a a UTF-8 or
Latin-1 encoded C string depending on the character encoding
mode. Returns the length of the printed string. */
/******************
* LAB_VECTOR
*****************/
typedef struct LAB_VECTOR {
int length;
int pos;
id_type *array;
} LAB_VECTORtype, *LAB_VECTORptr;
#define LAB_VECTOR_length(X) (X)->length
#define LAB_VECTOR_pos(X) (X)->pos
#define LAB_VECTOR_array(X) (X)->array
/* Label vectors are used to store sequences of label IDs */
LAB_VECTORptr make_lab_vector(int length);
/* Creates a label vector of the given length. */
void reclaim_lab_vector(LAB_VECTORptr lab_vect);
int append_to_lab_vector(id_type lab, LAB_VECTORptr lab_vect);
/* Appends a label to the next position in the label vector and
increments the position counter. The length of the vector is
adjusted if needed. Returns the new value of the position counter. */
int set_lab_vector_element_at(id_type lab, int pos, LAB_VECTORptr lab_vect);
/* Stores the label in the given position in the label vector. The
length of the label vector is adjusted if needed. Returns the
value of the position counter. */
int lab_vector_element_at(id_type *lab, int pos, LAB_VECTORptr lab_vect);
/* Sets *lab to the label in the given position. Returns 0 on success,
1 if the position is outside the filled part of the vector. */
void increment_lab_vector(LAB_VECTORptr lab_vect);
/* Increments the position counter of the label vector. */
int decrement_lab_vector(LAB_VECTORptr lab_vect);
/* Decrements the position counter of the label vector.
Return 0 on success, 1 if the new position would be less than 0. */
void reset_lab_vector(LAB_VECTORptr lab_vect);
/* Resets the position counter of lab_vect to 0. */
typedef struct LAB_VECTOR_TABLE LAB_VECTOR_TABLEtype, *LAB_VECTOR_TABLEptr;
/******************
* VECTOR *
******************/
typedef struct VECTOR {
int length;
int pos;
void **array;
} VECTORtype, *VECTORptr;
/* Vectors are for storing sequences of objects of any kind. */
#define VECTOR_length(X) (X)->length
#define VECTOR_pos(X) (X)->pos
#define VECTOR_array(X) (X)->array
typedef struct VECTOR_ENUMERATOR VECT_ENUMtype, *VECT_ENUMptr;
typedef struct VECTOR_TABLE VECTOR_TABLEtype, *VECTOR_TABLEptr;
VECTORptr make_vector(int length);
/* Makes a vector of the given length. */
void free_vector(VECTORptr vector);
int append_to_vector(void *object, VECTORptr vector);
/* Appends object to the next position in the vector and increments
the position counter. Allocates more vector space if needed.
Returns the new value of the position counter. */
int vector_element_at(void **element, int pos, VECTORptr *vector);
/* Sets *element to the object in the given position. Returns 0
on success, 1 if pos is greater or equal to the value of the
vector's position counter. */
int set_vector_element_at(void *element, int pos, VECTORptr vector);
/* Stores the element in the given position in the vector. The
length of the vector is adjusted if needed. Returns the value of
the position counter. */
void reset_vector(VECTORptr vector);
/* Sets the position counter of vector to 0. */
typedef struct LAB_RING LAB_RINGtype, *LAB_RINGptr;
/*************************
* LABEL
***************************/
typedef struct TUPLE {
id_type *labels; /* a pair of label IDs */
id_type inverse; /* it is equal to ID_NO_SYMBOL for labels
of arity 1 and for tuples whose inverse
has not been computed; otherwise it contains
the unique ID of the inverse tuple. */
} TUPLEtype, *TUPLEptr;
#define TUPLE_labels(X) (X)->labels
typedef struct FLAG_DIACRITIC {
int action;
id_type attribute;
id_type value;
} FLAG_DIACRtype, *FLAG_DIACRptr;
#define FLAG_DIACR_action(X) (X)->action
#define FLAG_DIACR_attribute(X) (X)->attribute
#define FLAG_DIACR_value(X) (X)->value
typedef struct LABEL {
id_type id; /* Unique ID for a symbol or symbol pair. */
id_type other_id; /* Place to store some related label ID. */
void *data; /* a cache for storing some information about
the label such as type or print name */
FLAG_DIACRptr flag; /* NULL for labels that are not flag diacritics. */
unsigned short arity; /* 1 for atomic label, 2 for pairs */
unsigned short expands_other; /* TRUE if covered by OTHER */
unsigned short consumes_input; /* Not an epsilon transition */
unsigned short convertible; /* TRUE if case conversion makes sense. */
unsigned short closing_xml_tag; /* Is of the form </....> */
unsigned short data_type; /* The type of data stored in the data field:
0 = CFSMUnknown, 1 = Network, 2 = Alphabet,
3 = Integer, 4 = Other */
union {
FAT_STR name; /* Name of an atomic label. */
TUPLEptr tuple; /* The tuple of an fstpair. */
} content;
} LABELtype, *LABELptr;
/* There are two types of labels. Atomic labels have arity 1 and tuple labels
have arity 2. The content field of an atomic label is its name represented
as a fat string. The content field of a tuple label consists of a pair of
label IDs for its stomic components. */
#define LABEL_id(X) (X)->id
#define LABEL_other_id(X) (X)->other_id
#define LABEL_arity(X) (X)->arity
#define LABEL_data(X) (X)->data
#define LABEL_flag(X) (X)->flag
#define LABEL_name(X) (X)->content.name
#define LABEL_tuple(X) (X)->content.tuple
#define LABEL_convertible(X) (X)->convertible
#define LABEL_expands_other(X) (X)->expands_other
#define LABEL_consumes_input(X)(X)->consumes_input
#define LABEL_closing_xml_tag(X)(X)->closing_xml_tag
#define LABEL_data_type(X) (X)->data_type
int print_label(id_type id, FILE *stream, int escape_p);
/* Prints label correspondig to the ID to a stream. If escape_p is DONT_ESCAPE,
the label is printed literally. If escape_p is ESCAPE, special symbols such
as newline symbols and tabs are printed in double quotes. In UTF8 mode
(default), non-ASCII symbols are printed as UTF8 strings, in Latin-1 mode
symbols outside the Latin-1 region are printed in the format "\uXXXX" where
XXXX is the hex value of the Unicode code point. */
#define fstpair_upper(X) TUPLE_labels(LABEL_tuple(X))[UPPER]
#define fstpair_lower(X) TUPLE_labels(LABEL_tuple(X))[LOWER]
/*************************
* LABEL ID MAP
***************************/
typedef struct LABEL_ID_MAP LABEL_ID_MAPtype, *LABEL_ID_MAPptr;
/* A data structure structure for associating label names (fat
strings) and the corresponding integer IDs. It contains a hash
table that maps label names to their IDs and an array of
labels. The label for an ID, for example 321, is located at the
position 321 in the label array. The maximum number of label IDs
used to be 65535, it is now 16777214 (2^24 -2). When the default
label map is initialized, all the printable ASCII characters get
a label and an ID that is the same as the integer value of the
character. For example, the symbol 'A' has the ID 65. Labels for
other symbols and symbol pairs are created on demand. Special
symbols that have fixed label IDs include EPSILON (ID 0) and
OTHER (ID 1), the unknown symbol. Because symbol names are
recorded as fat strings, they do not depend on the character
encoding mode (utf-8 or iso-8859-1). */
id_type single_to_id(const char *name);
/* Converts the name string into a fat string and returns the
corresponding symbol ID for the atomic label. Creates a new label
and a new label ID, if the label does not already exist. Returns
the label ID. In UTF-8 mode, it is assumed that the name is a
UTF8-coded string. Generates a warning message if the string is
not a valid UTF8-string. In Latin-1 mode the name is processed as
a Latin-1 string. Any Unicode character may be represented in the
format "\uXXXX" where X is a hex character. For example,
single_to_id("\u20AC") returns an ID for the Euro symbol. */
id_type pair_to_id(const char *upper, const char *lower);
/* Returns the ID of the tuple label with upper and lower as the two
components. The names are processed as either UTF8 strings or
Latin-1 strings depending on the mode. If the names are equal
strings, the result is a single label because A:A is treated as
equivalent to A. */
id_type id_pair_to_id(id_type upper_id, id_type lower_id);
/* Returns the ID of the tuple label upper_id and lower_id as the
two components. The names are processed as either UTF8 strings or
Latin-1 strings depending on the mode. If upper_id and lower id are
identical, the result is identical to them as well. */
LABELptr id_to_label(id_type id);
/* Returns the label corresponding to the id. */
id_type upper_id(id_type id);
/* Returns the upper id of a tuple label or the id itself if id
refers to an atomic label. */
id_type lower_id(id_type id);
/* Returns the lower id of a tuple label or the id itself if id
is an atomic label. */
/*******************
* RANGE
*******************/
typedef struct RANGE_RECORD RANGEtype, *RANGEptr;
/*******************
* MATCH_TABLE
*******************/
typedef struct MATCH_TABLE MATCH_TABLEtype, *MATCH_TABLEptr;
/**********************
* ALPHABET
*********************/
typedef struct ALPHABET {
int len; /* # of ALPH_items positions in use */
int max; /* actual size of ALPH_items */
id_type *items; /* Label IDs (LABEL VECTOR), 0s and 1s (BINARY_VECTOR)*/
bit type:8; /* 0 = BINARY_VECTOR, 1 = LABEL_VECTOR */
bit in_use:8; /* 0 = not in use, 1 = in use */
} ALPHABETtype, *ALPHABETptr;
/* An alphabet is a list of label IDs. The list can be represented
in two ways, as a binary vector or as list of label IDs. A binary
vector is a list of zeros and ones where ones indicate that the
ID corresponding to the position in the vector is a member of the
alphabet. For example, if the position 65 in the vector contains 1,
then symbol with the ID 65 (the letter 'A') is a member of the
alphabet. The sigma alphabet of a network is kept in binary
format for quick membership checking. The label alphabet of a
network is maintained as a label vector. */
#define ALPH_type(X) (X)->type
#define ALPH_items(X) (X)->items
#define ALPH_item(X,Y) (X)->items[(Y)]
#define ALPH_len(X) (X)->len
#define ALPH_max(X) (X)->max
#define ALPH_in_use(X) (X)->in_use
ALPHABETptr make_alph(int len, int type);
/* Returns an alphabet of the specified length and type (either LABEL_VECTOR
or BINARY_VECTOR). */
ALPHABETptr copy_alphabet(ALPHABETptr alph);
/* Returns a copy of the alphabet. */
void free_alph(ALPHABETptr alph);
/* Reclaims the alphabet. */
void print_alph(ALPHABETptr alph, FILE *stream);
/* Prints the alphabet into the stream. */
/**********************
* ALPHABET ITERATOR
*********************/
typedef struct ALPH_ITERATOR {
int pos;
int len;
int type;
id_type *items;
} ALPH_ITtype, *ALPH_ITptr;
/* An alphabet iterator returns the members of an alphabet one after
an another. The alphabet may be in BINARY or LABEL format. */
#define ALPH_IT_pos(X) (X)->pos
#define ALPH_IT_type(X) (X)->type
#define ALPH_IT_items(X) (X)->items
#define ALPH_IT_len(X) (X)->len
ALPH_ITptr start_alph_iterator(ALPH_ITptr alph_it, ALPHABETptr alph);
/* Returns an initialized alphabet iterator for the given alphabet.
If the first argument is NULL, a new alphabet iterator is created. */
id_type next_alph_id(ALPH_ITptr alph_it);
/* Returns the next member of the alphabet. If there are no more unseen
IDs in the iterator, the return value is ID_NO_SYMBOL (16777215). */
void reset_alph_iterator(ALPH_ITptr alph_it);
/* Resets the alphabet iterator to the first symbol ID. */
void free_alph_iterator(ALPH_ITptr alph_it);
/* Frees the alphabet iterator only, not the alphabet that it iterates on.
You need not call this function if the iterator has been statically
allocated. */
/***********************
* ARC_VECTOR
***********************/
typedef struct ARC_VECTOR AVtype, *AVptr;
/*********************
* CH_NODE
*********************/
typedef struct CH_NODE CH_NODEtype, *CH_NODEptr;
/*********************
* PARSE_TABLE object *
*********************/
typedef struct PARSE_TABLE PARSE_TBLtype, *PARSE_TBL;
/*******************
* PROPERTY LIST *
*******************/
/* The property list of a network is a list of attribute-value pairs.
The attributes are represented as fat strings, the values may
be diffent types of objects including strings, integers and lists.
The property list is used to store the regular expression that
was used to define the network. It also stores the defined list
symbols that the network refers to. For example, the following
sequence of commands
define_regex_list("Vowel", "a e i o u");
define_regex_list("VoicelessStop", "k p t");
define_regex_net("Test","\"@L.VoicelessStop@\" \"@L.Vowel@\"");
save(net("Test"), "test.net")
causes the test net to be saved with the following property list:
DEFINITION: "%"@L.VoicelessStop@%" %"@L.Vowel@%""
DEFINED_LISTS: ( ( "VoicelessStop" "k" "p" "t" )
( "Vowel" "a" "e" "i" "o" "u" ) )
when the saved net is loaded from a file, the list definitions are
restored from the property list. */
typedef struct IO_SYMBOL IO_SYMBOLtype, *IO_SYMBOLptr;
typedef struct IO_SYMBOL_PACKAGE IO_SYMBOL_PACKAGEtype, *IO_SYMBOL_PACKAGEptr;
typedef struct BYTE_BLOCK BYTE_BLOCKtype, *BYTE_BLOCKptr;
typedef struct SEQUENCE SEQUENCEtype, *SEQUENCEptr;
typedef struct OBJECT OBJECTtype, *OBJECTptr;
typedef struct PROP {
FAT_STR attribute;
OBJECTptr value;
struct PROP *next;
} PROPtype, *PROPptr;
#define PROP_attr(X) (X)->attribute
#define PROP_val(X) (X)->value
#define next_prop(X) (X)->next
/*****************
* ARC
*****************/
typedef struct ARC {
struct STATE *destination; /* Destination state */
bit type_bit : 1;
bit userflag1 : 1;
bit visit_mark : 2;
bit big_arc_flag : 1;
bit userflag2 : 2;
bit in_use: 1;
id_type label : MAX_LV; /* Label ID */
struct ARC *next; /* Pointer to the next arc */
} ARCtype, *ARCptr;
/* An arc consists of a pointer to a destination state, a label ID,
a pointer to the next arc and various bit flags. Arcs are
allocated from a global arc heap. When an arc is freed, it
is put on the freelist of the heap. The in_use bit is 1 when
the arc is in use, 0 when it has been freed. */
#define ARC_type_bit(X) (X)->type_bit
#define ARC_userflag1(X) (X)->userflag1
#define ARC_visit_mark(X) (X)->visit_mark
#define ARC_big_arc_flag(X) (X)->big_arc_flag
#define ARC_userflag2(X) (X)->userflag2
#define ARC_in_use(X) (X)->in_use
#define ARC_label(X) (X)->label
#define ARC_destination(X) (X)->destination
#define ARC_next(X) (X)->next
typedef struct BIG_ARC {
struct STATE *destination;
bit type_bit : 1;
bit userflag1 : 1;
bit visit_mark : 2;
bit big_arc_flag : 1;
bit userflag2: 2;
bit in_use: 1;
id_type label: MAX_LV;
struct BIG_ARC *next;
void *user_pointer;
} BIG_ARCtype, *BIG_ARCptr;
/* A "big arc" is an arc with an extra user_pointer field. If the
big_arc_flag is 1 instead of 0, the arc has that extra field. A
common use for big arcs is to record for each symbol in a
network its starting byte position in a text file. */
#define ARC_user_pointer(X) (X)->user_pointer
void free_arc(ARCptr arc);
/* Returns the arc to the global arc or big-arc heap. */
/******************
* STATE
******************/
typedef struct STATE {
union {
struct ARC *set; /* First arc of the state */
struct ARC_VECTOR *vector; /* Vector of destination states */
} arc;
bit type_bit : 1;
bit final : 1;
bit deterministic : 1;
bit vector_p : 1; /* 0 = normal state, 1 = vectorized state */
bit visit_mark : 8;
bit userflag2 : 2;
bit is_virtual: 1;
bit in_use : 1;
struct STATE *next; /* Pointer to the next state */
void *client_cell;
} STATEtype, *STATEptr;
/* A state contains a set of arcs leading other other states, a
pointer to the next state, a set of bit flags, a client_cell
pointer that is used by various algorithms to store temporary
information. The arc set of a state may be in one of two
formats. The standard format that most cfsm algorithms expect is
a list of arcs. In this case, state->arc points to the first arc
of the state and each arc points to its successor, or to NULL in
the case of the last arc. Alternatively, the arc set is represented
by a structure that contains a vector of destination states. The
vector format takes up more space than the standard format but
it is faster to process because it provides random access to
destination states by label IDs. States are allocated from a
global state heap. The in_use flag is 1 when a state is in use
and 0 when a state is returned to the heap. */
void free_state(STATEptr state);
/* Returns the state to the global state heap. */
#define STATE_type_bit(X) (X)->type_bit
#define STATE_visit_mark(X) (X)->visit_mark
#define STATE_final(X) (X)->final
#define STATE_deterministic(X) (X)->deterministic
#define STATE_vector_p(X) (X)->vector_p
#define STATE_arc_set(X) (X)->arc.set
#define STATE_arc_vector(X) (X)->arc.vector
#define STATE_userflag2(X) (X)->userflag2
#define STATE_is_virtual(X) (X)->is_virtual
#define STATE_in_use(X) (X)->in_use
/*********************
* NETWORK
*********************/
typedef struct NETWORK {
ALPHABETptr labels; /* Label alphabet as LABEL_VECTOR */
ALPHABETptr sigma; /* Sigma alphabet as BINARY_VECTOR */
struct {
bit deterministic:1;
bit pruned:1;
bit completed:1;
bit minimized:1;
bit epsilon_free:1;
bit sorted_states:1;
bit loop_free:1;
bit twol_net:1;
bit visit_marks_dirty:1;
bit names_matter:1;
bit shared_arc_lists:1;
bit has_arc_user_pointer:1;
bit closed_sigma:1;
bit start_state_final:1;
bit lower_bound_checked:1;
bit compacted:1;
bit obsolete2:1;
bit obsolete3:1;
bit mark:1;
bit u_flag_diacr:1;
bit l_flag_diacr:1;
bit obsolete4:1;
bit obsolete5:1;
bit sorted_arcs:1;
bit reduced_labelset:1;
bit obsolete6:1;
bit is_virtual:1;
bit is_arc_optimized:1;
bit in_use:1;
bit has_arc_vectors:1;
bit linear_bounded_upper:1;
bit linear_bounded_lower:1;
bit upper_bound_checked:1;
} flags;
int16 arc_label_arity;
id_type defined_as;
id_type range_len;
LABEL_ID_MAPptr label_map;
RANGEptr uprange_map;
RANGEptr downrange_map;
MATCH_TABLEptr upmatch_table;
MATCH_TABLEptr downmatch_table;
ALPHABETptr recode_key;
ALPHABETptr decode_key;
ALPHABETptr unreduce_key;
union {
STATEptr state;
unsigned char *loc;
} start;
union {
STATEptr states; /* First state of a standard network */
void *block; /* Arc block of a compacted network */
} body;
HEAPptr arc_vector_heap;
int arc_vector_len;
PROPptr networkprops; /* First network property */
PARSE_TBL upper_parse_table;
PARSE_TBL lower_parse_table;
uintptr_t num_states; /* Number of states */
uintptr_t num_arcs; /* Number of arcs */
uintptr_t block_size;
ALPHABETptr flag_register;
void *client_cell;
void *mmap_handle;
size_t mmap_size;
} NETtype, *NETptr;
/* A network is a large data structure. Many of the fields such as
parse and match tables are used to cache data that is used by the
apply routines. The body of a standard network is a pointer to
the first state of a state list. Each state points to its
successor on the list. The body of a compacted network is a
pointer to the block of memory encoding the arcs and states in a
space-efficient way. Most cfsm algorithms work only on standard
networks. A network can be optimized in several ways either to
save space or to increase the application speed. If the has_arc_vectors
flag is 1, some of the states of the network are in the vectorized
format. Other optimization flags are arc_optimized, reduced_labelset,
and shared_arc_lists. A network has a unique start_state. The start
state does not have to be the first state of the list pointed to
by body.arcs. Networks are allocated from a global heap. The in_use
flag indicates whether a network is in use or whether it has been
freed. The num_arcs field contains the number of arcs; The num_states
field keeps track of the number of states in the network. */
#define NET_deterministic(X) (X)->flags.deterministic
#define NET_pruned(X) (X)->flags.pruned
#define NET_completed(X) (X)->flags.completed
#define NET_minimized(X) (X)->flags.minimized
#define NET_epsilon_free(X) (X)->flags.epsilon_free
#define NET_sorted_states(X) (X)->flags.sorted_states
#define NET_loop_free(X) (X)->flags.loop_free
#define NET_visit_marks_dirty(X) (X)->flags.visit_marks_dirty
#define NET_names_matter(X) (X)->flags.names_matter
#define NET_shared_arc_lists(X) (X)->flags.shared_arc_lists
#define NET_has_arc_user_pointer(X) (X)->flags.has_arc_user_pointer
#define NET_closed_sigma(X) (X)->flags.closed_sigma
#define NET_start_state_final(X) (X)->flags.start_state_final
#define NET_twol_net(X) (X)->flags.twol_net
#define NET_compacted(X) (X)->flags.compacted
#define NET_mark(X) (X)->flags.mark
#define NET_u_flag_diacr(X) (X)->flags.u_flag_diacr
#define NET_l_flag_diacr(X) (X)->flags.l_flag_diacr
#define NET_sorted_arcs(X) (X)->flags.sorted_arcs
#define NET_reduced_labelset(X) (X)->flags.reduced_labelset
#define NET_Kaplan_compressed(X) (X)->flags.Kaplan_compressed
#define NET_is_virtual(X) (X)->flags.is_virtual
#define NET_optimized(X) (X)->flags.is_arc_optimized
#define NET_in_use(X) (X)->flags.in_use
#define NET_has_arc_vectors(X) (X)->flags.has_arc_vectors
#define NET_linear_bounded_upper(X) (X)->flags.linear_bounded_upper
#define NET_linear_bounded_lower(X) (X)->flags.linear_bounded_lower
#define NET_lower_bound_checked(X) (X)->flags.lower_bound_checked
#define NET_upper_bound_checked(X) (X)->flags.upper_bound_checked
#define NET_arc_label_arity(X) (X)->arc_label_arity
#define NET_num_arcs(X) (X)->num_arcs
#define NET_num_states(X) (X)->num_states
#define NET_labels(X) (X)->labels
#define NET_sigma(X) (X)->sigma
#define NET_recode_key(X) (X)->recode_key
#define NET_decode_key(X) (X)->decode_key
#define NET_unreduce_key(X) (X)->unreduce_key
#define NET_start_state(X) (X)->start.state
#define NET_start_loc(X) (X)->start.loc
#define NET_states(X) (X)->body.states
#define NET_arc_block(X) (X)->body.block
#define NET_arc_vector_heap(X) (X)->arc_vector_heap
#define NET_arc_vector_len(X) (X)->arc_vector_len
#define NET_range_len(X) (X)->range_len
#define NET_label_map(X) (X)->label_map
#define NET_properties(X) (X)->networkprops
#define NET_upper_parse_table(X) (X)->upper_parse_table
#define NET_lower_parse_table(X) (X)->lower_parse_table
#define NET_uprange_map(X) (X)->uprange_map
#define NET_downrange_map(X) (X)->downrange_map
#define NET_upmatch_table(X) (X)->upmatch_table
#define NET_downmatch_table(X) (X)->downmatch_table
#define NET_mmap(X) (X)->mmap_handle
#define NET_mmap_size(X) (X)->mmap_size
NETptr make_empty_net(void);
/* Returns a skeleton network structure with an empty sigma and
label_alphabets but without an initial state. Use either
null_net() or epsilon_net() to create a minimal network
with an initial state. */
NETptr copy_net(NETptr net);
/* Returns a copy of the network. */
int minimize_net(NETptr net);
/* Destructively minimizes the network using Hopcroft's algorithm.
Returns 0 on success and 1 on error. As a prelimnary step
to minimization, the network is first pruned, epsilons are
removed and the network is determinized. Minimization can
only be done on standard networks, not on networks that have
been compacted or vectorized. */
void free_network(NETptr net);
/* Returns the network to the global network heap. */
void print_net(NETptr net, FILE *stream);
/* Prints the states and arcs of the network into the stream. */
STATEptr add_state_to_net(NETptr net, int final_p);
/* Adds a new state to the network. If final_p is non-zero, the
state is final. Returns the new state on success, NULL on failure. */
ARCptr add_arc_to_state(NETptr net, STATEptr start, id_type id, STATEptr dest,
void *user_pointer, int big_arc_p);
/* Creates a new arc from start to dest with the label id unless it
would duplicate an existing arc. Does not add a looping EPSILON
arc. The start and dest states must already exist in the
network. The network must be a standard network, not vectorized
or optimized. Updates the sigma and label alphabets of the
network. If big_arc_p is non-zero, the new arc will have a
user_pointer field. Returns the arc on success, NULL on failure. */
int read_net_properties(NETptr net, char *file);
/* Reads a list of attribute value pairs from the file and adds them
to the networks property list. For example,
NETWORKNAME: "Number-to-numeral converter"
LARGEST_NUMBER: 99999
If file is NULL, the input is obtained from stdin. */
int write_net_properties(NETptr net, char *file);
/* Writes the networks property list into a file or to stdout if
file is NULL. */
int add_string_property(NETptr net, char *attribute, char *value);
/* Adds the attribute:value pair to the network's property list.
Any previous value for the attribute is freed and replaced by
the new value. Returns 0 on success, 1 on error. Both the
attribute and the value are copied and converted to fat
strings, they can be freed by the calling function if they
have been malloc'd. */
char *get_string_property(NETptr net, char *attribute);
/* Returns the value of the attribute on the property list of the
net, or NULL if it is not found. The value is a freshly allocated
C string. It should be freed by the calling function when it
is not needed anymore. */
int remove_string_property(NETptr net, char *attribute);
/* Removes the attribute and its value from the property list of the
network. Returns 0 on success, 1 or error. */
/*****************
* NET VECTOR *
*****************/
typedef struct NET_VECTOR {
int len;
NETptr *nets;
} NVtype, *NVptr;
/* A data structure for storing one or more networks. The positions
in a net vector are counted starting from 0. Thus NV_net(nv, 0)
refers to the first network in the net vector nv. */
#define NV_len(X) (X)->len
#define NV_nets(X) (X)->nets
#define NV_net(X,Y) (X)->nets[(Y)]
NVptr make_nv(int len);
/* Retuns a net vector of the specified length. */
NVptr net2nv(NETptr net);
/* Wraps the net inside a net vector of length 1 and returns the vector. */
NETptr nv_get(NVptr nv, int pos);
/* Returns the net in the given position in the net vector nv.
Safer than the NV_net(nv,pos) macro because it makes sure that
0 >= pos < NV_len(nv). Returns NULL on error. */
void nv_push(NETptr net, NVptr nv);
/* Pushes the net into the beginning of the net vector increasing its
length by 1. */
void nv_add(NETptr net, NVptr nv);
/* Appends the net into the end of the net vector increasing its length
by 1. */
void free_nv_only(NVptr nv);
/* Frees the net vector but not any of the nets it contains. */
void free_nv_and_nets(NVptr nv);
/* Frees both networks contained in the vector and the net vector itself. */
typedef struct LOCATION_IN_NET LOCATIONtype, *LOCATIONptr;
typedef struct LODATION_PATH LOCATION_PATHtype, *LOCATION_PATHptr;
typedef struct IO_SEQUENCE IO_SEQtype, *IO_SEQptr;
typedef struct IO_SEQUENCE_TABLE IO_SEQ_TABLEtype, *IO_SEQ_TABLEptr;
/**************************
* STANDARD FILE HEADER *