forked from fumiyas/hyperestraier-encore
-
Notifications
You must be signed in to change notification settings - Fork 0
/
estraier.c
10477 lines (9890 loc) · 329 KB
/
estraier.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
/*************************************************************************************************
* Implementation of the core API
* Copyright (C) 2004-2007 Mikio Hirabayashi
* This file is part of Hyper Estraier.
* Hyper Estraier is free software; you can redistribute it and/or modify it under the terms of
* the GNU Lesser General Public License as published by the Free Software Foundation; either
* version 2.1 of the License or any later version. Hyper Estraier is distributed in the hope
* that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
* License for more details.
* You should have received a copy of the GNU Lesser General Public License along with Hyper
* Estraier; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330,
* Boston, MA 02111-1307 USA.
*************************************************************************************************/
#if defined(_MYVISTA)
#include <vista.h>
#endif
#include "estraier.h"
#include "myconf.h"
#define ESTNUMBUFSIZ 32 /* size of a buffer for a number */
#define ESTPATHBUFSIZ 4096 /* size of a buffer for a path */
#define ESTIOBUFSIZ 8192 /* size of a buffer for I/O */
#define ESTALLOCUNIT 1024 /* unit number of memory allocation */
#define ESTMINIBNUM 31 /* bucket number of map for attributes */
#define ESTSCANWNUM 256 /* number of words for scaning check */
#define ESTSIGNUM 64 /* number of signals */
#define ESTREGSUBMAX 32 /* maximum number of substrings for regex */
#define ESTMETADBNAME "_meta" /* name of the meta database */
#define ESTKEYIDXNUM "_idxnum" /* key for the number of inverted indexes */
#define ESTKEYDSEQ "_dseq" /* key for the sequence for document IDs */
#define ESTKEYDNUM "_dnum" /* key for the number of documents */
#define ESTKEYMETA "_meta" /* key for meta data */
#define ESTIDXDBNAME "_idx" /* name of the inverted index */
#define ESTIDXDBLRM 109 /* records in a leaf node of the inverted index */
#define ESTIDXDBLRMA 17 /* records in a leaf node of the index in APN mode */
#define ESTIDXDBNIM 160 /* records in a non-leaf node of the inverted index */
#define ESTIDXDBLCN 16 /* number of leaf cache of the inverted index */
#define ESTIDXDBNCN 16 /* number of non-leaf cache of the inverted index */
#define ESTIDXDBRLCN 128 /* number of leaf cache of the index reader */
#define ESTIDXDBRLCNA 32 /* number of leaf cache of the reader in APN mode */
#define ESTIDXDBRNCN 256 /* number of non-leaf cache of the index reader */
#define ESTIDXDBFBP 512 /* size of free block pool of the inverted index */
#define ESTIDXDBMIN (1048576*512) /* minimum size of a database file */
#define ESTIDXDBMAX (1048576*1536) /* maximum size of a database file */
#define ESTFWMDBNAME "_fwm" /* name of the database for forward matching */
#define ESTFWMDBLRM 251 /* records in a leaf node of forward matching DB */
#define ESTFWMDBNIM 110 /* records in a non-leaf node of forward matching DB */
#define ESTFWMDBLCN 32 /* number of leaf cache of forward matching DB */
#define ESTFWMDBNCN 16 /* number of non-leaf cache of forward matching DB */
#define ESTFWMDBFBP 128 /* size of free block pool of forward matching DB */
#define ESTAUXDBNAME "_aux" /* name of the auxiliary index */
#define ESTAUXDBLRM 23 /* records in a leaf node of the auxiliary index */
#define ESTAUXDBNIM 160 /* records in a non-leaf node of the auxiliary index */
#define ESTAUXDBLCN 16 /* number of leaf cache of the auxiliary index */
#define ESTAUXDBNCN 16 /* number of non-leaf cache of the auxiliary index */
#define ESTAUXDBRLCN 256 /* number of leaf cache of the auxiliary reader */
#define ESTAUXDBRNCN 64 /* number of non-leaf cache of the auxiliary reader */
#define ESTAUXDBFBP 256 /* size of free block pool of the auxiliary index */
#define ESTXFMDBNAME "_xfm" /* name of the database for auxiliary forward matching */
#define ESTXFMDBLRM 111 /* records in a leaf node of xfm DB */
#define ESTXFMDBNIM 110 /* records in a non-leaf node of xfm DB */
#define ESTXFMDBLCN 32 /* number of leaf cache of xfm DB */
#define ESTXFMDBNCN 16 /* number of non-leaf cache of xfm DB */
#define ESTXFMDBFBP 128 /* size of free block pool of xfm DB */
#define ESTATTRDBNAME "_attr" /* name of the database for attributes */
#define ESTATTRDBBNUM 212987 /* bucket number of the database for attributes */
#define ESTATTRDBDNUM 3 /* division number of the database for attributes */
#define ESTATTRDBALN -5 /* alignment of the database for attributes */
#define ESTATTRDBFBP 64 /* size of free block pool of the attribute DB */
#define ESTTEXTDBNAME "_text" /* name of the database of texts */
#define ESTTEXTDBBNUM 61417 /* bucket number of the database for texts */
#define ESTTEXTDBDNUM 7 /* division number of the database for texts */
#define ESTTEXTDBALN -5 /* alignment of the database for texts */
#define ESTTEXTDBFBP 128 /* size of free block pool of the text DB */
#define ESTKWDDBNAME "_kwd" /* name of the database of keywords */
#define ESTKWDDBBNUM 163819 /* bucket number of the database for keywords */
#define ESTKWDDBDNUM 3 /* division number of the database for keywords */
#define ESTKWDDBALN -5 /* alignment of the database for keywords */
#define ESTKWDDBFBP 64 /* size of free block pool of the keyword DB */
#define ESTLISTDBNAME "_list" /* name of the database of document list */
#define ESTLISTDBLRM 99 /* records in a leaf node of document list DB */
#define ESTLISTDBNIM 200 /* records in a non-leaf node of document list DB */
#define ESTLISTDBLCN 64 /* number of leaf cache of document list DB */
#define ESTLISTDBNCN 16 /* number of non-leaf cache of document list DB */
#define ESTLISTDBFBP 128 /* size of free block pool of document list DB */
#define ESTAISEQPREF "__seq_" /* prefix of the database for sequencial access */
#define ESTAISTRPREF "__str_" /* prefix of the database for string narrowing */
#define ESTAINUMPREF "__num_" /* prefix of the database for number narrowing */
#define ESTAIBDIAM 0.8 /* diameter of the bucket number */
#define ESTAIDXLRM 99 /* records in a leaf node of narrowing index */
#define ESTAIDXNIM 120 /* records in a non-leaf node of narrowing index */
#define ESTAIDXLCN 1024 /* number of leaf cache of narrowing index */
#define ESTAIDXNCN 256 /* number of non-leaf cache of narrowing index */
#define ESTAIDXDPFBP 32 /* size of free block pool of sequencial DB */
#define ESTAIDXVLFBP 128 /* size of free block pool of narrowing DB */
#define ESTAIKBUFSIZ 8192 /* size of a buffer for a key */
#define ESTAISNUMMIN 256 /* minimum number of scores to use narrowing index */
#define ESTDBSBRAT 0.3 /* ratio of bucket numbers of large mode */
#define ESTDBSDRAT 0.4 /* ratio of the division number of large mode */
#define ESTDBLBRAT 3.0 /* ratio of bucket numbers of large mode */
#define ESTDBLDRAT 1.0 /* ratio of the division number of large mode */
#define ESTDBHBRAT 5.0 /* ratio of bucket numbers of huge mode */
#define ESTDBHDRAT 2.0 /* ratio of the division number of huge mode */
#define ESTDBH2RAT 1.4 /* ratio of huge mode second */
#define ESTDBH3RAT 2.0 /* ratio of huge mode third */
#define ESTVLCRDNUM 2 /* division number of usual Villa databases */
#define ESTVLCRDNAUX 7 /* division number of the auxiliary index */
#define ESTIDXCCBNUM 524288 /* bucket number of cache for the inverted index */
#define ESTAUXCCBNUM 65521 /* bucket number of cache for the auxiliary index */
#define ESTIDXCCMAX (1048576*64) /* max size of the cache */
#define ESTOUTCCBNUM 131072 /* bucket number of cache for deleted documents */
#define ESTKEYCCMNUM 65536 /* bucket number of cache for keys for TF-IDF */
#define ESTATTRCCMNUM 8192 /* number of cache for attributes */
#define ESTTEXTCCMNUM 1024 /* number of cache for texts */
#define ESTRESCCMNUM 256 /* number of cache for results */
#define ESTCCIRSLOT 256 /* slot timing for interruption */
#define ESTCCCBFREQ 10000 /* frequency of callback for flushing words */
#define ESTDIRMODE 00755 /* permission of a creating directory */
#define ESTICCHECKSIZ 32768 /* size of checking character code */
#define ESTICMISSMAX 256 /* allowance number of missing characters */
#define ESTICALLWRAT 0.001 /* allowance ratio of missing characters */
#define ESTOCPOINT 16 /* point per occurrence */
#define ESTJHASHNUM 251 /* hash number for a junction */
#define ESTWORDMAXLEN 48 /* maximum length of a word */
#define ESTWORDAVGLEN 8 /* average length of a word */
#define ESTATTRALW 1.5 /* allowance ratio of attribute narrowing */
#define ESTKEYSCALW 3 /* allowance ratio of TF-IDF for keywords */
#define ESTMEMIRATIO 1.1 /* incremental ratio of memory allocation */
#define ESTSCOREUNIT 1000 /* unit of standard deviation of scoring */
#define ESTAUXMIN 32 /* minimum hits to adopt the auxiliary index */
#define ESTAUXEXRAT 16 /* ratio of hits of keywords expansion */
#define ESTWILDMAX 256 /* maximum number of expansion of wild cards */
#define ESTECLKNUM 32 /* number of keywords to eclipse candidates */
#define ESTSMLRKNUM 16 /* number of keywords to get candidates */
#define ESTSMLRUNUM 1024 /* number of adopted documents for a keyword */
#define ESTSMLRMNUM 4096 /* maximum number of candidates to be checked */
#define ESTSMLRNMIN 0.5 /* the minimum value for narrowing */
/* set a buffer for a variable length number */
#define EST_SET_VNUMBUF(EST_len, EST_buf, EST_num) \
do { \
int _EST_num = (EST_num); \
div_t EST_d; \
if(_EST_num == 0){ \
((signed char *)(EST_buf))[0] = 0; \
(EST_len) = 1; \
} else { \
(EST_len) = 0; \
while(_EST_num > 0){ \
EST_d = div(_EST_num, 128); \
_EST_num = EST_d.quot; \
if(_EST_num > 0){ \
((signed char *)(EST_buf))[(EST_len)] = -EST_d.rem - 1; \
} else { \
((signed char *)(EST_buf))[(EST_len)] = EST_d.rem; \
} \
(EST_len)++; \
} \
} \
} while(FALSE)
/* read a variable length buffer */
#define EST_READ_VNUMBUF(EST_buf, EST_num, EST_step) \
do { \
int _EST_i, _EST_base; \
(EST_num) = 0; \
_EST_base = 1; \
for(_EST_i = 0; TRUE; _EST_i++){ \
if(((signed char *)(EST_buf))[_EST_i] >= 0){ \
(EST_num) += ((signed char *)(EST_buf))[_EST_i] * _EST_base; \
break; \
} \
(EST_num) += _EST_base * (((signed char *)(EST_buf))[_EST_i] + 1) * -1; \
_EST_base *= 128; \
} \
EST_step = _EST_i + 1; \
} while(FALSE)
typedef struct { /* type of structure for an attribute database */
void *db; /* handle of the database */
int type; /* data type */
} ESTATTRIDX;
enum { /* enumeration for character categories */
ESTSPACECHR, /* space characters */
ESTDELIMCHR, /* delimiter characters */
ESTWESTALPH, /* west alphabets */
ESTEASTALPH, /* east alphabets */
ESTHIRAGANA, /* east alphabets: hiragana */
ESTKATAKANA, /* east alphabets: katakana */
ESTHANGUL, /* east alphabets: hangul */
ESTKANJI /* east alphabets: kanji */
};
enum { /* enumeration for flags for databases */
ESTDFPERFNG = 1 << 10, /* use perfect N-gram analizer */
ESTDFCHRCAT = 1 << 11, /* use character category analizer */
ESTDFZLIB = 1 << 15, /* compress records with ZLIB */
ESTDFLZO = 1 << 16, /* compress records with LZO */
ESTDFBZIP = 1 << 17, /* compress records with BZIP2 */
ESTDFSCVOID = 1 << 20, /* store scores as void */
ESTDFSCINT = 1 << 21, /* store scores as integer */
ESTDFSCASIS = 1 << 22 /* refrain from adjustment of scores */
};
enum { /* enumration for phrase format */
ESTPMUSUAL, /* usual phrase */
ESTPMSIMPLE, /* simplified phrase */
ESTPMROUGH, /* rough phrase */
ESTPMUNION, /* union phrase */
ESTPMISECT /* intersection phrase */
};
enum {
COP_ESTOPSTREQ, /* string is equal */
COP_ESTOPSTRNE, /* string is not equal */
COP_ESTOPSTRINC, /* string is included in */
COP_ESTOPSTRBW, /* string begins with */
COP_ESTOPSTREW, /* string ends with */
COP_ESTOPSTRAND, /* string includes all tokens in */
COP_ESTOPSTROR, /* string includes at least one token in */
COP_ESTOPSTROREQ, /* string is equal at least one token in */
COP_ESTOPSTRRX, /* string matches regular expressions of */
COP_ESTOPNUMEQ, /* number or date is equal */
COP_ESTOPNUMNE, /* number or date is not equal */
COP_ESTOPNUMGT, /* number or date is greater than */
COP_ESTOPNUMGE, /* number or date is greater than or equal to */
COP_ESTOPNUMLT, /* number or date is less than */
COP_ESTOPNUMLE, /* number or date is less than or equal to */
COP_ESTOPNUMBT, /* number or date is between two tokens of */
COP_ESTOPDUMMY /* dummy operator */
};
typedef struct { /* type of structure for a hitting object */
int id; /* ID of a document */
int score; /* score tuned by TF-IDF */
char *value; /* value of an attribute for sorting */
} ESTSCORE;
typedef struct { /* type of structure for a conditional attribute */
char *name; /* name */
int nsiz; /* size of the name */
CBLIST *nlist; /* list of plural names */
char *oper; /* operator */
char *val; /* value */
int vsiz; /* size of the value */
int cop; /* canonical operator */
int sign; /* positive or negative */
char *sval; /* value of small cases */
int ssiz; /* size of the small value */
void *regex; /* compiled regular expressions */
time_t num; /* numeric value */
} ESTCATTR;
typedef struct { /* type of structure for a hitting object */
const char *word; /* face of keyword */
int wsiz; /* size of the keyword */
int pt; /* score tuned by TF-IDF */
} ESTKEYSC;
typedef struct { /* type of structure for a meta hitting object */
int db; /* index of a container database */
int id; /* ID of a document */
int score; /* score tuned by TF-IDF */
char *value; /* value of an attribute for sorting */
} ESTMETASCORE;
/* private function prototypes */
static void est_set_ecode(int *ecp, int value, int line);
static char *est_hex_encode(const char *str);
static char *est_hex_decode(const char *str);
static int est_enc_miss(const char *ptr, int size, const char *icode, const char *ocode);
static void est_normalize_text(unsigned char *utext, int size, int *sp);
static void est_canonicalize_text(unsigned char *utext, int size, int funcspc);
static int est_char_category(int c);
static int est_char_category_perfng(int c);
static int est_char_category_chrcat(int c);
static char *est_make_snippet(const char *str, int len, const CBLIST *words,
int wwidth, int hwidth, int awidth);
static int est_check_cjk_only(const char *str);
static char *est_phrase_from_simple(const char *sphrase);
static char *est_phrase_from_rough(const char *rphrase);
static char *est_phrase_from_union(const char *uphrase);
static char *est_phrase_from_isect(const char *iphrase);
static void est_snippet_add_text(const unsigned char *rtext, const unsigned char *ctext,
int size, int awsiz, CBDATUM *res, const CBLIST *rwords);
static int est_str_fwmatch_wide(const unsigned char *haystack, int hsiz,
const unsigned char *needle, int nsiz);
static char *est_strstr_sparse(const char *haystack, const char *needle);
static int est_idx_rec_last_id(const char *vbuf, int vsiz, int smode);
static void est_encode_idx_rec(CBDATUM *datum, const char *vbuf, int vsiz, int lid, int smode);
static void est_decode_idx_rec(CBDATUM *datum, const char *vbuf, int vsiz, int smode);
static ESTIDX *est_idx_open(const char *name, int omode, int dnum);
static int est_idx_close(ESTIDX *idx);
static void est_idx_set_tuning(ESTIDX *idx, int lrecmax, int nidxmax, int lcnum, int ncnum,
int fbpsiz);
static void est_idx_increment(ESTIDX *idx);
static int est_idx_dnum(ESTIDX *idx);
static int est_idx_add(ESTIDX *idx, const char *word, int wsiz,
const char *vbuf, int vsiz, int smode);
static int est_idx_put_one(ESTIDX *idx, int inum, const char *word, int wsiz,
const char *vbuf, int vsiz);
static int est_idx_out(ESTIDX *idx, const char *word, int wsiz);
static char *est_idx_scan(ESTIDX *idx, const char *word, int wsiz, int *sp, int smode);
static const char *est_idx_get_one(ESTIDX *idx, int inum, const char *word, int wsiz, int *sp);
static int est_idx_vsiz(ESTIDX *idx, const char *word, int wsiz);
static int est_idx_num(ESTIDX *idx);
static double est_idx_size(ESTIDX *idx);
static int est_idx_size_current(ESTIDX *idx);
static int est_idx_memflush(ESTIDX *idx);
static int est_idx_sync(ESTIDX *idx);
static int est_idx_optimize(ESTIDX *idx);
static void est_idx_set_current(ESTIDX *idx);
static int est_crput(CURIA *curia, int zmode, int id, const char *vbuf, int vsiz, int dmode);
static int est_crout(CURIA *curia, int id);
static char *est_crget(CURIA *curia, int flags, int id, int *sp);
static int est_aidx_seq_put(DEPOT *db, int id, const char *vbuf, int vsiz);
static int est_aidx_seq_out(DEPOT *db, int id);
static char *est_aidx_seq_get(DEPOT *db, int id, int *sp);
static int est_aidx_seq_narrow(DEPOT *db, const CBLIST *pdocs, int cop, int sign,
const char *oval, int osiz, const char *sval, int ssiz,
const void *regex, int onum, ESTSCORE *scores, int snum,
int limit, int *restp);
static int est_aidx_numcmp(const char *aptr, int asiz, const char *bptr, int bsiz);
static int est_aidx_attr_put(VILLA *db, int id, const char *vbuf, int vsiz);
static int est_aidx_attr_out(VILLA *db, int id, const char *vbuf, int vsiz);
static int est_aidx_attr_narrow(VILLA *db, const CBLIST *pdocs, int cop, int sign,
const char *oval, int osiz, const char *sval, int ssiz,
const void *regex, int onum, ESTSCORE *scores, int snum);
static int est_int_compare(const void *ap, const void *bp);
static int est_short_compare(const void *ap, const void *bp);
static void est_inodes_delete(void *arg);
static void est_inodes_delete_informer(const char *msg, void *opaque);
static int est_db_write_meta(ESTDB *db);
static void est_db_inform(ESTDB *db, const char *info);
static void est_db_prepare_meta(ESTDB *db);
static int est_db_score_doc(ESTDB *db, ESTDOC *doc, ESTCOND *cond, int *scp);
static int est_pidx_uri_to_id(ESTDB *db, const char *uri);
static CBLIST *est_phrase_terms(const char *phrase);
static int est_score_compare_by_id_asc(const void *ap, const void *bp);
static int est_score_compare_by_id_desc(const void *ap, const void *bp);
static int est_score_compare_by_score_asc(const void *ap, const void *bp);
static int est_score_compare_by_score_desc(const void *ap, const void *bp);
static int est_score_compare_by_str_asc(const void *ap, const void *bp);
static int est_score_compare_by_str_desc(const void *ap, const void *bp);
static int est_score_compare_by_num_asc(const void *ap, const void *bp);
static int est_score_compare_by_num_desc(const void *ap, const void *bp);
static int est_metascore_compare_by_id_asc(const void *ap, const void *bp);
static int est_metascore_compare_by_id_desc(const void *ap, const void *bp);
static int est_metascore_compare_by_score_asc(const void *ap, const void *bp);
static int est_metascore_compare_by_score_desc(const void *ap, const void *bp);
static int est_metascore_compare_by_str_asc(const void *ap, const void *bp);
static int est_metascore_compare_by_str_desc(const void *ap, const void *bp);
static int est_metascore_compare_by_num_asc(const void *ap, const void *bp);
static int est_metascore_compare_by_num_desc(const void *ap, const void *bp);
static ESTSCORE *est_search_uvset(ESTDB *db, int *nump, CBMAP *hints, int add);
static void est_expand_word_bw(ESTDB *db, const char *word, CBLIST *list);
static void est_expand_word_ew(ESTDB *db, const char *word, CBLIST *list);
static void est_expand_word_rx(ESTDB *db, const char *word, CBLIST *list);
static void est_expand_keyword_bw(ESTDB *db, const char *word, CBLIST *list);
static void est_expand_keyword_ew(ESTDB *db, const char *word, CBLIST *list);
static void est_expand_keyword_rx(ESTDB *db, const char *word, CBLIST *list);
static ESTSCORE *est_search_union(ESTDB *db, const char *term, int gstep,
void (*xpn)(const char *, CBLIST *),
int *nump, CBMAP *hints, int add, int auxmin, CBMAP *auxwords);
static const ESTSCORE *est_rescc_get(ESTDB *db, const char *word, int size, int *nump);
static void est_rescc_put(ESTDB *db, const char *word, int size, ESTSCORE *scores, int num);
static ESTSCORE *est_search_keywords(ESTDB *db, const char *word, int min, int *nump);
static void est_weight_keywords(ESTDB *db, const char *word, ESTSCORE *scores, int snum);
static ESTSCORE *est_search_rank(ESTDB *db, const char *name, int top, int *nump);
static ESTSCORE *est_search_aidx_attr(ESTDB *db, const char *expr, int *nump);
static ESTSCORE *est_search_pidxs(ESTDB *db, ESTCOND *cond, ESTSCORE *scores, int *nump,
CBMAP *ordattrs);
static int est_narrow_scores(ESTDB *db, const CBLIST *attrs, int ign,
const char *order, const char *distinct, ESTSCORE *scores, int snum,
int limit, int *restp, CBMAP *ordattrs);
static ESTCATTR *est_make_cattr_list(const CBLIST *attrs, int *nump);
static void est_free_cattr_list(ESTCATTR *list, int anum);
static int est_eclipse_scores(ESTDB *db, ESTSCORE *scores, int snum, int num,
int vnum, int tfidf, double limit, CBMAP *shadows);
static int est_match_attr(const char *tval, int tsiz, int cop, int sign,
const char *oval, int osiz, const char *sval, int ssiz,
const void *regex, int onum);
static int est_check_strand(const char *tval, const char *oval);
static int est_check_stror(const char *tval, const char *oval);
static int est_check_stroreq(const char *tval, const char *oval);
static int est_check_numbt(const char *tval, const char *oval);
static int est_keysc_compare(const void *ap, const void *bp);
static ESTSCORE *est_search_similar(ESTDB *db, CBMAP *svmap, int *nump,
int knum, int unum, int mnum, int tfidf,
double nmin, int auxmin, CBMAP *auxwords);
static CBMAP *est_phrase_vector(const char *phrase);
static CBMAP *est_get_tvmap(ESTDB *db, int id, int vnum, int tfidf);
static int est_url_sameness(const char *aurl, const char *burl);
static void est_random_fclose(void);
static int est_signal_dispatch(int signum);
/*************************************************************************************************
* common settings
*************************************************************************************************/
/* version of Hyper Estraier */
const char *est_version = _EST_VERSION;
/*************************************************************************************************
* API for document
*************************************************************************************************/
/* Create a document object. */
ESTDOC *est_doc_new(void){
ESTDOC *doc;
CB_MALLOC(doc, sizeof(ESTDOC));
doc->id = -1;
doc->attrs = NULL;
doc->dtexts = NULL;
doc->kwords = NULL;
return doc;
}
/* Create a document object made from draft data. */
ESTDOC *est_doc_new_from_draft(const char *draft){
ESTDOC *doc;
CBLIST *lines;
const char *line;
char *pv, *rp, *ep;
int i;
assert(draft);
doc = est_doc_new();
lines = cbsplit(draft, -1, "\n");
for(i = 0; i < CB_LISTNUM(lines); i++){
line = CB_LISTVAL(lines, i);
while(*line > '\0' && *line <= ' '){
line++;
}
if(*line == '\0'){
i++;
break;
}
if(*line == '%'){
if(cbstrfwmatch(line, ESTDCNTLVECTOR)){
if(!doc->kwords) doc->kwords = cbmapopenex(ESTMINIBNUM);
if((rp = strchr(line, '\t')) != NULL) rp++;
while(rp && (pv = strchr(rp, '\t')) != NULL){
pv++;
if((ep = strchr(pv, '\t')) != NULL){
*ep = '\0';
ep++;
}
if(rp[0] != '\0' && pv[0] != '\0') cbmapput(doc->kwords, rp, pv - rp - 1, pv, -1, TRUE);
rp = ep;
}
} else if(cbstrfwmatch(line, ESTDCNTLSCORE)){
if((rp = strchr(line, '\t')) != NULL) est_doc_set_score(doc, atoi(rp + 1));
}
} else if((pv = strchr(line, '=')) != NULL){
*(pv++) = '\0';
est_doc_add_attr(doc, line, pv);
}
}
for(; i < CB_LISTNUM(lines); i++){
line = CB_LISTVAL(lines, i);
if(*line == '\t'){
est_doc_add_hidden_text(doc, line + 1);
} else {
est_doc_add_text(doc, line);
}
}
CB_LISTCLOSE(lines);
return doc;
}
/* Destroy a document object. */
void est_doc_delete(ESTDOC *doc){
assert(doc);
if(doc->kwords) cbmapclose(doc->kwords);
if(doc->dtexts) CB_LISTCLOSE(doc->dtexts);
if(doc->attrs) cbmapclose(doc->attrs);
free(doc);
}
/* Add an attribute to a document object. */
void est_doc_add_attr(ESTDOC *doc, const char *name, const char *value){
char *rbuf, *wp;
int len;
assert(doc && name);
if(name[0] == '\0' || name[0] == '%') return;
if(!doc->attrs) doc->attrs = cbmapopenex(ESTMINIBNUM);
if(value){
rbuf = cbmemdup(value, -1);
for(wp = rbuf; *wp != '\0'; wp++){
if(*wp > 0 && *wp < ' ') *wp = ' ';
}
if (strcmp(name, ESTDATTRURI)) {
cbstrsqzspc(rbuf);
}
if((len = strlen(name)) > 0) cbmapput(doc->attrs, name, len, rbuf, -1, TRUE);
free(rbuf);
} else {
cbmapout(doc->attrs, name, -1);
}
}
/* Add a sentence of text to a document object. */
void est_doc_add_text(ESTDOC *doc, const char *text){
unsigned char *utext;
char *rtext, *wp;
int size;
assert(doc && text);
while(*text > '\0' && *text <= ' '){
text++;
}
if(text[0] == '\0') return;
if(!doc->dtexts) CB_LISTOPEN(doc->dtexts);
utext = (unsigned char *)est_uconv_in(text, strlen(text), &size);
est_normalize_text(utext, size, &size);
rtext = est_uconv_out((char *)utext, size, NULL);
for(wp = rtext; *wp != '\0'; wp++){
if(*wp > 0 && *wp < ' ') *wp = ' ';
}
cbstrsqzspc(rtext);
if(rtext[0] != '\0'){
CB_LISTPUSHBUF(doc->dtexts, rtext, strlen(rtext));
} else {
free(rtext);
}
free(utext);
}
/* Add a hidden sentence to a document object. */
void est_doc_add_hidden_text(ESTDOC *doc, const char *text){
unsigned char *utext;
char *rtext, *wp;
int size;
assert(doc && text);
while(*text > '\0' && *text <= ' '){
text++;
}
if(text[0] == '\0') return;
utext = (unsigned char *)est_uconv_in(text, strlen(text), &size);
est_normalize_text(utext, size, &size);
rtext = est_uconv_out((char *)utext, size, NULL);
for(wp = rtext; *wp != '\0'; wp++){
if(*wp > 0 && *wp < ' ') *wp = ' ';
}
cbstrsqzspc(rtext);
if(rtext[0] != '\0'){
if(!doc->attrs) doc->attrs = cbmapopenex(ESTMINIBNUM);
if(cbmapget(doc->attrs, "", 0, NULL)) cbmapputcat(doc->attrs, "", 0, " ", 1);
cbmapputcat(doc->attrs, "", 0, rtext, -1);
}
free(rtext);
free(utext);
}
/* Attach keywords to a document object. */
void est_doc_set_keywords(ESTDOC *doc, CBMAP *kwords){
assert(doc && kwords);
if(doc->kwords) cbmapclose(doc->kwords);
doc->kwords = cbmapdup(kwords);
}
/* Set the substitute score of a document object. */
void est_doc_set_score(ESTDOC *doc, int score){
char numbuf[ESTNUMBUFSIZ];
assert(doc);
if(!doc->attrs) doc->attrs = cbmapopenex(ESTMINIBNUM);
if(score >= 0){
sprintf(numbuf, "%d", score);
cbmapput(doc->attrs, "\t", 1, numbuf, -1, TRUE);
} else {
cbmapout(doc->attrs, "\t", 1);
}
}
/* Get the ID number of a document object. */
int est_doc_id(ESTDOC *doc){
assert(doc);
return doc->id;
}
/* Get a list of attribute names of a document object. */
CBLIST *est_doc_attr_names(ESTDOC *doc){
CBLIST *names;
const char *kbuf;
int ksiz;
assert(doc);
if(!doc->attrs){
CB_LISTOPEN(names);
return names;
}
CB_LISTOPEN(names);
cbmapiterinit(doc->attrs);
while((kbuf = cbmapiternext(doc->attrs, &ksiz)) != NULL){
if(ksiz > 0 && kbuf[0] != '\t') CB_LISTPUSH(names, kbuf, ksiz);
}
cblistsort(names);
return names;
}
/* Get the value of an attribute of a document object. */
const char *est_doc_attr(ESTDOC *doc, const char *name){
assert(doc && name);
if(!doc->attrs || name[0] == '\0') return NULL;
return cbmapget(doc->attrs, name, -1, NULL);
}
/* Get a list of sentences of the text of a document object. */
const CBLIST *est_doc_texts(ESTDOC *doc){
assert(doc);
if(!doc->dtexts) CB_LISTOPEN(doc->dtexts);
return doc->dtexts;
}
/* Concatenate sentences of the text of a document object. */
char *est_doc_cat_texts(ESTDOC *doc){
CBDATUM *datum;
const char *elem;
int i, size;
if(!doc->dtexts) return cbmemdup("", 0);
CB_DATUMOPEN(datum);
for(i = 0; i < CB_LISTNUM(doc->dtexts); i++){
elem = CB_LISTVAL2(doc->dtexts, i, size);
if(i > 0) CB_DATUMCAT(datum, " ", 1);
CB_DATUMCAT(datum, elem, size);
}
return cbdatumtomalloc(datum, NULL);
}
/* Get attached keywords of a document object. */
CBMAP *est_doc_keywords(ESTDOC *doc){
assert(doc);
return doc->kwords;
}
/* Get the substitute score of a document object. */
int est_doc_score(ESTDOC *doc){
const char *vbuf;
assert(doc);
if(doc->attrs && (vbuf = cbmapget(doc->attrs, "\t", 1, NULL)) != NULL) return atoi(vbuf);
return -1;
}
/* Dump draft data of a document object. */
char *est_doc_dump_draft(ESTDOC *doc){
CBLIST *list;
CBDATUM *datum;
const char *kbuf, *vbuf;
int i, ksiz, vsiz;
assert(doc);
CB_DATUMOPEN(datum);
if(doc->attrs){
list = est_doc_attr_names(doc);
for(i = 0; i < CB_LISTNUM(list); i++){
kbuf = CB_LISTVAL2(list, i, ksiz);
vbuf = cbmapget(doc->attrs, kbuf, ksiz, &vsiz);
CB_DATUMCAT(datum, kbuf, ksiz);
CB_DATUMCAT(datum, "=", 1);
CB_DATUMCAT(datum, vbuf, vsiz);
CB_DATUMCAT(datum, "\n", 1);
}
CB_LISTCLOSE(list);
}
if(doc->kwords && cbmaprnum(doc->kwords) > 0){
CB_DATUMCAT(datum, ESTDCNTLVECTOR, strlen(ESTDCNTLVECTOR));
cbmapiterinit(doc->kwords);
while((kbuf = cbmapiternext(doc->kwords, &ksiz)) != NULL){
CB_MAPITERVAL(vbuf, kbuf, vsiz);
CB_DATUMCAT(datum, "\t", 1);
CB_DATUMCAT(datum, kbuf, ksiz);
CB_DATUMCAT(datum, "\t", 1);
CB_DATUMCAT(datum, vbuf, vsiz);
}
CB_DATUMCAT(datum, "\n", 1);
}
if(doc->attrs && (vbuf = cbmapget(doc->attrs, "\t", 1, &vsiz)) != NULL){
CB_DATUMCAT(datum, ESTDCNTLSCORE, strlen(ESTDCNTLSCORE));
CB_DATUMCAT(datum, "\t", 1);
CB_DATUMCAT(datum, vbuf, vsiz);
CB_DATUMCAT(datum, "\n", 1);
}
CB_DATUMCAT(datum, "\n", 1);
if(doc->dtexts){
for(i = 0; i < CB_LISTNUM(doc->dtexts); i++){
kbuf = CB_LISTVAL2(doc->dtexts, i, ksiz);
CB_DATUMCAT(datum, kbuf, ksiz);
CB_DATUMCAT(datum, "\n", 1);
}
}
if(doc->attrs && (vbuf = cbmapget(doc->attrs, "", 0, &vsiz)) != NULL){
CB_DATUMCAT(datum, "\t", 1);
CB_DATUMCAT(datum, vbuf, vsiz);
CB_DATUMCAT(datum, "\n", 1);
}
return cbdatumtomalloc(datum, NULL);
}
/* Make a snippet of the body text of a document object. */
char *est_doc_make_snippet(ESTDOC *doc, const CBLIST *words, int wwidth, int hwidth, int awidth){
CBDATUM *sbuf;
const char *text;
char *snippet;
int i, size;
assert(doc && words && wwidth >= 0 && hwidth >= 0 && awidth >= 0);
if(!doc->dtexts) CB_LISTOPEN(doc->dtexts);
CB_DATUMOPEN(sbuf);
for(i = 0; i < CB_LISTNUM(doc->dtexts); i++){
text = CB_LISTVAL2(doc->dtexts, i, size);
if(i > 0) CB_DATUMCAT(sbuf, " ", 1);
CB_DATUMCAT(sbuf, text, size);
}
snippet = est_make_snippet(CB_DATUMPTR(sbuf), CB_DATUMSIZE(sbuf),
words, wwidth, hwidth, awidth);
CB_DATUMCLOSE(sbuf);
return snippet;
}
/*************************************************************************************************
* API for search conditions
*************************************************************************************************/
/* Create a condition object. */
ESTCOND *est_cond_new(void){
ESTCOND *cond;
CB_MALLOC(cond, sizeof(ESTCOND));
cond->phrase = NULL;
cond->gstep = 2;
cond->tfidf = TRUE;
cond->pmode = ESTPMUSUAL;
cond->cbxpn = NULL;
cond->attrs = NULL;
cond->order = NULL;
cond->max = -1;
cond->skip = 0;
cond->auxmin = ESTAUXMIN;
cond->auxwords = NULL;
cond->scfb = FALSE;
cond->scores = NULL;
cond->snum = 0;
cond->nscores = NULL;
cond->nsnum = -1;
cond->opts = 0;
cond->ecllim = -1.0;
cond->shadows = NULL;
cond->distinct = NULL;
cond->mask = 0;
return cond;
}
/* Destroy a condition object. */
void est_cond_delete(ESTCOND *cond){
assert(cond);
if(cond->distinct) free(cond->distinct);
if(cond->shadows) cbmapclose(cond->shadows);
if(cond->auxwords) cbmapclose(cond->auxwords);
if(cond->scores) free(cond->scores);
if(cond->order) free(cond->order);
if(cond->attrs) CB_LISTCLOSE(cond->attrs);
if(cond->phrase) free(cond->phrase);
free(cond);
}
/* Set a search phrase to a condition object. */
void est_cond_set_phrase(ESTCOND *cond, const char *phrase){
assert(cond && phrase);
if(cond->phrase) free(cond->phrase);
while(*phrase > '\0' && *phrase <= ' '){
phrase++;
}
cond->phrase = cbmemdup(phrase, -1);
}
/* Add a condition of an attribute fo a condition object. */
void est_cond_add_attr(ESTCOND *cond, const char *expr){
assert(cond && expr);
while(*expr > '\0' && *expr <= ' '){
expr++;
}
if(*expr == '\0') return;
if(!cond->attrs) CB_LISTOPEN(cond->attrs);
CB_LISTPUSH(cond->attrs, expr, strlen(expr));
}
/* Set the order of a condition object. */
void est_cond_set_order(ESTCOND *cond, const char *expr){
assert(cond && expr);
while(*expr > '\0' && *expr <= ' '){
expr++;
}
if(*expr == '\0') return;
if(cond->order) free(cond->order);
cond->order = cbmemdup(expr, -1);
}
/* Set the maximum number of retrieval of a condition object. */
void est_cond_set_max(ESTCOND *cond, int max){
assert(cond && max >= 0);
cond->max = max;
}
/* Set the number of skipped documents of a condition object. */
void est_cond_set_skip(ESTCOND *cond, int skip){
assert(cond && skip >= 0);
cond->skip = skip;
}
/* Set options of retrieval of a condition object. */
void est_cond_set_options(ESTCOND *cond, int options){
assert(cond);
if(options & ESTCONDSURE) cond->gstep = 1;
if(options & ESTCONDUSUAL) cond->gstep = 2;
if(options & ESTCONDFAST) cond->gstep = 3;
if(options & ESTCONDAGITO) cond->gstep = 4;
if(options & ESTCONDNOIDF) cond->tfidf = FALSE;
if(options & ESTCONDSIMPLE) cond->pmode = ESTPMSIMPLE;
if(options & ESTCONDROUGH) cond->pmode = ESTPMROUGH;
if(options & ESTCONDUNION) cond->pmode = ESTPMUNION;
if(options & ESTCONDISECT) cond->pmode = ESTPMISECT;
if(options & ESTCONDSCFB) cond->scfb = TRUE;
cond->opts |= options;
}
/* Set permission to adopt result of the auxiliary index. */
void est_cond_set_auxiliary(ESTCOND *cond, int min){
assert(cond);
cond->auxmin = min;
}
/* Set the upper limit of similarity for document eclipse. */
void est_cond_set_eclipse(ESTCOND *cond, double limit){
assert(cond);
if(limit > 0.0) cond->ecllim = limit;
}
/* Set the attribute distinction filter. */
void est_cond_set_distinct(ESTCOND *cond, const char *name){
assert(cond && name);
while(*name > '\0' && *name <= ' '){
name++;
}
if(*name == '\0') return;
if(cond->distinct) free(cond->distinct);
cond->distinct = cbmemdup(name, -1);
}
/* Set the mask of targets of meta search. */
void est_cond_set_mask(ESTCOND *cond, int mask){
assert(cond);
cond->mask = mask & INT_MAX;
}
/*************************************************************************************************
* API for database
*************************************************************************************************/
/* Inode map for duplication check. */
CBMAP *est_inodes = NULL;
/* Get the string of an error code. */
const char *est_err_msg(int ecode){
switch(ecode){
case ESTENOERR: return "no error";
case ESTEINVAL: return "invalid argument";
case ESTEACCES: return "access forbidden";
case ESTELOCK: return "lock failure";
case ESTEDB: return "database problem";
case ESTEIO: return "I/O problem";
case ESTENOITEM: return "no such item";
default: break;
}
return "miscellaneous";
}
/* Open a database. */
ESTDB *est_db_open(const char *name, int omode, int *ecp){
ESTDB *db;
DEPOT *metadb;
ESTIDX *idxdb;
CURIA *attrdb, *textdb, *kwddb;
VILLA *fwmdb, *auxdb, *xfmdb, *listdb;
CBMAP *aidxs;
CBLIST *list;
ESTATTRIDX attridx;
void *aidxdb;
const char *elem;
char path[ESTPATHBUFSIZ], vbuf[ESTNUMBUFSIZ], *dec;
int i, inode, domode, comode, vomode, flags, idxnum, dseq, dnum;
int amode, zmode, smode, vsiz, type, crdnum;
double bdiam, ddiam;
assert(name && ecp);
if(!est_inodes){
est_inodes = cbmapopenex(ESTMINIBNUM);
cbglobalgc(est_inodes, est_inodes_delete);
}
est_set_ecode(ecp, ESTENOERR, __LINE__);
if((omode & ESTDBWRITER) && (omode & ESTDBCREAT) && !est_mkdir(name)){
switch(errno){
case EACCES:
est_set_ecode(ecp, ESTEACCES, __LINE__);
return NULL;
case EEXIST:
break;
default:
est_set_ecode(ecp, ESTEIO, __LINE__);
return NULL;
}
}
if((inode = est_inode(name)) < 1){
est_set_ecode(ecp, ESTEIO, __LINE__);
return NULL;
}
if(cbmapget(est_inodes, (char *)&inode, sizeof(int), NULL) && !(omode & ESTDBNOLCK)){
est_set_ecode(ecp, ESTEACCES, __LINE__);
return NULL;
}
domode = DP_OREADER;
comode = CR_OREADER;
vomode = VL_OREADER;
if(omode & ESTDBWRITER){
domode = DP_OWRITER;
comode = CR_OWRITER;
vomode = VL_OWRITER;
if(ESTUSEBZIP){
vomode |= VL_OXCOMP;
} else if(ESTUSELZO){
vomode |= VL_OYCOMP;
} else if(ESTUSEZLIB){
vomode |= VL_OZCOMP;
}
if(omode & ESTDBCREAT){
domode |= DP_OCREAT;
comode |= CR_OCREAT;
vomode |= VL_OCREAT;
}
if(omode & ESTDBTRUNC){
domode |= DP_OTRUNC;
comode |= CR_OTRUNC;
vomode |= VL_OTRUNC;
}
}