Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Fetching contributors…

Cannot retrieve contributors at this time

16085 lines (14677 sloc) 423.047 kb
/* vi:set ts=8 sts=4 sw=4:
*
* VIM - Vi IMproved by Bram Moolenaar
*
* Do ":help uganda" in Vim to read copying and usage conditions.
* Do ":help credits" in Vim to see a list of people who contributed.
* See README.txt for an overview of the Vim source code.
*/
/*
* spell.c: code for spell checking
*
* The spell checking mechanism uses a tree (aka trie). Each node in the tree
* has a list of bytes that can appear (siblings). For each byte there is a
* pointer to the node with the byte that follows in the word (child).
*
* A NUL byte is used where the word may end. The bytes are sorted, so that
* binary searching can be used and the NUL bytes are at the start. The
* number of possible bytes is stored before the list of bytes.
*
* The tree uses two arrays: "byts" stores the characters, "idxs" stores
* either the next index or flags. The tree starts at index 0. For example,
* to lookup "vi" this sequence is followed:
* i = 0
* len = byts[i]
* n = where "v" appears in byts[i + 1] to byts[i + len]
* i = idxs[n]
* len = byts[i]
* n = where "i" appears in byts[i + 1] to byts[i + len]
* i = idxs[n]
* len = byts[i]
* find that byts[i + 1] is 0, idxs[i + 1] has flags for "vi".
*
* There are two word trees: one with case-folded words and one with words in
* original case. The second one is only used for keep-case words and is
* usually small.
*
* There is one additional tree for when not all prefixes are applied when
* generating the .spl file. This tree stores all the possible prefixes, as
* if they were words. At each word (prefix) end the prefix nr is stored, the
* following word must support this prefix nr. And the condition nr is
* stored, used to lookup the condition that the word must match with.
*
* Thanks to Olaf Seibert for providing an example implementation of this tree
* and the compression mechanism.
* LZ trie ideas:
* http://www.irb.hr/hr/home/ristov/papers/RistovLZtrieRevision1.pdf
* More papers: http://www-igm.univ-mlv.fr/~laporte/publi_en.html
*
* Matching involves checking the caps type: Onecap ALLCAP KeepCap.
*
* Why doesn't Vim use aspell/ispell/myspell/etc.?
* See ":help develop-spell".
*/
/* Use SPELL_PRINTTREE for debugging: dump the word tree after adding a word.
* Only use it for small word lists! */
#if 0
# define SPELL_PRINTTREE
#endif
/* Use DEBUG_TRIEWALK to print the changes made in suggest_trie_walk() for a
* specific word. */
#if 0
# define DEBUG_TRIEWALK
#endif
/*
* Use this to adjust the score after finding suggestions, based on the
* suggested word sounding like the bad word. This is much faster than doing
* it for every possible suggestion.
* Disadvantage: When "the" is typed as "hte" it sounds quite different ("@"
* vs "ht") and goes down in the list.
* Used when 'spellsuggest' is set to "best".
*/
#define RESCORE(word_score, sound_score) ((3 * word_score + sound_score) / 4)
/*
* Do the opposite: based on a maximum end score and a known sound score,
* compute the maximum word score that can be used.
*/
#define MAXSCORE(word_score, sound_score) ((4 * word_score - sound_score) / 3)
/*
* Vim spell file format: <HEADER>
* <SECTIONS>
* <LWORDTREE>
* <KWORDTREE>
* <PREFIXTREE>
*
* <HEADER>: <fileID> <versionnr>
*
* <fileID> 8 bytes "VIMspell"
* <versionnr> 1 byte VIMSPELLVERSION
*
*
* Sections make it possible to add information to the .spl file without
* making it incompatible with previous versions. There are two kinds of
* sections:
* 1. Not essential for correct spell checking. E.g. for making suggestions.
* These are skipped when not supported.
* 2. Optional information, but essential for spell checking when present.
* E.g. conditions for affixes. When this section is present but not
* supported an error message is given.
*
* <SECTIONS>: <section> ... <sectionend>
*
* <section>: <sectionID> <sectionflags> <sectionlen> (section contents)
*
* <sectionID> 1 byte number from 0 to 254 identifying the section
*
* <sectionflags> 1 byte SNF_REQUIRED: this section is required for correct
* spell checking
*
* <sectionlen> 4 bytes length of section contents, MSB first
*
* <sectionend> 1 byte SN_END
*
*
* sectionID == SN_INFO: <infotext>
* <infotext> N bytes free format text with spell file info (version,
* website, etc)
*
* sectionID == SN_REGION: <regionname> ...
* <regionname> 2 bytes Up to 8 region names: ca, au, etc. Lower case.
* First <regionname> is region 1.
*
* sectionID == SN_CHARFLAGS: <charflagslen> <charflags>
* <folcharslen> <folchars>
* <charflagslen> 1 byte Number of bytes in <charflags> (should be 128).
* <charflags> N bytes List of flags (first one is for character 128):
* 0x01 word character CF_WORD
* 0x02 upper-case character CF_UPPER
* <folcharslen> 2 bytes Number of bytes in <folchars>.
* <folchars> N bytes Folded characters, first one is for character 128.
*
* sectionID == SN_MIDWORD: <midword>
* <midword> N bytes Characters that are word characters only when used
* in the middle of a word.
*
* sectionID == SN_PREFCOND: <prefcondcnt> <prefcond> ...
* <prefcondcnt> 2 bytes Number of <prefcond> items following.
* <prefcond> : <condlen> <condstr>
* <condlen> 1 byte Length of <condstr>.
* <condstr> N bytes Condition for the prefix.
*
* sectionID == SN_REP: <repcount> <rep> ...
* <repcount> 2 bytes number of <rep> items, MSB first.
* <rep> : <repfromlen> <repfrom> <reptolen> <repto>
* <repfromlen> 1 byte length of <repfrom>
* <repfrom> N bytes "from" part of replacement
* <reptolen> 1 byte length of <repto>
* <repto> N bytes "to" part of replacement
*
* sectionID == SN_REPSAL: <repcount> <rep> ...
* just like SN_REP but for soundfolded words
*
* sectionID == SN_SAL: <salflags> <salcount> <sal> ...
* <salflags> 1 byte flags for soundsalike conversion:
* SAL_F0LLOWUP
* SAL_COLLAPSE
* SAL_REM_ACCENTS
* <salcount> 2 bytes number of <sal> items following
* <sal> : <salfromlen> <salfrom> <saltolen> <salto>
* <salfromlen> 1 byte length of <salfrom>
* <salfrom> N bytes "from" part of soundsalike
* <saltolen> 1 byte length of <salto>
* <salto> N bytes "to" part of soundsalike
*
* sectionID == SN_SOFO: <sofofromlen> <sofofrom> <sofotolen> <sofoto>
* <sofofromlen> 2 bytes length of <sofofrom>
* <sofofrom> N bytes "from" part of soundfold
* <sofotolen> 2 bytes length of <sofoto>
* <sofoto> N bytes "to" part of soundfold
*
* sectionID == SN_SUGFILE: <timestamp>
* <timestamp> 8 bytes time in seconds that must match with .sug file
*
* sectionID == SN_NOSPLITSUGS: nothing
*
* sectionID == SN_WORDS: <word> ...
* <word> N bytes NUL terminated common word
*
* sectionID == SN_MAP: <mapstr>
* <mapstr> N bytes String with sequences of similar characters,
* separated by slashes.
*
* sectionID == SN_COMPOUND: <compmax> <compminlen> <compsylmax> <compoptions>
* <comppatcount> <comppattern> ... <compflags>
* <compmax> 1 byte Maximum nr of words in compound word.
* <compminlen> 1 byte Minimal word length for compounding.
* <compsylmax> 1 byte Maximum nr of syllables in compound word.
* <compoptions> 2 bytes COMP_ flags.
* <comppatcount> 2 bytes number of <comppattern> following
* <compflags> N bytes Flags from COMPOUNDRULE items, separated by
* slashes.
*
* <comppattern>: <comppatlen> <comppattext>
* <comppatlen> 1 byte length of <comppattext>
* <comppattext> N bytes end or begin chars from CHECKCOMPOUNDPATTERN
*
* sectionID == SN_NOBREAK: (empty, its presence is what matters)
*
* sectionID == SN_SYLLABLE: <syllable>
* <syllable> N bytes String from SYLLABLE item.
*
* <LWORDTREE>: <wordtree>
*
* <KWORDTREE>: <wordtree>
*
* <PREFIXTREE>: <wordtree>
*
*
* <wordtree>: <nodecount> <nodedata> ...
*
* <nodecount> 4 bytes Number of nodes following. MSB first.
*
* <nodedata>: <siblingcount> <sibling> ...
*
* <siblingcount> 1 byte Number of siblings in this node. The siblings
* follow in sorted order.
*
* <sibling>: <byte> [ <nodeidx> <xbyte>
* | <flags> [<flags2>] [<region>] [<affixID>]
* | [<pflags>] <affixID> <prefcondnr> ]
*
* <byte> 1 byte Byte value of the sibling. Special cases:
* BY_NOFLAGS: End of word without flags and for all
* regions.
* For PREFIXTREE <affixID> and
* <prefcondnr> follow.
* BY_FLAGS: End of word, <flags> follow.
* For PREFIXTREE <pflags>, <affixID>
* and <prefcondnr> follow.
* BY_FLAGS2: End of word, <flags> and <flags2>
* follow. Not used in PREFIXTREE.
* BY_INDEX: Child of sibling is shared, <nodeidx>
* and <xbyte> follow.
*
* <nodeidx> 3 bytes Index of child for this sibling, MSB first.
*
* <xbyte> 1 byte byte value of the sibling.
*
* <flags> 1 byte bitmask of:
* WF_ALLCAP word must have only capitals
* WF_ONECAP first char of word must be capital
* WF_KEEPCAP keep-case word
* WF_FIXCAP keep-case word, all caps not allowed
* WF_RARE rare word
* WF_BANNED bad word
* WF_REGION <region> follows
* WF_AFX <affixID> follows
*
* <flags2> 1 byte Bitmask of:
* WF_HAS_AFF >> 8 word includes affix
* WF_NEEDCOMP >> 8 word only valid in compound
* WF_NOSUGGEST >> 8 word not used for suggestions
* WF_COMPROOT >> 8 word already a compound
* WF_NOCOMPBEF >> 8 no compounding before this word
* WF_NOCOMPAFT >> 8 no compounding after this word
*
* <pflags> 1 byte bitmask of:
* WFP_RARE rare prefix
* WFP_NC non-combining prefix
* WFP_UP letter after prefix made upper case
*
* <region> 1 byte Bitmask for regions in which word is valid. When
* omitted it's valid in all regions.
* Lowest bit is for region 1.
*
* <affixID> 1 byte ID of affix that can be used with this word. In
* PREFIXTREE used for the required prefix ID.
*
* <prefcondnr> 2 bytes Prefix condition number, index in <prefcond> list
* from HEADER.
*
* All text characters are in 'encoding', but stored as single bytes.
*/
/*
* Vim .sug file format: <SUGHEADER>
* <SUGWORDTREE>
* <SUGTABLE>
*
* <SUGHEADER>: <fileID> <versionnr> <timestamp>
*
* <fileID> 6 bytes "VIMsug"
* <versionnr> 1 byte VIMSUGVERSION
* <timestamp> 8 bytes timestamp that must match with .spl file
*
*
* <SUGWORDTREE>: <wordtree> (see above, no flags or region used)
*
*
* <SUGTABLE>: <sugwcount> <sugline> ...
*
* <sugwcount> 4 bytes number of <sugline> following
*
* <sugline>: <sugnr> ... NUL
*
* <sugnr>: X bytes word number that results in this soundfolded word,
* stored as an offset to the previous number in as
* few bytes as possible, see offset2bytes())
*/
#include "vim.h"
#if defined(FEAT_SPELL) || defined(PROTO)
#ifndef UNIX /* it's in os_unix.h for Unix */
# include <time.h> /* for time_t */
#endif
#define MAXWLEN 250 /* Assume max. word len is this many bytes.
Some places assume a word length fits in a
byte, thus it can't be above 255. */
/* Type used for indexes in the word tree need to be at least 4 bytes. If int
* is 8 bytes we could use something smaller, but what? */
#if SIZEOF_INT > 3
typedef int idx_T;
#else
typedef long idx_T;
#endif
#ifdef VMS
# define SPL_FNAME_TMPL "%s_%s.spl"
# define SPL_FNAME_ADD "_add."
# define SPL_FNAME_ASCII "_ascii."
#else
# define SPL_FNAME_TMPL "%s.%s.spl"
# define SPL_FNAME_ADD ".add."
# define SPL_FNAME_ASCII ".ascii."
#endif
/* Flags used for a word. Only the lowest byte can be used, the region byte
* comes above it. */
#define WF_REGION 0x01 /* region byte follows */
#define WF_ONECAP 0x02 /* word with one capital (or all capitals) */
#define WF_ALLCAP 0x04 /* word must be all capitals */
#define WF_RARE 0x08 /* rare word */
#define WF_BANNED 0x10 /* bad word */
#define WF_AFX 0x20 /* affix ID follows */
#define WF_FIXCAP 0x40 /* keep-case word, allcap not allowed */
#define WF_KEEPCAP 0x80 /* keep-case word */
/* for <flags2>, shifted up one byte to be used in wn_flags */
#define WF_HAS_AFF 0x0100 /* word includes affix */
#define WF_NEEDCOMP 0x0200 /* word only valid in compound */
#define WF_NOSUGGEST 0x0400 /* word not to be suggested */
#define WF_COMPROOT 0x0800 /* already compounded word, COMPOUNDROOT */
#define WF_NOCOMPBEF 0x1000 /* no compounding before this word */
#define WF_NOCOMPAFT 0x2000 /* no compounding after this word */
/* only used for su_badflags */
#define WF_MIXCAP 0x20 /* mix of upper and lower case: macaRONI */
#define WF_CAPMASK (WF_ONECAP | WF_ALLCAP | WF_KEEPCAP | WF_FIXCAP)
/* flags for <pflags> */
#define WFP_RARE 0x01 /* rare prefix */
#define WFP_NC 0x02 /* prefix is not combining */
#define WFP_UP 0x04 /* to-upper prefix */
#define WFP_COMPPERMIT 0x08 /* prefix with COMPOUNDPERMITFLAG */
#define WFP_COMPFORBID 0x10 /* prefix with COMPOUNDFORBIDFLAG */
/* Flags for postponed prefixes in "sl_pidxs". Must be above affixID (one
* byte) and prefcondnr (two bytes). */
#define WF_RAREPFX (WFP_RARE << 24) /* rare postponed prefix */
#define WF_PFX_NC (WFP_NC << 24) /* non-combining postponed prefix */
#define WF_PFX_UP (WFP_UP << 24) /* to-upper postponed prefix */
#define WF_PFX_COMPPERMIT (WFP_COMPPERMIT << 24) /* postponed prefix with
* COMPOUNDPERMITFLAG */
#define WF_PFX_COMPFORBID (WFP_COMPFORBID << 24) /* postponed prefix with
* COMPOUNDFORBIDFLAG */
/* flags for <compoptions> */
#define COMP_CHECKDUP 1 /* CHECKCOMPOUNDDUP */
#define COMP_CHECKREP 2 /* CHECKCOMPOUNDREP */
#define COMP_CHECKCASE 4 /* CHECKCOMPOUNDCASE */
#define COMP_CHECKTRIPLE 8 /* CHECKCOMPOUNDTRIPLE */
/* Special byte values for <byte>. Some are only used in the tree for
* postponed prefixes, some only in the other trees. This is a bit messy... */
#define BY_NOFLAGS 0 /* end of word without flags or region; for
* postponed prefix: no <pflags> */
#define BY_INDEX 1 /* child is shared, index follows */
#define BY_FLAGS 2 /* end of word, <flags> byte follows; for
* postponed prefix: <pflags> follows */
#define BY_FLAGS2 3 /* end of word, <flags> and <flags2> bytes
* follow; never used in prefix tree */
#define BY_SPECIAL BY_FLAGS2 /* highest special byte value */
/* Info from "REP", "REPSAL" and "SAL" entries in ".aff" file used in si_rep,
* si_repsal, sl_rep, and si_sal. Not for sl_sal!
* One replacement: from "ft_from" to "ft_to". */
typedef struct fromto_S
{
char_u *ft_from;
char_u *ft_to;
} fromto_T;
/* Info from "SAL" entries in ".aff" file used in sl_sal.
* The info is split for quick processing by spell_soundfold().
* Note that "sm_oneof" and "sm_rules" point into sm_lead. */
typedef struct salitem_S
{
char_u *sm_lead; /* leading letters */
int sm_leadlen; /* length of "sm_lead" */
char_u *sm_oneof; /* letters from () or NULL */
char_u *sm_rules; /* rules like ^, $, priority */
char_u *sm_to; /* replacement. */
#ifdef FEAT_MBYTE
int *sm_lead_w; /* wide character copy of "sm_lead" */
int *sm_oneof_w; /* wide character copy of "sm_oneof" */
int *sm_to_w; /* wide character copy of "sm_to" */
#endif
} salitem_T;
#ifdef FEAT_MBYTE
typedef int salfirst_T;
#else
typedef short salfirst_T;
#endif
/* Values for SP_*ERROR are negative, positive values are used by
* read_cnt_string(). */
#define SP_TRUNCERROR -1 /* spell file truncated error */
#define SP_FORMERROR -2 /* format error in spell file */
#define SP_OTHERERROR -3 /* other error while reading spell file */
/*
* Structure used to store words and other info for one language, loaded from
* a .spl file.
* The main access is through the tree in "sl_fbyts/sl_fidxs", storing the
* case-folded words. "sl_kbyts/sl_kidxs" is for keep-case words.
*
* The "byts" array stores the possible bytes in each tree node, preceded by
* the number of possible bytes, sorted on byte value:
* <len> <byte1> <byte2> ...
* The "idxs" array stores the index of the child node corresponding to the
* byte in "byts".
* Exception: when the byte is zero, the word may end here and "idxs" holds
* the flags, region mask and affixID for the word. There may be several
* zeros in sequence for alternative flag/region/affixID combinations.
*/
typedef struct slang_S slang_T;
struct slang_S
{
slang_T *sl_next; /* next language */
char_u *sl_name; /* language name "en", "en.rare", "nl", etc. */
char_u *sl_fname; /* name of .spl file */
int sl_add; /* TRUE if it's a .add file. */
char_u *sl_fbyts; /* case-folded word bytes */
idx_T *sl_fidxs; /* case-folded word indexes */
char_u *sl_kbyts; /* keep-case word bytes */
idx_T *sl_kidxs; /* keep-case word indexes */
char_u *sl_pbyts; /* prefix tree word bytes */
idx_T *sl_pidxs; /* prefix tree word indexes */
char_u *sl_info; /* infotext string or NULL */
char_u sl_regions[17]; /* table with up to 8 region names plus NUL */
char_u *sl_midword; /* MIDWORD string or NULL */
hashtab_T sl_wordcount; /* hashtable with word count, wordcount_T */
int sl_compmax; /* COMPOUNDWORDMAX (default: MAXWLEN) */
int sl_compminlen; /* COMPOUNDMIN (default: 0) */
int sl_compsylmax; /* COMPOUNDSYLMAX (default: MAXWLEN) */
int sl_compoptions; /* COMP_* flags */
garray_T sl_comppat; /* CHECKCOMPOUNDPATTERN items */
regprog_T *sl_compprog; /* COMPOUNDRULE turned into a regexp progrm
* (NULL when no compounding) */
char_u *sl_comprules; /* all COMPOUNDRULE concatenated (or NULL) */
char_u *sl_compstartflags; /* flags for first compound word */
char_u *sl_compallflags; /* all flags for compound words */
char_u sl_nobreak; /* When TRUE: no spaces between words */
char_u *sl_syllable; /* SYLLABLE repeatable chars or NULL */
garray_T sl_syl_items; /* syllable items */
int sl_prefixcnt; /* number of items in "sl_prefprog" */
regprog_T **sl_prefprog; /* table with regprogs for prefixes */
garray_T sl_rep; /* list of fromto_T entries from REP lines */
short sl_rep_first[256]; /* indexes where byte first appears, -1 if
there is none */
garray_T sl_sal; /* list of salitem_T entries from SAL lines */
salfirst_T sl_sal_first[256]; /* indexes where byte first appears, -1 if
there is none */
int sl_followup; /* SAL followup */
int sl_collapse; /* SAL collapse_result */
int sl_rem_accents; /* SAL remove_accents */
int sl_sofo; /* SOFOFROM and SOFOTO instead of SAL items:
* "sl_sal_first" maps chars, when has_mbyte
* "sl_sal" is a list of wide char lists. */
garray_T sl_repsal; /* list of fromto_T entries from REPSAL lines */
short sl_repsal_first[256]; /* sl_rep_first for REPSAL lines */
int sl_nosplitsugs; /* don't suggest splitting a word */
/* Info from the .sug file. Loaded on demand. */
time_t sl_sugtime; /* timestamp for .sug file */
char_u *sl_sbyts; /* soundfolded word bytes */
idx_T *sl_sidxs; /* soundfolded word indexes */
buf_T *sl_sugbuf; /* buffer with word number table */
int sl_sugloaded; /* TRUE when .sug file was loaded or failed to
load */
int sl_has_map; /* TRUE if there is a MAP line */
#ifdef FEAT_MBYTE
hashtab_T sl_map_hash; /* MAP for multi-byte chars */
int sl_map_array[256]; /* MAP for first 256 chars */
#else
char_u sl_map_array[256]; /* MAP for first 256 chars */
#endif
hashtab_T sl_sounddone; /* table with soundfolded words that have
handled, see add_sound_suggest() */
};
/* First language that is loaded, start of the linked list of loaded
* languages. */
static slang_T *first_lang = NULL;
/* Flags used in .spl file for soundsalike flags. */
#define SAL_F0LLOWUP 1
#define SAL_COLLAPSE 2
#define SAL_REM_ACCENTS 4
/*
* Structure used in "b_langp", filled from 'spelllang'.
*/
typedef struct langp_S
{
slang_T *lp_slang; /* info for this language */
slang_T *lp_sallang; /* language used for sound folding or NULL */
slang_T *lp_replang; /* language used for REP items or NULL */
int lp_region; /* bitmask for region or REGION_ALL */
} langp_T;
#define LANGP_ENTRY(ga, i) (((langp_T *)(ga).ga_data) + (i))
#define REGION_ALL 0xff /* word valid in all regions */
#define VIMSPELLMAGIC "VIMspell" /* string at start of Vim spell file */
#define VIMSPELLMAGICL 8
#define VIMSPELLVERSION 50
#define VIMSUGMAGIC "VIMsug" /* string at start of Vim .sug file */
#define VIMSUGMAGICL 6
#define VIMSUGVERSION 1
/* Section IDs. Only renumber them when VIMSPELLVERSION changes! */
#define SN_REGION 0 /* <regionname> section */
#define SN_CHARFLAGS 1 /* charflags section */
#define SN_MIDWORD 2 /* <midword> section */
#define SN_PREFCOND 3 /* <prefcond> section */
#define SN_REP 4 /* REP items section */
#define SN_SAL 5 /* SAL items section */
#define SN_SOFO 6 /* soundfolding section */
#define SN_MAP 7 /* MAP items section */
#define SN_COMPOUND 8 /* compound words section */
#define SN_SYLLABLE 9 /* syllable section */
#define SN_NOBREAK 10 /* NOBREAK section */
#define SN_SUGFILE 11 /* timestamp for .sug file */
#define SN_REPSAL 12 /* REPSAL items section */
#define SN_WORDS 13 /* common words */
#define SN_NOSPLITSUGS 14 /* don't split word for suggestions */
#define SN_INFO 15 /* info section */
#define SN_END 255 /* end of sections */
#define SNF_REQUIRED 1 /* <sectionflags>: required section */
/* Result values. Lower number is accepted over higher one. */
#define SP_BANNED -1
#define SP_OK 0
#define SP_RARE 1
#define SP_LOCAL 2
#define SP_BAD 3
/* file used for "zG" and "zW" */
static char_u *int_wordlist = NULL;
typedef struct wordcount_S
{
short_u wc_count; /* nr of times word was seen */
char_u wc_word[1]; /* word, actually longer */
} wordcount_T;
static wordcount_T dumwc;
#define WC_KEY_OFF (unsigned)(dumwc.wc_word - (char_u *)&dumwc)
#define HI2WC(hi) ((wordcount_T *)((hi)->hi_key - WC_KEY_OFF))
#define MAXWORDCOUNT 0xffff
/*
* Information used when looking for suggestions.
*/
typedef struct suginfo_S
{
garray_T su_ga; /* suggestions, contains "suggest_T" */
int su_maxcount; /* max. number of suggestions displayed */
int su_maxscore; /* maximum score for adding to su_ga */
int su_sfmaxscore; /* idem, for when doing soundfold words */
garray_T su_sga; /* like su_ga, sound-folded scoring */
char_u *su_badptr; /* start of bad word in line */
int su_badlen; /* length of detected bad word in line */
int su_badflags; /* caps flags for bad word */
char_u su_badword[MAXWLEN]; /* bad word truncated at su_badlen */
char_u su_fbadword[MAXWLEN]; /* su_badword case-folded */
char_u su_sal_badword[MAXWLEN]; /* su_badword soundfolded */
hashtab_T su_banned; /* table with banned words */
slang_T *su_sallang; /* default language for sound folding */
} suginfo_T;
/* One word suggestion. Used in "si_ga". */
typedef struct suggest_S
{
char_u *st_word; /* suggested word, allocated string */
int st_wordlen; /* STRLEN(st_word) */
int st_orglen; /* length of replaced text */
int st_score; /* lower is better */
int st_altscore; /* used when st_score compares equal */
int st_salscore; /* st_score is for soundalike */
int st_had_bonus; /* bonus already included in score */
slang_T *st_slang; /* language used for sound folding */
} suggest_T;
#define SUG(ga, i) (((suggest_T *)(ga).ga_data)[i])
/* TRUE if a word appears in the list of banned words. */
#define WAS_BANNED(su, word) (!HASHITEM_EMPTY(hash_find(&su->su_banned, word)))
/* Number of suggestions kept when cleaning up. We need to keep more than
* what is displayed, because when rescore_suggestions() is called the score
* may change and wrong suggestions may be removed later. */
#define SUG_CLEAN_COUNT(su) ((su)->su_maxcount < 130 ? 150 : (su)->su_maxcount + 20)
/* Threshold for sorting and cleaning up suggestions. Don't want to keep lots
* of suggestions that are not going to be displayed. */
#define SUG_MAX_COUNT(su) (SUG_CLEAN_COUNT(su) + 50)
/* score for various changes */
#define SCORE_SPLIT 149 /* split bad word */
#define SCORE_SPLIT_NO 249 /* split bad word with NOSPLITSUGS */
#define SCORE_ICASE 52 /* slightly different case */
#define SCORE_REGION 200 /* word is for different region */
#define SCORE_RARE 180 /* rare word */
#define SCORE_SWAP 75 /* swap two characters */
#define SCORE_SWAP3 110 /* swap two characters in three */
#define SCORE_REP 65 /* REP replacement */
#define SCORE_SUBST 93 /* substitute a character */
#define SCORE_SIMILAR 33 /* substitute a similar character */
#define SCORE_SUBCOMP 33 /* substitute a composing character */
#define SCORE_DEL 94 /* delete a character */
#define SCORE_DELDUP 66 /* delete a duplicated character */
#define SCORE_DELCOMP 28 /* delete a composing character */
#define SCORE_INS 96 /* insert a character */
#define SCORE_INSDUP 67 /* insert a duplicate character */
#define SCORE_INSCOMP 30 /* insert a composing character */
#define SCORE_NONWORD 103 /* change non-word to word char */
#define SCORE_FILE 30 /* suggestion from a file */
#define SCORE_MAXINIT 350 /* Initial maximum score: higher == slower.
* 350 allows for about three changes. */
#define SCORE_COMMON1 30 /* subtracted for words seen before */
#define SCORE_COMMON2 40 /* subtracted for words often seen */
#define SCORE_COMMON3 50 /* subtracted for words very often seen */
#define SCORE_THRES2 10 /* word count threshold for COMMON2 */
#define SCORE_THRES3 100 /* word count threshold for COMMON3 */
/* When trying changed soundfold words it becomes slow when trying more than
* two changes. With less then two changes it's slightly faster but we miss a
* few good suggestions. In rare cases we need to try three of four changes.
*/
#define SCORE_SFMAX1 200 /* maximum score for first try */
#define SCORE_SFMAX2 300 /* maximum score for second try */
#define SCORE_SFMAX3 400 /* maximum score for third try */
#define SCORE_BIG SCORE_INS * 3 /* big difference */
#define SCORE_MAXMAX 999999 /* accept any score */
#define SCORE_LIMITMAX 350 /* for spell_edit_score_limit() */
/* for spell_edit_score_limit() we need to know the minimum value of
* SCORE_ICASE, SCORE_SWAP, SCORE_DEL, SCORE_SIMILAR and SCORE_INS */
#define SCORE_EDIT_MIN SCORE_SIMILAR
/*
* Structure to store info for word matching.
*/
typedef struct matchinf_S
{
langp_T *mi_lp; /* info for language and region */
/* pointers to original text to be checked */
char_u *mi_word; /* start of word being checked */
char_u *mi_end; /* end of matching word so far */
char_u *mi_fend; /* next char to be added to mi_fword */
char_u *mi_cend; /* char after what was used for
mi_capflags */
/* case-folded text */
char_u mi_fword[MAXWLEN + 1]; /* mi_word case-folded */
int mi_fwordlen; /* nr of valid bytes in mi_fword */
/* for when checking word after a prefix */
int mi_prefarridx; /* index in sl_pidxs with list of
affixID/condition */
int mi_prefcnt; /* number of entries at mi_prefarridx */
int mi_prefixlen; /* byte length of prefix */
#ifdef FEAT_MBYTE
int mi_cprefixlen; /* byte length of prefix in original
case */
#else
# define mi_cprefixlen mi_prefixlen /* it's the same value */
#endif
/* for when checking a compound word */
int mi_compoff; /* start of following word offset */
char_u mi_compflags[MAXWLEN]; /* flags for compound words used */
int mi_complen; /* nr of compound words used */
int mi_compextra; /* nr of COMPOUNDROOT words */
/* others */
int mi_result; /* result so far: SP_BAD, SP_OK, etc. */
int mi_capflags; /* WF_ONECAP WF_ALLCAP WF_KEEPCAP */
win_T *mi_win; /* buffer being checked */
/* for NOBREAK */
int mi_result2; /* "mi_resul" without following word */
char_u *mi_end2; /* "mi_end" without following word */
} matchinf_T;
/*
* The tables used for recognizing word characters according to spelling.
* These are only used for the first 256 characters of 'encoding'.
*/
typedef struct spelltab_S
{
char_u st_isw[256]; /* flags: is word char */
char_u st_isu[256]; /* flags: is uppercase char */
char_u st_fold[256]; /* chars: folded case */
char_u st_upper[256]; /* chars: upper case */
} spelltab_T;
static spelltab_T spelltab;
static int did_set_spelltab;
#define CF_WORD 0x01
#define CF_UPPER 0x02
static void clear_spell_chartab __ARGS((spelltab_T *sp));
static int set_spell_finish __ARGS((spelltab_T *new_st));
static int spell_iswordp __ARGS((char_u *p, win_T *wp));
static int spell_iswordp_nmw __ARGS((char_u *p));
#ifdef FEAT_MBYTE
static int spell_mb_isword_class __ARGS((int cl));
static int spell_iswordp_w __ARGS((int *p, win_T *wp));
#endif
static int write_spell_prefcond __ARGS((FILE *fd, garray_T *gap));
/*
* For finding suggestions: At each node in the tree these states are tried:
*/
typedef enum
{
STATE_START = 0, /* At start of node check for NUL bytes (goodword
* ends); if badword ends there is a match, otherwise
* try splitting word. */
STATE_NOPREFIX, /* try without prefix */
STATE_SPLITUNDO, /* Undo splitting. */
STATE_ENDNUL, /* Past NUL bytes at start of the node. */
STATE_PLAIN, /* Use each byte of the node. */
STATE_DEL, /* Delete a byte from the bad word. */
STATE_INS_PREP, /* Prepare for inserting bytes. */
STATE_INS, /* Insert a byte in the bad word. */
STATE_SWAP, /* Swap two bytes. */
STATE_UNSWAP, /* Undo swap two characters. */
STATE_SWAP3, /* Swap two characters over three. */
STATE_UNSWAP3, /* Undo Swap two characters over three. */
STATE_UNROT3L, /* Undo rotate three characters left */
STATE_UNROT3R, /* Undo rotate three characters right */
STATE_REP_INI, /* Prepare for using REP items. */
STATE_REP, /* Use matching REP items from the .aff file. */
STATE_REP_UNDO, /* Undo a REP item replacement. */
STATE_FINAL /* End of this node. */
} state_T;
/*
* Struct to keep the state at each level in suggest_try_change().
*/
typedef struct trystate_S
{
state_T ts_state; /* state at this level, STATE_ */
int ts_score; /* score */
idx_T ts_arridx; /* index in tree array, start of node */
short ts_curi; /* index in list of child nodes */
char_u ts_fidx; /* index in fword[], case-folded bad word */
char_u ts_fidxtry; /* ts_fidx at which bytes may be changed */
char_u ts_twordlen; /* valid length of tword[] */
char_u ts_prefixdepth; /* stack depth for end of prefix or
* PFD_PREFIXTREE or PFD_NOPREFIX */
char_u ts_flags; /* TSF_ flags */
#ifdef FEAT_MBYTE
char_u ts_tcharlen; /* number of bytes in tword character */
char_u ts_tcharidx; /* current byte index in tword character */
char_u ts_isdiff; /* DIFF_ values */
char_u ts_fcharstart; /* index in fword where badword char started */
#endif
char_u ts_prewordlen; /* length of word in "preword[]" */
char_u ts_splitoff; /* index in "tword" after last split */
char_u ts_splitfidx; /* "ts_fidx" at word split */
char_u ts_complen; /* nr of compound words used */
char_u ts_compsplit; /* index for "compflags" where word was spit */
char_u ts_save_badflags; /* su_badflags saved here */
char_u ts_delidx; /* index in fword for char that was deleted,
valid when "ts_flags" has TSF_DIDDEL */
} trystate_T;
/* values for ts_isdiff */
#define DIFF_NONE 0 /* no different byte (yet) */
#define DIFF_YES 1 /* different byte found */
#define DIFF_INSERT 2 /* inserting character */
/* values for ts_flags */
#define TSF_PREFIXOK 1 /* already checked that prefix is OK */
#define TSF_DIDSPLIT 2 /* tried split at this point */
#define TSF_DIDDEL 4 /* did a delete, "ts_delidx" has index */
/* special values ts_prefixdepth */
#define PFD_NOPREFIX 0xff /* not using prefixes */
#define PFD_PREFIXTREE 0xfe /* walking through the prefix tree */
#define PFD_NOTSPECIAL 0xfd /* highest value that's not special */
/* mode values for find_word */
#define FIND_FOLDWORD 0 /* find word case-folded */
#define FIND_KEEPWORD 1 /* find keep-case word */
#define FIND_PREFIX 2 /* find word after prefix */
#define FIND_COMPOUND 3 /* find case-folded compound word */
#define FIND_KEEPCOMPOUND 4 /* find keep-case compound word */
static slang_T *slang_alloc __ARGS((char_u *lang));
static void slang_free __ARGS((slang_T *lp));
static void slang_clear __ARGS((slang_T *lp));
static void slang_clear_sug __ARGS((slang_T *lp));
static void find_word __ARGS((matchinf_T *mip, int mode));
static int match_checkcompoundpattern __ARGS((char_u *ptr, int wlen, garray_T *gap));
static int can_compound __ARGS((slang_T *slang, char_u *word, char_u *flags));
static int can_be_compound __ARGS((trystate_T *sp, slang_T *slang, char_u *compflags, int flag));
static int match_compoundrule __ARGS((slang_T *slang, char_u *compflags));
static int valid_word_prefix __ARGS((int totprefcnt, int arridx, int flags, char_u *word, slang_T *slang, int cond_req));
static void find_prefix __ARGS((matchinf_T *mip, int mode));
static int fold_more __ARGS((matchinf_T *mip));
static int spell_valid_case __ARGS((int wordflags, int treeflags));
static int no_spell_checking __ARGS((win_T *wp));
static void spell_load_lang __ARGS((char_u *lang));
static char_u *spell_enc __ARGS((void));
static void int_wordlist_spl __ARGS((char_u *fname));
static void spell_load_cb __ARGS((char_u *fname, void *cookie));
static slang_T *spell_load_file __ARGS((char_u *fname, char_u *lang, slang_T *old_lp, int silent));
static char_u *read_cnt_string __ARGS((FILE *fd, int cnt_bytes, int *lenp));
static int read_region_section __ARGS((FILE *fd, slang_T *slang, int len));
static int read_charflags_section __ARGS((FILE *fd));
static int read_prefcond_section __ARGS((FILE *fd, slang_T *lp));
static int read_rep_section __ARGS((FILE *fd, garray_T *gap, short *first));
static int read_sal_section __ARGS((FILE *fd, slang_T *slang));
static int read_words_section __ARGS((FILE *fd, slang_T *lp, int len));
static void count_common_word __ARGS((slang_T *lp, char_u *word, int len, int count));
static int score_wordcount_adj __ARGS((slang_T *slang, int score, char_u *word, int split));
static int read_sofo_section __ARGS((FILE *fd, slang_T *slang));
static int read_compound __ARGS((FILE *fd, slang_T *slang, int len));
static int byte_in_str __ARGS((char_u *str, int byte));
static int init_syl_tab __ARGS((slang_T *slang));
static int count_syllables __ARGS((slang_T *slang, char_u *word));
static int set_sofo __ARGS((slang_T *lp, char_u *from, char_u *to));
static void set_sal_first __ARGS((slang_T *lp));
#ifdef FEAT_MBYTE
static int *mb_str2wide __ARGS((char_u *s));
#endif
static int spell_read_tree __ARGS((FILE *fd, char_u **bytsp, idx_T **idxsp, int prefixtree, int prefixcnt));
static idx_T read_tree_node __ARGS((FILE *fd, char_u *byts, idx_T *idxs, int maxidx, idx_T startidx, int prefixtree, int maxprefcondnr));
static void clear_midword __ARGS((win_T *buf));
static void use_midword __ARGS((slang_T *lp, win_T *buf));
static int find_region __ARGS((char_u *rp, char_u *region));
static int captype __ARGS((char_u *word, char_u *end));
static int badword_captype __ARGS((char_u *word, char_u *end));
static void spell_reload_one __ARGS((char_u *fname, int added_word));
static void set_spell_charflags __ARGS((char_u *flags, int cnt, char_u *upp));
static int set_spell_chartab __ARGS((char_u *fol, char_u *low, char_u *upp));
static int spell_casefold __ARGS((char_u *p, int len, char_u *buf, int buflen));
static int check_need_cap __ARGS((linenr_T lnum, colnr_T col));
static void spell_find_suggest __ARGS((char_u *badptr, int badlen, suginfo_T *su, int maxcount, int banbadword, int need_cap, int interactive));
#ifdef FEAT_EVAL
static void spell_suggest_expr __ARGS((suginfo_T *su, char_u *expr));
#endif
static void spell_suggest_file __ARGS((suginfo_T *su, char_u *fname));
static void spell_suggest_intern __ARGS((suginfo_T *su, int interactive));
static void suggest_load_files __ARGS((void));
static void tree_count_words __ARGS((char_u *byts, idx_T *idxs));
static void spell_find_cleanup __ARGS((suginfo_T *su));
static void onecap_copy __ARGS((char_u *word, char_u *wcopy, int upper));
static void allcap_copy __ARGS((char_u *word, char_u *wcopy));
static void suggest_try_special __ARGS((suginfo_T *su));
static void suggest_try_change __ARGS((suginfo_T *su));
static void suggest_trie_walk __ARGS((suginfo_T *su, langp_T *lp, char_u *fword, int soundfold));
static void go_deeper __ARGS((trystate_T *stack, int depth, int score_add));
#ifdef FEAT_MBYTE
static int nofold_len __ARGS((char_u *fword, int flen, char_u *word));
#endif
static void find_keepcap_word __ARGS((slang_T *slang, char_u *fword, char_u *kword));
static void score_comp_sal __ARGS((suginfo_T *su));
static void score_combine __ARGS((suginfo_T *su));
static int stp_sal_score __ARGS((suggest_T *stp, suginfo_T *su, slang_T *slang, char_u *badsound));
static void suggest_try_soundalike_prep __ARGS((void));
static void suggest_try_soundalike __ARGS((suginfo_T *su));
static void suggest_try_soundalike_finish __ARGS((void));
static void add_sound_suggest __ARGS((suginfo_T *su, char_u *goodword, int score, langp_T *lp));
static int soundfold_find __ARGS((slang_T *slang, char_u *word));
static void make_case_word __ARGS((char_u *fword, char_u *cword, int flags));
static void set_map_str __ARGS((slang_T *lp, char_u *map));
static int similar_chars __ARGS((slang_T *slang, int c1, int c2));
static void add_suggestion __ARGS((suginfo_T *su, garray_T *gap, char_u *goodword, int badlen, int score, int altscore, int had_bonus, slang_T *slang, int maxsf));
static void check_suggestions __ARGS((suginfo_T *su, garray_T *gap));
static void add_banned __ARGS((suginfo_T *su, char_u *word));
static void rescore_suggestions __ARGS((suginfo_T *su));
static void rescore_one __ARGS((suginfo_T *su, suggest_T *stp));
static int cleanup_suggestions __ARGS((garray_T *gap, int maxscore, int keep));
static void spell_soundfold __ARGS((slang_T *slang, char_u *inword, int folded, char_u *res));
static void spell_soundfold_sofo __ARGS((slang_T *slang, char_u *inword, char_u *res));
static void spell_soundfold_sal __ARGS((slang_T *slang, char_u *inword, char_u *res));
#ifdef FEAT_MBYTE
static void spell_soundfold_wsal __ARGS((slang_T *slang, char_u *inword, char_u *res));
#endif
static int soundalike_score __ARGS((char_u *goodsound, char_u *badsound));
static int spell_edit_score __ARGS((slang_T *slang, char_u *badword, char_u *goodword));
static int spell_edit_score_limit __ARGS((slang_T *slang, char_u *badword, char_u *goodword, int limit));
#ifdef FEAT_MBYTE
static int spell_edit_score_limit_w __ARGS((slang_T *slang, char_u *badword, char_u *goodword, int limit));
#endif
static void dump_word __ARGS((slang_T *slang, char_u *word, char_u *pat, int *dir, int round, int flags, linenr_T lnum));
static linenr_T dump_prefixes __ARGS((slang_T *slang, char_u *word, char_u *pat, int *dir, int round, int flags, linenr_T startlnum));
static buf_T *open_spellbuf __ARGS((void));
static void close_spellbuf __ARGS((buf_T *buf));
/*
* Use our own character-case definitions, because the current locale may
* differ from what the .spl file uses.
* These must not be called with negative number!
*/
#ifndef FEAT_MBYTE
/* Non-multi-byte implementation. */
# define SPELL_TOFOLD(c) ((c) < 256 ? (int)spelltab.st_fold[c] : (c))
# define SPELL_TOUPPER(c) ((c) < 256 ? (int)spelltab.st_upper[c] : (c))
# define SPELL_ISUPPER(c) ((c) < 256 ? spelltab.st_isu[c] : FALSE)
#else
# if defined(HAVE_WCHAR_H)
# include <wchar.h> /* for towupper() and towlower() */
# endif
/* Multi-byte implementation. For Unicode we can call utf_*(), but don't do
* that for ASCII, because we don't want to use 'casemap' here. Otherwise use
* the "w" library function for characters above 255 if available. */
# ifdef HAVE_TOWLOWER
# define SPELL_TOFOLD(c) (enc_utf8 && (c) >= 128 ? utf_fold(c) \
: (c) < 256 ? (int)spelltab.st_fold[c] : (int)towlower(c))
# else
# define SPELL_TOFOLD(c) (enc_utf8 && (c) >= 128 ? utf_fold(c) \
: (c) < 256 ? (int)spelltab.st_fold[c] : (c))
# endif
# ifdef HAVE_TOWUPPER
# define SPELL_TOUPPER(c) (enc_utf8 && (c) >= 128 ? utf_toupper(c) \
: (c) < 256 ? (int)spelltab.st_upper[c] : (int)towupper(c))
# else
# define SPELL_TOUPPER(c) (enc_utf8 && (c) >= 128 ? utf_toupper(c) \
: (c) < 256 ? (int)spelltab.st_upper[c] : (c))
# endif
# ifdef HAVE_ISWUPPER
# define SPELL_ISUPPER(c) (enc_utf8 && (c) >= 128 ? utf_isupper(c) \
: (c) < 256 ? spelltab.st_isu[c] : iswupper(c))
# else
# define SPELL_ISUPPER(c) (enc_utf8 && (c) >= 128 ? utf_isupper(c) \
: (c) < 256 ? spelltab.st_isu[c] : (FALSE))
# endif
#endif
static char *e_format = N_("E759: Format error in spell file");
static char *e_spell_trunc = N_("E758: Truncated spell file");
static char *e_afftrailing = N_("Trailing text in %s line %d: %s");
static char *e_affname = N_("Affix name too long in %s line %d: %s");
static char *e_affform = N_("E761: Format error in affix file FOL, LOW or UPP");
static char *e_affrange = N_("E762: Character in FOL, LOW or UPP is out of range");
static char *msg_compressing = N_("Compressing word tree...");
/* Remember what "z?" replaced. */
static char_u *repl_from = NULL;
static char_u *repl_to = NULL;
/*
* Main spell-checking function.
* "ptr" points to a character that could be the start of a word.
* "*attrp" is set to the highlight index for a badly spelled word. For a
* non-word or when it's OK it remains unchanged.
* This must only be called when 'spelllang' is not empty.
*
* "capcol" is used to check for a Capitalised word after the end of a
* sentence. If it's zero then perform the check. Return the column where to
* check next, or -1 when no sentence end was found. If it's NULL then don't
* worry.
*
* Returns the length of the word in bytes, also when it's OK, so that the
* caller can skip over the word.
*/
int
spell_check(wp, ptr, attrp, capcol, docount)
win_T *wp; /* current window */
char_u *ptr;
hlf_T *attrp;
int *capcol; /* column to check for Capital */
int docount; /* count good words */
{
matchinf_T mi; /* Most things are put in "mi" so that it can
be passed to functions quickly. */
int nrlen = 0; /* found a number first */
int c;
int wrongcaplen = 0;
int lpi;
int count_word = docount;
/* A word never starts at a space or a control character. Return quickly
* then, skipping over the character. */
if (*ptr <= ' ')
return 1;
/* Return here when loading language files failed. */
if (wp->w_s->b_langp.ga_len == 0)
return 1;
vim_memset(&mi, 0, sizeof(matchinf_T));
/* A number is always OK. Also skip hexadecimal numbers 0xFF99 and
* 0X99FF. But always do check spelling to find "3GPP" and "11
* julifeest". */
if (*ptr >= '0' && *ptr <= '9')
{
if (*ptr == '0' && (ptr[1] == 'x' || ptr[1] == 'X'))
mi.mi_end = skiphex(ptr + 2);
else
mi.mi_end = skipdigits(ptr);
nrlen = (int)(mi.mi_end - ptr);
}
/* Find the normal end of the word (until the next non-word character). */
mi.mi_word = ptr;
mi.mi_fend = ptr;
if (spell_iswordp(mi.mi_fend, wp))
{
do
{
mb_ptr_adv(mi.mi_fend);
} while (*mi.mi_fend != NUL && spell_iswordp(mi.mi_fend, wp));
if (capcol != NULL && *capcol == 0 && wp->w_s->b_cap_prog != NULL)
{
/* Check word starting with capital letter. */
c = PTR2CHAR(ptr);
if (!SPELL_ISUPPER(c))
wrongcaplen = (int)(mi.mi_fend - ptr);
}
}
if (capcol != NULL)
*capcol = -1;
/* We always use the characters up to the next non-word character,
* also for bad words. */
mi.mi_end = mi.mi_fend;
/* Check caps type later. */
mi.mi_capflags = 0;
mi.mi_cend = NULL;
mi.mi_win = wp;
/* case-fold the word with one non-word character, so that we can check
* for the word end. */
if (*mi.mi_fend != NUL)
mb_ptr_adv(mi.mi_fend);
(void)spell_casefold(ptr, (int)(mi.mi_fend - ptr), mi.mi_fword,
MAXWLEN + 1);
mi.mi_fwordlen = (int)STRLEN(mi.mi_fword);
/* The word is bad unless we recognize it. */
mi.mi_result = SP_BAD;
mi.mi_result2 = SP_BAD;
/*
* Loop over the languages specified in 'spelllang'.
* We check them all, because a word may be matched longer in another
* language.
*/
for (lpi = 0; lpi < wp->w_s->b_langp.ga_len; ++lpi)
{
mi.mi_lp = LANGP_ENTRY(wp->w_s->b_langp, lpi);
/* If reloading fails the language is still in the list but everything
* has been cleared. */
if (mi.mi_lp->lp_slang->sl_fidxs == NULL)
continue;
/* Check for a matching word in case-folded words. */
find_word(&mi, FIND_FOLDWORD);
/* Check for a matching word in keep-case words. */
find_word(&mi, FIND_KEEPWORD);
/* Check for matching prefixes. */
find_prefix(&mi, FIND_FOLDWORD);
/* For a NOBREAK language, may want to use a word without a following
* word as a backup. */
if (mi.mi_lp->lp_slang->sl_nobreak && mi.mi_result == SP_BAD
&& mi.mi_result2 != SP_BAD)
{
mi.mi_result = mi.mi_result2;
mi.mi_end = mi.mi_end2;
}
/* Count the word in the first language where it's found to be OK. */
if (count_word && mi.mi_result == SP_OK)
{
count_common_word(mi.mi_lp->lp_slang, ptr,
(int)(mi.mi_end - ptr), 1);
count_word = FALSE;
}
}
if (mi.mi_result != SP_OK)
{
/* If we found a number skip over it. Allows for "42nd". Do flag
* rare and local words, e.g., "3GPP". */
if (nrlen > 0)
{
if (mi.mi_result == SP_BAD || mi.mi_result == SP_BANNED)
return nrlen;
}
/* When we are at a non-word character there is no error, just
* skip over the character (try looking for a word after it). */
else if (!spell_iswordp_nmw(ptr))
{
if (capcol != NULL && wp->w_s->b_cap_prog != NULL)
{
regmatch_T regmatch;
/* Check for end of sentence. */
regmatch.regprog = wp->w_s->b_cap_prog;
regmatch.rm_ic = FALSE;
if (vim_regexec(&regmatch, ptr, 0))
*capcol = (int)(regmatch.endp[0] - ptr);
}
#ifdef FEAT_MBYTE
if (has_mbyte)
return (*mb_ptr2len)(ptr);
#endif
return 1;
}
else if (mi.mi_end == ptr)
/* Always include at least one character. Required for when there
* is a mixup in "midword". */
mb_ptr_adv(mi.mi_end);
else if (mi.mi_result == SP_BAD
&& LANGP_ENTRY(wp->w_s->b_langp, 0)->lp_slang->sl_nobreak)
{
char_u *p, *fp;
int save_result = mi.mi_result;
/* First language in 'spelllang' is NOBREAK. Find first position
* at which any word would be valid. */
mi.mi_lp = LANGP_ENTRY(wp->w_s->b_langp, 0);
if (mi.mi_lp->lp_slang->sl_fidxs != NULL)
{
p = mi.mi_word;
fp = mi.mi_fword;
for (;;)
{
mb_ptr_adv(p);
mb_ptr_adv(fp);
if (p >= mi.mi_end)
break;
mi.mi_compoff = (int)(fp - mi.mi_fword);
find_word(&mi, FIND_COMPOUND);
if (mi.mi_result != SP_BAD)
{
mi.mi_end = p;
break;
}
}
mi.mi_result = save_result;
}
}
if (mi.mi_result == SP_BAD || mi.mi_result == SP_BANNED)
*attrp = HLF_SPB;
else if (mi.mi_result == SP_RARE)
*attrp = HLF_SPR;
else
*attrp = HLF_SPL;
}
if (wrongcaplen > 0 && (mi.mi_result == SP_OK || mi.mi_result == SP_RARE))
{
/* Report SpellCap only when the word isn't badly spelled. */
*attrp = HLF_SPC;
return wrongcaplen;
}
return (int)(mi.mi_end - ptr);
}
/*
* Check if the word at "mip->mi_word" is in the tree.
* When "mode" is FIND_FOLDWORD check in fold-case word tree.
* When "mode" is FIND_KEEPWORD check in keep-case word tree.
* When "mode" is FIND_PREFIX check for word after prefix in fold-case word
* tree.
*
* For a match mip->mi_result is updated.
*/
static void
find_word(mip, mode)
matchinf_T *mip;
int mode;
{
idx_T arridx = 0;
int endlen[MAXWLEN]; /* length at possible word endings */
idx_T endidx[MAXWLEN]; /* possible word endings */
int endidxcnt = 0;
int len;
int wlen = 0;
int flen;
int c;
char_u *ptr;
idx_T lo, hi, m;
#ifdef FEAT_MBYTE
char_u *s;
#endif
char_u *p;
int res = SP_BAD;
slang_T *slang = mip->mi_lp->lp_slang;
unsigned flags;
char_u *byts;
idx_T *idxs;
int word_ends;
int prefix_found;
int nobreak_result;
if (mode == FIND_KEEPWORD || mode == FIND_KEEPCOMPOUND)
{
/* Check for word with matching case in keep-case tree. */
ptr = mip->mi_word;
flen = 9999; /* no case folding, always enough bytes */
byts = slang->sl_kbyts;
idxs = slang->sl_kidxs;
if (mode == FIND_KEEPCOMPOUND)
/* Skip over the previously found word(s). */
wlen += mip->mi_compoff;
}
else
{
/* Check for case-folded in case-folded tree. */
ptr = mip->mi_fword;
flen = mip->mi_fwordlen; /* available case-folded bytes */
byts = slang->sl_fbyts;
idxs = slang->sl_fidxs;
if (mode == FIND_PREFIX)
{
/* Skip over the prefix. */
wlen = mip->mi_prefixlen;
flen -= mip->mi_prefixlen;
}
else if (mode == FIND_COMPOUND)
{
/* Skip over the previously found word(s). */
wlen = mip->mi_compoff;
flen -= mip->mi_compoff;
}
}
if (byts == NULL)
return; /* array is empty */
/*
* Repeat advancing in the tree until:
* - there is a byte that doesn't match,
* - we reach the end of the tree,
* - or we reach the end of the line.
*/
for (;;)
{
if (flen <= 0 && *mip->mi_fend != NUL)
flen = fold_more(mip);
len = byts[arridx++];
/* If the first possible byte is a zero the word could end here.
* Remember this index, we first check for the longest word. */
if (byts[arridx] == 0)
{
if (endidxcnt == MAXWLEN)
{
/* Must be a corrupted spell file. */
EMSG(_(e_format));
return;
}
endlen[endidxcnt] = wlen;
endidx[endidxcnt++] = arridx++;
--len;
/* Skip over the zeros, there can be several flag/region
* combinations. */
while (len > 0 && byts[arridx] == 0)
{
++arridx;
--len;
}
if (len == 0)
break; /* no children, word must end here */
}
/* Stop looking at end of the line. */
if (ptr[wlen] == NUL)
break;
/* Perform a binary search in the list of accepted bytes. */
c = ptr[wlen];
if (c == TAB) /* <Tab> is handled like <Space> */
c = ' ';
lo = arridx;
hi = arridx + len - 1;
while (lo < hi)
{
m = (lo + hi) / 2;
if (byts[m] > c)
hi = m - 1;
else if (byts[m] < c)
lo = m + 1;
else
{
lo = hi = m;
break;
}
}
/* Stop if there is no matching byte. */
if (hi < lo || byts[lo] != c)
break;
/* Continue at the child (if there is one). */
arridx = idxs[lo];
++wlen;
--flen;
/* One space in the good word may stand for several spaces in the
* checked word. */
if (c == ' ')
{
for (;;)
{
if (flen <= 0 && *mip->mi_fend != NUL)
flen = fold_more(mip);
if (ptr[wlen] != ' ' && ptr[wlen] != TAB)
break;
++wlen;
--flen;
}
}
}
/*
* Verify that one of the possible endings is valid. Try the longest
* first.
*/
while (endidxcnt > 0)
{
--endidxcnt;
arridx = endidx[endidxcnt];
wlen = endlen[endidxcnt];
#ifdef FEAT_MBYTE
if ((*mb_head_off)(ptr, ptr + wlen) > 0)
continue; /* not at first byte of character */
#endif
if (spell_iswordp(ptr + wlen, mip->mi_win))
{
if (slang->sl_compprog == NULL && !slang->sl_nobreak)
continue; /* next char is a word character */
word_ends = FALSE;
}
else
word_ends = TRUE;
/* The prefix flag is before compound flags. Once a valid prefix flag
* has been found we try compound flags. */
prefix_found = FALSE;
#ifdef FEAT_MBYTE
if (mode != FIND_KEEPWORD && has_mbyte)
{
/* Compute byte length in original word, length may change
* when folding case. This can be slow, take a shortcut when the
* case-folded word is equal to the keep-case word. */
p = mip->mi_word;
if (STRNCMP(ptr, p, wlen) != 0)
{
for (s = ptr; s < ptr + wlen; mb_ptr_adv(s))
mb_ptr_adv(p);
wlen = (int)(p - mip->mi_word);
}
}
#endif
/* Check flags and region. For FIND_PREFIX check the condition and
* prefix ID.
* Repeat this if there are more flags/region alternatives until there
* is a match. */
res = SP_BAD;
for (len = byts[arridx - 1]; len > 0 && byts[arridx] == 0;
--len, ++arridx)
{
flags = idxs[arridx];
/* For the fold-case tree check that the case of the checked word
* matches with what the word in the tree requires.
* For keep-case tree the case is always right. For prefixes we
* don't bother to check. */
if (mode == FIND_FOLDWORD)
{
if (mip->mi_cend != mip->mi_word + wlen)
{
/* mi_capflags was set for a different word length, need
* to do it again. */
mip->mi_cend = mip->mi_word + wlen;
mip->mi_capflags = captype(mip->mi_word, mip->mi_cend);
}
if (mip->mi_capflags == WF_KEEPCAP
|| !spell_valid_case(mip->mi_capflags, flags))
continue;
}
/* When mode is FIND_PREFIX the word must support the prefix:
* check the prefix ID and the condition. Do that for the list at
* mip->mi_prefarridx that find_prefix() filled. */
else if (mode == FIND_PREFIX && !prefix_found)
{
c = valid_word_prefix(mip->mi_prefcnt, mip->mi_prefarridx,
flags,
mip->mi_word + mip->mi_cprefixlen, slang,
FALSE);
if (c == 0)
continue;
/* Use the WF_RARE flag for a rare prefix. */
if (c & WF_RAREPFX)
flags |= WF_RARE;
prefix_found = TRUE;
}
if (slang->sl_nobreak)
{
if ((mode == FIND_COMPOUND || mode == FIND_KEEPCOMPOUND)
&& (flags & WF_BANNED) == 0)
{
/* NOBREAK: found a valid following word. That's all we
* need to know, so return. */
mip->mi_result = SP_OK;
break;
}
}
else if ((mode == FIND_COMPOUND || mode == FIND_KEEPCOMPOUND
|| !word_ends))
{
/* If there is no compound flag or the word is shorter than
* COMPOUNDMIN reject it quickly.
* Makes you wonder why someone puts a compound flag on a word
* that's too short... Myspell compatibility requires this
* anyway. */
if (((unsigned)flags >> 24) == 0
|| wlen - mip->mi_compoff < slang->sl_compminlen)
continue;
#ifdef FEAT_MBYTE
/* For multi-byte chars check character length against
* COMPOUNDMIN. */
if (has_mbyte
&& slang->sl_compminlen > 0
&& mb_charlen_len(mip->mi_word + mip->mi_compoff,
wlen - mip->mi_compoff) < slang->sl_compminlen)
continue;
#endif
/* Limit the number of compound words to COMPOUNDWORDMAX if no
* maximum for syllables is specified. */
if (!word_ends && mip->mi_complen + mip->mi_compextra + 2
> slang->sl_compmax
&& slang->sl_compsylmax == MAXWLEN)
continue;
/* Don't allow compounding on a side where an affix was added,
* unless COMPOUNDPERMITFLAG was used. */
if (mip->mi_complen > 0 && (flags & WF_NOCOMPBEF))
continue;
if (!word_ends && (flags & WF_NOCOMPAFT))
continue;
/* Quickly check if compounding is possible with this flag. */
if (!byte_in_str(mip->mi_complen == 0
? slang->sl_compstartflags
: slang->sl_compallflags,
((unsigned)flags >> 24)))
continue;
/* If there is a match with a CHECKCOMPOUNDPATTERN rule
* discard the compound word. */
if (match_checkcompoundpattern(ptr, wlen, &slang->sl_comppat))
continue;
if (mode == FIND_COMPOUND)
{
int capflags;
/* Need to check the caps type of the appended compound
* word. */
#ifdef FEAT_MBYTE
if (has_mbyte && STRNCMP(ptr, mip->mi_word,
mip->mi_compoff) != 0)
{
/* case folding may have changed the length */
p = mip->mi_word;
for (s = ptr; s < ptr + mip->mi_compoff; mb_ptr_adv(s))
mb_ptr_adv(p);
}
else
#endif
p = mip->mi_word + mip->mi_compoff;
capflags = captype(p, mip->mi_word + wlen);
if (capflags == WF_KEEPCAP || (capflags == WF_ALLCAP
&& (flags & WF_FIXCAP) != 0))
continue;
if (capflags != WF_ALLCAP)
{
/* When the character before the word is a word
* character we do not accept a Onecap word. We do
* accept a no-caps word, even when the dictionary
* word specifies ONECAP. */
mb_ptr_back(mip->mi_word, p);
if (spell_iswordp_nmw(p)
? capflags == WF_ONECAP
: (flags & WF_ONECAP) != 0
&& capflags != WF_ONECAP)
continue;
}
}
/* If the word ends the sequence of compound flags of the
* words must match with one of the COMPOUNDRULE items and
* the number of syllables must not be too large. */
mip->mi_compflags[mip->mi_complen] = ((unsigned)flags >> 24);
mip->mi_compflags[mip->mi_complen + 1] = NUL;
if (word_ends)
{
char_u fword[MAXWLEN];
if (slang->sl_compsylmax < MAXWLEN)
{
/* "fword" is only needed for checking syllables. */
if (ptr == mip->mi_word)
(void)spell_casefold(ptr, wlen, fword, MAXWLEN);
else
vim_strncpy(fword, ptr, endlen[endidxcnt]);
}
if (!can_compound(slang, fword, mip->mi_compflags))
continue;
}
else if (slang->sl_comprules != NULL
&& !match_compoundrule(slang, mip->mi_compflags))
/* The compound flags collected so far do not match any
* COMPOUNDRULE, discard the compounded word. */
continue;
}
/* Check NEEDCOMPOUND: can't use word without compounding. */
else if (flags & WF_NEEDCOMP)
continue;
nobreak_result = SP_OK;
if (!word_ends)
{
int save_result = mip->mi_result;
char_u *save_end = mip->mi_end;
langp_T *save_lp = mip->mi_lp;
int lpi;
/* Check that a valid word follows. If there is one and we
* are compounding, it will set "mi_result", thus we are
* always finished here. For NOBREAK we only check that a
* valid word follows.
* Recursive! */
if (slang->sl_nobreak)
mip->mi_result = SP_BAD;
/* Find following word in case-folded tree. */
mip->mi_compoff = endlen[endidxcnt];
#ifdef FEAT_MBYTE
if (has_mbyte && mode == FIND_KEEPWORD)
{
/* Compute byte length in case-folded word from "wlen":
* byte length in keep-case word. Length may change when
* folding case. This can be slow, take a shortcut when
* the case-folded word is equal to the keep-case word. */
p = mip->mi_fword;
if (STRNCMP(ptr, p, wlen) != 0)
{
for (s = ptr; s < ptr + wlen; mb_ptr_adv(s))
mb_ptr_adv(p);
mip->mi_compoff = (int)(p - mip->mi_fword);
}
}
#endif
c = mip->mi_compoff;
++mip->mi_complen;
if (flags & WF_COMPROOT)
++mip->mi_compextra;
/* For NOBREAK we need to try all NOBREAK languages, at least
* to find the ".add" file(s). */
for (lpi = 0; lpi < mip->mi_win->w_s->b_langp.ga_len; ++lpi)
{
if (slang->sl_nobreak)
{
mip->mi_lp = LANGP_ENTRY(mip->mi_win->w_s->b_langp, lpi);
if (mip->mi_lp->lp_slang->sl_fidxs == NULL
|| !mip->mi_lp->lp_slang->sl_nobreak)
continue;
}
find_word(mip, FIND_COMPOUND);
/* When NOBREAK any word that matches is OK. Otherwise we
* need to find the longest match, thus try with keep-case
* and prefix too. */
if (!slang->sl_nobreak || mip->mi_result == SP_BAD)
{
/* Find following word in keep-case tree. */
mip->mi_compoff = wlen;
find_word(mip, FIND_KEEPCOMPOUND);
#if 0 /* Disabled, a prefix must not appear halfway a compound word,
unless the COMPOUNDPERMITFLAG is used and then it can't be a
postponed prefix. */
if (!slang->sl_nobreak || mip->mi_result == SP_BAD)
{
/* Check for following word with prefix. */
mip->mi_compoff = c;
find_prefix(mip, FIND_COMPOUND);
}
#endif
}
if (!slang->sl_nobreak)
break;
}
--mip->mi_complen;
if (flags & WF_COMPROOT)
--mip->mi_compextra;
mip->mi_lp = save_lp;
if (slang->sl_nobreak)
{
nobreak_result = mip->mi_result;
mip->mi_result = save_result;
mip->mi_end = save_end;
}
else
{
if (mip->mi_result == SP_OK)
break;
continue;
}
}
if (flags & WF_BANNED)
res = SP_BANNED;
else if (flags & WF_REGION)
{
/* Check region. */
if ((mip->mi_lp->lp_region & (flags >> 16)) != 0)
res = SP_OK;
else
res = SP_LOCAL;
}
else if (flags & WF_RARE)
res = SP_RARE;
else
res = SP_OK;
/* Always use the longest match and the best result. For NOBREAK
* we separately keep the longest match without a following good
* word as a fall-back. */
if (nobreak_result == SP_BAD)
{
if (mip->mi_result2 > res)
{
mip->mi_result2 = res;
mip->mi_end2 = mip->mi_word + wlen;
}
else if (mip->mi_result2 == res
&& mip->mi_end2 < mip->mi_word + wlen)
mip->mi_end2 = mip->mi_word + wlen;
}
else if (mip->mi_result > res)
{
mip->mi_result = res;
mip->mi_end = mip->mi_word + wlen;
}
else if (mip->mi_result == res && mip->mi_end < mip->mi_word + wlen)
mip->mi_end = mip->mi_word + wlen;
if (mip->mi_result == SP_OK)
break;
}
if (mip->mi_result == SP_OK)
break;
}
}
/*
* Return TRUE if there is a match between the word ptr[wlen] and
* CHECKCOMPOUNDPATTERN rules, assuming that we will concatenate with another
* word.
* A match means that the first part of CHECKCOMPOUNDPATTERN matches at the
* end of ptr[wlen] and the second part matches after it.
*/
static int
match_checkcompoundpattern(ptr, wlen, gap)
char_u *ptr;
int wlen;
garray_T *gap; /* &sl_comppat */
{
int i;
char_u *p;
int len;
for (i = 0; i + 1 < gap->ga_len; i += 2)
{
p = ((char_u **)gap->ga_data)[i + 1];
if (STRNCMP(ptr + wlen, p, STRLEN(p)) == 0)
{
/* Second part matches at start of following compound word, now
* check if first part matches at end of previous word. */
p = ((char_u **)gap->ga_data)[i];
len = (int)STRLEN(p);
if (len <= wlen && STRNCMP(ptr + wlen - len, p, len) == 0)
return TRUE;
}
}
return FALSE;
}
/*
* Return TRUE if "flags" is a valid sequence of compound flags and "word"
* does not have too many syllables.
*/
static int
can_compound(slang, word, flags)
slang_T *slang;
char_u *word;
char_u *flags;
{
regmatch_T regmatch;
#ifdef FEAT_MBYTE
char_u uflags[MAXWLEN * 2];
int i;
#endif
char_u *p;
if (slang->sl_compprog == NULL)
return FALSE;
#ifdef FEAT_MBYTE
if (enc_utf8)
{
/* Need to convert the single byte flags to utf8 characters. */
p = uflags;
for (i = 0; flags[i] != NUL; ++i)
p += mb_char2bytes(flags[i], p);
*p = NUL;
p = uflags;
}
else
#endif
p = flags;
regmatch.regprog = slang->sl_compprog;
regmatch.rm_ic = FALSE;
if (!vim_regexec(&regmatch, p, 0))
return FALSE;
/* Count the number of syllables. This may be slow, do it last. If there
* are too many syllables AND the number of compound words is above
* COMPOUNDWORDMAX then compounding is not allowed. */
if (slang->sl_compsylmax < MAXWLEN
&& count_syllables(slang, word) > slang->sl_compsylmax)
return (int)STRLEN(flags) < slang->sl_compmax;
return TRUE;
}
/*
* Return TRUE when the sequence of flags in "compflags" plus "flag" can
* possibly form a valid compounded word. This also checks the COMPOUNDRULE
* lines if they don't contain wildcards.
*/
static int
can_be_compound(sp, slang, compflags, flag)
trystate_T *sp;
slang_T *slang;
char_u *compflags;
int flag;
{
/* If the flag doesn't appear in sl_compstartflags or sl_compallflags
* then it can't possibly compound. */
if (!byte_in_str(sp->ts_complen == sp->ts_compsplit
? slang->sl_compstartflags : slang->sl_compallflags, flag))
return FALSE;
/* If there are no wildcards, we can check if the flags collected so far
* possibly can form a match with COMPOUNDRULE patterns. This only
* makes sense when we have two or more words. */
if (slang->sl_comprules != NULL && sp->ts_complen > sp->ts_compsplit)
{
int v;
compflags[sp->ts_complen] = flag;
compflags[sp->ts_complen + 1] = NUL;
v = match_compoundrule(slang, compflags + sp->ts_compsplit);
compflags[sp->ts_complen] = NUL;
return v;
}
return TRUE;
}
/*
* Return TRUE if the compound flags in compflags[] match the start of any
* compound rule. This is used to stop trying a compound if the flags
* collected so far can't possibly match any compound rule.
* Caller must check that slang->sl_comprules is not NULL.
*/
static int
match_compoundrule(slang, compflags)
slang_T *slang;
char_u *compflags;
{
char_u *p;
int i;
int c;
/* loop over all the COMPOUNDRULE entries */
for (p = slang->sl_comprules; *p != NUL; ++p)
{
/* loop over the flags in the compound word we have made, match
* them against the current rule entry */
for (i = 0; ; ++i)
{
c = compflags[i];
if (c == NUL)
/* found a rule that matches for the flags we have so far */
return TRUE;
if (*p == '/' || *p == NUL)
break; /* end of rule, it's too short */
if (*p == '[')
{
int match = FALSE;
/* compare against all the flags in [] */
++p;
while (*p != ']' && *p != NUL)
if (*p++ == c)
match = TRUE;
if (!match)
break; /* none matches */
}
else if (*p != c)
break; /* flag of word doesn't match flag in pattern */
++p;
}
/* Skip to the next "/", where the next pattern starts. */
p = vim_strchr(p, '/');
if (p == NULL)
break;
}
/* Checked all the rules and none of them match the flags, so there
* can't possibly be a compound starting with these flags. */
return FALSE;
}
/*
* Return non-zero if the prefix indicated by "arridx" matches with the prefix
* ID in "flags" for the word "word".
* The WF_RAREPFX flag is included in the return value for a rare prefix.
*/
static int
valid_word_prefix(totprefcnt, arridx, flags, word, slang, cond_req)
int totprefcnt; /* nr of prefix IDs */
int arridx; /* idx in sl_pidxs[] */
int flags;
char_u *word;
slang_T *slang;
int cond_req; /* only use prefixes with a condition */
{
int prefcnt;
int pidx;
regprog_T *rp;
regmatch_T regmatch;
int prefid;
prefid = (unsigned)flags >> 24;
for (prefcnt = totprefcnt - 1; prefcnt >= 0; --prefcnt)
{
pidx = slang->sl_pidxs[arridx + prefcnt];
/* Check the prefix ID. */
if (prefid != (pidx & 0xff))
continue;
/* Check if the prefix doesn't combine and the word already has a
* suffix. */
if ((flags & WF_HAS_AFF) && (pidx & WF_PFX_NC))
continue;
/* Check the condition, if there is one. The condition index is
* stored in the two bytes above the prefix ID byte. */
rp = slang->sl_prefprog[((unsigned)pidx >> 8) & 0xffff];
if (rp != NULL)
{
regmatch.regprog = rp;
regmatch.rm_ic = FALSE;
if (!vim_regexec(&regmatch, word, 0))
continue;
}
else if (cond_req)
continue;
/* It's a match! Return the WF_ flags. */
return pidx;
}
return 0;
}
/*
* Check if the word at "mip->mi_word" has a matching prefix.
* If it does, then check the following word.
*
* If "mode" is "FIND_COMPOUND" then do the same after another word, find a
* prefix in a compound word.
*
* For a match mip->mi_result is updated.
*/
static void
find_prefix(mip, mode)
matchinf_T *mip;
int mode;
{
idx_T arridx = 0;
int len;
int wlen = 0;
int flen;
int c;
char_u *ptr;
idx_T lo, hi, m;
slang_T *slang = mip->mi_lp->lp_slang;
char_u *byts;
idx_T *idxs;
byts = slang->sl_pbyts;
if (byts == NULL)
return; /* array is empty */
/* We use the case-folded word here, since prefixes are always
* case-folded. */
ptr = mip->mi_fword;
flen = mip->mi_fwordlen; /* available case-folded bytes */
if (mode == FIND_COMPOUND)
{
/* Skip over the previously found word(s). */
ptr += mip->mi_compoff;
flen -= mip->mi_compoff;
}
idxs = slang->sl_pidxs;
/*
* Repeat advancing in the tree until:
* - there is a byte that doesn't match,
* - we reach the end of the tree,
* - or we reach the end of the line.
*/
for (;;)
{
if (flen == 0 && *mip->mi_fend != NUL)
flen = fold_more(mip);
len = byts[arridx++];
/* If the first possible byte is a zero the prefix could end here.
* Check if the following word matches and supports the prefix. */
if (byts[arridx] == 0)
{
/* There can be several prefixes with different conditions. We
* try them all, since we don't know which one will give the
* longest match. The word is the same each time, pass the list
* of possible prefixes to find_word(). */
mip->mi_prefarridx = arridx;
mip->mi_prefcnt = len;
while (len > 0 && byts[arridx] == 0)
{
++arridx;
--len;
}
mip->mi_prefcnt -= len;
/* Find the word that comes after the prefix. */
mip->mi_prefixlen = wlen;
if (mode == FIND_COMPOUND)
/* Skip over the previously found word(s). */
mip->mi_prefixlen += mip->mi_compoff;
#ifdef FEAT_MBYTE
if (has_mbyte)
{
/* Case-folded length may differ from original length. */
mip->mi_cprefixlen = nofold_len(mip->mi_fword,
mip->mi_prefixlen, mip->mi_word);
}
else
mip->mi_cprefixlen = mip->mi_prefixlen;
#endif
find_word(mip, FIND_PREFIX);
if (len == 0)
break; /* no children, word must end here */
}
/* Stop looking at end of the line. */
if (ptr[wlen] == NUL)
break;
/* Perform a binary search in the list of accepted bytes. */
c = ptr[wlen];
lo = arridx;
hi = arridx + len - 1;
while (lo < hi)
{
m = (lo + hi) / 2;
if (byts[m] > c)
hi = m - 1;
else if (byts[m] < c)
lo = m + 1;
else
{
lo = hi = m;
break;
}
}
/* Stop if there is no matching byte. */
if (hi < lo || byts[lo] != c)
break;
/* Continue at the child (if there is one). */
arridx = idxs[lo];
++wlen;
--flen;
}
}
/*
* Need to fold at least one more character. Do until next non-word character
* for efficiency. Include the non-word character too.
* Return the length of the folded chars in bytes.
*/
static int
fold_more(mip)
matchinf_T *mip;
{
int flen;
char_u *p;
p = mip->mi_fend;
do
{
mb_ptr_adv(mip->mi_fend);
} while (*mip->mi_fend != NUL && spell_iswordp(mip->mi_fend, mip->mi_win));
/* Include the non-word character so that we can check for the word end. */
if (*mip->mi_fend != NUL)
mb_ptr_adv(mip->mi_fend);
(void)spell_casefold(p, (int)(mip->mi_fend - p),
mip->mi_fword + mip->mi_fwordlen,
MAXWLEN - mip->mi_fwordlen);
flen = (int)STRLEN(mip->mi_fword + mip->mi_fwordlen);
mip->mi_fwordlen += flen;
return flen;
}
/*
* Check case flags for a word. Return TRUE if the word has the requested
* case.
*/
static int
spell_valid_case(wordflags, treeflags)
int wordflags; /* flags for the checked word. */
int treeflags; /* flags for the word in the spell tree */
{
return ((wordflags == WF_ALLCAP && (treeflags & WF_FIXCAP) == 0)
|| ((treeflags & (WF_ALLCAP | WF_KEEPCAP)) == 0
&& ((treeflags & WF_ONECAP) == 0
|| (wordflags & WF_ONECAP) != 0)));
}
/*
* Return TRUE if spell checking is not enabled.
*/
static int
no_spell_checking(wp)
win_T *wp;
{
if (!wp->w_p_spell || *wp->w_s->b_p_spl == NUL
|| wp->w_s->b_langp.ga_len == 0)
{
EMSG(_("E756: Spell checking is not enabled"));
return TRUE;
}
return FALSE;
}
/*
* Move to next spell error.
* "curline" is FALSE for "[s", "]s", "[S" and "]S".
* "curline" is TRUE to find word under/after cursor in the same line.
* For Insert mode completion "dir" is BACKWARD and "curline" is TRUE: move
* to after badly spelled word before the cursor.
* Return 0 if not found, length of the badly spelled word otherwise.
*/
int
spell_move_to(wp, dir, allwords, curline, attrp)
win_T *wp;
int dir; /* FORWARD or BACKWARD */
int allwords; /* TRUE for "[s"/"]s", FALSE for "[S"/"]S" */
int curline;
hlf_T *attrp; /* return: attributes of bad word or NULL
(only when "dir" is FORWARD) */
{
linenr_T lnum;
pos_T found_pos;
int found_len = 0;
char_u *line;
char_u *p;
char_u *endp;
hlf_T attr;
int len;
# ifdef FEAT_SYN_HL
int has_syntax = syntax_present(wp);
# endif
int col;
int can_spell;
char_u *buf = NULL;
int buflen = 0;
int skip = 0;
int capcol = -1;
int found_one = FALSE;
int wrapped = FALSE;
if (no_spell_checking(wp))
return 0;
/*
* Start looking for bad word at the start of the line, because we can't
* start halfway a word, we don't know where it starts or ends.
*
* When searching backwards, we continue in the line to find the last
* bad word (in the cursor line: before the cursor).
*
* We concatenate the start of the next line, so that wrapped words work
* (e.g. "et<line-break>cetera"). Doesn't work when searching backwards
* though...
*/
lnum = wp->w_cursor.lnum;
clearpos(&found_pos);
while (!got_int)
{
line = ml_get_buf(wp->w_buffer, lnum, FALSE);
len = (int)STRLEN(line);
if (buflen < len + MAXWLEN + 2)
{
vim_free(buf);
buflen = len + MAXWLEN + 2;
buf = alloc(buflen);
if (buf == NULL)
break;
}
/* In first line check first word for Capital. */
if (lnum == 1)
capcol = 0;
/* For checking first word with a capital skip white space. */
if (capcol == 0)
capcol = (int)(skipwhite(line) - line);
else if (curline && wp == curwin)
{
/* For spellbadword(): check if first word needs a capital. */
col = (int)(skipwhite(line) - line);
if (check_need_cap(lnum, col))
capcol = col;
/* Need to get the line again, may have looked at the previous
* one. */
line = ml_get_buf(wp->w_buffer, lnum, FALSE);
}
/* Copy the line into "buf" and append the start of the next line if
* possible. */
STRCPY(buf, line);
if (lnum < wp->w_buffer->b_ml.ml_line_count)
spell_cat_line(buf + STRLEN(buf),
ml_get_buf(wp->w_buffer, lnum + 1, FALSE), MAXWLEN);
p = buf + skip;
endp = buf + len;
while (p < endp)
{
/* When searching backward don't search after the cursor. Unless
* we wrapped around the end of the buffer. */
if (dir == BACKWARD
&& lnum == wp->w_cursor.lnum
&& !wrapped
&& (colnr_T)(p - buf) >= wp->w_cursor.col)
break;
/* start of word */
attr = HLF_COUNT;
len = spell_check(wp, p, &attr, &capcol, FALSE);
if (attr != HLF_COUNT)
{
/* We found a bad word. Check the attribute. */
if (allwords || attr == HLF_SPB)
{
/* When searching forward only accept a bad word after
* the cursor. */
if (dir == BACKWARD
|| lnum != wp->w_cursor.lnum
|| (lnum == wp->w_cursor.lnum
&& (wrapped
|| (colnr_T)(curline ? p - buf + len
: p - buf)
> wp->w_cursor.col)))
{
# ifdef FEAT_SYN_HL
if (has_syntax)
{
col = (int)(p - buf);
(void)syn_get_id(wp, lnum, (colnr_T)col,
FALSE, &can_spell, FALSE);
if (!can_spell)
attr = HLF_COUNT;
}
else
#endif
can_spell = TRUE;
if (can_spell)
{
found_one = TRUE;
found_pos.lnum = lnum;
found_pos.col = (int)(p - buf);
#ifdef FEAT_VIRTUALEDIT
found_pos.coladd = 0;
#endif
if (dir == FORWARD)
{
/* No need to search further. */
wp->w_cursor = found_pos;
vim_free(buf);
if (attrp != NULL)
*attrp = attr;
return len;
}
else if (curline)
/* Insert mode completion: put cursor after
* the bad word. */
found_pos.col += len;
found_len = len;
}
}
else
found_one = TRUE;
}
}
/* advance to character after the word */
p += len;
capcol -= len;
}
if (dir == BACKWARD && found_pos.lnum != 0)
{
/* Use the last match in the line (before the cursor). */
wp->w_cursor = found_pos;
vim_free(buf);
return found_len;
}
if (curline)
break; /* only check cursor line */
/* Advance to next line. */
if (dir == BACKWARD)
{
/* If we are back at the starting line and searched it again there
* is no match, give up. */
if (lnum == wp->w_cursor.lnum && wrapped)
break;
if (lnum > 1)
--lnum;
else if (!p_ws)
break; /* at first line and 'nowrapscan' */
else
{
/* Wrap around to the end of the buffer. May search the
* starting line again and accept the last match. */
lnum = wp->w_buffer->b_ml.ml_line_count;
wrapped = TRUE;
if (!shortmess(SHM_SEARCH))
give_warning((char_u *)_(top_bot_msg), TRUE);
}
capcol = -1;
}
else
{
if (lnum < wp->w_buffer->b_ml.ml_line_count)
++lnum;
else if (!p_ws)
break; /* at first line and 'nowrapscan' */
else
{
/* Wrap around to the start of the buffer. May search the
* starting line again and accept the first match. */
lnum = 1;
wrapped = TRUE;
if (!shortmess(SHM_SEARCH))
give_warning((char_u *)_(bot_top_msg), TRUE);
}
/* If we are back at the starting line and there is no match then
* give up. */
if (lnum == wp->w_cursor.lnum && (!found_one || wrapped))
break;
/* Skip the characters at the start of the next line that were
* included in a match crossing line boundaries. */
if (attr == HLF_COUNT)
skip = (int)(p - endp);
else
skip = 0;
/* Capcol skips over the inserted space. */
--capcol;
/* But after empty line check first word in next line */
if (*skipwhite(line) == NUL)
capcol = 0;
}
line_breakcheck();
}
vim_free(buf);
return 0;
}
/*
* For spell checking: concatenate the start of the following line "line" into
* "buf", blanking-out special characters. Copy less then "maxlen" bytes.
* Keep the blanks at the start of the next line, this is used in win_line()
* to skip those bytes if the word was OK.
*/
void
spell_cat_line(buf, line, maxlen)
char_u *buf;
char_u *line;
int maxlen;
{
char_u *p;
int n;
p = skipwhite(line);
while (vim_strchr((char_u *)"*#/\"\t", *p) != NULL)
p = skipwhite(p + 1);
if (*p != NUL)
{
/* Only worth concatenating if there is something else than spaces to
* concatenate. */
n = (int)(p - line) + 1;
if (n < maxlen - 1)
{
vim_memset(buf, ' ', n);
vim_strncpy(buf + n, p, maxlen - 1 - n);
}
}
}
/*
* Structure used for the cookie argument of do_in_runtimepath().
*/
typedef struct spelload_S
{
char_u sl_lang[MAXWLEN + 1]; /* language name */
slang_T *sl_slang; /* resulting slang_T struct */
int sl_nobreak; /* NOBREAK language found */
} spelload_T;
/*
* Load word list(s) for "lang" from Vim spell file(s).
* "lang" must be the language without the region: e.g., "en".
*/
static void
spell_load_lang(lang)
char_u *lang;
{
char_u fname_enc[85];
int r;
spelload_T sl;
#ifdef FEAT_AUTOCMD
int round;
#endif
/* Copy the language name to pass it to spell_load_cb() as a cookie.
* It's truncated when an error is detected. */
STRCPY(sl.sl_lang, lang);
sl.sl_slang = NULL;
sl.sl_nobreak = FALSE;
#ifdef FEAT_AUTOCMD
/* We may retry when no spell file is found for the language, an
* autocommand may load it then. */
for (round = 1; round <= 2; ++round)
#endif
{
/*
* Find the first spell file for "lang" in 'runtimepath' and load it.
*/
vim_snprintf((char *)fname_enc, sizeof(fname_enc) - 5,
#ifdef VMS
"spell/%s_%s.spl",
#else
"spell/%s.%s.spl",
#endif
lang, spell_enc());
r = do_in_runtimepath(fname_enc, FALSE, spell_load_cb, &sl);
if (r == FAIL && *sl.sl_lang != NUL)
{
/* Try loading the ASCII version. */
vim_snprintf((char *)fname_enc, sizeof(fname_enc) - 5,
#ifdef VMS
"spell/%s_ascii.spl",
#else
"spell/%s.ascii.spl",
#endif
lang);
r = do_in_runtimepath(fname_enc, FALSE, spell_load_cb, &sl);
#ifdef FEAT_AUTOCMD
if (r == FAIL && *sl.sl_lang != NUL && round == 1
&& apply_autocmds(EVENT_SPELLFILEMISSING, lang,
curbuf->b_fname, FALSE, curbuf))
continue;
break;
#endif
}
#ifdef FEAT_AUTOCMD
break;
#endif
}
if (r == FAIL)
{
smsg((char_u *)
#ifdef VMS
_("Warning: Cannot find word list \"%s_%s.spl\" or \"%s_ascii.spl\""),
#else
_("Warning: Cannot find word list \"%s.%s.spl\" or \"%s.ascii.spl\""),
#endif
lang, spell_enc(), lang);
}
else if (sl.sl_slang != NULL)
{
/* At least one file was loaded, now load ALL the additions. */
STRCPY(fname_enc + STRLEN(fname_enc) - 3, "add.spl");
do_in_runtimepath(fname_enc, TRUE, spell_load_cb, &sl);
}
}
/*
* Return the encoding used for spell checking: Use 'encoding', except that we
* use "latin1" for "latin9". And limit to 60 characters (just in case).
*/
static char_u *
spell_enc()
{
#ifdef FEAT_MBYTE
if (STRLEN(p_enc) < 60 && STRCMP(p_enc, "iso-8859-15") != 0)
return p_enc;
#endif
return (char_u *)"latin1";
}
/*
* Get the name of the .spl file for the internal wordlist into
* "fname[MAXPATHL]".
*/
static void
int_wordlist_spl(fname)
char_u *fname;
{
vim_snprintf((char *)fname, MAXPATHL, SPL_FNAME_TMPL,
int_wordlist, spell_enc());
}
/*
* Allocate a new slang_T for language "lang". "lang" can be NULL.
* Caller must fill "sl_next".
*/
static slang_T *
slang_alloc(lang)
char_u *lang;
{
slang_T *lp;
lp = (slang_T *)alloc_clear(sizeof(slang_T));
if (lp != NULL)
{
if (lang != NULL)
lp->sl_name = vim_strsave(lang);
ga_init2(&lp->sl_rep, sizeof(fromto_T), 10);
ga_init2(&lp->sl_repsal, sizeof(fromto_T), 10);
lp->sl_compmax = MAXWLEN;
lp->sl_compsylmax = MAXWLEN;
hash_init(&lp->sl_wordcount);
}
return lp;
}
/*
* Free the contents of an slang_T and the structure itself.
*/
static void
slang_free(lp)
slang_T *lp;
{
vim_free(lp->sl_name);
vim_free(lp->sl_fname);
slang_clear(lp);
vim_free(lp);
}
/*
* Clear an slang_T so that the file can be reloaded.
*/
static void
slang_clear(lp)
slang_T *lp;
{
garray_T *gap;
fromto_T *ftp;
salitem_T *smp;
int i;
int round;
vim_free(lp->sl_fbyts);
lp->sl_fbyts = NULL;
vim_free(lp->sl_kbyts);
lp->sl_kbyts = NULL;
vim_free(lp->sl_pbyts);
lp->sl_pbyts = NULL;
vim_free(lp->sl_fidxs);
lp->sl_fidxs = NULL;
vim_free(lp->sl_kidxs);
lp->sl_kidxs = NULL;
vim_free(lp->sl_pidxs);
lp->sl_pidxs = NULL;
for (round = 1; round <= 2; ++round)
{
gap = round == 1 ? &lp->sl_rep : &lp->sl_repsal;
while (gap->ga_len > 0)
{
ftp = &((fromto_T *)gap->ga_data)[--gap->ga_len];
vim_free(ftp->ft_from);
vim_free(ftp->ft_to);
}
ga_clear(gap);
}
gap = &lp->sl_sal;
if (lp->sl_sofo)
{
/* "ga_len" is set to 1 without adding an item for latin1 */
if (gap->ga_data != NULL)
/* SOFOFROM and SOFOTO items: free lists of wide characters. */
for (i = 0; i < gap->ga_len; ++i)
vim_free(((int **)gap->ga_data)[i]);
}
else
/* SAL items: free salitem_T items */
while (gap->ga_len > 0)
{
smp = &((salitem_T *)gap->ga_data)[--gap->ga_len];
vim_free(smp->sm_lead);
/* Don't free sm_oneof and sm_rules, they point into sm_lead. */
vim_free(smp->sm_to);
#ifdef FEAT_MBYTE
vim_free(smp->sm_lead_w);
vim_free(smp->sm_oneof_w);
vim_free(smp->sm_to_w);
#endif
}
ga_clear(gap);
for (i = 0; i < lp->sl_prefixcnt; ++i)
vim_free(lp->sl_prefprog[i]);
lp->sl_prefixcnt = 0;
vim_free(lp->sl_prefprog);
lp->sl_prefprog = NULL;
vim_free(lp->sl_info);
lp->sl_info = NULL;
vim_free(lp->sl_midword);
lp->sl_midword = NULL;
vim_free(lp->sl_compprog);
vim_free(lp->sl_comprules);
vim_free(lp->sl_compstartflags);
vim_free(lp->sl_compallflags);
lp->sl_compprog = NULL;
lp->sl_comprules = NULL;
lp->sl_compstartflags = NULL;
lp->sl_compallflags = NULL;
vim_free(lp->sl_syllable);
lp->sl_syllable = NULL;
ga_clear(&lp->sl_syl_items);
ga_clear_strings(&lp->sl_comppat);
hash_clear_all(&lp->sl_wordcount, WC_KEY_OFF);
hash_init(&lp->sl_wordcount);
#ifdef FEAT_MBYTE
hash_clear_all(&lp->sl_map_hash, 0);
#endif
/* Clear info from .sug file. */
slang_clear_sug(lp);
lp->sl_compmax = MAXWLEN;
lp->sl_compminlen = 0;
lp->sl_compsylmax = MAXWLEN;
lp->sl_regions[0] = NUL;
}
/*
* Clear the info from the .sug file in "lp".
*/
static void
slang_clear_sug(lp)
slang_T *lp;
{
vim_free(lp->sl_sbyts);
lp->sl_sbyts = NULL;
vim_free(lp->sl_sidxs);
lp->sl_sidxs = NULL;
close_spellbuf(lp->sl_sugbuf);
lp->sl_sugbuf = NULL;
lp->sl_sugloaded = FALSE;
lp->sl_sugtime = 0;
}
/*
* Load one spell file and store the info into a slang_T.
* Invoked through do_in_runtimepath().
*/
static void
spell_load_cb(fname, cookie)
char_u *fname;
void *cookie;
{
spelload_T *slp = (spelload_T *)cookie;
slang_T *slang;
slang = spell_load_file(fname, slp->sl_lang, NULL, FALSE);
if (slang != NULL)
{
/* When a previously loaded file has NOBREAK also use it for the
* ".add" files. */
if (slp->sl_nobreak && slang->sl_add)
slang->sl_nobreak = TRUE;
else if (slang->sl_nobreak)
slp->sl_nobreak = TRUE;
slp->sl_slang = slang;
}
}
/*
* Load one spell file and store the info into a slang_T.
*
* This is invoked in three ways:
* - From spell_load_cb() to load a spell file for the first time. "lang" is
* the language name, "old_lp" is NULL. Will allocate an slang_T.
* - To reload a spell file that was changed. "lang" is NULL and "old_lp"
* points to the existing slang_T.
* - Just after writing a .spl file; it's read back to produce the .sug file.
* "old_lp" is NULL and "lang" is NULL. Will allocate an slang_T.
*
* Returns the slang_T the spell file was loaded into. NULL for error.
*/
static slang_T *
spell_load_file(fname, lang, old_lp, silent)
char_u *fname;
char_u *lang;
slang_T *old_lp;
int silent; /* no error if file doesn't exist */
{
FILE *fd;
char_u buf[VIMSPELLMAGICL];
char_u *p;
int i;
int n;
int len;
char_u *save_sourcing_name = sourcing_name;
linenr_T save_sourcing_lnum = sourcing_lnum;
slang_T *lp = NULL;
int c = 0;
int res;
fd = mch_fopen((char *)fname, "r");
if (fd == NULL)
{
if (!silent)
EMSG2(_(e_notopen), fname);
else if (p_verbose > 2)
{
verbose_enter();
smsg((char_u *)e_notopen, fname);
verbose_leave();
}
goto endFAIL;
}
if (p_verbose > 2)
{
verbose_enter();
smsg((char_u *)_("Reading spell file \"%s\""), fname);
verbose_leave();
}
if (old_lp == NULL)
{
lp = slang_alloc(lang);
if (lp == NULL)
goto endFAIL;
/* Remember the file name, used to reload the file when it's updated. */
lp->sl_fname = vim_strsave(fname);
if (lp->sl_fname == NULL)
goto endFAIL;
/* Check for .add.spl (_add.spl for VMS). */
lp->sl_add = strstr((char *)gettail(fname), SPL_FNAME_ADD) != NULL;
}
else
lp = old_lp;
/* Set sourcing_name, so that error messages mention the file name. */
sourcing_name = fname;
sourcing_lnum = 0;
/*
* <HEADER>: <fileID>
*/
for (i = 0; i < VIMSPELLMAGICL; ++i)
buf[i] = getc(fd); /* <fileID> */
if (STRNCMP(buf, VIMSPELLMAGIC, VIMSPELLMAGICL) != 0)
{
EMSG(_("E757: This does not look like a spell file"));
goto endFAIL;
}
c = getc(fd); /* <versionnr> */
if (c < VIMSPELLVERSION)
{
EMSG(_("E771: Old spell file, needs to be updated"));
goto endFAIL;
}
else if (c > VIMSPELLVERSION)
{
EMSG(_("E772: Spell file is for newer version of Vim"));
goto endFAIL;
}
/*
* <SECTIONS>: <section> ... <sectionend>
* <section>: <sectionID> <sectionflags> <sectionlen> (section contents)
*/
for (;;)
{
n = getc(fd); /* <sectionID> or <sectionend> */
if (n == SN_END)
break;
c = getc(fd); /* <sectionflags> */
len = get4c(fd); /* <sectionlen> */
if (len < 0)
goto truncerr;
res = 0;
switch (n)
{
case SN_INFO:
lp->sl_info = read_string(fd, len); /* <infotext> */
if (lp->sl_info == NULL)
goto endFAIL;
break;
case SN_REGION:
res = read_region_section(fd, lp, len);
break;
case SN_CHARFLAGS:
res = read_charflags_section(fd);
break;
case SN_MIDWORD:
lp->sl_midword = read_string(fd, len); /* <midword> */
if (lp->sl_midword == NULL)
goto endFAIL;
break;
case SN_PREFCOND:
res = read_prefcond_section(fd, lp);
break;
case SN_REP:
res = read_rep_section(fd, &lp->sl_rep, lp->sl_rep_first);
break;
case SN_REPSAL:
res = read_rep_section(fd, &lp->sl_repsal, lp->sl_repsal_first);
break;
case SN_SAL:
res = read_sal_section(fd, lp);
break;
case SN_SOFO:
res = read_sofo_section(fd, lp);
break;
case SN_MAP:
p = read_string(fd, len); /* <mapstr> */
if (p == NULL)
goto endFAIL;
set_map_str(lp, p);
vim_free(p);
break;
case SN_WORDS:
res = read_words_section(fd, lp, len);
break;
case SN_SUGFILE:
lp->sl_sugtime = get8ctime(fd); /* <timestamp> */
break;
case SN_NOSPLITSUGS:
lp->sl_nosplitsugs = TRUE; /* <timestamp> */
break;
case SN_COMPOUND:
res = read_compound(fd, lp, len);
break;
case SN_NOBREAK:
lp->sl_nobreak = TRUE;
break;
case SN_SYLLABLE:
lp->sl_syllable = read_string(fd, len); /* <syllable> */
if (lp->sl_syllable == NULL)
goto endFAIL;
if (init_syl_tab(lp) == FAIL)
goto endFAIL;
break;
default:
/* Unsupported section. When it's required give an error
* message. When it's not required skip the contents. */
if (c & SNF_REQUIRED)
{
EMSG(_("E770: Unsupported section in spell file"));
goto endFAIL;
}
while (--len >= 0)
if (getc(fd) < 0)
goto truncerr;
break;
}
someerror:
if (res == SP_FORMERROR)
{
EMSG(_(e_format));
goto endFAIL;
}
if (res == SP_TRUNCERROR)
{
truncerr:
EMSG(_(e_spell_trunc));
goto endFAIL;
}
if (res == SP_OTHERERROR)
goto endFAIL;
}
/* <LWORDTREE> */
res = spell_read_tree(fd, &lp->sl_fbyts, &lp->sl_fidxs, FALSE, 0);
if (res != 0)
goto someerror;
/* <KWORDTREE> */
res = spell_read_tree(fd, &lp->sl_kbyts, &lp->sl_kidxs, FALSE, 0);
if (res != 0)
goto someerror;
/* <PREFIXTREE> */
res = spell_read_tree(fd, &lp->sl_pbyts, &lp->sl_pidxs, TRUE,
lp->sl_prefixcnt);
if (res != 0)
goto someerror;
/* For a new file link it in the list of spell files. */
if (old_lp == NULL && lang != NULL)
{
lp->sl_next = first_lang;
first_lang = lp;
}
goto endOK;
endFAIL:
if (lang != NULL)
/* truncating the name signals the error to spell_load_lang() */
*lang = NUL;
if (lp != NULL && old_lp == NULL)
slang_free(lp);
lp = NULL;
endOK:
if (fd != NULL)
fclose(fd);
sourcing_name = save_sourcing_name;
sourcing_lnum = save_sourcing_lnum;
return lp;
}
/*
* Read a length field from "fd" in "cnt_bytes" bytes.
* Allocate memory, read the string into it and add a NUL at the end.
* Returns NULL when the count is zero.
* Sets "*cntp" to SP_*ERROR when there is an error, length of the result
* otherwise.
*/
static char_u *
read_cnt_string(fd, cnt_bytes, cntp)
FILE *fd;
int cnt_bytes;
int *cntp;
{
int cnt = 0;
int i;
char_u *str;
/* read the length bytes, MSB first */
for (i = 0; i < cnt_bytes; ++i)
cnt = (cnt << 8) + getc(fd);
if (cnt < 0)
{
*cntp = SP_TRUNCERROR;
return NULL;
}
*cntp = cnt;
if (cnt == 0)
return NULL; /* nothing to read, return NULL */
str = read_string(fd, cnt);
if (str == NULL)
*cntp = SP_OTHERERROR;
return str;
}
/*
* Read SN_REGION: <regionname> ...
* Return SP_*ERROR flags.
*/
static int
read_region_section(fd, lp, len)
FILE *fd;
slang_T *lp;
int len;
{
int i;
if (len > 16)
return SP_FORMERROR;
for (i = 0; i < len; ++i)
lp->sl_regions[i] = getc(fd); /* <regionname> */
lp->sl_regions[len] = NUL;
return 0;
}
/*
* Read SN_CHARFLAGS section: <charflagslen> <charflags>
* <folcharslen> <folchars>
* Return SP_*ERROR flags.
*/
static int
read_charflags_section(fd)
FILE *fd;
{
char_u *flags;
char_u *fol;
int flagslen, follen;
/* <charflagslen> <charflags> */
flags = read_cnt_string(fd, 1, &flagslen);
if (flagslen < 0)
return flagslen;
/* <folcharslen> <folchars> */
fol = read_cnt_string(fd, 2, &follen);
if (follen < 0)
{
vim_free(flags);
return follen;
}
/* Set the word-char flags and fill SPELL_ISUPPER() table. */
if (flags != NULL && fol != NULL)
set_spell_charflags(flags, flagslen, fol);
vim_free(flags);
vim_free(fol);
/* When <charflagslen> is zero then <fcharlen> must also be zero. */
if ((flags == NULL) != (fol == NULL))
return SP_FORMERROR;
return 0;
}
/*
* Read SN_PREFCOND section.
* Return SP_*ERROR flags.
*/
static int
read_prefcond_section(fd, lp)
FILE *fd;
slang_T *lp;
{
int cnt;
int i;
int n;
char_u *p;
char_u buf[MAXWLEN + 1];
/* <prefcondcnt> <prefcond> ... */
cnt = get2c(fd); /* <prefcondcnt> */
if (cnt <= 0)
return SP_FORMERROR;
lp->sl_prefprog = (regprog_T **)alloc_clear(
(unsigned)sizeof(regprog_T *) * cnt);
if (lp->sl_prefprog == NULL)
return SP_OTHERERROR;
lp->sl_prefixcnt = cnt;
for (i = 0; i < cnt; ++i)
{
/* <prefcond> : <condlen> <condstr> */
n = getc(fd); /* <condlen> */
if (n < 0 || n >= MAXWLEN)
return SP_FORMERROR;
/* When <condlen> is zero we have an empty condition. Otherwise
* compile the regexp program used to check for the condition. */
if (n > 0)
{
buf[0] = '^'; /* always match at one position only */
p = buf + 1;
while (n-- > 0)
*p++ = getc(fd); /* <condstr> */
*p = NUL;
lp->sl_prefprog[i] = vim_regcomp(buf, RE_MAGIC + RE_STRING);
}
}
return 0;
}
/*
* Read REP or REPSAL items section from "fd": <repcount> <rep> ...
* Return SP_*ERROR flags.
*/
static int
read_rep_section(fd, gap, first)
FILE *fd;
garray_T *gap;
short *first;
{
int cnt;
fromto_T *ftp;
int i;
cnt = get2c(fd); /* <repcount> */
if (cnt < 0)
return SP_TRUNCERROR;
if (ga_grow(gap, cnt) == FAIL)
return SP_OTHERERROR;
/* <rep> : <repfromlen> <repfrom> <reptolen> <repto> */
for (; gap->ga_len < cnt; ++gap->ga_len)
{
ftp = &((fromto_T *)gap->ga_data)[gap->ga_len];
ftp->ft_from = read_cnt_string(fd, 1, &i);
if (i < 0)
return i;
if (i == 0)
return SP_FORMERROR;
ftp->ft_to = read_cnt_string(fd, 1, &i);
if (i <= 0)
{
vim_free(ftp->ft_from);
if (i < 0)
return i;
return SP_FORMERROR;
}
}
/* Fill the first-index table. */
for (i = 0; i < 256; ++i)
first[i] = -1;
for (i = 0; i < gap->ga_len; ++i)
{
ftp = &((fromto_T *)gap->ga_data)[i];
if (first[*ftp->ft_from] == -1)
first[*ftp->ft_from] = i;
}
return 0;
}
/*
* Read SN_SAL section: <salflags> <salcount> <sal> ...
* Return SP_*ERROR flags.
*/
static int
read_sal_section(fd, slang)
FILE *fd;
slang_T *slang;
{
int i;
int cnt;
garray_T *gap;
salitem_T *smp;
int ccnt;
char_u *p;
int c = NUL;
slang->sl_sofo = FALSE;
i = getc(fd); /* <salflags> */
if (i & SAL_F0LLOWUP)
slang->sl_followup = TRUE;
if (i & SAL_COLLAPSE)
slang->sl_collapse = TRUE;
if (i & SAL_REM_ACCENTS)
slang->sl_rem_accents = TRUE;
cnt = get2c(fd); /* <salcount> */
if (cnt < 0)
return SP_TRUNCERROR;
gap = &slang->sl_sal;
ga_init2(gap, sizeof(salitem_T), 10);
if (ga_grow(gap, cnt + 1) == FAIL)
return SP_OTHERERROR;
/* <sal> : <salfromlen> <salfrom> <saltolen> <salto> */
for (; gap->ga_len < cnt; ++gap->ga_len)
{
smp = &((salitem_T *)gap->ga_data)[gap->ga_len];
ccnt = getc(fd); /* <salfromlen> */
if (ccnt < 0)
return SP_TRUNCERROR;
if ((p = alloc(ccnt + 2)) == NULL)
return SP_OTHERERROR;
smp->sm_lead = p;
/* Read up to the first special char into sm_lead. */
for (i = 0; i < ccnt; ++i)
{
c = getc(fd); /* <salfrom> */
if (vim_strchr((char_u *)"0123456789(-<^$", c) != NULL)
break;
*p++ = c;
}
smp->sm_leadlen = (int)(p - smp->sm_lead);
*p++ = NUL;
/* Put (abc) chars in sm_oneof, if any. */
if (c == '(')
{
smp->sm_oneof = p;
for (++i; i < ccnt; ++i)
{
c = getc(fd); /* <salfrom> */
if (c == ')')
break;
*p++ = c;
}
*p++ = NUL;
if (++i < ccnt)
c = getc(fd);
}
else
smp->sm_oneof = NULL;
/* Any following chars go in sm_rules. */
smp->sm_rules = p;
if (i < ccnt)
/* store the char we got while checking for end of sm_lead */
*p++ = c;
for (++i; i < ccnt; ++i)
*p++ = getc(fd); /* <salfrom> */
*p++ = NUL;
/* <saltolen> <salto> */
smp->sm_to = read_cnt_string(fd, 1, &ccnt);
if (ccnt < 0)
{
vim_free(smp->sm_lead);
return ccnt;
}
#ifdef FEAT_MBYTE
if (has_mbyte)
{
/* convert the multi-byte strings to wide char strings */
smp->sm_lead_w = mb_str2wide(smp->sm_lead);
smp->sm_leadlen = mb_charlen(smp->sm_lead);
if (smp->sm_oneof == NULL)
smp->sm_oneof_w = NULL;
else
smp->sm_oneof_w = mb_str2wide(smp->sm_oneof);
if (smp->sm_to == NULL)
smp->sm_to_w = NULL;
else
smp->sm_to_w = mb_str2wide(smp->sm_to);
if (smp->sm_lead_w == NULL
|| (smp->sm_oneof_w == NULL && smp->sm_oneof != NULL)
|| (smp->sm_to_w == NULL && smp->sm_to != NULL))
{
vim_free(smp->sm_lead);
vim_free(smp->sm_to);
vim_free(smp->sm_lead_w);
vim_free(smp->sm_oneof_w);
vim_free(smp->sm_to_w);
return SP_OTHERERROR;
}
}
#endif
}
if (gap->ga_len > 0)
{
/* Add one extra entry to mark the end with an empty sm_lead. Avoids
* that we need to check the index every time. */
smp = &((salitem_T *)gap->ga_data)[gap->ga_len];
if ((p = alloc(1)) == NULL)
return SP_OTHERERROR;
p[0] = NUL;
smp->sm_lead = p;
smp->sm_leadlen = 0;
smp->sm_oneof = NULL;
smp->sm_rules = p;
smp->sm_to = NULL;
#ifdef FEAT_MBYTE
if (has_mbyte)
{
smp->sm_lead_w = mb_str2wide(smp->sm_lead);
smp->sm_leadlen = 0;
smp->sm_oneof_w = NULL;
smp->sm_to_w = NULL;
}
#endif
++gap->ga_len;
}
/* Fill the first-index table. */
set_sal_first(slang);
return 0;
}
/*
* Read SN_WORDS: <word> ...
* Return SP_*ERROR flags.
*/
static int
read_words_section(fd, lp, len)
FILE *fd;
slang_T *lp;
int len;
{
int done = 0;
int i;
int c;
char_u word[MAXWLEN];
while (done < len)
{
/* Read one word at a time. */
for (i = 0; ; ++i)
{
c = getc(fd);
if (c == EOF)
return SP_TRUNCERROR;
word[i] = c;
if (word[i] == NUL)
break;
if (i == MAXWLEN - 1)
return SP_FORMERROR;
}
/* Init the count to 10. */
count_common_word(lp, word, -1, 10);
done += i + 1;
}
return 0;
}
/*
* Add a word to the hashtable of common words.
* If it's already there then the counter is increased.
*/
static void
count_common_word(lp, word, len, count)
slang_T *lp;
char_u *word;
int len; /* word length, -1 for upto NUL */
int count; /* 1 to count once, 10 to init */
{
hash_T hash;
hashitem_T *hi;
wordcount_T *wc;
char_u buf[MAXWLEN];
char_u *p;
if (len == -1)
p = word;
else
{
vim_strncpy(buf, word, len);
p = buf;
}
hash = hash_hash(p);
hi = hash_lookup(&lp->sl_wordcount, p, hash);
if (HASHITEM_EMPTY(hi))
{
wc = (wordcount_T *)alloc((unsigned)(sizeof(wordcount_T) + STRLEN(p)));
if (wc == NULL)
return;
STRCPY(wc->wc_word, p);
wc->wc_count = count;
hash_add_item(&lp->sl_wordcount, hi, wc->wc_word, hash);
}
else
{
wc = HI2WC(hi);
if ((wc->wc_count += count) < (unsigned)count) /* check for overflow */
wc->wc_count = MAXWORDCOUNT;
}
}
/*
* Adjust the score of common words.
*/
static int
score_wordcount_adj(slang, score, word, split)
slang_T *slang;
int score;
char_u *word;
int split; /* word was split, less bonus */
{
hashitem_T *hi;
wordcount_T *wc;
int bonus;
int newscore;
hi = hash_find(&slang->sl_wordcount, word);
if (!HASHITEM_EMPTY(hi))
{
wc = HI2WC(hi);
if (wc->wc_count < SCORE_THRES2)
bonus = SCORE_COMMON1;
else if (wc->wc_count < SCORE_THRES3)
bonus = SCORE_COMMON2;
else
bonus = SCORE_COMMON3;
if (split)
newscore = score - bonus / 2;
else
newscore = score - bonus;
if (newscore < 0)
return 0;
return newscore;
}
return score;
}
/*
* SN_SOFO: <sofofromlen> <sofofrom> <sofotolen> <sofoto>
* Return SP_*ERROR flags.
*/
static int
read_sofo_section(fd, slang)
FILE *fd;
slang_T *slang;
{
int cnt;
char_u *from, *to;
int res;
slang->sl_sofo = TRUE;
/* <sofofromlen> <sofofrom> */
from = read_cnt_string(fd, 2, &cnt);
if (cnt < 0)
return cnt;
/* <sofotolen> <sofoto> */
to = read_cnt_string(fd, 2, &cnt);
if (cnt < 0)
{
vim_free(from);
return cnt;
}
/* Store the info in slang->sl_sal and/or slang->sl_sal_first. */
if (from != NULL && to != NULL)
res = set_sofo(slang, from, to);
else if (from != NULL || to != NULL)
res = SP_FORMERROR; /* only one of two strings is an error */
else
res = 0;
vim_free(from);
vim_free(to);
return res;
}
/*
* Read the compound section from the .spl file:
* <compmax> <compminlen> <compsylmax> <compoptions> <compflags>
* Returns SP_*ERROR flags.
*/
static int
read_compound(fd, slang, len)
FILE *fd;
slang_T *slang;
int len;
{
int todo = len;
int c;
int atstart;
char_u *pat;
char_u *pp;
char_u *cp;
char_u *ap;
char_u *crp;
int cnt;
garray_T *gap;
if (todo < 2)
return SP_FORMERROR; /* need at least two bytes */
--todo;
c = getc(fd); /* <compmax> */
if (c < 2)
c = MAXWLEN;
slang->sl_compmax = c;
--todo;
c = getc(fd); /* <compminlen> */
if (c < 1)
c = 0;
slang->sl_compminlen = c;
--todo;
c = getc(fd); /* <compsylmax> */
if (c < 1)
c = MAXWLEN;
slang->sl_compsylmax = c;
c = getc(fd); /* <compoptions> */
if (c != 0)
ungetc(c, fd); /* be backwards compatible with Vim 7.0b */
else
{
--todo;
c = getc(fd); /* only use the lower byte for now */
--todo;
slang->sl_compoptions = c;
gap = &slang->sl_comppat;
c = get2c(fd); /* <comppatcount> */
todo -= 2;
ga_init2(gap, sizeof(char_u *), c);
if (ga_grow(gap, c) == OK)
while (--c >= 0)
{
((char_u **)(gap->ga_data))[gap->ga_len++] =
read_cnt_string(fd, 1, &cnt);
/* <comppatlen> <comppattext> */
if (cnt < 0)
return cnt;
todo -= cnt + 1;
}
}
if (todo < 0)
return SP_FORMERROR;
/* Turn the COMPOUNDRULE items into a regexp pattern:
* "a[bc]/a*b+" -> "^\(a[bc]\|a*b\+\)$".
* Inserting backslashes may double the length, "^\(\)$<Nul>" is 7 bytes.
* Conversion to utf-8 may double the size. */
c = todo * 2 + 7;
#ifdef FEAT_MBYTE
if (enc_utf8)
c += todo * 2;
#endif
pat = alloc((unsigned)c);
if (pat == NULL)
return SP_OTHERERROR;
/* We also need a list of all flags that can appear at the start and one
* for all flags. */
cp = alloc(todo + 1);
if (cp == NULL)
{
vim_free(pat);
return SP_OTHERERROR;
}
slang->sl_compstartflags = cp;
*cp = NUL;
ap = alloc(todo + 1);
if (ap == NULL)
{
vim_free(pat);
return SP_OTHERERROR;
}
slang->sl_compallflags = ap;
*ap = NUL;
/* And a list of all patterns in their original form, for checking whether
* compounding may work in match_compoundrule(). This is freed when we
* encounter a wildcard, the check doesn't work then. */
crp = alloc(todo + 1);
slang->sl_comprules = crp;
pp = pat;
*pp++ = '^';
*pp++ = '\\';
*pp++ = '(';
atstart = 1;
while (todo-- > 0)
{
c = getc(fd); /* <compflags> */
if (c == EOF)
{
vim_free(pat);
return SP_TRUNCERROR;
}
/* Add all flags to "sl_compallflags". */
if (vim_strchr((char_u *)"?*+[]/", c) == NULL
&& !byte_in_str(slang->sl_compallflags, c))
{
*ap++ = c;
*ap = NUL;
}
if (atstart != 0)
{
/* At start of item: copy flags to "sl_compstartflags". For a
* [abc] item set "atstart" to 2 and copy up to the ']'. */
if (c == '[')
atstart = 2;
else if (c == ']')
atstart = 0;
else
{
if (!byte_in_str(slang->sl_compstartflags, c))
{
*cp++ = c;
*cp = NUL;
}
if (atstart == 1)
atstart = 0;
}
}
/* Copy flag to "sl_comprules", unless we run into a wildcard. */
if (crp != NULL)
{
if (c == '?' || c == '+' || c == '*')
{
vim_free(slang->sl_comprules);
slang->sl_comprules = NULL;
crp = NULL;
}
else
*crp++ = c;
}
if (c == '/') /* slash separates two items */
{
*pp++ = '\\';
*pp++ = '|';
atstart = 1;
}
else /* normal char, "[abc]" and '*' are copied as-is */
{
if (c == '?' || c == '+' || c == '~')
*pp++ = '\\'; /* "a?" becomes "a\?", "a+" becomes "a\+" */
#ifdef FEAT_MBYTE
if (enc_utf8)
pp += mb_char2bytes(c, pp);
else
#endif
*pp++ = c;
}
}
*pp++ = '\\';
*pp++ = ')';
*pp++ = '$';
*pp = NUL;
if (crp != NULL)
*crp = NUL;
slang->sl_compprog = vim_regcomp(pat, RE_MAGIC + RE_STRING + RE_STRICT);
vim_free(pat);
if (slang->sl_compprog == NULL)
return SP_FORMERROR;
return 0;
}
/*
* Return TRUE if byte "n" appears in "str".
* Like strchr() but independent of locale.
*/
static int
byte_in_str(str, n)
char_u *str;
int n;
{
char_u *p;
for (p = str; *p != NUL; ++p)
if (*p == n)
return TRUE;
return FALSE;
}
#define SY_MAXLEN 30
typedef struct syl_item_S
{
char_u sy_chars[SY_MAXLEN]; /* the sequence of chars */
int sy_len;
} syl_item_T;
/*
* Truncate "slang->sl_syllable" at the first slash and put the following items
* in "slang->sl_syl_items".
*/
static int
init_syl_tab(slang)
slang_T *slang;
{
char_u *p;
char_u *s;
int l;
syl_item_T *syl;
ga_init2(&slang->sl_syl_items, sizeof(syl_item_T), 4);
p = vim_strchr(slang->sl_syllable, '/');
while (p != NULL)
{
*p++ = NUL;
if (*p == NUL) /* trailing slash */
break;
s = p;
p = vim_strchr(p, '/');
if (p == NULL)
l = (int)STRLEN(s);
else
l = (int)(p - s);
if (l >= SY_MAXLEN)
return SP_FORMERROR;
if (ga_grow(&slang->sl_syl_items, 1) == FAIL)
return SP_OTHERERROR;
syl = ((syl_item_T *)slang->sl_syl_items.ga_data)
+ slang->sl_syl_items.ga_len++;
vim_strncpy(syl->sy_chars, s, l);
syl->sy_len = l;
}
return OK;
}
/*
* Count the number of syllables in "word".
* When "word" contains spaces the syllables after the last space are counted.
* Returns zero if syllables are not defines.
*/
static int
count_syllables(slang, word)
slang_T *slang;
char_u *word;
{
int cnt = 0;
int skip = FALSE;
char_u *p;
int len;
int i;
syl_item_T *syl;
int c;
if (slang->sl_syllable == NULL)
return 0;
for (p = word; *p != NUL; p += len)
{
/* When running into a space reset counter. */
if (*p == ' ')
{
len = 1;
cnt = 0;
continue;
}
/* Find longest match of syllable items. */
len = 0;
for (i = 0; i < slang->sl_syl_items.ga_len; ++i)
{
syl = ((syl_item_T *)slang->sl_syl_items.ga_data) + i;
if (syl->sy_len > len
&& STRNCMP(p, syl->sy_chars, syl->sy_len) == 0)
len = syl->sy_len;
}
if (len != 0) /* found a match, count syllable */
{
++cnt;
skip = FALSE;
}
else
{
/* No recognized syllable item, at least a syllable char then? */
#ifdef FEAT_MBYTE
c = mb_ptr2char(p);
len = (*mb_ptr2len)(p);
#else
c = *p;
len = 1;
#endif
if (vim_strchr(slang->sl_syllable, c) == NULL)
skip = FALSE; /* No, search for next syllable */
else if (!skip)
{
++cnt; /* Yes, count it */
skip = TRUE; /* don't count following syllable chars */
}
}
}
return cnt;
}
/*
* Set the SOFOFROM and SOFOTO items in language "lp".
* Returns SP_*ERROR flags when there is something wrong.
*/
static int
set_sofo(lp, from, to)
slang_T *lp;
char_u *from;
char_u *to;
{
int i;
#ifdef FEAT_MBYTE
garray_T *gap;
char_u *s;
char_u *p;
int c;
int *inp;
if (has_mbyte)
{
/* Use "sl_sal" as an array with 256 pointers to a list of wide
* characters. The index is the low byte of the character.
* The list contains from-to pairs with a terminating NUL.
* sl_sal_first[] is used for latin1 "from" characters. */
gap = &lp->sl_sal;
ga_init2(gap, sizeof(int *), 1);
if (ga_grow(gap, 256) == FAIL)
return SP_OTHERERROR;
vim_memset(gap->ga_data, 0, sizeof(int *) * 256);
gap->ga_len = 256;
/* First count the number of items for each list. Temporarily use
* sl_sal_first[] for this. */
for (p = from, s = to; *p != NUL && *s != NUL; )
{
c = mb_cptr2char_adv(&p);
mb_cptr_adv(s);
if (c >= 256)
++lp->sl_sal_first[c & 0xff];
}
if (*p != NUL || *s != NUL) /* lengths differ */
return SP_FORMERROR;
/* Allocate the lists. */
for (i = 0; i < 256; ++i)
if (lp->sl_sal_first[i] > 0)
{
p = alloc(sizeof(int) * (lp->sl_sal_first[i] * 2 + 1));
if (p == NULL)
return SP_OTHERERROR;
((int **)gap->ga_data)[i] = (int *)p;
*(int *)p = 0;
}
/* Put the characters up to 255 in sl_sal_first[] the rest in a sl_sal
* list. */
vim_memset(lp->sl_sal_first, 0, sizeof(salfirst_T) * 256);
for (p = from, s = to; *p != NUL && *s != NUL; )
{
c = mb_cptr2char_adv(&p);
i = mb_cptr2char_adv(&s);
if (c >= 256)
{
/* Append the from-to chars at the end of the list with
* the low byte. */
inp = ((int **)gap->ga_data)[c & 0xff];
while (*inp != 0)
++inp;
*inp++ = c; /* from char */
*inp++ = i; /* to char */
*inp++ = NUL; /* NUL at the end */
}
else
/* mapping byte to char is done in sl_sal_first[] */
lp->sl_sal_first[c] = i;
}
}
else
#endif
{
/* mapping bytes to bytes is done in sl_sal_first[] */
if (STRLEN(from) != STRLEN(to))
return SP_FORMERROR;
for (i = 0; to[i] != NUL; ++i)
lp->sl_sal_first[from[i]] = to[i];
lp->sl_sal.ga_len = 1; /* indicates we have soundfolding */
}
return 0;
}
/*
* Fill the first-index table for "lp".
*/
static void
set_sal_first(lp)
slang_T *lp;
{
salfirst_T *sfirst;
int i;
salitem_T *smp;
int c;
garray_T *gap = &lp->sl_sal;
sfirst = lp->sl_sal_first;
for (i = 0; i < 256; ++i)
sfirst[i] = -1;
smp = (salitem_T *)gap->ga_data;
for (i = 0; i < gap->ga_len; ++i)
{
#ifdef FEAT_MBYTE
if (has_mbyte)
/* Use the lowest byte of the first character. For latin1 it's
* the character, for other encodings it should differ for most
* characters. */
c = *smp[i].sm_lead_w & 0xff;
else
#endif
c = *smp[i].sm_lead;
if (sfirst[c] == -1)
{
sfirst[c] = i;
#ifdef FEAT_MBYTE
if (has_mbyte)
{
int n;
/* Make sure all entries with this byte are following each
* other. Move the ones that are in the wrong position. Do
* keep the same ordering! */
while (i + 1 < gap->ga_len
&& (*smp[i + 1].sm_lead_w & 0xff) == c)
/* Skip over entry with same index byte. */
++i;
for (n = 1; i + n < gap->ga_len; ++n)
if ((*smp[i + n].sm_lead_w & 0xff) == c)
{
salitem_T tsal;
/* Move entry with same index byte after the entries
* we already found. */
++i;
--n;
tsal = smp[i + n];
mch_memmove(smp + i + 1, smp + i,
sizeof(salitem_T) * n);
smp[i] = tsal;
}
}
#endif
}
}
}
#ifdef FEAT_MBYTE
/*
* Turn a multi-byte string into a wide character string.
* Return it in allocated memory (NULL for out-of-memory)
*/
static int *
mb_str2wide(s)
char_u *s;
{
int *res;
char_u *p;
int i = 0;
res = (int *)alloc(sizeof(int) * (mb_charlen(s) + 1));
if (res != NULL)
{
for (p = s; *p != NUL; )
res[i++] = mb_ptr2char_adv(&p);
res[i] = NUL;
}
return res;
}
#endif
/*
* Read a tree from the .spl or .sug file.
* Allocates the memory and stores pointers in "bytsp" and "idxsp".
* This is skipped when the tree has zero length.
* Returns zero when OK, SP_ value for an error.
*/
static int
spell_read_tree(fd, bytsp, idxsp, prefixtree, prefixcnt)
FILE *fd;
char_u **bytsp;
idx_T **idxsp;
int prefixtree; /* TRUE for the prefix tree */
int prefixcnt; /* when "prefixtree" is TRUE: prefix count */
{
int len;
int idx;
char_u *bp;
idx_T *ip;
/* The tree size was computed when writing the file, so that we can
* allocate it as one long block. <nodecount> */
len = get4c(fd);
if (len < 0)
return SP_TRUNCERROR;
if (len > 0)
{
/* Allocate the byte array. */
bp = lalloc((long_u)len, TRUE);
if (bp == NULL)
return SP_OTHERERROR;
*bytsp = bp;
/* Allocate the index array. */
ip = (idx_T *)lalloc_clear((long_u)(len * sizeof(int)), TRUE);
if (ip == NULL)
return SP_OTHERERROR;
*idxsp = ip;
/* Recursively read the tree and store it in the array. */
idx = read_tree_node(fd, bp, ip, len, 0, prefixtree, prefixcnt);
if (idx < 0)
return idx;
}
return 0;
}
/*
* Read one row of siblings from the spell file and store it in the byte array
* "byts" and index array "idxs". Recursively read the children.
*
* NOTE: The code here must match put_node()!
*
* Returns the index (>= 0) following the siblings.
* Returns SP_TRUNCERROR if the file is shorter than expected.
* Returns SP_FORMERROR if there is a format error.
*/
static idx_T
read_tree_node(fd, byts, idxs, maxidx, startidx, prefixtree, maxprefcondnr)
FILE *fd;
char_u *byts;
idx_T *idxs;
int maxidx; /* size of arrays */
idx_T startidx; /* current index in "byts" and "idxs" */
int prefixtree; /* TRUE for reading PREFIXTREE */
int maxprefcondnr; /* maximum for <prefcondnr> */
{
int len;
int i;
int n;
idx_T idx = startidx;
int c;
int c2;
#define SHARED_MASK 0x8000000
len = getc(fd); /* <siblingcount> */
if (len <= 0)
return SP_TRUNCERROR;
if (startidx + len >= maxidx)
return SP_FORMERROR;
byts[idx++] = len;
/* Read the byte values, flag/region bytes and shared indexes. */
for (i = 1; i <= len; ++i)
{
c = getc(fd); /* <byte> */
if (c < 0)
return SP_TRUNCERROR;
if (c <= BY_SPECIAL)
{
if (c == BY_NOFLAGS && !prefixtree)
{
/* No flags, all regions. */
idxs[idx] = 0;
c = 0;
}
else if (c != BY_INDEX)
{
if (prefixtree)
{
/* Read the optional pflags byte, the prefix ID and the
* condition nr. In idxs[] store the prefix ID in the low
* byte, the condition index shifted up 8 bits, the flags
* shifted up 24 bits. */
if (c == BY_FLAGS)
c = getc(fd) << 24; /* <pflags> */
else
c = 0;
c |= getc(fd); /* <affixID> */
n = get2c(fd); /* <prefcondnr> */
if (n >= maxprefcondnr)
return SP_FORMERROR;
c |= (n << 8);
}
else /* c must be BY_FLAGS or BY_FLAGS2 */
{
/* Read flags and optional region and prefix ID. In
* idxs[] the flags go in the low two bytes, region above
* that and prefix ID above the region. */
c2 = c;
c = getc(fd); /* <flags> */
if (c2 == BY_FLAGS2)
c = (getc(fd) << 8) + c; /* <flags2> */
if (c & WF_REGION)
c = (getc(fd) << 16) + c; /* <region> */
if (c & WF_AFX)
c = (getc(fd) << 24) + c; /* <affixID> */
}
idxs[idx] = c;
c = 0;
}
else /* c == BY_INDEX */
{
/* <nodeidx> */
n = get3c(fd);
if (n < 0 || n >= maxidx)
return SP_FORMERROR;
idxs[idx] = n + SHARED_MASK;
c = getc(fd); /* <xbyte> */
}
}
byts[idx++] = c;
}
/* Recursively read the children for non-shared siblings.
* Skip the end-of-word ones (zero byte value) and the shared ones (and
* remove SHARED_MASK) */
for (i = 1; i <= len; ++i)
if (byts[startidx + i] != 0)
{
if (idxs[startidx + i] & SHARED_MASK)
idxs[startidx + i] &= ~SHARED_MASK;
else
{
idxs[startidx + i] = idx;
idx = read_tree_node(fd, byts, idxs, maxidx, idx,
prefixtree, maxprefcondnr);
if (idx < 0)
break;
}
}
return idx;
}
/*
* Parse 'spelllang' and set w_s->b_langp accordingly.
* Returns NULL if it's OK, an error message otherwise.
*/
char_u *
did_set_spelllang(wp)
win_T *wp;
{
garray_T ga;
char_u *splp;
char_u *region;
char_u region_cp[3];
int filename;
int region_mask;
slang_T *slang;
int c;
char_u lang[MAXWLEN + 1];
char_u spf_name[MAXPATHL];
int len;
char_u *p;
int round;
char_u *spf;
char_u *use_region = NULL;
int dont_use_region = FALSE;
int nobreak = FALSE;
int i, j;
langp_T *lp, *lp2;
static int recursive = FALSE;
char_u *ret_msg = NULL;
char_u *spl_copy;
/* We don't want to do this recursively. May happen when a language is
* not available and the SpellFileMissing autocommand opens a new buffer
* in which 'spell' is set. */
if (recursive)
return NULL;
recursive = TRUE;
ga_init2(&ga, sizeof(langp_T), 2);
clear_midword(wp);
/* Make a copy of 'spellang', the SpellFileMissing autocommands may change
* it under our fingers. */
spl_copy = vim_strsave(wp->w_s->b_p_spl);
if (spl_copy == NULL)
goto theend;
/* loop over comma separated language names. */
for (splp = spl_copy; *splp != NUL; )
{
/* Get one language name. */
copy_option_part(&splp, lang, MAXWLEN, ",");
region = NULL;
len = (int)STRLEN(lang);
/* If the name ends in ".spl" use it as the name of the spell file.
* If there is a region name let "region" point to it and remove it
* from the name. */
if (len > 4 && fnamecmp(lang + len - 4, ".spl") == 0)
{
filename = TRUE;
/* Locate a region and remove it from the file name. */
p = vim_strchr(gettail(lang), '_');
if (p != NULL && ASCII_ISALPHA(p[1]) && ASCII_ISALPHA(p[2])
&& !ASCII_ISALPHA(p[3]))
{
vim_strncpy(region_cp, p + 1, 2);
mch_memmove(p, p + 3, len - (p - lang) - 2);
len -= 3;
region = region_cp;
}
else
dont_use_region = TRUE;
/* Check if we loaded this language before. */
for (slang = first_lang; slang != NULL; slang = slang->sl_next)
if (fullpathcmp(lang, slang->sl_fname, FALSE) == FPC_SAME)
break;
}
else
{
filename = FALSE;
if (len > 3 && lang[len - 3] == '_')
{
region = lang + len - 2;
len -= 3;
lang[len] = NUL;
}
else
dont_use_region = TRUE;
/* Check if we loaded this language before. */
for (slang = first_lang; slang != NULL; slang = slang->sl_next)
if (STRICMP(lang, slang->sl_name) == 0)
break;
}
if (region != NULL)
{
/* If the region differs from what was used before then don't
* use it for 'spellfile'. */
if (use_region != NULL && STRCMP(region, use_region) != 0)
dont_use_region = TRUE;
use_region = region;
}
/* If not found try loading the language now. */
if (slang == NULL)
{
if (filename)
(void)spell_load_file(lang, lang, NULL, FALSE);
else
{
spell_load_lang(lang);
#ifdef FEAT_AUTOCMD
/* SpellFileMissing autocommands may do anything, including
* destroying the buffer we are using... */
if (!buf_valid(wp->w_buffer))
{
ret_msg = (char_u *)"E797: SpellFileMissing autocommand deleted buffer";
goto theend;
}
#endif
}
}
/*
* Loop over the languages, there can be several files for "lang".
*/
for (slang = first_lang; slang != NULL; slang = slang->sl_next)
if (filename ? fullpathcmp(lang, slang->sl_fname, FALSE) == FPC_SAME
: STRICMP(lang, slang->sl_name) == 0)
{
region_mask = REGION_ALL;
if (!filename && region != NULL)
{
/* find region in sl_regions */
c = find_region(slang->sl_regions, region);
if (c == REGION_ALL)
{
if (slang->sl_add)
{
if (*slang->sl_regions != NUL)
/* This addition file is for other regions. */
region_mask = 0;
}
else
/* This is probably an error. Give a warning and
* accept the words anyway. */
smsg((char_u *)
_("Warning: region %s not supported"),
region);
}
else
region_mask = 1 << c;
}
if (region_mask != 0)
{
if (ga_grow(&ga, 1) == FAIL)
{
ga_clear(&ga);
ret_msg = e_outofmem;
goto theend;
}
LANGP_ENTRY(ga, ga.ga_len)->lp_slang = slang;
LANGP_ENTRY(ga, ga.ga_len)->lp_region = region_mask;
++ga.ga_len;
use_midword(slang, wp);
if (slang->sl_nobreak)
nobreak = TRUE;
}
}
}
/* round 0: load int_wordlist, if possible.
* round 1: load first name in 'spellfile'.
* round 2: load second name in 'spellfile.
* etc. */
spf = curwin->w_s->b_p_spf;
for (round = 0; round == 0 || *spf != NUL; ++round)
{
if (round == 0)
{
/* Internal wordlist, if there is one. */
if (int_wordlist == NULL)
continue;
int_wordlist_spl(spf_name);
}
else
{
/* One entry in 'spellfile'. */
copy_option_part(&spf, spf_name, MAXPATHL - 5, ",");
STRCAT(spf_name, ".spl");
/* If it was already found above then skip it. */
for (c = 0; c < ga.ga_len; ++c)
{
p = LANGP_ENTRY(ga, c)->lp_slang->sl_fname;
if (p != NULL && fullpathcmp(spf_name, p, FALSE) == FPC_SAME)
break;
}
if (c < ga.ga_len)
continue;
}
/* Check if it was loaded already. */
for (slang = first_lang; slang != NULL; slang = slang->sl_next)
if (fullpathcmp(spf_name, slang->sl_fname, FALSE) == FPC_SAME)
break;
if (slang == NULL)
{
/* Not loaded, try loading it now. The language name includes the
* region name, the region is ignored otherwise. for int_wordlist
* use an arbitrary name. */
if (round == 0)
STRCPY(lang, "internal wordlist");
else
{
vim_strncpy(lang, gettail(spf_name), MAXWLEN);
p = vim_strchr<