From 708662a581b71edf7963fdb88c81848d1818bd90 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Henrique=20Guard=C3=A3o=20Gandarez?= <782854+gandarez@users.noreply.github.com> Date: Tue, 22 Aug 2023 22:51:13 -0300 Subject: [PATCH] feat: improve regex analysers in XML (#831) --- lexer.go | 18 ++++++++++---- lexers/embedded/c++.xml | 6 +++-- lexers/embedded/c.xml | 6 +++-- regexp.go | 11 ++++++--- serialise.go | 55 +++++++++++++++++++++++++++++------------ 5 files changed, 67 insertions(+), 29 deletions(-) diff --git a/lexer.go b/lexer.go index 060c3f0d3..0f12fcc28 100644 --- a/lexer.go +++ b/lexer.go @@ -60,15 +60,23 @@ type Config struct { // Analyse is a list of regexes to match against the input. // - // The sum of all the score of matching patterns will be + // If a match is found, the score is returned if single attribute is set to true, + // otherwise the sum of all the score of matching patterns will be // used as the final score. - Analyse []AnalyseConfig `xml:"analyse,omitempty"` + Analyse *AnalyseConfig `xml:"analyse,omitempty"` } -// AnalyseConfig defines a single regex analyser pattern. +// AnalyseConfig defines the list of regexes analysers. type AnalyseConfig struct { - Regex string `xml:"regex,attr"` - Score float32 `xml:"score,attr"` + Regexes []RegexConfig `xml:"regex,omitempty"` + // If true, the score is returned despite other matches. + Single bool `xml:"single,attr"` +} + +// RegexConfig defines a single regex pattern and its score in case of match. +type RegexConfig struct { + Pattern string `xml:"pattern,attr"` + Score float32 `xml:"score,attr"` } // Token output to formatter. diff --git a/lexers/embedded/c++.xml b/lexers/embedded/c++.xml index 455c03368..a6bfb1140 100644 --- a/lexers/embedded/c++.xml +++ b/lexers/embedded/c++.xml @@ -19,8 +19,10 @@ text/x-c++hdr text/x-c++src true - - + + + + diff --git a/lexers/embedded/c.xml b/lexers/embedded/c.xml index 744732902..da57298e6 100644 --- a/lexers/embedded/c.xml +++ b/lexers/embedded/c.xml @@ -11,8 +11,10 @@ image/x-xbitmap image/x-xpixmap true - - + + + + diff --git a/regexp.go b/regexp.go index 3e0de6bd3..f49d5dace 100644 --- a/regexp.go +++ b/regexp.go @@ -298,11 +298,12 @@ func (r *RegexLexer) SetAnalyser(analyser func(text string) float32) Lexer { return r } -func (r *RegexLexer) AnalyseText(text string) float32 { // nolint +// AnalyseText scores how likely a fragment of text is to match this lexer, between 0.0 and 1.0. +func (r *RegexLexer) AnalyseText(text string) float32 { if r.analyser != nil { return r.analyser(text) } - return 0.0 + return 0 } // SetConfig replaces the Config for this Lexer. @@ -311,7 +312,8 @@ func (r *RegexLexer) SetConfig(config *Config) *RegexLexer { return r } -func (r *RegexLexer) Config() *Config { // nolint +// Config returns the Config for this Lexer. +func (r *RegexLexer) Config() *Config { return r.config } @@ -406,7 +408,8 @@ func (r *RegexLexer) needRules() error { return err } -func (r *RegexLexer) Tokenise(options *TokeniseOptions, text string) (Iterator, error) { // nolint +// Tokenise text using lexer, returning an iterator. +func (r *RegexLexer) Tokenise(options *TokeniseOptions, text string) (Iterator, error) { err := r.needRules() if err != nil { return nil, err diff --git a/serialise.go b/serialise.go index f6ad589a7..a0c072785 100644 --- a/serialise.go +++ b/serialise.go @@ -131,35 +131,58 @@ func NewXMLLexer(from fs.FS, path string) (*RegexLexer, error) { if err != nil { return nil, err } + for _, glob := range append(config.Filenames, config.AliasFilenames...) { _, err := filepath.Match(glob, "") if err != nil { return nil, fmt.Errorf("%s: %q is not a valid glob: %w", config.Name, glob, err) } } - type regexAnalyse struct { - re *regexp2.Regexp - score float32 - } - regexAnalysers := make([]regexAnalyse, 0, len(config.Analyse)) - for _, ra := range config.Analyse { - re, err := regexp2.Compile(ra.Regex, regexp2.None) - if err != nil { - return nil, fmt.Errorf("%s: %q is not a valid analyser regex: %w", config.Name, ra.Regex, err) + + var analyserFn func(string) float32 + + if config.Analyse != nil { + type regexAnalyse struct { + re *regexp2.Regexp + score float32 } - regexAnalysers = append(regexAnalysers, regexAnalyse{re, ra.Score}) - } - return &RegexLexer{ - config: config, - analyser: func(text string) float32 { + + regexAnalysers := make([]regexAnalyse, 0, len(config.Analyse.Regexes)) + + for _, ra := range config.Analyse.Regexes { + re, err := regexp2.Compile(ra.Pattern, regexp2.None) + if err != nil { + return nil, fmt.Errorf("%s: %q is not a valid analyser regex: %w", config.Name, ra.Pattern, err) + } + + regexAnalysers = append(regexAnalysers, regexAnalyse{re, ra.Score}) + } + + analyserFn = func(text string) float32 { var score float32 + for _, ra := range regexAnalysers { - if ok, _ := ra.re.MatchString(text); ok { + ok, err := ra.re.MatchString(text) + if err != nil { + return 0 + } + + if ok && config.Analyse.Single { + return ra.score + } + + if ok { score += ra.score } } + return score - }, + } + } + + return &RegexLexer{ + config: config, + analyser: analyserFn, fetchRulesFunc: func() (Rules, error) { var lexer struct { Config