From 708662a581b71edf7963fdb88c81848d1818bd90 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Carlos=20Henrique=20Guard=C3=A3o=20Gandarez?=
<782854+gandarez@users.noreply.github.com>
Date: Tue, 22 Aug 2023 22:51:13 -0300
Subject: [PATCH] feat: improve regex analysers in XML (#831)
---
lexer.go | 18 ++++++++++----
lexers/embedded/c++.xml | 6 +++--
lexers/embedded/c.xml | 6 +++--
regexp.go | 11 ++++++---
serialise.go | 55 +++++++++++++++++++++++++++++------------
5 files changed, 67 insertions(+), 29 deletions(-)
diff --git a/lexer.go b/lexer.go
index 060c3f0d3..0f12fcc28 100644
--- a/lexer.go
+++ b/lexer.go
@@ -60,15 +60,23 @@ type Config struct {
// Analyse is a list of regexes to match against the input.
//
- // The sum of all the score of matching patterns will be
+ // If a match is found, the score is returned if single attribute is set to true,
+ // otherwise the sum of all the score of matching patterns will be
// used as the final score.
- Analyse []AnalyseConfig `xml:"analyse,omitempty"`
+ Analyse *AnalyseConfig `xml:"analyse,omitempty"`
}
-// AnalyseConfig defines a single regex analyser pattern.
+// AnalyseConfig defines the list of regexes analysers.
type AnalyseConfig struct {
- Regex string `xml:"regex,attr"`
- Score float32 `xml:"score,attr"`
+ Regexes []RegexConfig `xml:"regex,omitempty"`
+ // If true, the score is returned despite other matches.
+ Single bool `xml:"single,attr"`
+}
+
+// RegexConfig defines a single regex pattern and its score in case of match.
+type RegexConfig struct {
+ Pattern string `xml:"pattern,attr"`
+ Score float32 `xml:"score,attr"`
}
// Token output to formatter.
diff --git a/lexers/embedded/c++.xml b/lexers/embedded/c++.xml
index 455c03368..a6bfb1140 100644
--- a/lexers/embedded/c++.xml
+++ b/lexers/embedded/c++.xml
@@ -19,8 +19,10 @@
text/x-c++hdr
text/x-c++src
true
-
-
+
+
+
+
diff --git a/lexers/embedded/c.xml b/lexers/embedded/c.xml
index 744732902..da57298e6 100644
--- a/lexers/embedded/c.xml
+++ b/lexers/embedded/c.xml
@@ -11,8 +11,10 @@
image/x-xbitmap
image/x-xpixmap
true
-
-
+
+
+
+
diff --git a/regexp.go b/regexp.go
index 3e0de6bd3..f49d5dace 100644
--- a/regexp.go
+++ b/regexp.go
@@ -298,11 +298,12 @@ func (r *RegexLexer) SetAnalyser(analyser func(text string) float32) Lexer {
return r
}
-func (r *RegexLexer) AnalyseText(text string) float32 { // nolint
+// AnalyseText scores how likely a fragment of text is to match this lexer, between 0.0 and 1.0.
+func (r *RegexLexer) AnalyseText(text string) float32 {
if r.analyser != nil {
return r.analyser(text)
}
- return 0.0
+ return 0
}
// SetConfig replaces the Config for this Lexer.
@@ -311,7 +312,8 @@ func (r *RegexLexer) SetConfig(config *Config) *RegexLexer {
return r
}
-func (r *RegexLexer) Config() *Config { // nolint
+// Config returns the Config for this Lexer.
+func (r *RegexLexer) Config() *Config {
return r.config
}
@@ -406,7 +408,8 @@ func (r *RegexLexer) needRules() error {
return err
}
-func (r *RegexLexer) Tokenise(options *TokeniseOptions, text string) (Iterator, error) { // nolint
+// Tokenise text using lexer, returning an iterator.
+func (r *RegexLexer) Tokenise(options *TokeniseOptions, text string) (Iterator, error) {
err := r.needRules()
if err != nil {
return nil, err
diff --git a/serialise.go b/serialise.go
index f6ad589a7..a0c072785 100644
--- a/serialise.go
+++ b/serialise.go
@@ -131,35 +131,58 @@ func NewXMLLexer(from fs.FS, path string) (*RegexLexer, error) {
if err != nil {
return nil, err
}
+
for _, glob := range append(config.Filenames, config.AliasFilenames...) {
_, err := filepath.Match(glob, "")
if err != nil {
return nil, fmt.Errorf("%s: %q is not a valid glob: %w", config.Name, glob, err)
}
}
- type regexAnalyse struct {
- re *regexp2.Regexp
- score float32
- }
- regexAnalysers := make([]regexAnalyse, 0, len(config.Analyse))
- for _, ra := range config.Analyse {
- re, err := regexp2.Compile(ra.Regex, regexp2.None)
- if err != nil {
- return nil, fmt.Errorf("%s: %q is not a valid analyser regex: %w", config.Name, ra.Regex, err)
+
+ var analyserFn func(string) float32
+
+ if config.Analyse != nil {
+ type regexAnalyse struct {
+ re *regexp2.Regexp
+ score float32
}
- regexAnalysers = append(regexAnalysers, regexAnalyse{re, ra.Score})
- }
- return &RegexLexer{
- config: config,
- analyser: func(text string) float32 {
+
+ regexAnalysers := make([]regexAnalyse, 0, len(config.Analyse.Regexes))
+
+ for _, ra := range config.Analyse.Regexes {
+ re, err := regexp2.Compile(ra.Pattern, regexp2.None)
+ if err != nil {
+ return nil, fmt.Errorf("%s: %q is not a valid analyser regex: %w", config.Name, ra.Pattern, err)
+ }
+
+ regexAnalysers = append(regexAnalysers, regexAnalyse{re, ra.Score})
+ }
+
+ analyserFn = func(text string) float32 {
var score float32
+
for _, ra := range regexAnalysers {
- if ok, _ := ra.re.MatchString(text); ok {
+ ok, err := ra.re.MatchString(text)
+ if err != nil {
+ return 0
+ }
+
+ if ok && config.Analyse.Single {
+ return ra.score
+ }
+
+ if ok {
score += ra.score
}
}
+
return score
- },
+ }
+ }
+
+ return &RegexLexer{
+ config: config,
+ analyser: analyserFn,
fetchRulesFunc: func() (Rules, error) {
var lexer struct {
Config