Skip to content

Commit

Permalink
feat: improve regex analysers in XML (#831)
Browse files Browse the repository at this point in the history
  • Loading branch information
gandarez committed Aug 23, 2023
1 parent d37de78 commit 708662a
Show file tree
Hide file tree
Showing 5 changed files with 67 additions and 29 deletions.
18 changes: 13 additions & 5 deletions lexer.go
Expand Up @@ -60,15 +60,23 @@ type Config struct {

// Analyse is a list of regexes to match against the input.
//
// The sum of all the score of matching patterns will be
// If a match is found, the score is returned if single attribute is set to true,
// otherwise the sum of all the score of matching patterns will be
// used as the final score.
Analyse []AnalyseConfig `xml:"analyse,omitempty"`
Analyse *AnalyseConfig `xml:"analyse,omitempty"`
}

// AnalyseConfig defines a single regex analyser pattern.
// AnalyseConfig defines the list of regexes analysers.
type AnalyseConfig struct {
Regex string `xml:"regex,attr"`
Score float32 `xml:"score,attr"`
Regexes []RegexConfig `xml:"regex,omitempty"`
// If true, the score is returned despite other matches.
Single bool `xml:"single,attr"`
}

// RegexConfig defines a single regex pattern and its score in case of match.
type RegexConfig struct {
Pattern string `xml:"pattern,attr"`
Score float32 `xml:"score,attr"`
}

// Token output to formatter.
Expand Down
6 changes: 4 additions & 2 deletions lexers/embedded/c++.xml
Expand Up @@ -19,8 +19,10 @@
<mime_type>text/x-c++hdr</mime_type>
<mime_type>text/x-c++src</mime_type>
<ensure_nl>true</ensure_nl>
<analyse regex="#include &lt;[a-z_]+>" score="0.2" />
<analyse regex="using namespace " score="0.4" />
<analyse single="true">
<regex pattern="#include &lt;[a-z_]+>" score="0.2" />
<regex pattern="using namespace " score="0.4" />
</analyse>
</config>
<rules>
<state name="classname">
Expand Down
6 changes: 4 additions & 2 deletions lexers/embedded/c.xml
Expand Up @@ -11,8 +11,10 @@
<mime_type>image/x-xbitmap</mime_type>
<mime_type>image/x-xpixmap</mime_type>
<ensure_nl>true</ensure_nl>
<analyse regex="(?m)^\s*#include &lt;" score="0.1"/>
<analyse regex="(?m)^\s*#ifn?def " score="0.1" />
<analyse single="true" >
<regex pattern="(?m)^\s*#include &lt;" score="0.1" />
<regex pattern="(?m)^\s*#ifn?def " score="0.1" />
</analyse>
</config>
<rules>
<state name="statement">
Expand Down
11 changes: 7 additions & 4 deletions regexp.go
Expand Up @@ -298,11 +298,12 @@ func (r *RegexLexer) SetAnalyser(analyser func(text string) float32) Lexer {
return r
}

func (r *RegexLexer) AnalyseText(text string) float32 { // nolint
// AnalyseText scores how likely a fragment of text is to match this lexer, between 0.0 and 1.0.
func (r *RegexLexer) AnalyseText(text string) float32 {
if r.analyser != nil {
return r.analyser(text)
}
return 0.0
return 0
}

// SetConfig replaces the Config for this Lexer.
Expand All @@ -311,7 +312,8 @@ func (r *RegexLexer) SetConfig(config *Config) *RegexLexer {
return r
}

func (r *RegexLexer) Config() *Config { // nolint
// Config returns the Config for this Lexer.
func (r *RegexLexer) Config() *Config {
return r.config
}

Expand Down Expand Up @@ -406,7 +408,8 @@ func (r *RegexLexer) needRules() error {
return err
}

func (r *RegexLexer) Tokenise(options *TokeniseOptions, text string) (Iterator, error) { // nolint
// Tokenise text using lexer, returning an iterator.
func (r *RegexLexer) Tokenise(options *TokeniseOptions, text string) (Iterator, error) {
err := r.needRules()
if err != nil {
return nil, err
Expand Down
55 changes: 39 additions & 16 deletions serialise.go
Expand Up @@ -131,35 +131,58 @@ func NewXMLLexer(from fs.FS, path string) (*RegexLexer, error) {
if err != nil {
return nil, err
}

for _, glob := range append(config.Filenames, config.AliasFilenames...) {
_, err := filepath.Match(glob, "")
if err != nil {
return nil, fmt.Errorf("%s: %q is not a valid glob: %w", config.Name, glob, err)
}
}
type regexAnalyse struct {
re *regexp2.Regexp
score float32
}
regexAnalysers := make([]regexAnalyse, 0, len(config.Analyse))
for _, ra := range config.Analyse {
re, err := regexp2.Compile(ra.Regex, regexp2.None)
if err != nil {
return nil, fmt.Errorf("%s: %q is not a valid analyser regex: %w", config.Name, ra.Regex, err)

var analyserFn func(string) float32

if config.Analyse != nil {
type regexAnalyse struct {
re *regexp2.Regexp
score float32
}
regexAnalysers = append(regexAnalysers, regexAnalyse{re, ra.Score})
}
return &RegexLexer{
config: config,
analyser: func(text string) float32 {

regexAnalysers := make([]regexAnalyse, 0, len(config.Analyse.Regexes))

for _, ra := range config.Analyse.Regexes {
re, err := regexp2.Compile(ra.Pattern, regexp2.None)
if err != nil {
return nil, fmt.Errorf("%s: %q is not a valid analyser regex: %w", config.Name, ra.Pattern, err)
}

regexAnalysers = append(regexAnalysers, regexAnalyse{re, ra.Score})
}

analyserFn = func(text string) float32 {
var score float32

for _, ra := range regexAnalysers {
if ok, _ := ra.re.MatchString(text); ok {
ok, err := ra.re.MatchString(text)
if err != nil {
return 0
}

if ok && config.Analyse.Single {
return ra.score
}

if ok {
score += ra.score
}
}

return score
},
}
}

return &RegexLexer{
config: config,
analyser: analyserFn,
fetchRulesFunc: func() (Rules, error) {
var lexer struct {
Config
Expand Down

0 comments on commit 708662a

Please sign in to comment.