Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: improve regex analysers in XML #831

Merged
merged 1 commit into from Aug 23, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
18 changes: 13 additions & 5 deletions lexer.go
Expand Up @@ -60,15 +60,23 @@ type Config struct {

// Analyse is a list of regexes to match against the input.
//
// The sum of all the score of matching patterns will be
// If a match is found, the score is returned if single attribute is set to true,
// otherwise the sum of all the score of matching patterns will be
// used as the final score.
Analyse []AnalyseConfig `xml:"analyse,omitempty"`
Analyse *AnalyseConfig `xml:"analyse,omitempty"`
}

// AnalyseConfig defines a single regex analyser pattern.
// AnalyseConfig defines the list of regexes analysers.
type AnalyseConfig struct {
Regex string `xml:"regex,attr"`
Score float32 `xml:"score,attr"`
Regexes []RegexConfig `xml:"regex,omitempty"`
// If true, the score is returned despite other matches.
Single bool `xml:"single,attr"`
}

// RegexConfig defines a single regex pattern and its score in case of match.
type RegexConfig struct {
Pattern string `xml:"pattern,attr"`
Score float32 `xml:"score,attr"`
}

// Token output to formatter.
Expand Down
6 changes: 4 additions & 2 deletions lexers/embedded/c++.xml
Expand Up @@ -19,8 +19,10 @@
<mime_type>text/x-c++hdr</mime_type>
<mime_type>text/x-c++src</mime_type>
<ensure_nl>true</ensure_nl>
<analyse regex="#include &lt;[a-z_]+>" score="0.2" />
<analyse regex="using namespace " score="0.4" />
<analyse single="true">
<regex pattern="#include &lt;[a-z_]+>" score="0.2" />
<regex pattern="using namespace " score="0.4" />
</analyse>
</config>
<rules>
<state name="classname">
Expand Down
6 changes: 4 additions & 2 deletions lexers/embedded/c.xml
Expand Up @@ -11,8 +11,10 @@
<mime_type>image/x-xbitmap</mime_type>
<mime_type>image/x-xpixmap</mime_type>
<ensure_nl>true</ensure_nl>
<analyse regex="(?m)^\s*#include &lt;" score="0.1"/>
<analyse regex="(?m)^\s*#ifn?def " score="0.1" />
<analyse single="true" >
<regex pattern="(?m)^\s*#include &lt;" score="0.1" />
<regex pattern="(?m)^\s*#ifn?def " score="0.1" />
</analyse>
</config>
<rules>
<state name="statement">
Expand Down
11 changes: 7 additions & 4 deletions regexp.go
Expand Up @@ -298,11 +298,12 @@ func (r *RegexLexer) SetAnalyser(analyser func(text string) float32) Lexer {
return r
}

func (r *RegexLexer) AnalyseText(text string) float32 { // nolint
// AnalyseText scores how likely a fragment of text is to match this lexer, between 0.0 and 1.0.
func (r *RegexLexer) AnalyseText(text string) float32 {
if r.analyser != nil {
return r.analyser(text)
}
return 0.0
return 0
}

// SetConfig replaces the Config for this Lexer.
Expand All @@ -311,7 +312,8 @@ func (r *RegexLexer) SetConfig(config *Config) *RegexLexer {
return r
}

func (r *RegexLexer) Config() *Config { // nolint
// Config returns the Config for this Lexer.
func (r *RegexLexer) Config() *Config {
return r.config
}

Expand Down Expand Up @@ -406,7 +408,8 @@ func (r *RegexLexer) needRules() error {
return err
}

func (r *RegexLexer) Tokenise(options *TokeniseOptions, text string) (Iterator, error) { // nolint
// Tokenise text using lexer, returning an iterator.
func (r *RegexLexer) Tokenise(options *TokeniseOptions, text string) (Iterator, error) {
err := r.needRules()
if err != nil {
return nil, err
Expand Down
55 changes: 39 additions & 16 deletions serialise.go
Expand Up @@ -131,35 +131,58 @@ func NewXMLLexer(from fs.FS, path string) (*RegexLexer, error) {
if err != nil {
return nil, err
}

for _, glob := range append(config.Filenames, config.AliasFilenames...) {
_, err := filepath.Match(glob, "")
if err != nil {
return nil, fmt.Errorf("%s: %q is not a valid glob: %w", config.Name, glob, err)
}
}
type regexAnalyse struct {
re *regexp2.Regexp
score float32
}
regexAnalysers := make([]regexAnalyse, 0, len(config.Analyse))
for _, ra := range config.Analyse {
re, err := regexp2.Compile(ra.Regex, regexp2.None)
if err != nil {
return nil, fmt.Errorf("%s: %q is not a valid analyser regex: %w", config.Name, ra.Regex, err)

var analyserFn func(string) float32

if config.Analyse != nil {
type regexAnalyse struct {
re *regexp2.Regexp
score float32
}
regexAnalysers = append(regexAnalysers, regexAnalyse{re, ra.Score})
}
return &RegexLexer{
config: config,
analyser: func(text string) float32 {

regexAnalysers := make([]regexAnalyse, 0, len(config.Analyse.Regexes))

for _, ra := range config.Analyse.Regexes {
re, err := regexp2.Compile(ra.Pattern, regexp2.None)
if err != nil {
return nil, fmt.Errorf("%s: %q is not a valid analyser regex: %w", config.Name, ra.Pattern, err)
}

regexAnalysers = append(regexAnalysers, regexAnalyse{re, ra.Score})
}

analyserFn = func(text string) float32 {
var score float32

for _, ra := range regexAnalysers {
if ok, _ := ra.re.MatchString(text); ok {
ok, err := ra.re.MatchString(text)
if err != nil {
return 0
}

if ok && config.Analyse.Single {
return ra.score
}

if ok {
score += ra.score
}
}

return score
},
}
}

return &RegexLexer{
config: config,
analyser: analyserFn,
fetchRulesFunc: func() (Rules, error) {
var lexer struct {
Config
Expand Down