masses/lint-rules-from-freqs

#!/usr/bin/perl
#
# <@LICENSE>
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements.  See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to you under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License.  You may obtain a copy of the License at:
# 
#     http://www.apache.org/licenses/LICENSE-2.0
# 
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# </@LICENSE>

# any tests that get less than this % of matches on *both* spam or nonspam, are
# reported.
my $LOW_MATCHES_PERCENT = 0.03;
my $scoreset = 0;

sub usage {
  die "
lint-rules-from-freqs: perform 'lint' testing on SpamAssassin rules and scores

usage: ./lint-rules-from-freqs [-f falsefreqs] < freqs > badtests

This analyzes SpamAssassin tests, based on the hit frequencies and S/O ratios
from a mass-check logfile pair.

The 'freqs' argument is the frequency of hits in all messages ('hit-frequencies
-x -p' output).

The 'falsefreqs' argument is frequencies of hits in false-positives and
false-negatives only ('hit-frequencies -x -p -f' output).

";
}

my $opt_falsefreqs;
while ($#ARGV >= 0) {
  $_ = shift @ARGV;
  if (/^-f/) { $_ = shift @ARGV; $opt_falsefreqs = $_; }
  elsif (/^-s/) { $_ = shift @ARGV; $scoreset = $_; }
  else { usage(); }
}

print "BAD TESTS REPORT\n";
readrules();
print "\n" .((scalar keys %rulefile) + 1). " rules found.\n";
print "\nRule file syntax issues:\n\n";
lintrules();

if ($opt_falsefreqs) {
  open (FALSE, "<$opt_falsefreqs");
  while (<FALSE>) {
    if (!/^\s*([\d\.]+)/) {
      my ($overall, $spam, $nons, $so, $score, $name) = split (' ');
      next unless ($name =~ /\S/);
      $falsefreqs_spam{$name} = $spam;
      $falsefreqs_nons{$name} = $nons;
      $falsefreqs_so{$name} = $so;
    }
  }
  close FALSE;
}

while (<>) {
  if (!/^\s*([\d\.]+)/) {
    $output{'a_header'} = $_; next;
  }

  my $badrule;
  my ($overall, $spam, $nons, $so, $score, $name) = split (' ');
  next unless ($name =~ /\S/);

  my $ffspam = $falsefreqs_spam{$name};
  my $ffnons = $falsefreqs_nons{$name};
  my $ffso = $falsefreqs_so{$name};

  my $tf = $tflags{$name};
  next if ($tf =~ /net/ && ($scoreset % 2) == 0);
  next if ($tf =~ /userconf/);

  if ($overall == 0.0 && $spam == 0.0 && $nons == 0.0) {        # sanity!
    $badrule = 'no matches';

  } else {
    if ($score < 0.0) {
      # negative score with more spams than nonspams? bad rule.
      if ($tf !~ /nice/ && $so > 0.5 && $score < 0.5) {
        $badrule = 'non-nice but -ve score';
      }

      if ($tf =~ /nice/ && $so > 0.5 && $score < 0.5) {
        if ($ffso < 0.5) {
          $badrule = 'fn';
        } else {
          # ignore, the FNs are overridden by other tests so it doesn't
          # affect the overall results.
        }
      }

      # low number of matches overall
      if ($nons < $LOW_MATCHES_PERCENT) 
                 { $badrule ||= ''; $badrule .= ', low matches'; }

    } elsif ($score > 0.0) {
      # positive score with more nonspams than spams? bad.
      if ($tf =~ /nice/ && $so < 0.5 && $score > 0.5) {
        $badrule = 'nice but +ve score';
      }

      if ($tf !~ /nice/ && $so < 0.5 && $score > 0.5) {
        if ($ffso > 0.5) {
          $badrule = 'fp';
        } else {
          # ignore, the FPs are overridden by other tests so it doesn't
          # affect the overall results.
        }
      }

      # low number of matches overall
      if ($spam < $LOW_MATCHES_PERCENT) 
                 { $badrule ||= ''; $badrule .= ', low matches'; }

    } elsif ($score == 0.0) {
      $badrule = 'score is 0';
    }
  }

  if (defined $badrule) {
    $badrule =~ s/^, //; chomp;
    $output{$badrule} .= $_ . " ($badrule)\n";
  }
}

# do all but 'no/low matches' first
print "\nHigh-priority issues:\n\n";
foreach my $badness (sort keys %output) {
  next if ($badness eq 'no matches');
  next if ($badness eq 'low matches');
  print $output{$badness};
  delete $output{$badness};
}

# now go back and do the other 2 (if they're there)
print "\nLow-priority issues:\n\n";
foreach my $badness (sort keys %output) {
  next unless defined ($output{$badness});
  print $output{$badness};
  delete $output{$badness};
}
exit;


sub concat_rule_lang {
  my $rule = shift;
  my $lang = shift;

  if (defined $lang && $lang ne '') {
    return "[$lang]_$rule";
  } else {
    return $rule;
  }
}

# note: do not use parse-rules-for-masses here, we need to do linting instead
# of your average parse
sub readrules {
  my @files = <../rules/[0-9]*.cf>;
  my $file;
  %rulesfound = ();
  %langs = ();
  foreach $file (@files) {
    open (IN, "<$file");
    while (<IN>) {
      s/#.*$//g; s/^\s+//; s/\s+$//; next if /^$/;

      # make all the foo-bar stuff foo_bar
      1 while s/^(\S+)-/\1_/g;
      1 while s/^(lang\s+\S+\s+\S+)-/\1_/g;

      my $lang = '';
      if (s/^lang\s+(\S+)\s+//) {
        $lang = $1; $langs{$1} = 1;
      }

      if (/^(header|rawbody|body|full|uri|meta|mimeheader)\s+(\S+)\s+/) {
        $rulesfound{$2} = 1;
        $rulefile{$2} ||= $file;
        $scorefile{$1} = $file;
        $score{$2} ||= 1.0;
        $tflags{$2} ||= '';
        $descfile{$2} ||= $file;       # a rule with no score or desc is OK
	$description{$2}->{$lang} = undef;

        if (/^body\s+\S+\s+eval:/) {
          # ignored
        } elsif (/^body\s+\S+\s+(.*)$/) {
          my $re = $1;

	  # If there's a ( in a rule where it should be (?:, flag it.
	  # but ignore [abc(] ...
          if ($re =~ /[^\\]\([^\?]/ && $re !~ /\[[^\]]*[^\\]\(/) { 
            print "warning: non-(?:...) capture in regexp in $file: $_\n";
          }
          if ($re =~ /\.[\*\+]/) { 
            print "warning: .* in regexp in $file: $_\n";
          }
          if ($re =~ /[^\\]\{(\d*),?(\d*?)\}/) {
            if ($1 > 120 || $2 > 120) {
              print "warning: long .{n} in regexp in $file: $_\n";
            }
          }
        }

      } elsif (/^describe\s+(\S+)\s+(.*?)\s*$/) {
        $rulesfound{$1} = 1;
        $descfile{concat_rule_lang ($1, $lang)} ||= $file;
        $descfile{$1} ||= $file;
	$description{$1}->{$lang} = $2;
      } elsif (/^tflags\s+(\S+)\s+(.+)$/) {
        $rulesfound{$1} = 1;
        $tflags{$1} = $2;
        $tflagsfile{concat_rule_lang ($1, $lang)} = $file;
        $tflagsfile{$1} = $file;
      } elsif (/^score\s+(\S+)\s+(.+)$/) {
        $rulesfound{$1} = 1;
        $scorefile{concat_rule_lang ($1, $lang)} = $file;
        $scorefile{$1} = $file;
        $score{$1} = $2;
      } elsif (/^(clear_report_template|clear_spamtrap_template|report|spamtrap|
                clear_terse_report_template|terse_report|
                required_score|ok_locales|ok_languages|test|lang|
                spamphrase|whitelist_from|require_version|
		clear_unsafe_report_template|unsafe_report|
		(?:bayes_)?auto_learn_threshold_nonspam|(?:bayes_)?auto_learn_threshold_spam|
		(?:bayes_)?auto_learn
                )/x) {
        next;
      } else {
        print "warning: unknown rule in $file: $_\n";
      }
    }
    close IN;
  }
  @langsfound = sort keys %langs;
  @rulesfound = sort keys %rulesfound;
}

sub lintrules {
  my %possible_renames = ();

  foreach my $rule (@rulesfound) {
    my $match = $rule;
    $match =~ s/_\d+[^_]+$//gs;    # trim e.g. "_20K"
    $match =~ s/[^A-Z]+//gs;    # trim numbers etc.

    if (defined ($rulefile{$rule}) && $possible_renames{$match} !~ / \Q$rule\E\b/) {
      $possible_renames{$match} .= " ".$rule;
    }
    $possible_rename_matches{$rule} = $match;
  }

  foreach my $lang ('', @langsfound) {
    foreach my $baserule (@rulesfound) {
      next if ( $baserule =~ /^__/ || $baserule =~ /^T_/ );

      my $rule = concat_rule_lang ($baserule, $lang);
      my $f = $descfile{$rule};
      my $warned = '';

      if (defined $f && !defined ($rulefile{$rule})
                && !defined ($rulefile{$baserule}))
      {
        print "warning: $baserule has description, but no rule: $f\n";
        $warned .= ' lamedesc';
      }

	# Check our convention for rule length
	if ( (($lang ne '' && defined($rulefile{$rule})) || ($lang eq '' && defined ($rulefile{$baserule}))) && length $baserule > 22 ) {
	  print "warning: $baserule has a name longer than 22 chars: $f\n";
	}
 	# Check our convention for rule length
	if ( (($lang ne '' && defined($rulefile{$rule})) || ($lang eq '' && defined ($rulefile{$baserule}))) && defined $description{$baserule}->{$lang} && length $description{$baserule}->{$lang} > 50 ) {
	  print "warning: $baserule has a description longer than 50 chars: $f\n";
	}

      # lang rule trumps normal rule
      $f = $rulefile{$rule} || $rulefile{$baserule};
      # if the rule exists, and the language/rule description doesn't exist ...
      if ( defined $f && !defined $description{$baserule}->{$lang} )
      {
        print "warning: $baserule exists, ",( $lang ne '' ? "lang $lang, " : "" ),"but has no description: $f\n";
        $warned .= ' lamedesc';
      }


      $f = $scorefile{$rule};
      if (defined $f && !defined ($rulefile{$rule})
                && !defined ($rulefile{$baserule}))
      {
        print "warning: $baserule has score, but no rule: $f\n";
        $warned .= ' lamescore';
      }

      my $r = $possible_rename_matches{$rule};
      if ($warned ne '' && defined $r) {
        my @matches = split (' ', $possible_renames{$r});
        if (scalar @matches != 0) {
          my $text = '';

          # now try and figure out "nearby" rules with no description/score
          foreach my $baser (@matches) {
            my $blang;
            if ($descfile{$rule} =~ /text_(\S\S)\./) {
              $blang = $1;
            }
            my $r = concat_rule_lang ($baser, $blang);
            #warn "$r $descfile{$r} $descfile{$baser}";
            next if ($warned =~ /lamedesc/ && (defined $descfile{$r}));
            next if ($warned =~ /lamescore/ && (defined $scorefile{$r}));
            $text .= " $baser";
          }

          if ($text ne '') {
            print "warning: (possible renamed rule? $text)\n";
          }
        }
      }
    }
  }
}