masses/logs-to-c

#!/usr/bin/perl -w
#
# <@LICENSE>
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements.  See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to you under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License.  You may obtain a copy of the License at:
# 
#     http://www.apache.org/licenses/LICENSE-2.0
# 
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# </@LICENSE>

=head1 NAME

logs-to-c - Convert a mass-check log into perceptron format

=head1 SYNOPSIS

logs-to-c [options]

 Options:
    -c,--cffile=path	  Use path as the rules directory
    -s,--scoreset=n	  Use scoreset n
    --spam=file           Location of spam mass-check log
    --ham=file            Location of ham mass-check log

=head1 DESCRIPTION

B<logs-to-c> will read the mass-check logs F<spam.log> and F<ham.log>
or as specified by the B<--spam> and B<--ham> options, and convert it
into the format needed by the perceptron. This is a format that is
simple for the perceptron to parse, but is not very readable to
humans.

=head1 BUGS

Please report bugs to http://bugzilla.spamassassin.org/

=head1 SEE ALSO

L<mass-check(1)>, L<perceptron(1)>

=cut

use Getopt::Long qw(:config auto_help bundling);
use strict;

our $opt_cffile = "../rules";
our $opt_spam = 'spam.log';
our $opt_ham = 'ham.log';
our $opt_scoreset = 0;

GetOptions("cffile=s", "spam=s", "ham=s", "scoreset=i");

my $is_spam = '';		# vec aligned with @tests_hit
my @tests_hit = ();
my %mutable_tests = ();

our (%rules, %allrules, %scores);

my (%ignored_rule, %range_lo, %range_hi);
my %rule_to_index;

readscores();

print "Reading per-message hit stat logs and scores...\n";
my ($num_tests, $num_spam, $num_ham);

read_ranges();
readlogs();

print "Writing logs and current scores as C code...\n";
writescores_c();

# show memory usage before we exit
# print "Running \"ps aux\"...\n";
# open(PS, "ps aux|");
# while(<PS>) {
# print if $. == 1 || /\b$$\b/;
# }
# close(PS);

exit 0;

# code to freeze/thaw test lines in as little space as possible
# this could be faster, but improves memory usage by a phenomenal
# amount over arrayrefs or strings of comma-separated-values
my $short_index = 1;
my %long_to_short;
my @short_to_long;

sub new_short {
  $short_index++;
  $long_to_short{$_[0]} = $short_index;
  $short_to_long[$short_index] = $_[0];
  return $short_index;
}

# uses less than half the memory of join on ',' and even better
# compared to Storable::freeze
sub freeze_tests {
  return pack("w*", map
	      {
		$long_to_short{$_} || new_short($_);
	      } @{$_[0]})
}

sub thaw_tests {
  return map { $short_to_long[$_] } unpack("w*", $_[0]);
}

sub readlogs {
  my $msgline;

  my $count = 0;
  $num_spam = $num_ham = 0;

  foreach my $file ($opt_spam, $opt_ham) {
    open (IN, "<$file") || die "Could not open file '$file': $!";

    my $isspam = ($file eq $opt_spam);
    my $caught;			# 1st parameter of log line
    my $rules;			# 4th parameter of log line
    my $restofline;             # intermediate parse buffer

    while (defined($msgline = <IN>)) {
      # faster log-reading code from hit-frequencies.
      # the additional split() is for this case:
      # ".  -20 /path  time=1112116980,scantime=0,format=f,reuse=no"
      # in other words, no hits.  split(' ') cannot deal with this
      # correctly, seeing (".", "-20", "/path", "time=...etc").  Work
      # around this by using a literal / / regexp split to discard
      # the csv stuff we don't want out of the rest of the line.

      ($caught, undef, $restofline) = split(' ', $msgline, 3);
      next unless ($caught =~ /^[Y\.]$/ && $restofline);
      (undef, $rules) = split(/ /, $restofline, 3);

      # get tests, but ignore unknown tests and subrules
      my @tests = grep { defined $scores{$_} && !$allrules{$_}->{issubrule} }
	split(/,/, $rules);

      if ($isspam) {
        $num_spam++;
        vec($is_spam, $count, 1) = 1;
      }
      else {
        $num_ham++;
        vec($is_spam, $count, 1) = 0;
      }

      # inlined for speed.
      # ORIGINAL: $tests_hit[$count] = freeze_tests(\@tests);
      $tests_hit[$count] = pack("w*", map
                  {
                    $long_to_short{$_} || new_short($_);
                  } @tests);

      # TODO: benchmark using foreach(), map() is often slower

      $count++;                  # increment line
    }
    close IN;
  }
  $num_tests = $count;
}

sub readscores {
  print "Reading scores from \"$opt_cffile\"...\n";
  my $tmpf = "tmp/rules$$.pl";
  system "../build/parse-rules-for-masses ".
        "-d \"$opt_cffile\" -s $opt_scoreset -o $tmpf" and die;
  require $tmpf;
  unlink $tmpf;
  %allrules = %rules;           # ensure it stays global
}

sub writescores_c {
  my $output = '';
  my $size = 0;
  my $mutable = 0;
  my $i;

    # jm: now, score-ranges-from-freqs has tflags to work from, so
    # it will always list all mutable tests.

  my @index_to_rule = sort {($ignored_rule{$a} <=> $ignored_rule{$b}) ||
			  ($mutable_tests{$b} <=> $mutable_tests{$a}) ||
			   ($a cmp $b)} (keys %scores);
  my $max_hits_per_msg = 0;
  for (my $file = 0; $file < $num_tests; $file++) {
    my(@hits) =
     grep {(! $ignored_rule{$_}) && $mutable_tests{$_}} (thaw_tests($tests_hit[$file]));
    if ((scalar(@hits)+1) > $max_hits_per_msg) {
      $max_hits_per_msg = scalar(@hits)+1;
    }
  }

  for ($i = 0; $i <= $#index_to_rule; $i++) {
    my $name = $index_to_rule[$i];
    $rule_to_index{$name} = $i;

    if ($ignored_rule{$name}) { next; }

    if ($mutable_tests{$name} == 0) {
      $range_lo{$name} = $range_hi{$name} = $scores{$name};
    } else {
      $mutable++;
      if ($range_lo{$name} > $range_hi{$name}) {
	($range_lo{$name},$range_hi{$name}) =
	 ($range_hi{$name},$range_lo{$name});
      }
      #$range_lo{$name} ||= 0.1;
      #$range_hi{$name} ||= 1.5;

      # no default score found? set it to max and let GA adjust downwards.  this
      # seems to help avoid a load of really good rules getting 1.0 scores
      if ($allrules{$name}->{no_score_found}) {
        $scores{$name} = ($range_hi{$name} + $range_lo{$name}) / 2.0;
      }
    }

    $output .= ".".$i."\n".
                "n".$name."\n".
                "b".$scores{$name}."\n".
                "m".$mutable_tests{$name}."\n".
                "l".$range_lo{$name}."\n".
                "h".$range_hi{$name}."\n";
    $size++;
  }


  open (DAT, ">tmp/scores.data");
  print DAT "N$size\n", "M$mutable\n", # informational only
   $output;
  close DAT;

  open (OUT, ">tmp/scores.h");
  print OUT "
#include <stdio.h>
#include <string.h>
#include <stdlib.h>

int num_scores = $size;
int num_mutable = $mutable;
unsigned char is_mutable[$size];
double range_lo[$size];
double range_hi[$size];
double bestscores[$size];
char *score_names[$size];
double tmp_scores[$size][2];
unsigned char ny_hit[$mutable];
unsigned char yn_hit[$mutable];

double lookup[$mutable];

/* readscores() is defined in tests.h */

";
  close OUT;

  writetests_c($max_hits_per_msg); # make sure $rule_to_index is around
}

sub writetests_c {
  my $max_hits_per_msg = $_[0];

  my(%uniq_files) = ();
  my(%count_keys) = ();
  my(%file_key) = ();

  my $file;

  for ($file = 0; $file < $num_tests; $file++)
  {
    my $uniq_key = vec($is_spam, $file, 1) . " ";

    my (@good_tests) =
     grep {length($_) && (! $ignored_rule{$_}) &&
	    (defined($rule_to_index{$_}))} (thaw_tests($tests_hit[$file]));

    @good_tests = sort {$a <=> $b} (map {$rule_to_index{$_}} (@good_tests));

    $uniq_key .= join(" ",@good_tests);

    if (exists($count_keys{$uniq_key})) {
      $count_keys{$uniq_key}++;
    } else {
      $count_keys{$uniq_key} = 1;
      $file_key{$file} = $uniq_key;
      $uniq_files{$file} = scalar(keys(%count_keys)) - 1;
    }
  }

  my $num_nondup = scalar(keys(%uniq_files));

  open (TOP, ">tmp/tests.h");
  print TOP "
#include <stdio.h>
#include <string.h>
#include <stdlib.h>

int num_tests = $num_tests;
int num_nondup = $num_nondup;
int num_spam = $num_spam;
int num_ham = $num_ham;
int max_hits_per_msg = $max_hits_per_msg;
unsigned char num_tests_hit[$num_nondup];
unsigned char is_spam[$num_nondup];
unsigned short tests_hit[$num_nondup][$max_hits_per_msg];
double scores[$num_nondup];
double tmp_total[$num_nondup];
int tests_count[$num_nondup];

";
  $_ = join ('', <DATA>);
  print TOP $_;
  close TOP;

  open (DAT, ">tmp/tests.data");

  foreach $file (sort {$a <=> $b} (keys %uniq_files)) {
    print DAT ".".$uniq_files{$file}."\n";

    my $out = '';
    $out .= "s".vec($is_spam, $file, 1)."\n";

    my $base_score = 0;
    my $num_tests_hit = 0;
    foreach my $test (thaw_tests($tests_hit[$file])) {
      if ($test eq '') { next; }

      if ($ignored_rule{$test}) {
        # this is not a log-worthy event anymore, since we have a lot
        # of T_ test rules that are ignored during perceptron runs
        # warn "ignored rule $test got a hit in $file!\n";
        next;
      }

      if (!defined $rule_to_index{$test}) {
	warn "test with no C index: $test\n";
	next;
      }

      if ($mutable_tests{$test}) {
        $num_tests_hit++;
        $out .= "t".$rule_to_index{$test}."\n";

        if ($num_tests_hit >= $max_hits_per_msg) {
          die "Need to increase \$max_hits_per_msg";
        }
      } else {
	$base_score += $scores{$test};
      }
    }

    $out .= "b" . $base_score . "\n"; # score to add in for non-mutable tests
    $out .= "c" . $count_keys{$file_key{$file}} . "\n";

    print DAT "n".$num_tests_hit."\n".$out;
  }
  close DAT;
}

sub read_ranges {
  if (!-f 'tmp/ranges.data') {
    die "need to make 'tmp/ranges.data' first";
  }

  # read ranges, and mutableness, from ranges.data.
  open (IN, "<tmp/ranges.data")
  	or die "need to run score-ranges-from-freqs first!";

  my $count = 0;
  while (<IN>) {
    /^(\S+) (\S+) (\d+) (\S+)$/ or next;
    my $t = $4;
    $range_lo{$t} = $1+0;
    $range_hi{$t} = $2+0;
    my $mut = $3+0;

    if ($allrules{$t}->{issubrule}) {
      # warn "$t: ignoring, is sub-rule\n";    # no need to warn
      $ignored_rule{$t} = 1;
      $mutable_tests{$t} = 0;
      next;
    }
    if ($t =~ /^T_/) {
      # warn "$t: ignoring, is T_ test rule\n";    # no need to warn
      $ignored_rule{$t} = 1;
      $mutable_tests{$t} = 0;
      $range_lo{$t} = 0.01;    # clamp to insignificant range
      $range_hi{$t} = 0.01;
      next;
    }
    if (($range_lo{$t} == $range_hi{$t}) && (! $range_lo{$t})) {
      warn "$t: ignoring, score and range == 0\n";
      $ignored_rule{$t} = 1;
      $mutable_tests{$t} = 0;
      next;
    }

    $ignored_rule{$t} = 0;

    if (!$mut) {
      $mutable_tests{$t} = 0;
    } elsif ($range_lo{$t} == $range_hi{$t}) {
      $mutable_tests{$t} = 0;
    } elsif ($allrules{$t}->{tflags} =~ m/\buserconf\b/i) {
      $mutable_tests{$t} = 0;
    } else {
      $mutable_tests{$t} = 1;
    }
    unless ($mutable_tests{$t} || $scores{$t}) {
      warn "$t: ignoring, immutable and score == 0 in this scoreset\n";
      $ignored_rule{$t} = 1;
    }
  }
  close IN;

  # catch up on the ones missed; seems to be userconf or 0-hitters mostly.
  foreach my $t (sort keys %allrules) {
    next if ($t eq '_scoreset');
    next if (exists($range_lo{$t}));

    if ($allrules{$t}->{issubrule}) {
      if (!$ignored_rule{$t}) {
        # warn "$t: ignoring, is sub-rule\n";  # no need to warn here
        $ignored_rule{$t} = 1;
      }
      $mutable_tests{$t} = 0;
      next;
    }
    if ($t =~ /^T_/) {
      if (!$ignored_rule{$t}) {
        # warn "$t: ignoring, is T_ test rule\n";  # no need to warn here
        $ignored_rule{$t} = 1;
	$range_lo{$t} = 0.01;    # clamp to insignificant range
	$range_hi{$t} = 0.01;
      }
      $mutable_tests{$t} = 0;
      next;
    }
    $ignored_rule{$t} = 0;
    unless (exists($mutable_tests{$t}) &&
	    ($allrules{$t}->{tflags} !~ m/\buserconf\b/i)) {
      $mutable_tests{$t} = 0;
    }
    unless ($mutable_tests{$t} || $scores{$t}) {
      if (!$ignored_rule{$t}) {
        warn "$t: ignoring, immutable and score == 0 in this scoreset\n";
        $ignored_rule{$t} = 1;
      }
    }
  }
  foreach my $t (keys %range_lo) {
    next if ($ignored_rule{$t});
    if ($mutable_tests{$t}) {
      if (($scores{$t} == 1) && ($allrules{$t}->{tflags} =~ m/\bnice\b/i)) {
	$scores{$t} = -1;
      } elsif (($scores{$t} == 0.01) && ($t =~ m/^T_/) &&
	       ($allrules{$t}->{tflags} =~ m/\bnice\b/i)) {
	$scores{$t} = -0.01;
      }
      if ($scores{$t} >= $range_hi{$t}) {
	$scores{$t} = $range_hi{$t} - 0.001;
      } elsif ($scores{$t} <= $range_lo{$t}) {
	$scores{$t} = $range_lo{$t} + 0.001;
      }
    } else {
      if ($allrules{$t}->{tflags} =~ m/\buserconf\b/i) {
	next;
      } elsif ($range_lo{$t} == $range_hi{$t}) {
	$scores{$t} = $range_lo{$t};
	next;
      }
      if (($scores{$t} == 1) && ($allrules{$t}->{tflags} =~ m/\bnice\b/i)) {
	$scores{$t} = -1;
      } elsif (($scores{$t} == 0.01) && ($t =~ m/^T_/) &&
	       ($allrules{$t}->{tflags} =~ m/\bnice\b/i)) {
	$scores{$t} = -0.01;
      }
      if ($scores{$t} > $range_hi{$t}) {
	$scores{$t} = $range_hi{$t};
      } elsif ($scores{$t} < $range_lo{$t}) {
	$scores{$t} = $range_lo{$t};
      }
    }
  }
}


__DATA__

void loadtests (void) {
  FILE *fin = fopen ("tmp/tests.data", "r");
  char buf[256];
  int file = 0;
  int tnum = 0;

  while (fgets (buf, 255, fin) != NULL) {
    char cmd;
    long arg;
    float argd;

    cmd = (char) *buf;
    arg = strtol (buf+1, NULL, 10);
    argd = (float)strtod (buf+1, NULL);

    if (cmd == '.') {
      file = arg;

    } else if (cmd == 'n') {
      tnum = 0;
      num_tests_hit[file] = arg;

    } else if (cmd == 's') {
      is_spam[file] = arg;

    } else if (cmd == 'b') {
      scores[file] = argd;

    } else if (cmd == 't') {
      tests_hit[file][tnum] = arg; tnum++;

    } else if (cmd == 'c') {
      tests_count[file] = arg;

    }
  }
  fclose(fin);

  printf ("Read test results for %d messages (%d total).\n", file+1,
	  num_tests);
}

void loadscores (void) {
  FILE *fin = fopen ("tmp/scores.data", "r");
  char buf[256];
  int snum = 0;

  while (fgets (buf, 255, fin) != NULL) {
    char cmd;
    long arg;
    float argd;
    char *str, *white;

    cmd = (char) *buf;
    arg = strtol (buf+1, NULL, 10);
    argd = (float)strtod (buf+1, NULL);
    str = buf+1;

    while ((white = strchr (str, '\n')) != NULL) {
      *white = '\0';
    }

    if (cmd == '.') {
      snum = arg;

    } else if (cmd == 'b') {
      bestscores[snum] = argd;

    } else if (cmd == 'l') {
      range_lo[snum] = argd;

    } else if (cmd == 'h') {
      range_hi[snum] = argd;

    } else if (cmd == 'n') {
      score_names[snum] = strdup (str);	/* leaky leak ;) */

    } else if (cmd == 'm') {
      is_mutable[snum] = arg;
    }
  }
  fclose(fin);

  printf ("Read scores for %d tests.\n", num_scores);
}