masses/perceptron.c

/* This program uses stochastic gradient descent to learn a scoreset for
 * SpamAssassin.  You'll need to run logs-to-c from spamassassin/masses to
 * generate the stuff in tmp.
 *
 * <@LICENSE>
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to you under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at:
 * 
 *     http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 * </@LICENSE>
 */

#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include <sys/time.h>
#include <math.h>
#include <unistd.h>

#include "tmp/scores.h"
#include "tmp/tests.h"

/* Ensure that multiple error functions have not been chosen. */
#ifdef ENTROPIC_ERROR
#ifdef LEAST_SQUARES_ERROR
#warn Both entropic error and least squares error chosen.  Using least squares.
#endif
#endif

/* Choose the least squares error function by default. */
#ifndef ENTROPIC_ERROR
#ifndef LEAST_SQUARES_ERROR
#define LEAST_SQUARES_ERROR
#endif
#endif

#define OUTPUT_FILE "perceptron.scores"
/* #define IGNORE_SCORE_RANGES 1 */

void init_wheel ();
void destroy_wheel ();
int get_random_test ();
void init_weights();
void destroy_weights ();
void write_weights (FILE * fp);
void scale_scores (double old_threshold, double new_threshold);
double evaluate_test (int test);
double evaluate_test_nogain (int test);
void train (int num_epochs, double learning_rate);
void usage ();

/* Converts a weight to a SpamAssassin score. */
#define weight_to_score(x) (-threshold*(x)/bias)
#define score_to_weight(x) (-(x)*bias/threshold)

int wheel_size; /* The number of entries in the roulette wheel (NOT THE
		   SIZE OF THE ROULETTE_WHEEL ARRAY!). */
int * roulette_wheel; /* Used for roulette wheel selection. */
double ham_preference = 2.0;

#define DEFAULT_THRESHOLD 5.0
double threshold = DEFAULT_THRESHOLD;

double * weights; /* The weights of the single-layer perceptron. */
double bias; /* The network bias for the single-layer perceptron. */

int num_epochs = 15;
double learning_rate = 2.0;
double weight_decay = 1.0;

/* Initialize the roulette wheel and populate it, replicating harder-to-classify hams. */
void init_wheel () {
	int i;
	int spam = 0, ham = 0;

	/* cache locations on the roulette wheel for fast lookups */
	roulette_wheel = (int*)calloc(num_nondup, sizeof(int));
	wheel_size = 0;

	roulette_wheel[0] = 0;

	for (i = 0; i < num_nondup - 1; i++) {
		int slot_size = 1;

		/* Hams with more tests are rare and harder to classify but are the
		 * most important to classify correctly.  They are thus replicated in the
		 * training set proportionally to their difficulty. */
		if ( ! is_spam[i] ) {
			slot_size += (int)(num_tests_hit[i] * ham_preference * tests_count[i]);
		} else {
			slot_size = tests_count[i];
		}

		/* The database is compressed with all instances mapped in the same place. */
		wheel_size += slot_size;

		if ( ! is_spam[i] ) {
			ham += slot_size;
		} else {
			spam++;
		}

		roulette_wheel[i+1] = roulette_wheel[i] + slot_size;
	}

	printf ("Modified training set statistics: %d spam, %d ham.\n", spam, ham);
}

/* Free the resources for the roulette wheel selector. */
void destroy_wheel () {
	if ( roulette_wheel ) {
		free (roulette_wheel);
		roulette_wheel = 0;
		wheel_size = 0;
	}
}

/* Get a random test using roulette wheel selection.  This is not used anymore. */
int get_random_test () {
	int r;
	int bottom, middle, top;

	r = lrand48() % wheel_size;

	bottom = 0;
	top = num_nondup - 1;
	middle = top / 2;

	while (1) {
		if ( r < roulette_wheel[bottom+1] ) {
			return bottom;
		} else if ( r >= roulette_wheel[top] ) {
			return top;
		} else if ( r >= roulette_wheel[middle] && r < roulette_wheel[middle+1] ) {
			return middle;
		} else if ( r < roulette_wheel[middle] ) {
			top = middle-1;
			bottom++;
			middle = bottom + (top-bottom)/2;
		} else {
			bottom = middle+1;
			top--;
			middle = bottom + (top-bottom)/2;
		}
	}
}

/* Allocate and initialize the weights over the range [-0.5..0.5] */
void init_weights () {
	int i;

	weights = (double*)calloc(num_scores, sizeof(double));

	bias = drand48() - 0.5;
	for (i = 0; i < num_scores; i++) {
		weights[i] = range_lo[i] + drand48() * (range_hi[i] - range_lo[i]);
	}
}

/* Free the resources allocated for the weights. */
void destroy_weights () {
	if ( weights ) {
		free(weights);
		weights = 0;
	}
}

/* Writes out the weights in SpamAssassin score space. */
void write_weights (FILE * fp) {
	int i;
	int ga_nn, ga_yy, ga_ny, ga_yn;
	double nnscore, yyscore, nyscore, ynscore;

	ga_nn = ga_yy = ga_ny = ga_yn = 0;
	nnscore = yyscore = nyscore = ynscore = 0;

	/* Run through all of the instances in the training set and tally up
	 * the scores. */
	for (i = 0; i < num_nondup; i++) {
		double score = weight_to_score(evaluate_test_nogain(i)) + threshold;

		if ( score >= threshold && is_spam[i] ) {
			ga_yy += tests_count[i];
			yyscore += tests_count[i] * score;
		} else if ( score < threshold && !is_spam[i] ) {
			ga_nn += tests_count[i]; 
			nnscore += tests_count[i] * score;
		} else if ( score >= threshold && !is_spam[i] ) {
			ga_ny += tests_count[i];
			nyscore += tests_count[i] * score;
		} else {
			ga_yn += tests_count[i];
			ynscore += tests_count[i] * score;
		}
	}

	/* This is copied from the dump() function in craig-evolve.c.  It 
	 * outputs some nice statistics about the learned classifier. */
	fprintf (fp,"\n# SUMMARY for threshold %3.1f:\n", threshold);
	fprintf (fp,
			"# Correctly non-spam: %6d  %4.2f%%\n",
			ga_nn,
			(ga_nn / (float) num_ham) * 100.0);
	fprintf (fp,
			"# Correctly spam:     %6d  %4.2f%%\n",
			ga_yy,
			(ga_yy / (float) num_spam) * 100.0);
	fprintf (fp,
			"# False positives:    %6d  %4.2f%%\n",
			ga_ny,
			(ga_ny / (float) num_ham) * 100.0);
	fprintf (fp,
			"# False negatives:    %6d  %4.2f%%\n",
			ga_yn,
			(ga_yn / (float) num_spam) * 100.0);

	fprintf (fp,"# Average score for spam:  %3.3f    ham: %3.1f\n",(ynscore+yyscore)/((double)(ga_yn+ga_yy)),(nyscore+nnscore)/((double)(ga_nn+ga_ny)));
	fprintf (fp,"# Average for false-pos:   %3.3f  false-neg: %3.1f\n",(nyscore/(double)ga_ny),(ynscore/(double)ga_yn));

	fprintf (fp,"# TOTAL:              %6d  %3.2f%%\n\n", num_tests, 100.0);


	for (i = 0; i < num_scores; i++) {
		if ( is_mutable[i] )  {
			fprintf(fp, "score %-30s %2.3f # [%2.3f..%2.3f]\n", score_names[i], weight_to_score(weights[i]), range_lo[i], range_hi[i]);
		} else {
			fprintf(fp, "score %-30s %2.3f # not mutable\n", score_names[i], range_lo[i]);
		}
	}
}

/* This is to support Daniel's threshold thing. */
void scale_scores (double old_threshold, double new_threshold) {
	int i;

	/* No need to scale something to itself. */
	if ( old_threshold == new_threshold ) {
		return;
	}

	for (i = 0; i < num_scores; i++) {
		if ( is_mutable[i] ) {
			range_lo[i] = range_lo[i] * new_threshold / old_threshold;
			range_hi[i] = range_hi[i] * new_threshold / old_threshold;
		}
	}

	/* Maybe we don't want this bit.  This prescaling stuff makes my
	 * brain hurt.*/
	/*
	for (i = 0; i < num_nondup; i++) {
		scores[i] = scores[i] * new_threshold / old_threshold;
	}
	*/
}

/* Computes the value of the activation function of the perceptron for
 * a given input. */
double evaluate_test (int test) {
#ifdef LEAST_SQUARES_ERROR
	return 1/(1+exp(-evaluate_test_nogain(test)));
#else
#ifdef ENTROPIC_ERROR
	return tanh(evaluate_test_nogain(test));
#endif
#endif
}

/* Computes the value of the transfer function (in this case, linear) for
 * an input defined in tests_hit[test]* */
double evaluate_test_nogain (int test) {
	double sum;
	int i;

	sum = bias;

	for (i = 0; i < num_tests_hit[test]; i++) {
		sum += weights[tests_hit[test][i]];
	}

	/* Translate the 'unmutable' scores to weight space. */
	sum += score_to_weight(scores[test]);

	return sum;
}

/* Trains the perceptron using stochastic gradient descent. */
void train (int num_epochs, double learning_rate) {
	int epoch, random_test;
	int i, j;
	int * tests;
	double y_out, error, delta;

	/* Initialize and populate an array containing indices of training
	 * instances.  This is shuffled on every epoch and then iterated
	 * through.  I had originally planned to use roulette wheel selection,
	 * but shuffled selection seems to work better. */
	tests = (int*)calloc(wheel_size, sizeof(int));

	for (i = 0, j = 0; i < num_nondup-1; i++) {
		for (; j < roulette_wheel[i+1]; j++) {
			tests[j] = i;
		}
	}
	for (; j < wheel_size; j++) {
		tests[j] = num_nondup-1;
	}

	for (epoch = 0; epoch < num_epochs; epoch++) {
		/* decay the weights on every epoch to smooth out statistical
		 * anomalies */
		if ( weight_decay != 1.0 ) {
			bias *= weight_decay;
			for (i = 0; i < num_mutable; i++) {
				weights[i] *= weight_decay;
			}
		}

		/* shuffle the training instances */
		for (i = 0; i < wheel_size-1; i++) {
			int tmp;
			int r = lrand48 () % (wheel_size - i);

			tmp = tests[i];
			tests[i] = tests[r+i];
			tests[r+i] = tmp;
		}

		for (j = 0; j < wheel_size; j++) {

			/* select a random test (they have been randomized above) */
			random_test = tests[j];

			/* compute the output of the network */
			y_out = evaluate_test(random_test);
	
/* compute the error gradient for the logsig node with least squares error */
#ifdef LEAST_SQUARES_ERROR
			error = is_spam[random_test] - y_out;
			delta = y_out * (1-y_out) * error / (num_tests_hit[random_test]+1) * learning_rate;
#else
/* compute the error gradient for the tanh node with entropic error */
#ifdef ENTROPIC_ERROR
			error = (2.0*is_spam[random_test]-1) - y_out;
			delta = error / (num_tests_hit[random_test]+1) * learning_rate;
#endif
#endif
	
			/* adjust the weights to descend the steepest part of the error gradient */
			if ( epoch + 1 < num_epochs ) {
				bias += delta;
			}
			for (i = 0; i < num_tests_hit[random_test]; i++) {
				int idx = tests_hit[random_test][i];
				weights[idx] += delta;

#ifdef IGNORE_SCORE_RANGES
				/* Constrain the weights so that nice rules are always <= 0 etc. */
				if ( range_lo[idx] >= 0 && weights[idx] < 0 ) {
					weights[idx] = 0;
				} else if ( range_hi[idx] <= 0 && weights[idx] > 0 ) {
					weights[idx] = 0;
				}
#else
				if ( weights[idx] < score_to_weight(range_lo[idx]) ) {
					weights[idx] = score_to_weight(range_lo[idx]);
				} else if ( weights[idx] > score_to_weight(range_hi[idx]) ) {
					weights[idx] = score_to_weight(range_hi[idx]);
				}
#endif
			}
		}
	}

	free(tests);
}

void usage () {
	printf ("usage: perceptron [args]\n"
			"\n"
			"  -p ham_preference = adds extra ham to training set multiplied by number of\n"
			"                      tests hit (2.0 default)\n"
			"  -e num_epochs = number of epochs to train (15 default)\n"
			"  -l learning_rate = learning rate for gradient descent (2.0 default)\n"
			"  -t threshold = minimum threshold for spam (5.0 default)\n"
			"  -w weight_decay = per-epoch decay of learned weight and bias (1.0 default)\n"
			"  -h = print this help\n"
			"\n");
	exit(30);
}

int main (int argc, char ** argv) {
	struct timeval tv, tv_start, tv_end;
	long long int t_usec;
	FILE * fp;
	int arg;

	/* Read the command line options */
	while ((arg = getopt (argc, argv, "p:e:l:t:w:h?")) != -1) {
		switch (arg) {
			case 'p':
				ham_preference = atof(optarg);
				break;

			case 'e':
				num_epochs = atoi(optarg);
				break;

			case 'l':
				learning_rate = atof(optarg);
				break;

			case 't':
				threshold = atof(optarg);
				break;

			case 'w':
				weight_decay = atof(optarg);
				break;

			case 'h':
			case '?':
				usage();
				break;
		}
	}

	/* Seed the PRNG */
	gettimeofday (&tv, 0);
	t_usec = tv.tv_sec * 1000000 + tv.tv_usec;
	srand48 ((int)t_usec);

	/* Load the instances and score constraints generated by logs-to-c. */
	loadtests();
	loadscores();

	/* If the threshold has been changed, the ranges and scores need to be
	 * scaled so that the output of the program will not be affected.
	 */
	scale_scores (DEFAULT_THRESHOLD, threshold);

	/* Replicate instances from the training set to bias against false positives. */
	init_wheel ();

	/* Allocate and initialize the weight vector. */
	init_weights ();

	/* Train the network using stochastic gradient descent. */
	gettimeofday(&tv_start, 0);
	train(num_epochs, learning_rate);
	gettimeofday(&tv_end, 0);

	t_usec = tv_end.tv_sec * 1000000 + tv_end.tv_usec -
		(tv_start.tv_sec *1000000 + tv_start.tv_usec);
	printf ("Training time = %fs.\n", t_usec / 1000000.0f);

	/* Translate the learned weights to SA score space and output to a file. */
	fp = fopen (OUTPUT_FILE, "w");
	if ( fp ) {
		write_weights(fp);
		fclose(fp);
	} else {
		perror (OUTPUT_FILE);
	}

	/* Free resources */
	destroy_weights ();
	destroy_wheel ();
	return 0;
}