Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Browse files

First commit

  • Loading branch information...
commit 760487f9e16f5cd8cad69dca2a32f822596426b3 0 parents
Jonathan Moss authored
Showing with 55,049 additions and 0 deletions.
  1. +2 −0  .gitignore
  2. 0  README
  3. +127 −0 build.xml
  4. +6 −0 package.properties
  5. +185 −0 src/bootstrap.php
  6. +9 −0 src/tutorials/wotsit/examples/preTrained.php
  7. +19 −0 src/tutorials/wotsit/examples/training.php
  8. +70 −0 src/tutorials/wotsit/wotsit.pkg
  9. +185 −0 src/wotsit/Classifier.php
  10. +45 −0 src/wotsit/Feature.php
  11. +79 −0 src/wotsit/classifier/Fisher.php
  12. +59 −0 src/wotsit/classifier/NaiveBayesian.php
  13. +53 −0 src/wotsit/feature/ExtractorCollection.php
  14. +65 −0 src/wotsit/feature/LinkExtractor.php
  15. +88 −0 src/wotsit/feature/NGramExtractor.php
  16. +672 −0 src/wotsit/feature/Stemmer.php
  17. +93 −0 src/wotsit/feature/WordExtractor.php
  18. +67 −0 src/wotsit/iClassifier.php
  19. +28 −0 src/wotsit/iExtractFeatures.php
  20. +62 −0 src/wotsit/iStorage.php
  21. +170 −0 src/wotsit/storage/Dbm.php
  22. +120 −0 src/wotsit/storage/Memory.php
  23. +211 −0 src/wotsit/storage/MongoDb.php
  24. +68 −0 src/wotsit/storage/PersistentMemory.php
  25. +274 −0 src/wotsit/storage/Sqlite.php
  26. +32 −0 tests/bdd/BDDTests.php
  27. +75 −0 tests/bdd/TestFeature.php
  28. +89 −0 tests/bdd/feature/TestExtractorCollection.php
  29. +87 −0 tests/bdd/feature/TestLinkExtractor.php
  30. +93 −0 tests/bdd/feature/TestNGramExtractor.php
  31. +133 −0 tests/bdd/feature/TestStemmer.php
  32. +103 −0 tests/bdd/feature/TestWordExtractor.php
  33. +23,531 −0 tests/bdd/fixtures/stemmer-full.csv
  34. +1,349 −0 tests/bdd/fixtures/stemmer.csv
  35. +163 −0 tests/bdd/storage/TestDbm.php
  36. +147 −0 tests/bdd/storage/TestMemory.php
  37. +176 −0 tests/bdd/storage/TestMongoDb.php
  38. +168 −0 tests/bdd/storage/TestPersistentMemory.php
  39. +158 −0 tests/bdd/storage/TestSqlite.php
  40. +16 −0 tests/integration-tests/IntegrationTests.php
  41. +66 −0 tests/integration-tests/TestFisher.php
  42. +66 −0 tests/integration-tests/TestNaiveBayesian.php
  43. +5 −0 tests/integration-tests/fixtures/sampleData.txt
  44. +71 −0 tests/integration-tests/storage/TestDbm.php
  45. +75 −0 tests/integration-tests/storage/TestMongoDb.php
  46. +37 −0 tests/unit-tests/TestFeature.php
  47. +32 −0 tests/unit-tests/UnitTests.php
  48. +44 −0 tests/unit-tests/feature/TestExtractorCollection.php
  49. +43 −0 tests/unit-tests/feature/TestLinkExtractor.php
  50. +52 −0 tests/unit-tests/feature/TestNGramExtractor.php
  51. +85 −0 tests/unit-tests/feature/TestStemmer.php
  52. +59 −0 tests/unit-tests/feature/TestWordExtractor.php
  53. +23,531 −0 tests/unit-tests/fixtures/stemmer-full.csv
  54. +1,349 −0 tests/unit-tests/fixtures/stemmer.csv
  55. +98 −0 tests/unit-tests/storage/TestDbm.php
  56. +74 −0 tests/unit-tests/storage/TestMemory.php
  57. +97 −0 tests/unit-tests/storage/TestMongoDb.php
  58. +107 −0 tests/unit-tests/storage/TestPersistentMemory.php
  59. +81 −0 tests/unit-tests/storage/TestSqlite.php
2  .gitignore
@@ -0,0 +1,2 @@
+Wotsit.phar
+docs/*
0  README
No changes.
127 build.xml
@@ -0,0 +1,127 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project name="Wotsit Classsifier" default="build-all">
+
+ <property file="package.properties" />
+
+ <fileset dir="tests/unit-tests" id="unit-tests">
+ <exclude name="TestStemmer.php" />
+ <include name="**/Test*.php" />
+ <exclude name="UnitTests.php" />
+ </fileset>
+
+ <fileset dir="tests/integration-tests" id="integration-tests">
+ <include name="**/Test*.php" />
+ <exclude name="IntegrationsTests.php" />
+ </fileset>
+
+ <fileset dir="src/wotsit" id="source">
+ <include name="**/*.php" />
+ </fileset>
+
+ <target name="lint-source" description="checks source for syntax errors">
+ <phplint haltonfailure="true">
+ <fileset refid="source"/>
+ </phplint>
+ </target>
+
+ <target name="api-docs" description="Creates documentation">
+ <delete dir="docs/api" quiet="true" includeemptydirs="true" verbose="false" failonerror="false" />
+ <mkdir dir="docs/api" />
+ <phpdoc title="${package.title} Documentation" destdir="docs/api" sourcecode="yes"
+ output="HTML:frames:DOM/default" quiet="true" defaultpackagename="${package.default}">
+ <fileset refid="source" />
+ <fileset dir="src/tutorials">
+ <include name="**/*.pkg" />
+ <include name="**/*.cls" />
+ </fileset>
+ </phpdoc>
+
+ </target>
+
+ <target name="unit-report" description="Unit test report">
+ <phpunit haltonfailure="true" haltonerror="true" printsummary="true">
+ <formatter type="xml" outfile="test-report.xml" />
+ <batchtest>
+ <fileset refid="unit-tests" />
+ </batchtest>
+ </phpunit>
+
+ <delete dir="docs/test_report" quiet="true" includeemptydirs="true" verbose="false" failonerror="false" />
+ <mkdir dir="docs/test_report" />
+ <phpunitreport infile="test-report.xml" format="frames" todir="docs/test_report" />
+ <delete file="test-report.xml" />
+ </target>
+
+ <target name="coverage-report" description="generates unit test coverage report">
+ <delete dir="docs/coverage" quiet="true" includeemptydirs="true" verbose="false" failonerror="false" />
+ <mkdir dir="docs/coverage" />
+ <exec command="phpunit --coverage-html docs/coverage/ tests/unit-tests/UnitTests.php" checkreturn="false" />
+ </target>
+
+ <target name="bdd-report" description="generates bdd testdox report">
+ <delete dir="docs/bdd_report" quiet="true" includeemptydirs="true" verbose="false" failonerror="false" />
+ <mkdir dir="docs/bdd_report" />
+ <exec command="phpunit --testdox-html docs/bdd_report/index.html tests/bdd/BDDTests.php" checkreturn="false" />
+ </target>
+
+ <target name="integration-report" description="Integration test report">
+ <phpunit haltonfailure="true" haltonerror="true" printsummary="false">
+ <formatter type="xml" outfile="integration-tests.xml" />
+ <formatter type="plain" usefile="false" />
+ <batchtest>
+ <fileset refid="integration-tests" />
+ </batchtest>
+ </phpunit>
+ <delete dir="docs/integration_report" quiet="true" includeemptydirs="true" verbose="false" failonerror="false" />
+ <mkdir dir="docs/integration_report" />
+ <phpunitreport infile="integration-tests.xml" format="frames" todir="docs/integration_report" />
+ <delete file="integration-tests.xml"/>
+ </target>
+
+ <target name="package" description="build the phar library">
+ <tstamp>
+ <format property="date.created" pattern="%Y-%m-%d %H:%I:%S" />
+ <format property="date.year" pattern="%Y" />
+ <format property="date.build" pattern="%s" />
+ </tstamp>
+
+ <mkdir dir="export/src/wotsit" />
+ <copy todir="export/src/wotsit">
+ <fileset refid="source" />
+ <filterchain>
+ <stripwhitespace />
+ </filterchain>
+ </copy>
+
+ <mkdir dir="export/tests/unit-tests" />
+ <copy todir="export/tests/unit-tests">
+ <fileset dir="tests/unit-tests">
+ <include name="**/*" />
+ </fileset>
+ <filterchain>
+ <stripwhitespace />
+ </filterchain>
+ </copy>
+
+ <delete file="${package.file}" quiet="true" />
+
+ <pharpackage basedir="export" destfile="${package.file}" compression="gzip" stub="src/bootstrap.php" alias="${package.title}">
+ <fileset dir="export">
+ <include name="**/*" />
+ </fileset>
+ <metadata>
+ <element name="Title" value="${package.title}" />
+ <element name="Author" value="${package.author}" />
+ <element name="Description" value="${package.description}" />
+ <element name="Copyright" value="${package.copyright} ${date.year}" />
+ <element name="Build" value="${date.build}" />
+ <element name="Created Date" value="${date.created}" />
+ </metadata>
+ </pharpackage>
+
+ <delete dir="export" quiet="true" includeemptydirs="true" />
+ </target>
+
+ <target name="build-all" depends="lint-source, unit-report, bdd-report, coverage-report, api-docs, package, integration-report"></target>
+
+</project>
6 package.properties
@@ -0,0 +1,6 @@
+package.title = Wotsit
+package.file = Wotsit.phar
+package.author = Jonathan Moss <jonathan.moss@tangentone.com.au>
+package.copyright = Tangent/One Australia Pty Ltd (c)
+package.description = Wotsit is a heuristics based classification package
+package.default = wotsit
185 src/bootstrap.php
@@ -0,0 +1,185 @@
+<?php
+/**
+ * @author Jonathan Moss <jonathan.moss@tangentone.com.au>
+ * @copyright 2010 Tangent/One Au
+ * @version SVN: $Id$
+ */
+
+Phar::mapPhar();
+
+if (php_sapi_name() == 'cli' && isset ($argv[0])) {
+ $runner = new CliRunner();
+ $runner->processParameters($argv);
+} else {
+ //register the autoloader
+ spl_autoload_register(array('WotsitAutoloader', 'load'));
+}
+
+/**
+ * Autoloader for the wotsit package
+ */
+class WotsitAutoloader
+{
+
+ /**
+ * A static array of classes
+ *
+ * @var array
+ */
+ private static $classes = array(
+ 'wotsit_classifier_NaiveBayesian' => 'phar://Wotsit/src/wotsit/classifier/NaiveBayesian.php',
+ 'wotsit_classifier_Fisher' => 'phar://Wotsit/src/wotsit/classifier/Fisher.php',
+ 'wotsit_feature_ExtractorCollection' => 'phar://Wotsit/src/wotsit/feature/ExtractorCollection.php',
+ 'wotsit_feature_LinkExtractor' => 'phar://Wotsit/src/wotsit/feature/LinkExtractor.php',
+ 'wotsit_feature_NGramExtractor' => 'phar://Wotsit/src/wotsit/feature/NGramExtractor.php',
+ 'wotsit_feature_WordExtractor' => 'phar://Wotsit/src/wotsit/feature/WordExtractor.php',
+ 'wotsit_storage_Dbm' => 'phar://Wotsit/src/wotsit/storage/Dbm.php',
+ 'wotsit_storage_Memory' => 'phar://Wotsit/src/wotsit/storage/Memory.php',
+ 'wotsit_storage_MongoDb' => 'phar://Wotsit/src/wotsit/storage/MongoDb.php',
+ 'wotsit_storage_PersistentMemory' => 'phar://Wotsit/src/wotsit/storage/PersistentMemory.php',
+ 'wotsit_storage_Sqlite' => 'phar://Wotsit/src/wotsit/storage/Sqlite.php',
+ 'wotsit_Classifier' => 'phar://Wotsit/src/wotsit/Classifier.php',
+ 'wotsit_Feature' => 'phar://Wotsit/src/wotsit/Feature.php',
+ 'wotsit_iClassifier' => 'phar://Wotsit/src/wotsit/iClassifier.php',
+ 'wotsit_iExtractFeatures' => 'phar://Wotsit/src/wotsit/iExtractFeatures.php',
+ 'wotsit_iStorage' => 'phar://Wotsit/src/wotsit/iStorage.php',
+ );
+
+ /**
+ * class loader
+ *
+ * @param string $className
+ * @return boolean
+ */
+ public static function load($className)
+ {
+ $isLoaded = false;
+ if (isset(self::$classes[$className])) {
+ include self::$classes[$className];
+ $isLoaded = true;
+ }
+ return $isLoaded;
+ }
+
+}
+
+/**
+ * Cli runner
+ */
+class CliRunner
+{
+
+ /**
+ * @param array $argv
+ * @return void
+ */
+ public function processParameters(array $argv)
+ {
+ $filename = $argv[0];
+ $switches = array();
+ $commands = array();
+ foreach (array_slice($argv, 1) as $arg) {
+ if (substr($arg, 0, 2) == '--') {
+ $switches[] = $arg;
+ } else {
+ $commands[] = $arg;
+ }
+ }
+
+ switch (true) {
+ case in_array('test', $commands):
+ require_once 'PHPUnit/Autoload.php';
+ $verbose = in_array('--verbose', $switches);
+ $this->printHeader($filename);
+ if (in_array('--testdox', $switches)) {
+ $listener = new PHPUnit_Util_TestDox_ResultPrinter_Text();
+ } else {
+ $listener = new PHPUnit_TextUI_ResultPrinter(null, $verbose);
+ }
+ $this->runTests($listener);
+ break;
+ case in_array('list', $commands):
+ $this->printHeader($filename);
+ $this->listContent($filename);
+ break;
+ case in_array('--help', $switches):
+ case in_array('help', $commands):
+ default:
+ $this->printHeader($filename);
+ $this->printHelp();
+ break;
+ }
+ }
+
+ /**
+ * @param PHPUnit_Framework_TestListener $listener
+ * @return void
+ */
+ private function runTests(PHPUnit_Framework_TestListener $listener)
+ {
+ echo "RUNNING TEST SUITE:\n^^^^^^^^^^^^^^^^^^^\n";
+ set_include_path(get_include_path() . PATH_SEPARATOR . 'Wotsit.phar');
+ require_once 'phar://Wotsit/tests/unit-tests/UnitTests.php';
+ $suite = UnitTests::suite();
+ $result = new PHPUnit_Framework_TestResult;
+ $result->addListener($listener);
+ $suite->run($result);
+ echo "\n";
+ die((int)$result->wasSuccessful());
+ }
+
+ /**
+ * @param string $filename
+ * @return void
+ */
+ private function listContent($filename)
+ {
+ $p = new Phar($filename, 0);
+ echo "LISTING METADATA\n^^^^^^^^^^^^^^^^\n";
+ foreach ($p->getMetadata() as $key => $value) {
+ echo "\t{$key}: {$value}\n";
+ }
+
+ echo "\nLISTING CONTENTS\n^^^^^^^^^^^^^^^^\n";
+ foreach (new RecursiveIteratorIterator($p) as $file) {
+ $path = $file->getPathname();
+ $path = substr($path, strpos($path, $filename) + strlen($filename));
+ echo "\t{$path}\n";
+ }
+ }
+
+ /**
+ * @param string $filename
+ * @return void
+ */
+ private function printHeader($filename)
+ {
+ $p = new Phar($filename, 0);
+
+ $meta = $p->getMetadata();
+ echo <<<EOD
+{$meta['Title']} by {$meta['Author']}
+
+EOD;
+
+ }
+
+ /**
+ * @return void
+ */
+ private function printHelp()
+ {
+ echo <<<EOD
+Usage: php Wotsit.phar [switches] test
+ php Wotsit.phar [switches] help
+
+ --verbose Will output a more verbose test report
+
+
+EOD;
+ die();
+ }
+
+}
+
+__HALT_COMPILER();
9 src/tutorials/wotsit/examples/preTrained.php
@@ -0,0 +1,9 @@
+<?php
+//initialise storage object with previously learned data
+$storage = new wotsit_storage_Sqlite('data.db');
+
+$extractor = new wotsit_feature_WordExtractor();
+$classfier = new wotsit_classifier_NaiveBayesian($extractor, $storage);
+
+//classify the passed in text
+$category = $classfier->classify($someText);
19 src/tutorials/wotsit/examples/training.php
@@ -0,0 +1,19 @@
+<?php
+//initialise storage object with previously learned data
+$storage = new wotsit_storage_Sqlite('new.db');
+
+$extractor = new wotsit_feature_WordExtractor(); //NB: You can also pass in a stemmer if you desire
+$classfier = new wotsit_classifier_NaiveBayesian($extractor, $storage);
+
+$goodData = array(); //should be examples of documents which are in the category 'good'
+$badData = array(); //should be examples of documents which are in the category 'bad'
+
+//Adding examples of 'good' data
+foreach ($goodData as $goodDatum) {
+ $classfier->train($goodDatum, 'good');
+}
+
+//adding examples of 'bad' data
+foreach ($badData as $badDatum) {
+ $classfier->train($badDatum, 'bad');
+}
70 src/tutorials/wotsit/wotsit.pkg
@@ -0,0 +1,70 @@
+<refentry id="{@id}">
+ <refnamediv>
+ <refname>
+ Wotsit
+ </refname>
+ <refpurpose>
+ An introductory tutorial to Wotsit Classfier
+ </refpurpose>
+ </refnamediv>
+
+ <refsynopsisdiv>
+ <author>
+ <authorblurb>
+ {@link mailto:jonathan.moss@tangentone.com.au Jonathan
+ Moss}
+ </authorblurb>
+ </author>
+ <para>
+ The Wotsit package provides a document classification framework
+ </para>
+ </refsynopsisdiv>
+ {@toc}
+ <refsect1 id="{@id intro}">
+ <title>Overview</title>
+ <para>
+ This package comprises a suite of classes for automatic classification
+ of documents based on statistical analysis of the document against previously
+ seen documents.
+ </para>
+ <para>
+ It should be noted that the classifiers in the package are heuristics based and are therefore
+ not 100% accurate. They also rely on some level of training before they become useful.
+ </para>
+ </refsect1>
+
+ <refsect1 id="{@id basic_usage}">
+ <title>Basic Usage</title>
+ <para>
+ In the following section we go over the basics of Wotsit. How we create a classifier,
+ how we train it and how to use it there after.
+ </para>
+ <refsect2 id="{@id using_a_classfier}">
+ <title>Creating and using a pre-trained classfier</title>
+ <para>
+ Using a pre-trained classifier is very easy. e.g.
+ {@example preTrained.php}
+ </para>
+ </refsect2>
+ <refsect2 id="{@id training_a_classfier}">
+ <title>Training a classfier</title>
+ <para>
+ Training a classifier is also quite straight forward. All you need is to
+ initialise the classfier and then feed it sample data. e.g.
+ {@example training.php}
+ </para>
+ <para>
+ Of course there is nothing stopping you taking a pre-trained dataset and
+ refining it with you own data.
+ </para>
+ <para>
+ You can also constantly refine the learned data by allowing users or administrators
+ to feedback their own decisions. For example, you could automatically mark comments
+ as 'good' or 'bad' based on the output of the classifier, but allow users to mark the
+ comment differently and then use their feedback to 're-train' the classfier.
+ </para>
+ </refsect2>
+ </refsect1>
+
+
+</refentry>
185 src/wotsit/Classifier.php
@@ -0,0 +1,185 @@
+<?php
+/**
+ * @package wotsit
+ * @author Jonathan Moss <jonathan.moss@tangentone.com.au>
+ * @copyright 2010 Tangent/One Au
+ * @version SVN: $Id$
+ */
+
+/**
+ * The abstract base class for Classifiers
+ *
+ * @package wotsit
+ */
+abstract class wotsit_Classifier implements wotsit_iClassifier
+{
+
+ /**
+ * @var wotsit_iExtractFeatures
+ */
+ protected $featureExtractor;
+
+ /**
+ * @var wotsit_iStorage
+ */
+ protected $storage;
+
+ /**
+ * @var array
+ */
+ protected $thresholds;
+
+ /**
+ * Constructor that takes a filter_iExtractFeature as a parameter to extract features
+ *
+ * @param wotsit_iExtractFeatures $featureExtractor
+ */
+ public function __construct(wotsit_iExtractFeatures $featureExtractor, wotsit_iStorage $storage)
+ {
+ $this->featureExtractor = $featureExtractor;
+ $this->storage = $storage;
+ $this->thresholds = array();
+ }
+
+ /**
+ * Returns the probability that the given item fits within the specified category
+ *
+ * Must be implement to derived classes to allow for classification
+ *
+ * @param string $item The item to test
+ * @param string $category The category to test it in
+ */
+ protected abstract function getProbability($item, $category);
+
+ /**
+ * Adds an item to the filter with a given category
+ *
+ * @param mixed $input The string to extract features from
+ * @param string $category The category it should be placed in
+ */
+ public function train($input, $category)
+ {
+ foreach($this->featureExtractor->getFeatures($input) as $feature) {
+ $this->storage->incrementFeatureCount($feature, $category);
+ }
+ $this->storage->incrementCategoryCount($category);
+ }
+
+
+
+
+ /**
+ * Returns the category that the item best fits in
+ *
+ * If we are unsure (i.e.) the best fit is below the threshold then we return the default category
+ *
+ * @param mixed $input The item to classify
+ * @param string $default The default to return if we are unsure
+ * @return string
+ */
+ public function classify($input, $default = null)
+ {
+ $probabilities = $this->classifications($input);
+ reset($probabilities);
+ $bestCategory = key($probabilities);
+ $max = $probabilities[$bestCategory];
+ array_shift($probabilities);
+
+ //check the best match is above the threshold factor compared to other categories
+ foreach ($probabilities as $category => $probability) {
+
+ if (($probability * $this->getThreshold($bestCategory)) > $max) {
+ return $default;
+ }
+ }
+ return $bestCategory;
+ }
+
+ /**
+ * Returns an array of category probabilities
+ *
+ * The array is keyed by the category and is sorted in descending order
+ *
+ * @param mixed $input The item to classify
+ * @return array
+ */
+ public function classifications($input)
+ {
+ $probabilities = array();
+ foreach ($this->storage->getCategories() as $category) {
+ $probabilities[$category] = $this->getProbability($input, $category);
+ }
+ arsort($probabilities, SORT_NUMERIC);
+ return $probabilities;
+ }
+
+ /**
+ * Sets the threshold for a given category
+ *
+ * If the probability that an item fits within the given category is less that
+ * the specified threshold then we are not sure that this is correct.
+ *
+ * Default threshold is 1.0
+ *
+ * @param string $category The category
+ * @param float $threshold The threshold
+ */
+ public function setThreshold($category, $threshold)
+ {
+ $this->thresholds[$category] = $threshold;
+ }
+
+ /**
+ * Returns the threshold for the specified category
+ *
+ * @param string $category The category
+ * @return float The threshold for the given category (default 1.0)
+ */
+ public function getThreshold($category)
+ {
+ $threshold = 1.0;
+ if (array_key_exists($category, $this->thresholds)) {
+ $threshold = $this->thresholds[$category];
+ }
+ return $threshold;
+ }
+
+ /**
+ * Returns that probablity that the given feature is in the given category
+ *
+ * @param wotsit_Feature $feature The feature
+ * @param string $category The category
+ * @return float The probability that the given feature is in the given category
+ */
+ protected function featureProbability(wotsit_Feature $feature, $category)
+ {
+ $categoryCount = $this->storage->getCategoryCount($category);
+ if ($categoryCount == 0) {
+ return 0.0;
+ }
+ return $this->storage->getFeatureCount($feature, $category) / $categoryCount;
+ }
+
+ /**
+ * Returns the weighted probablity that a given feature is in a given category
+ *
+ * @param string $feature The feature
+ * @param string $category The category
+ * @param float $weight The weighting to apply (default = 1.0)
+ * @param float $assumedProbability The assumed probability (default 0.5)
+ * @return float The weighted probability that the given feature is in the given category
+ */
+ protected function weightedProbability(wotsit_Feature $feature, $category, $assumedProbability = 0.5){
+ $baseProbability = $this->featureProbability($feature, $category);
+ $totals = 0;
+ foreach ($this->storage->getCategories() as $aCategory) {
+ $totals += $this->storage->getFeatureCount($feature, $aCategory);
+ }
+ $weight = $feature->getWeight();
+ $weightedProbability = (($weight*$assumedProbability) + ($totals*$baseProbability)) / ($weight+$totals);
+ return $weightedProbability;
+ }
+
+
+}
+?>
45 src/wotsit/Feature.php
@@ -0,0 +1,45 @@
+<?php
+/**
+ * @package wotsit
+ * @author Jonathan Moss <jonathan.moss@tangentone.com.au>
+ * @copyright 2010 Tangent/One Au
+ * @version SVN: $Id$
+ */
+
+/**
+ * A single, weighted feature
+ *
+ * @package wotsit
+ */
+class wotsit_Feature
+{
+
+ private $value;
+ private $weight;
+
+ /**
+ * @param string $value
+ * @param float $weight
+ */
+ public function __construct($value, $weight = 1.0)
+ {
+ $this->value = (string)$value;
+ $this->weight = (float)$weight;
+ }
+
+ /**
+ * @return string
+ */
+ public function getValue()
+ {
+ return $this->value;
+ }
+
+ /**
+ * @return float
+ */
+ public function getWeight()
+ {
+ return $this->weight;
+ }
+}
79 src/wotsit/classifier/Fisher.php
@@ -0,0 +1,79 @@
+<?php
+/**
+ * @package wotsit
+ * @subpackage classifier
+ * @author Jonathan Moss <jonathan.moss@tangentone.com.au>
+ * @copyright 2010 Tangent/One Au
+ * @version SVN: $Id$
+ */
+
+/**
+ * This class contains a Naive Bayesian classifier implementation
+ *
+ * @package wotsit
+ * @subpackage classifier
+ */
+class wotsit_classifier_Fisher extends wotsit_Classifier implements wotsit_iClassifier
+{
+
+ /**
+ * @param wotsit_iExtractFeatures $featureExtractor
+ */
+ public function __construct(wotsit_iExtractFeatures $featureExtractor, wotsit_iStorage $storage)
+ {
+ parent::__construct($featureExtractor, $storage);
+ }
+
+ protected function featureProbability(wotsit_Feature $feature, $category)
+ {
+ $probability = 0;
+ $probabilityInChosenCategory = parent::featureProbability($feature, $category);
+
+ if($probabilityInChosenCategory != 0){
+ $frequencySum = 0.0;
+ foreach($this->storage->getCategories() as $aCategory) {
+ $frequencySum += parent::featureProbability($feature, $aCategory);
+ }
+ $probability = $probabilityInChosenCategory/$frequencySum;
+ }
+ return $probability;
+ }
+
+ /**
+ * Returns the probability that the given $item is in the given $category
+ *
+ * @param string $item
+ * @param string $category
+ */
+ public function getProbability($item, $category){
+ $probability = 1;
+ $features = $this->featureExtractor->getFeatures($item);
+ foreach($features as $feature) {
+ $probability *= $this->weightedProbability($feature, $category);
+ }
+ $fscore = -2.0 * log($probability);
+ $probability = $this->inverseChi2($fscore, count($features)*2);
+ return $probability;
+ }
+
+ /**
+ * Calculates the inverse Chi^2
+ *
+ * @param float $chi
+ * @param float $degreesOfFreedom
+ * @return float
+ */
+ public function inverseChi2($chi, $degreesOfFreedom){
+ $m = $chi / 2.0;
+ $sum = exp(-$m);
+ $term = $sum;
+
+ for($i=1; $i < ($degreesOfFreedom/2); $i++){
+ $term *= $m/$i;
+ $sum += $term;
+ }
+
+ return min($sum, 1.0);
+ }
+}
+?>
59 src/wotsit/classifier/NaiveBayesian.php
@@ -0,0 +1,59 @@
+<?php
+/**
+ * @package wotsit
+ * @subpackage classifier
+ * @author Jonathan Moss <jonathan.moss@tangentone.com.au>
+ * @copyright 2010 Tangent/One Au
+ * @version SVN: $Id$
+ */
+
+/**
+ * This class contains a Naive Bayesian classifier implementation
+ *
+ * @package wotsit
+ * @subpackage classifier
+ */
+class wotsit_classifier_NaiveBayesian extends wotsit_Classifier implements wotsit_iClassifier
+{
+
+ /**
+ * Constructor takes a filter_iExtractFeatures object which is used to extract the features for any gieven document
+ *
+ * @param filter_iExtractFeatures $objExtractor
+ */
+ public function __construct(wotsit_iExtractFeatures $featureExtractor, wotsit_iStorage $storage)
+ {
+ parent::__construct($featureExtractor, $storage);
+ }
+
+ /**
+ * Return the weighted probability that features within a document fit within the specified category
+ *
+ * @param string $item The item to categories
+ * @param string $feature The category to test it in
+ * @return float The probablity that $item fits within $category
+ */
+ protected function getDocumentProbability($item, $category)
+ {
+ $features = $this->featureExtractor->getFeatures($item);
+ $documentProbability = 1.0;
+ foreach($features as $feature) {
+ $documentProbability *= $this->weightedProbability($feature, $category);
+ }
+ return $documentProbability;
+ }
+
+ /**
+ * Returns the product of Pr($item, $category) and Pr($category)
+ *
+ * @param string $item The item
+ * @param string $category The category
+ * @return float The probability
+ */
+ protected function getProbability($item, $category)
+ {
+ $categoryProbability = $this->storage->getCategoryCount($category) / $this->storage->getTotalCount();
+ $documentProbability = $this->getDocumentProbability($item, $category);
+ return $categoryProbability * $documentProbability;
+ }
+}
53 src/wotsit/feature/ExtractorCollection.php
@@ -0,0 +1,53 @@
+<?php
+/**
+ * @package wotsit
+ * @subpackage feature
+ * @author Jonathan Moss <jonathan.moss@tangentone.com.au>
+ * @copyright 2010 Tangent/One Au
+ * @version SVN: $Id$
+ */
+
+/**
+ * Provides a convenient way to use multiple extractors as a single one
+ *
+ * @package wotsit
+ * @subpackage feature
+ */
+class wotsit_feature_ExtractorCollection implements wotsit_iExtractFeatures
+{
+
+ /**
+ * @var array
+ */
+ private $extractors = array();
+
+ /**
+ * Adds an extractor to the list
+ *
+ * @param wotsit_iExtractFeatures$extractor
+ * @return wotsit_feature_ExtractorCollection
+ */
+ public function addExtractor(wotsit_iExtractFeatures $extractor)
+ {
+ $this->extractors[] = $extractor;
+ return $this;
+ }
+
+ /**
+ * Returns all features extracted from the input
+ *
+ * @param mixed $input
+ * @return array
+ */
+ public function getFeatures($input)
+ {
+ $features = array();
+ foreach ($this->extractors as $extractor) {
+ foreach ($extractor->getFeatures($input) as $feature) {
+ $features[] = $feature;
+ }
+ }
+ return $features;
+ }
+
+}
65 src/wotsit/feature/LinkExtractor.php
@@ -0,0 +1,65 @@
+<?php
+/**
+ * @package wotsit
+ * @subpackage feature
+ * @author Jonathan Moss <jonathan.moss@tangentone.com.au>
+ * @copyright 2010 Tangent/One Au
+ * @version SVN: $Id$
+ */
+
+/**
+ * The filter_LinkExtractor class implements the filter_iExtractFeatures interface
+ *
+ * This class extracts links from the passed in document
+ *
+ * @package wotsit
+ * @subpackage feature
+ */
+class wotsit_feature_LinkExtractor implements wotsit_iExtractFeatures
+{
+
+ const URL_REGEX = "((([hH][tT][tT][pP][sS]?|[fF][tT][pP])\:\/\/)?([\w\.\-]+(\:[\w\.\&%\$\-]+)*@)?((([^\s\(\)\<\>\\\"\.\[\]\,@;:]+)(\.[^\s\(\)\<\>\\\"\.\[\]\,@;:]+)*(\.[a-zA-Z]{2,4}))|((([01]?\d{1,2}|2[0-4]\d|25[0-5])\.){3}([01]?\d{1,2}|2[0-4]\d|25[0-5])))(\b\:(6553[0-5]|655[0-2]\d|65[0-4]\d{2}|6[0-4]\d{3}|[1-5]\d{4}|[1-9]\d{0,3}|0)\b)?((\/[^\/][\w\.\,\?\'\\\/\+&%\$#\=~_\-@]*)*[^\.\,\?\"\'\(\)\[\]!;<>{}\s\x7F-\xFF])?)";
+
+ protected $linkWeight = 1.2;
+
+ /**
+ * Returns an array of features found in the document
+ *
+ * @param string $input The document to extract features from
+ * @return array An array of features extracted from the provided document
+ */
+ public function getFeatures($input){
+
+ $features = $this->getLinkFeatures($input);
+ return $features;
+ }
+
+ /**
+ *
+ * @param float $weight
+ * @return wotsit_feature_CommentExtractor
+ */
+ public function setLinkWeight($weight = 1.2)
+ {
+ $this->linkWeight = (float)$weight;
+ return $this;
+ }
+
+ /**
+ * Returns an array of link features
+ *
+ * @param string $input
+ * @return array
+ */
+ protected function getLinkFeatures($input)
+ {
+ $features = array();
+ if (preg_match_all(self::URL_REGEX, $input, $matches) && isset($matches[0])) {
+ foreach ($matches[0] as $link) {
+ $link = strtolower($link);
+ $features[$link] = new wotsit_Feature($link, $this->linkWeight);
+ }
+ }
+ return array_values($features);
+ }
+}
88 src/wotsit/feature/NGramExtractor.php
@@ -0,0 +1,88 @@
+<?php
+/**
+ * @package wotsit
+ * @subpackage feature
+ * @author Jonathan Moss <jonathan.moss@tangentone.com.au>
+ * @copyright 2010 Tangent/One Au
+ * @version SVN: $Id$
+ */
+
+/**
+ * The wotsit_feature_NGramExtractor class implements the filter_iExtractFeatures interface
+ *
+ * Like the WordExtract, this class provides a simple split on whitespace characters with word length constraints.
+ * The NGram extractor takes it one step further and splits the words into n-gram of a defined length. for example
+ * with the ngram length set to 3 the word 'jon' would result in these tokens:
+ *
+ * <code>
+ * [0] => ' j';
+ * [1] => ' jo';
+ * [2] => 'jon';
+ * [3] => 'on ';
+ * [4] => 'n ';
+ * </code>
+ *
+ * n-grams are more tolerant to mis-spellings and intentional typos but at the cost of additional computation and many more tokens
+ *
+ * @package wotsit
+ * @subpackage feature
+ */
+class wotsit_feature_NGramExtractor extends wotsit_feature_WordExtractor
+{
+
+ protected $nGramLength;
+
+ /**
+ * @param int $minimumLength The minimum word length
+ * @param int $maximumLength The maximum word length
+ */
+ public function __construct($nGramLength = 3, $minimumLength = 2, $maximumLength = 20, wotsit_feature_Stemmer $stemmer = null)
+ {
+ parent::__construct($minimumLength, $maximumLength, $stemmer);
+ $this->nGramLength = $nGramLength;
+ }
+
+ /**
+ * Returns an array of features found in the document
+ *
+ * @param string $input The document to extract features from
+ * @return array An array of wotsit_Feature objects extracted from the provided document
+ */
+ public function getFeatures($input)
+ {
+ $decodedInput = strip_tags($input); //just in case
+ $tokens = $this->tokenize($decodedInput);
+ $nGrams = $this->createNGrams($tokens);
+ return $this->tokensToFeatures($nGrams);
+ }
+
+ /**
+ * Converts and array of words into an array of n-grams
+ *
+ * e.g. JON becomes:
+ * <code>
+ * [0] => ' J'
+ * [1] => ' JO'
+ * [2] => 'JON'
+ * [3] => 'ON '
+ * [4] => 'N '
+ * </code>
+ *
+ * @param array $tokens
+ * @return array
+ */
+ protected function createNGrams(array $tokens)
+ {
+ $nGrams = array();
+ foreach ($tokens as $token) {
+ $padding = str_pad('', $this->nGramLength - 1, ' ');
+ $token = $padding . $token . $padding;
+ $len = strlen($token) - ($this->nGramLength - 1);
+ for($pos = 0; $pos<$len; $pos++) {
+ $nGram = substr($token, $pos, $this->nGramLength);
+ $nGrams[$nGram] = $nGram;
+ }
+ }
+ return $nGrams;
+ }
+}
672 src/wotsit/feature/Stemmer.php
@@ -0,0 +1,672 @@
+<?php
+
+/*************************************************************************
+ * *
+ * class.stemmer.inc *
+ * *
+ *************************************************************************
+ * *
+ * Implementation of the Porter Stemming Alorithm *
+ * *
+ * Copyright (c) 2003-2007 Jon Abernathy <jon@chuggnutt.com> *
+ * All rights reserved. *
+ * *
+ * This script is free software; you can redistribute it and/or modify *
+ * it under the terms of the GNU General Public License as published by *
+ * the Free Software Foundation; either version 2 of the License, or *
+ * (at your option) any later version. *
+ * *
+ * The GNU General Public License can be found at *
+ * http://www.gnu.org/copyleft/gpl.html. *
+ * *
+ * This script is distributed in the hope that it will be useful, *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
+ * GNU General Public License for more details. *
+ * *
+ * Author(s): Jon Abernathy <jon@chuggnutt.com> *
+ * *
+ * Last modified: 08/08/07 *
+ * *
+ * @package wotsit *
+ * @subpackage feature *
+ * *
+ *************************************************************************/
+
+
+/**
+ * Takes a word, or list of words, and reduces them to their English stems.
+ *
+ * This is a fairly faithful implementation of the Porter stemming algorithm that
+ * reduces English words to their stems, originally adapted from the ANSI-C code found
+ * on the official Porter Stemming Algorithm website, located at
+ * http://www.tartarus.org/~martin/PorterStemmer and later changed to conform
+ * more accurately to the algorithm itself.
+ *
+ * There is a deviation in the way compound words are stemmed, such as
+ * hyphenated words and words starting with certain prefixes. For instance,
+ * "international" should be reduced to "internation" and not "intern," but
+ * an unmodified version of the alorithm will do just that. Currently, only
+ * hyphenated words are accounted for.
+ *
+ * Thanks to Mike Boone (http://www.boonedocks.net/) for finding a fatal
+ * error in the is_consonant() private function dealing with short word stems beginning
+ * with "Y".
+ *
+ * Additional thanks to Mark Plumbley for finding an additional problem with
+ * short words beginning with "Y"--the word "yves" for example. I fixed the
+ * _o() and is_consonant() private functions to appropriately sanity check the values
+ * being passed around. Updated 3/12/04.
+ *
+ * Thanks to Andrew Jeffries (http://www.nextgendevelopment.co.uk/) for
+ * discovering a bug for words beginning with "yy"--this would cause the
+ * is_consonant() method checking either of these first "y"s to fall into
+ * a recursive infinite loop and crash the program. Updated 9/23/05.
+ *
+ * 11/09/05, big update. Prompted by an email from Richard Shelquist, I went
+ * back over the class and fixed some errors in the algorithm; in particular
+ * I made sure to conform EXACTLY to the written algorithm found at
+ * the Stemmer website. This class now takes the test vocabulary file found at
+ * http://tartarus.org/~martin/PorterStemmer/voc.txt and stems every single
+ * word exactly as shown in the output file found at
+ * http://tartarus.org/~martin/PorterStemmer/output.txt, with two exceptions:
+ * "ycleped" and "ycliped", which I believe my version stems correctly, due
+ * to assuming the "Y" at the beginning of a word followed by a consonant--
+ * as in "Yvette"--is to be treated as a vowel and NOT a consonant. Yeah,
+ * that's arrogant; allow me some, okay?
+ * Of course, should someone find an exception after boasting of my arrogance,
+ * please let me know. I'm only human, after all.
+ *
+ * Thanks to Damon Sauve (http://www.shopping.com/) for suggesting a better
+ * fix to the handling of hyphenated words (in his case, multi-hyphenated
+ * words). His fix used a regular expression to extract the final part of the
+ * hyphenated word, while mine does a substr() split instead. Also, his version
+ * allows dots and apostrophes in words, such as URLs and contractions, and
+ * I realize this is a real-world scenario that I didn't account for, so it's
+ * been incorporated.
+ *
+ * @author Jon Abernathy <jon@chuggnutt.com>
+ * @version 2.1
+ * @package wotsit
+ * @subpackage feature
+ */
+class wotsit_feature_Stemmer
+{
+ /**
+ * Takes a word and returns it reduced to its stem.
+ *
+ * Non-alphanumerics and hyphens are removed, except for dots and
+ * apostrophes, and if the word is less than three characters in
+ * length, it will be stemmed according to the five-step
+ * Porter stemming algorithm.
+ *
+ * Note special cases here: hyphenated words (such as half-life) will
+ * only have the base after the last hyphen stemmed (so half-life would
+ * only have "life" subject to stemming). Handles multi-hyphenated
+ * words, too.
+ *
+ * @param string $word Word to reduce
+ * @access public
+ * @return string Stemmed word
+ */
+ public function stemWord($word)
+ {
+ if ( empty($word) ) {
+ return false;
+ }
+
+ $result = '';
+
+ $word = strtolower($word);
+
+ // Strip punctuation, etc. Keep ' and . for URLs and contractions.
+ if ( substr($word, -2) == "'s" ) {
+ $word = substr($word, 0, -2);
+ }
+ $word = preg_replace("/[^a-z0-9'.-]/", '', $word);
+
+ $first = '';
+ if ( strpos($word, '-') !== false ) {
+ //list($first, $word) = explode('-', $word);
+ //$first .= '-';
+ $first = substr($word, 0, strrpos($word, '-') + 1); // Grabs hyphen too
+ $word = substr($word, strrpos($word, '-') + 1);
+ }
+ if ( strlen($word) > 2 ) {
+ $word = $this->_step_1($word);
+ $word = $this->_step_2($word);
+ $word = $this->_step_3($word);
+ $word = $this->_step_4($word);
+ $word = $this->_step_5($word);
+ }
+
+ $result = $first . $word;
+
+ return $result;
+ }
+
+ /**
+ * Takes a list of words and returns them reduced to their stems.
+ *
+ * $words can be either a string or an array. If it is a string, it will
+ * be split into separate words on whitespace, commas, or semicolons. If
+ * an array, it assumes one word per element.
+ *
+ * @param mixed $words String or array of word(s) to reduce
+ * @access public
+ * @return array List of word stems
+ */
+ public function stemWords($words)
+ {
+ if ( empty($words) ) {
+ return false;
+ }
+
+ $results = array();
+
+ if ( !is_array($words) ) {
+ $words = preg_split("/[\s+,;]+/", trim($words));
+ }
+
+ foreach ( $words as $word ) {
+ $result = $this->stemWord($word);
+ if (false !== $result) {
+ $results[] = $result;
+ }
+ }
+
+ return $results;
+ }
+
+ /**
+ * Performs the private functions of steps 1a and 1b of the Porter Stemming Algorithm.
+ *
+ * First, if the word is in plural form, it is reduced to singular form.
+ * Then, any -ed or -ing endings are removed as appropriate, and finally,
+ * words ending in "y" with a vowel in the stem have the "y" changed to "i".
+ *
+ * @param string $word Word to reduce
+ * @access private
+ * @return string Reduced word
+ */
+ private function _step_1( $word )
+ {
+ // Step 1a
+ if ( substr($word, -1) == 's' ) {
+ if ( substr($word, -4) == 'sses' ) {
+ $word = substr($word, 0, -2);
+ } elseif ( substr($word, -3) == 'ies' ) {
+ $word = substr($word, 0, -2);
+ } elseif ( substr($word, -2, 1) != 's' ) {
+ // If second-to-last character is not "s"
+ $word = substr($word, 0, -1);
+ }
+ }
+ // Step 1b
+ if ( substr($word, -3) == 'eed' ) {
+ if ($this->count_vc(substr($word, 0, -3)) > 0 ) {
+ // Convert '-eed' to '-ee'
+ $word = substr($word, 0, -1);
+ }
+ } else {
+ if ( preg_match('/([aeiou]|[^aeiou]y).*(ed|ing)$/', $word) ) { // vowel in stem
+ // Strip '-ed' or '-ing'
+ if ( substr($word, -2) == 'ed' ) {
+ $word = substr($word, 0, -2);
+ } else {
+ $word = substr($word, 0, -3);
+ }
+ if ( substr($word, -2) == 'at' || substr($word, -2) == 'bl' ||
+ substr($word, -2) == 'iz' ) {
+ $word .= 'e';
+ } else {
+ $last_char = substr($word, -1, 1);
+ $next_to_last = substr($word, -2, 1);
+ // Strip ending double consonants to single, unless "l", "s" or "z"
+ if ( $this->is_consonant($word, -1) &&
+ $last_char == $next_to_last &&
+ $last_char != 'l' && $last_char != 's' && $last_char != 'z' ) {
+ $word = substr($word, 0, -1);
+ } else {
+ // If VC, and cvc (but not w,x,y at end)
+ if ( $this->count_vc($word) == 1 && $this->_o($word) ) {
+ $word .= 'e';
+ }
+ }
+ }
+ }
+ }
+ // Step 1c
+ // Turn y into i when another vowel in stem
+ if ( preg_match('/([aeiou]|[^aeiou]y).*y$/', $word) ) { // vowel in stem
+ $word = substr($word, 0, -1) . 'i';
+ }
+ return $word;
+ }
+
+ /**
+ * Performs the private function of step 2 of the Porter Stemming Algorithm.
+ *
+ * Step 2 maps double suffixes to single ones when the second-to-last character
+ * matches the given letters. So "-ization" (which is "-ize" plus "-ation"
+ * becomes "-ize". Mapping to a single character occurence speeds up the script
+ * by reducing the number of possible string searches.
+ *
+ * Note: for this step (and steps 3 and 4), the algorithm requires that if
+ * a suffix match is found (checks longest first), then the step ends, regardless
+ * if a replacement occurred. Some (or many) implementations simply keep
+ * searching though a list of suffixes, even if one is found.
+ *
+ * @param string $word Word to reduce
+ * @access private
+ * @return string Reduced word
+ */
+ private function _step_2( $word )
+ {
+ switch ( substr($word, -2, 1) ) {
+ case 'a':
+ if ( $this->_replace($word, 'ational', 'ate', 0) ) {
+ return $word;
+ }
+ if ( $this->_replace($word, 'tional', 'tion', 0) ) {
+ return $word;
+ }
+ break;
+ case 'c':
+ if ( $this->_replace($word, 'enci', 'ence', 0) ) {
+ return $word;
+ }
+ if ( $this->_replace($word, 'anci', 'ance', 0) ) {
+ return $word;
+ }
+ break;
+ case 'e':
+ if ( $this->_replace($word, 'izer', 'ize', 0) ) {
+ return $word;
+ }
+ break;
+ case 'l':
+ // This condition is a departure from the original algorithm;
+ // I adapted it from the departure in the ANSI-C version.
+ if ( $this->_replace($word, 'bli', 'ble', 0) ) {
+ return $word;
+ }
+ if ( $this->_replace($word, 'alli', 'al', 0) ) {
+ return $word;
+ }
+ if ( $this->_replace($word, 'entli', 'ent', 0) ) {
+ return $word;
+ }
+ if ( $this->_replace($word, 'eli', 'e', 0) ) {
+ return $word;
+ }
+ if ( $this->_replace($word, 'ousli', 'ous', 0) ) {
+ return $word;
+ }
+ break;
+ case 'o':
+ if ( $this->_replace($word, 'ization', 'ize', 0) ) {
+ return $word;
+ }
+ if ( $this->_replace($word, 'isation', 'ize', 0) ) {
+ return $word;
+ }
+ if ( $this->_replace($word, 'ation', 'ate', 0) ) {
+ return $word;
+ }
+ if ( $this->_replace($word, 'ator', 'ate', 0) ) {
+ return $word;
+ }
+ break;
+ case 's':
+ if ( $this->_replace($word, 'alism', 'al', 0) ) {
+ return $word;
+ }
+ if ( $this->_replace($word, 'iveness', 'ive', 0) ) {
+ return $word;
+ }
+ if ( $this->_replace($word, 'fulness', 'ful', 0) ) {
+ return $word;
+ }
+ if ( $this->_replace($word, 'ousness', 'ous', 0) ) {
+ return $word;
+ }
+ break;
+ case 't':
+ if ( $this->_replace($word, 'aliti', 'al', 0) ) {
+ return $word;
+ }
+ if ( $this->_replace($word, 'iviti', 'ive', 0) ) {
+ return $word;
+ }
+ if ( $this->_replace($word, 'biliti', 'ble', 0) ) {
+ return $word;
+ }
+ break;
+ case 'g':
+ // This condition is a departure from the original algorithm;
+ // I adapted it from the departure in the ANSI-C version.
+ if ( $this->_replace($word, 'logi', 'log', 0) ) { //*****
+ return $word;
+ }
+ break;
+ }
+ return $word;
+ }
+
+ /**
+ * Performs the private function of step 3 of the Porter Stemming Algorithm.
+ *
+ * Step 3 works in a similar stragegy to step 2, though checking the
+ * last character.
+ *
+ * @param string $word Word to reduce
+ * @access private
+ * @return string Reduced word
+ */
+ private function _step_3( $word )
+ {
+ switch ( substr($word, -1) ) {
+ case 'e':
+ if ( $this->_replace($word, 'icate', 'ic', 0) ) {
+ return $word;
+ }
+ if ( $this->_replace($word, 'ative', '', 0) ) {
+ return $word;
+ }
+ if ( $this->_replace($word, 'alize', 'al', 0) ) {
+ return $word;
+ }
+ break;
+ case 'i':
+ if ( $this->_replace($word, 'iciti', 'ic', 0) ) {
+ return $word;
+ }
+ break;
+ case 'l':
+ if ( $this->_replace($word, 'ical', 'ic', 0) ) {
+ return $word;
+ }
+ if ( $this->_replace($word, 'ful', '', 0) ) {
+ return $word;
+ }
+ break;
+ case 's':
+ if ( $this->_replace($word, 'ness', '', 0) ) {
+ return $word;
+ }
+ break;
+ }
+ return $word;
+ }
+
+ /**
+ * Performs the private function of step 4 of the Porter Stemming Algorithm.
+ *
+ * Step 4 works similarly to steps 3 and 2, above, though it removes
+ * the endings in the context of VCVC (vowel-consonant-vowel-consonant
+ * combinations).
+ *
+ * @param string $word Word to reduce
+ * @access private
+ * @return string Reduced word
+ */
+ private function _step_4( $word )
+ {
+ switch ( substr($word, -2, 1) ) {
+ case 'a':
+ if ( $this->_replace($word, 'al', '', 1) ) {
+ return $word;
+ }
+ break;
+ case 'c':
+ if ( $this->_replace($word, 'ance', '', 1) ) {
+ return $word;
+ }
+ if ( $this->_replace($word, 'ence', '', 1) ) {
+ return $word;
+ }
+ break;
+ case 'e':
+ if ( $this->_replace($word, 'er', '', 1) ) {
+ return $word;
+ }
+ break;
+ case 'i':
+ if ( $this->_replace($word, 'ic', '', 1) ) {
+ return $word;
+ }
+ break;
+ case 'l':
+ if ( $this->_replace($word, 'able', '', 1) ) {
+ return $word;
+ }
+ if ( $this->_replace($word, 'ible', '', 1) ) {
+ return $word;
+ }
+ break;
+ case 'n':
+ if ( $this->_replace($word, 'ant', '', 1) ) {
+ return $word;
+ }
+ if ( $this->_replace($word, 'ement', '', 1) ) {
+ return $word;
+ }
+ if ( $this->_replace($word, 'ment', '', 1) ) {
+ return $word;
+ }
+ if ( $this->_replace($word, 'ent', '', 1) ) {
+ return $word;
+ }
+ break;
+ case 'o':
+ // special cases
+ if ( substr($word, -4) == 'sion' || substr($word, -4) == 'tion' ) {
+ if ( $this->_replace($word, 'ion', '', 1) ) {
+ return $word;
+ }
+ }
+ if ( $this->_replace($word, 'ou', '', 1) ) {
+ return $word;
+ }
+ break;
+ case 's':
+ if ( $this->_replace($word, 'ism', '', 1) ) {
+ return $word;
+ }
+ break;
+ case 't':
+ if ( $this->_replace($word, 'ate', '', 1) ) {
+ return $word;
+ }
+ if ( $this->_replace($word, 'iti', '', 1) ) {
+ return $word;
+ }
+ break;
+ case 'u':
+ if ( $this->_replace($word, 'ous', '', 1) ) {
+ return $word;
+ }
+ break;
+ case 'v':
+ if ( $this->_replace($word, 'ive', '', 1) ) {
+ return $word;
+ }
+ break;
+ case 'z':
+ if ( $this->_replace($word, 'ize', '', 1) ) {
+ return $word;
+ }
+ break;
+ }
+ return $word;
+ }
+
+ /**
+ * Performs the private function of step 5 of the Porter Stemming Algorithm.
+ *
+ * Step 5 removes a final "-e" and changes "-ll" to "-l" in the context
+ * of VCVC (vowel-consonant-vowel-consonant combinations).
+ *
+ * @param string $word Word to reduce
+ * @access private
+ * @return string Reduced word
+ */
+ private function _step_5( $word )
+ {
+ if ( substr($word, -1) == 'e' ) {
+ $short = substr($word, 0, -1);
+ // Only remove in vcvc context...
+ if ( $this->count_vc($short) > 1 ) {
+ $word = $short;
+ } elseif ( $this->count_vc($short) == 1 && !$this->_o($short) ) {
+ $word = $short;
+ }
+ }
+ if ( substr($word, -2) == 'll' ) {
+ // Only remove in vcvc context...
+ if ( $this->count_vc($word) > 1 ) {
+ $word = substr($word, 0, -1);
+ }
+ }
+ return $word;
+ }
+
+ /**
+ * Checks that the specified letter (position) in the word is a consonant.
+ *
+ * Handy check adapted from the ANSI C program. Regular vowels always return
+ * FALSE, while "y" is a special case: if the prececing character is a vowel,
+ * "y" is a consonant, otherwise it's a vowel.
+ *
+ * And, if checking "y" in the first position and the word starts with "yy",
+ * return true even though it's not a legitimate word (it crashes otherwise).
+ *
+ * @param string $word Word to check
+ * @param integer $pos Position in the string to check
+ * @access public
+ * @return boolean
+ */
+ private function is_consonant( $word, $pos )
+ {
+ // Sanity checking $pos
+ if ( abs($pos) > strlen($word) ) {
+ if ( $pos < 0 ) {
+ // Points "too far back" in the string. Set it to beginning.
+ $pos = 0;
+ } else {
+ // Points "too far forward." Set it to end.
+ $pos = -1;
+ }
+ }
+ $char = substr($word, $pos, 1);
+ switch ( $char ) {
+ case 'a':
+ case 'e':
+ case 'i':
+ case 'o':
+ case 'u':
+ return false;
+ case 'y':
+ if ( $pos == 0 || strlen($word) == -$pos ) {
+ // Check second letter of word.
+ // If word starts with "yy", return true.
+ if ( substr($word, 1, 1) == 'y' ) {
+ return true;
+ }
+ return !($this->is_consonant($word, 1));
+ } else {
+ return !($this->is_consonant($word, $pos - 1));
+ }
+ default:
+ return true;
+ }
+ }
+
+ /**
+ * Counts (measures) the number of vowel-consonant occurences.
+ *
+ * Based on the algorithm; this handy private function counts the number of
+ * occurences of vowels (1 or more) followed by consonants (1 or more),
+ * ignoring any beginning consonants or trailing vowels. A legitimate
+ * VC combination counts as 1 (ie. VCVC = 2, VCVCVC = 3, etc.).
+ *
+ * @param string $word Word to measure
+ * @access public
+ * @return integer
+ */
+ private function count_vc( $word )
+ {
+ $m = 0;
+ $length = strlen($word);
+ $prev_c = false;
+ for ( $i = 0; $i < $length; $i++ ) {
+ $is_c = $this->is_consonant($word, $i);
+ if ( $is_c ) {
+ if ( $m > 0 && !$prev_c ) {
+ $m += 0.5;
+ }
+ } else {
+ if ( $prev_c || $m == 0 ) {
+ $m += 0.5;
+ }
+ }
+ $prev_c = $is_c;
+ }
+ $m = floor($m);
+ return $m;
+ }
+
+ /**
+ * Checks for a specific consonant-vowel-consonant condition.
+ *
+ * This private function is named directly from the original algorithm. It
+ * looks the last three characters of the word ending as
+ * consonant-vowel-consonant, with the final consonant NOT being one
+ * of "w", "x" or "y".
+ *
+ * @param string $word Word to check
+ * @access private
+ * @return boolean
+ */
+ private function _o( $word )
+ {
+ if ( strlen($word) >= 3 ) {
+ if ( $this->is_consonant($word, -1) && !$this->is_consonant($word, -2) &&
+ $this->is_consonant($word, -3) ) {
+ $last_char = substr($word, -1);
+ if ( $last_char == 'w' || $last_char == 'x' || $last_char == 'y' ) {
+ return false;
+ }
+ return true;
+ }
+ }
+ return false;
+ }
+
+ /**
+ * Replaces suffix, if found and word measure is a minimum count.
+ *
+ * @param string $word Word to check and modify
+ * @param string $suffix Suffix to look for
+ * @param string $replace Suffix replacement
+ * @param integer $m Word measure value that the word must be greater
+ * than to replace
+ * @access private
+ * @return boolean
+ */
+ private function _replace( &$word, $suffix, $replace, $m = 0 )
+ {
+ $sl = strlen($suffix);
+ if ( substr($word, -$sl) == $suffix ) {
+ $short = substr_replace($word, '', -$sl);
+ if ( $this->count_vc($short) > $m ) {
+ $word = $short . $replace;
+ }
+ // Found this suffix, doesn't matter if replacement succeeded
+ return true;
+ }
+ return false;
+ }
+
+}
93 src/wotsit/feature/WordExtractor.php
@@ -0,0 +1,93 @@
+<?php
+/**
+ * @package wotsit
+ * @subpackage feature
+ * @author Jonathan Moss <jonathan.moss@tangentone.com.au>
+ * @copyright 2010 Tangent/One Au
+ * @version SVN: $Id$
+ */
+
+/**
+ * The wotsit_feature_WordExtractor class implements the filter_iExtractFeatures interface
+ *
+ * This class provides a simple split on whitespace characters with word length constraints
+ *
+ * @package wotsit
+ * @subpackage feature
+ */
+class wotsit_feature_WordExtractor implements wotsit_iExtractFeatures
+{
+
+ protected $minimumLength;
+ protected $maximumLength;
+
+ /**
+ * @var wotsit_feature_Stemmer
+ */
+ protected $stemmer;
+
+ /**
+ * @param int $minimumLength The minimum word length
+ * @param int $maximumLength The maximum word length
+ */
+ public function __construct($minimumLength = 2, $maximumLength = 20, wotsit_feature_Stemmer $stemmer = null)
+ {
+ $this->minimumLength = $minimumLength;
+ $this->maximumLength = $maximumLength;
+ $this->stemmer = $stemmer;
+ }
+
+ /**
+ * Returns an array of features found in the document
+ *
+ * @param string $input The document to extract features from
+ * @return array An array of features extracted from the provided document
+ */
+ public function getFeatures($input)
+ {
+ $decodedInput = strip_tags($input); //just in case
+ $tokens = $this->tokenize($decodedInput);
+ return $this->tokensToFeatures($tokens);
+ }
+
+ /**
+ * Returns an array of words
+ *
+ * @todo Add a word stemmer?
+ *
+ * @param string $input
+ * @return array An array of words
+ */
+ protected function tokenize($input)
+ {
+ $words = array();
+ $tokens = preg_split('/[\s]+/', $input);
+
+ foreach($tokens as $token){
+ $length = strlen($token);
+ if (($length >= $this->minimumLength) && ($length <= $this->maximumLength)) {
+ $token = trim(strtolower($token));
+ if (null !== $this->stemmer) {
+ $token = $this->stemmer->stemWord($token);
+ }
+ $words[$token] = $token;
+ }
+ }
+ return $words;
+ }
+
+ /**
+ * Converts an array of tokens into features
+ *
+ * @param array $tokens
+ * @return array
+ */
+ protected function tokensToFeatures(array $tokens)
+ {
+ $features = array();
+ foreach ($tokens as $token) {
+ $features[] = new wotsit_Feature($token);
+ }
+ return $features;
+ }
+}
67 src/wotsit/iClassifier.php
@@ -0,0 +1,67 @@
+<?php
+/**
+ * @package wotsit
+ * @author Jonathan Moss <jonathan.moss@tangentone.com.au>
+ * @copyright 2010 Tangent/One Au
+ * @version SVN: $Id$
+ */
+
+/**
+ * A interface to define classifiers
+ *
+ * @package wotsit
+ */
+interface wotsit_IClassifier
+{
+
+ /**
+ * Returns the category that the item best fits in
+ *
+ * If we are unsure (i.e.) the best fit is below the threshold then we return the default category
+ *
+ * @param mixed $input The item to classify
+ * @param string $default The default to return if we are unsure
+ * @return string
+ */
+ public function classify($input, $default = null);
+
+ /**
+ * Returns an array of all category probabilities
+ *
+ * The array is keyed by the category and is sorted in descending order
+ *
+ * @param mixed $input The item to classify
+ * @return array
+ */
+ public function classifications($input);
+
+ /**
+ * Adds an item to the filter with a given category
+ *
+ * @param mixed $input The string to extract features from
+ * @param string $category The category it should be placed in
+ */
+ public function train($input, $category);
+
+ /**
+ * Sets the threshold for a given category
+ *
+ * If the probability that an item fits within the given category is less that
+ * the specified threshold then we are not sure that this is correct.
+ *
+ * Default threshold is 1.0
+ *
+ * @param string $category The category
+ * @param float $threshold The threshold
+ */
+ public function setThreshold($category, $threshold);
+
+ /**
+ * Returns the threshold for the specified category
+ *
+ * @param string $category The category
+ * @return float The threshold for the given category (default 1.0)
+ */
+ public function getThreshold($category);
+
+}
28 src/wotsit/iExtractFeatures.php
@@ -0,0 +1,28 @@
+<?php
+/**
+ * @package wotsit
+ * @author Jonathan Moss <jonathan.moss@tangentone.com.au>
+ * @copyright 2010 Tangent/One Au
+ * @version SVN: $Id$
+ */
+
+/**
+ * Interface for a feature extrator
+ *
+ * feature extractors should take an input and return an array of wotsit_Feature objects
+ * for features found within the input.
+ *
+ * There is no restriction on the type of input
+ *
+ * @package wotsit
+ */
+interface wotsit_iExtractFeatures
+{
+ /**
+ *
+ * @param mixed $input
+ * @return array
+ */
+ public function getFeatures($input);
+
+}
62 src/wotsit/iStorage.php
@@ -0,0 +1,62 @@
+<?php
+/**
+ * @package wotsit
+ * @author Jonathan Moss <jonathan.moss@tangentone.com.au>
+ * @copyright 2010 Tangent/One Au
+ * @version SVN: $Id$
+ */
+
+/**
+ * An interface for wotsit storage backends
+ *
+ * @package wotsit
+ */
+interface wotsit_iStorage
+{
+ /**
+ * Increases the count for the category of the specified feature
+ *
+ * @param wotsit_Feature $feature The feature
+ * @param string $category The category
+ */
+ public function incrementFeatureCount(wotsit_Feature $feature, $category);
+
+ /**
+ * Returns the count for a category of the specified feature
+ *
+ * @param wotsit_Feature $feature The feature
+ * @param string $category The category
+ * @return int The count associated with the specified feature and category
+ */
+ public function getFeatureCount(wotsit_Feature $feature, $category);
+
+ /**
+ * Increases the count for the specified category
+ *
+ * @param string $category The category
+ */
+ public function incrementCategoryCount($category);
+
+ /**
+ * Returns the count for the specified category
+ *
+ * @param string $category
+ * @return int The count for the specified category
+ */
+ public function getCategoryCount($category);
+
+ /**
+ * Returns the sum of all category counts
+ *
+ * @return int The sum of all category counts
+ */
+ public function getTotalCount();
+
+ /**
+ * Returns an array of categories
+ *
+ * @return array The categories
+ */
+ public function getCategories();
+
+}
170 src/wotsit/storage/Dbm.php
@@ -0,0 +1,170 @@
+<?php
+/**
+ * @package wotsit
+ * @subpackage storage
+ * @author Jonathan Moss <jonathan.moss@tangentone.com.au>
+ * @copyright 2010 Tangent/One Au
+ */
+
+/**
+ * DBM storage object
+ *
+ * @package wotsit
+ * @subpackage storage
+ */
+class wotsit_storage_Dbm implements wotsit_iStorage
+{
+
+ const CATEGORY_LIST_KEY = 'WOTSIT_CATEGORIES';
+
+ const FEATURE_KEY = "WOTSIT_%s:%s";
+
+ /**
+ * Handle to the DBM file resource
+ *
+ * @var resource
+ */
+ private $dbm;
+
+ /**
+ * @var string
+ */
+ private $filepath;
+
+ /**
+ * @param string $filePath
+ * @param int $expiration
+ */
+ public function __construct($filePath)
+ {
+ if (!extension_loaded('dba')) {
+ throw new RuntimeException("The dbm extension is not loaded");
+ }
+ $this->filepath = $filePath;
+ $this->openDbmFile();
+ }
+
+ public function __destruct()
+ {
+ dba_sync($this->dbm);
+ dba_optimize($this->dbm);
+ dba_close($this->dbm);
+ }
+
+ /**
+ * @return void
+ */
+ private function openDbmFile()
+ {
+ $this->dbm = dba_open($this->filepath, 'c');
+ }
+
+ /**
+ * Increases the count for the category of the specified feature
+ *
+ * @param wotsit_Feature $feature The feature
+ * @param string $category The category
+ */
+ public function incrementFeatureCount(wotsit_Feature $feature, $category)
+ {
+ $count = $this->getFeatureCount($feature, $category) + 1;
+ dba_replace($this->getFeatureKey($feature, $category), $count, $this->dbm);
+ }
+
+ /**
+ * Returns the count for a category of the specified feature
+ *
+ * @param wotsit_Feature $feature The feature
+ * @param string $category The category
+ * @return float The count associated with the specified feature and category
+ */
+ public function getFeatureCount(wotsit_Feature $feature, $category)
+ {
+ $count = dba_fetch($this->getFeatureKey($feature, $category), $this->dbm);
+ if (false === $count) {
+ $count = 0;
+ }
+ return (int)$count;
+ }
+
+ /**
+ * @param wotsit_Feature $feature
+ * @param string $category
+ * @return string
+ */
+ private function getFeatureKey(wotsit_Feature $feature, $category)
+ {
+ return sprintf(self::FEATURE_KEY, $category, $feature->getValue());
+ }
+
+ /**
+ * Increases the count for the specified category
+ *
+ * @param string $category The category
+ * @return void
+ */
+ public function incrementCategoryCount($category)
+ {
+ $this->ensureCategory($category);
+ $count = $this->getCategoryCount($category) + 1;
+ dba_replace($category, $count, $this->dbm);
+ }
+
+ /**
+ * Returns the count for the specified category
+ *
+ * @param string $category
+ * @return float The count for the specified category
+ */
+ public function getCategoryCount($category)
+ {
+ $count = dba_fetch($category, $this->dbm);
+ if (false === $count) {
+ $count = 0;
+ }
+ return (int)$count;
+ }
+
+ /**
+ * Returns the sum of all category counts
+ *
+ * @return float The sum of all category counts
+ */
+ public function getTotalCount()
+ {
+ $totalCount = 0;
+ foreach ($this->getCategories() as $category) {
+ $totalCount += $this->getCategoryCount($category);
+ }
+ return $totalCount;
+ }
+
+ /**
+ * Returns an array of categories
+ *
+ * @return array The categories
+ */
+ public function getCategories()
+ {
+ $categories = array();
+ if (dba_exists(self::CATEGORY_LIST_KEY, $this->dbm)) {
+ $categories = unserialize(dba_fetch(self::CATEGORY_LIST_KEY, $this->dbm));
+ }
+ return array_values($categories);
+ }
+
+ /**
+ * Ensures a category is present in the dbm file
+ *
+ * @param string $category
+ */
+ private function ensureCategory($category)
+ {
+ $data = array();
+ if (dba_exists(self::CATEGORY_LIST_KEY, $this->dbm)) {
+ $data = unserialize(dba_fetch(self::CATEGORY_LIST_KEY, $this->dbm));
+ }
+ $data[$category] = $category;
+ dba_replace(self::CATEGORY_LIST_KEY, serialize($data), $this->dbm);
+ }
+}
120 src/wotsit/storage/Memory.php
@@ -0,0 +1,120 @@
+<?php
+/**
+ * @package wotsit
+ * @subpackage storage
+ * @author Jonathan Moss <jonathan.moss@tangentone.com.au>
+ * @copyright 2010 Tangent/One Au
+ * @version SVN: $Id$
+ */
+
+/**
+ * A transient memory storage engine.
+ *