# benwaine/BayesPHP

Added Licences, some readme info and an example file.

commit 9165c6b7e8c6b903ef3bff0e5256b2ef1cabfd8f
Ben Waine authored
2  BayesPHP/Autoloader.php
 @@ -1,6 +1,6 @@
46 BayesPHP/Classifier.php
 @@ -1,9 +1,34 @@ resultOb = \$result; @@ -27,6 +59,13 @@ public function __construct(SResult \$result, Stemer \$stemer) \$this->stemer = \$stemer; } + /** + * Classify a string using the results of a sampling process. + * + * @param string \$string String to classify + * + * @return Classifier\Result + */ public function classify(\$string) { @@ -72,6 +111,13 @@ public function classify(\$string) return new Classifier\Result(\$string, \$posProbs, \$negProbs); } + /** + * Combines the word probabilites of a sample using Bayes formular. + * + * @param array \$probs + * + * @return double + */ private function calculateProbability(\$probs) { \$products = \array_product(\$probs);
102 BayesPHP/Classifier/Result.php
 @@ -1,37 +1,131 @@ string = \$string; \$this->positive = \$positive; \$this->negative = \$negative; - \$this->threshold = 0.7; + \$this->threshold = \$threshold; } + /** + * Returns the probabilities that the classified string resides in a class. + * + * @param integer \$resultType Used to specify which classification class to return. Default to 'BOTH'. + * + * @return array|double + */ public function getProbabilities(\$resultType = self::RESULT_BOTH) { if(\$resultType == self::RESULT_BOTH) @@ -52,6 +146,12 @@ public function getProbabilities(\$resultType = self::RESULT_BOTH) } } + /** + * Returns the result of the classification process. + * The 'threshold' value is taken into account. (See constructor) + * + * @return integer + */ public function getResult() { if(\$this->positive == \$this->negative)
26 BayesPHP/Exception/BadArgument.php
 @@ -1,6 +1,32 @@
76 BayesPHP/Sample.php
 @@ -1,9 +1,35 @@ array(), 'n' => array()). + * @param Stemer \$stemer A stemer used to stem the words in each of the text samples. + * @param WordCounter \$counter A word counter used to count words in text samples. + */ public function __construct(\$sample, Stemer \$stemer, WordCounter \$counter) { \$this->setSample(\$sample); @@ -35,6 +68,15 @@ public function __construct(\$sample, Stemer \$stemer, WordCounter \$counter) \$this->counter = \$counter; } + /** + * Set the sample used to produce the BayesPHP\Sample\Result object. + * MUST be in the format array('p' => array(), 'n'=> array()). + * MUST have an equal number of positive and negaitive text samples. + * + * @param array \$sample Text samples. + * + * @return void + */ public function setSample(array \$sample) { @@ -51,12 +93,17 @@ public function setSample(array \$sample) \$this->sample = \$sample; } + /** + * Process the sample supplied and produce a BayesPHP\Sample\Result object. + * + * @return \BayesPHP\Sample\Result + */ public function process() { // Both samples are the same size. // as asserted in the setSample method \$sampleSize = count(\$this->sample['p']); - //var_dump(\$this->sample); + \$positiveWCs = \$this->wordCountSample(\$this->sample['p']); \$negativeWCs = \$this->wordCountSample(\$this->sample['n']); @@ -71,6 +118,13 @@ public function process() return \$result; } + /** + * Counts the words in a text sample. + * + * @param array \$sample An array of text samples. + * + * @return array + */ private function wordCountSample(array \$sample) { @@ -88,6 +142,14 @@ private function wordCountSample(array \$sample) return \$counts; } + /** + * Calculate the probability that a word appeared in set of text samples. + * + * @param array \$words An array of word counts. + * @param integer \$sampleSize + * + * @return array + */ private function calculateProbabilities(array \$words, \$sampleSize) { \$resultArray = array(); @@ -100,6 +162,15 @@ private function calculateProbabilities(array \$words, \$sampleSize) return \$resultArray; } + /** + * Takes an array of positive and negative probabilities and + * reindexes them to produce an array in the format array('word' => array('p' => 0.33, 'n' => 0.12)) + * + * @param array \$positive Positive word probabilites + * @param array \$negative Negative word probabilites + * + * @return array + */ private function handleResults(array \$positive, array \$negative) { \$outResults = array(); @@ -129,7 +200,8 @@ private function handleResults(array \$positive, array \$negative) \$outResults[\$word] = array('p' => 0, 'n' => \$occur); } } - + unset (\$word, \$occur); + ksort(\$outResults); return \$outResults;
68 BayesPHP/Sample/Result.php
 @@ -1,27 +1,95 @@ probabilities = \$result; } + /** + * Returns an array consisting of probabilties that words appear in a sample + * based on number of encounters in previous samples. + * + * @return array + */ public function getAllProbabilities() { return \$this->probabilities; } + /** + * Returns an array consisting of probabilties that a given word appears in a sample + * based on number of encounters in previous samples. + * + * @return array|double + */ public function getWordProbability(\$word, \$return = self::RESULT_BOTH) { if(array_key_exists(\$word, \$this->probabilities))
107 BayesPHP/Stemer.php
 @@ -1,14 +1,63 @@ lowerCasing = \$lowerCasing; } + /** + * Set an array of punctuation to exclude from the subject. + * + * @param array \$punctuation Array containing symbols / puntuation + * + * @return void + */ public function setPunctuation(array \$punctuation) { \$this->punctuation = \$punctuation; } + /** + * Set an array of words to use as a blacklist + * + * @param array \$words Array of words to use as a blacklist + * + * @return void + */ public function setWordBlacklist(array \$words) { \$this->wordBlacklist = \$words; } - + + /** + * Process the string. Uses blacklist / lower casing / punctuation filters as applied. + * + * @param string \$string String to stem. + * + * @return string + */ public function process(\$string) { \$string = \$this->tokenActions(\$string); @@ -55,6 +132,13 @@ public function process(\$string) return \$string; } + /** + * Executes any actions required on individual tokens in the string. + * + * @param string \$string String to carry token actions on. + * + * @return string + */ private function tokenActions(\$string) { \$string = trim(\$string); @@ -84,16 +168,37 @@ private function tokenActions(\$string) return implode(' ', \$pieces); } + /** + * Checks the blacklist for presence of a word. + * + * @param string \$word Word to check blacklist for. + * + * @return boolean + */ private function checkBlacklist(\$word) { return (in_array(\$word, \$this->wordBlacklist)); } + /** + * Lower case the string. + * + * @param string \$string String to lower case. + * + * @return string + */ private function lowerCase(\$string) { return \strtolower(\$string); } + /** + * Remove an punctuation or symbols specified in the punctuation array. + * + * @param string \$string String to filter. + * + * @return string + */ private function punctuation(\$string) {
53 BayesPHP/WordCounter.php
 @@ -1,15 +1,57 @@ counts = array(); } + /** + * Adds a sample of text to the word counts recorded by this class. + * + * @param string \$string Text sample + * + * @return void + */ public function addToSample(\$string) { \$words = explode(' ', \$string); @@ -34,11 +76,22 @@ public function addToSample(\$string) } } + /** + * Gets an array containing the counts of all the strings submitted using the + * addToSample() method. + * + * @return array + */ public function getWordCounts() { return \$this->counts; } + /** + * Reset the word counter. This makes it suitable for reuse. + * + * @return void + */ public function reset() { \$this->counts = array();
12 README
 @@ -0,0 +1,12 @@ +BayesPHP is a small library designed to make the classification of text using +Bayes Algorithm easy and simple. + +REQUIREMENTS + +PHP 5.3 +Mockery https://github.com/padraic/mockery required to run unit tests + +EXAMPLES + +Examples are provided in examples/example.php. +
8 Tests/BayesPHP/Sample/ResultTest.php
 @@ -2,8 +2,6 @@ namespace BayesPHP\Sample; -require_once dirname(__FILE__) . '/../../../BayesPHP/Sample/Result.php'; - /** * Test class for Result. * Generated by PHPUnit on 2011-05-28 at 23:32:26. @@ -113,10 +111,6 @@ public function testGetWordProbabilityNeg(\$word, \$expected) \$this->assertEquals(\$expected, \$this->object->getWordProbability(\$word, Result::RESULT_NEG)); } - - - - } -?> +
2  Tests/bootstrap.php
 @@ -11,6 +11,4 @@ \$loader = new \Mockery\Loader; \$loader->register(); - - ?>
17 examples/Autoload.php
 @@ -1,4 +1,21 @@
127 examples/Example.php
 @@ -0,0 +1,127 @@ +register(); + + +// Create a new positive Sample based on some text input + +\$posSample = array( + 'I love the fox!', + 'What a great fox, im in love', + 'no one loves a fox like I do', + 'foxes are a great pet you would love one', + 'foxes are great I love them so much', + 'if you love foxes you probably should look at this' +); + +// Create a new negative sample based on some text inputs + +\$negSample = array( + 'Oh I hate foxes, dirty animals!', + 'the fox is a dirty animal', + 'foxes are in my dirty hate list', + 'if there is one animal I hate most, its a fox', + 'ew your a dirty fox', + 'no one can hate a dirty fox in the same way I do' +); + +// The sample array used in the BayesPHP\Sample class must be in the following +// format array('p' => array(), 'n'=> array()) + +\$textSample = array('p' => \$posSample, 'n' => \$negSample); + + +// The BayesPHP\Sample class uses two utility classes a word counter and a stemer. +// The Word counter counts all the words in the psotive and negative samples. + +\$wordCounter = new BayesPHP\WordCounter(); + +// The stemer reduces the number tpkens in the sample that have the same meaning. +// eg Hello, hello, hEllo and Hello! are all reduced to 'hello' +// This improves classifier accuracy. + +// First parameter dictates lower casing +// Second parameter dictates punctuation to use when steming +\$stemer = new BayesPHP\Stemer(true, array(',', '!', '.', )); + +// A blacklist of words can be added to the stemer. +// Any words on the list will be removed from the sample pre classification +// Suggestion: use a common word list and always remove the subject word. +\$stemer->setWordBlacklist(array('fox', 'i', 'a', 'if')); + +\$sample = new BayesPHP\Sample(\$textSample, \$stemer, \$wordCounter); + + +// Process the sample and produce a result object. +\$result = \$sample->process(); + + +// The result object is used as the input to the BayesPHP\Classifier object. +// The classifier classifies text inputs based on the results of the sampling process. + +// First parameter is the result object. +// Second parameter is a stemer instance. (preferably one with the same settings used in the sample process) +\$classifier = new BayesPHP\Classifier(\$result, \$stemer); + +\$posResult = \$classifier->classify('I love that fox'); +\$negResult = \$classifier->classify('I hate dirty foxes!'); +\$nuResult = \$classifier->classify('Nothing to do with foxes'); + +// The result of a classification is a BayesPHP\Classifier\Result +// You can get a result (on of the classes result constants) or view the probabilities +// of each classification. + +// When using the get result method + +echo 'Positive Result: ' . \$posResult->getResult(); + +echo PHP_EOL; + +var_dump(\$posResult->getProbabilities()); + +echo PHP_EOL; + +echo 'Negative Result: ' . \$negResult->getResult(); + +echo PHP_EOL; + +var_dump(\$negResult->getProbabilities()); + +echo PHP_EOL; + +echo 'Neutral Result: ' . \$nuResult->getResult(); + +echo PHP_EOL; + +var_dump(\$nuResult->getProbabilities()); + +echo PHP_EOL; + +var_dump(\$posResult, \$negResult); + + +?>
