-
Notifications
You must be signed in to change notification settings - Fork 3
/
Sample.php
212 lines (176 loc) · 5.72 KB
/
Sample.php
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
<?php
/*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
* This software consists of work done by Ben Waine
* and is licensed under the LGPL. For more information, see
* http://ben-waine.co.uk/
*/
namespace BayesPHP;
use BayesPHP\Sample\Result as Result;
use BayesPHP\Stemer as Stemer;
use BayesPHP\WordCounter as WordCounter;
/**
* BayesPHP\Sample analyses an array of samples and produces a result object used
* by the BayesPHP\Classifer to classify strings of text.
*
* @package BayesPHP
* @subpackage Sample
* @author Ben Waine
*/
class Sample
{
/**
* Array of text samples.
*
* @var array
*/
private $sample;
/**
* Stemer
*
* @var Stemer
*/
private $stemer;
/**
* Word Counter
*
* @var WordCounter
*/
private $counter;
/**
* Initialises an instance of BayesPHP\Sample.
*
* @param array $sample An array containing an equal number of positive and negative text samples. With indicies array('p' => array(), 'n' => array()).
* @param Stemer $stemer A stemer used to stem the words in each of the text samples.
* @param WordCounter $counter A word counter used to count words in text samples.
*/
public function __construct(Stemer $stemer, WordCounter $counter)
{
$this->stemer = $stemer;
$this->counter = $counter;
}
/**
* Set the sample used to produce the BayesPHP\Sample\Result object.
* MUST be in the format array('p' => array(), 'n'=> array()).
* MUST have an equal number of positive and negaitive text samples.
*
* @param array $sample Text samples.
*
* @return void
*/
public function setSample(array $sample)
{
if(!\array_key_exists('p', $sample) || !\array_key_exists('n', $sample))
{
throw new \BayesPHP\Exception\BadArgument('Sample must contain both P and N keys');
}
if(\count($sample['p']) != \count($sample['n']))
{
throw new \BayesPHP\Exception\BadArgument('Positive and Negative samples mus be equal.');
}
$this->sample = $sample;
}
/**
* Process the sample supplied and produce a BayesPHP\Sample\Result object.
*
* @return \BayesPHP\Sample\Result
*/
public function process()
{
// Both samples are the same size.
// as asserted in the setSample method
$sampleSize = count($this->sample['p']);
$positiveWCs = $this->wordCountSample($this->sample['p']);
$negativeWCs = $this->wordCountSample($this->sample['n']);
$probsPos = $this->calculateProbabilities($positiveWCs, $sampleSize);
$probsNeg = $this->calculateProbabilities($negativeWCs, $sampleSize);
$results = $this->handleResults($probsPos, $probsNeg);
$result = new Result($results);
return $result;
}
/**
* Counts the words in a text sample.
*
* @param array $sample An array of text samples.
*
* @return array
*/
private function wordCountSample(array $sample)
{
foreach($sample as $string)
{
$stemedString = $this->stemer->process($string);
$this->counter->addToSample($stemedString);
}
$counts = $this->counter->getWordCounts();
$this->counter->reset();
return $counts;
}
/**
* Calculate the probability that a word appeared in set of text samples.
*
* @param array $words An array of word counts.
* @param integer $sampleSize
*
* @return array
*/
private function calculateProbabilities(array $words, $sampleSize)
{
$resultArray = array();
foreach($words as $word => $appearences)
{
$resultArray[$word] = $appearences / $sampleSize;
}
return $resultArray;
}
/**
* Takes an array of positive and negative probabilities and
* reindexes them to produce an array in the format array('word' => array('p' => 0.33, 'n' => 0.12))
*
* @param array $positive Positive word probabilites
* @param array $negative Negative word probabilites
*
* @return array
*/
private function handleResults(array $positive, array $negative)
{
$outResults = array();
foreach($positive as $word => $occur)
{
if(array_key_exists($word, $outResults))
{
$outResults[$word]['p'] = $occur;
}
else
{
$outResults[$word] = array('p' => $occur, 'n' => 0);
}
}
unset($word, $occur);
foreach($negative as $word => $occur)
{
if(array_key_exists($word, $outResults))
{
$outResults[$word]['n'] = $occur;
}
else
{
$outResults[$word] = array('p' => 0, 'n' => $occur);
}
}
unset ($word, $occur);
ksort($outResults);
return $outResults;
}
}