-
Notifications
You must be signed in to change notification settings - Fork 4
/
PosTrainingSet.php
51 lines (47 loc) · 1.06 KB
/
PosTrainingSet.php
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
<?php
use NlpTools\Documents\TrainingSet;
use NlpTools\Documents\WordDocument;
class PosTrainingSet extends TrainingSet
{
/**
* Read word-tag pairs from a file and create a TrainingSet.
* The file should have the following format
* <word> <space> <tag> <new line>
*
* @param string|Iterator $file The filename that contains the tagged words or an iterator returninthe above formatted lines
* @return PosTrainingSet The training set from the file
*/
public static function fromFile($file, $context=1) {
if (!($file instanceof Iterator)) {
$file = new SplFileObject($file);
}
$lines = array_filter(
array_map(
function ($line) {
return array_map("trim",explode(" ", $line));
},
iterator_to_array($file, false)
)
);
$words = array_map(
function ($l) {
return $l[0];
},
$lines
);
$tset = new PosTrainingSet();
foreach ($lines as $idx=>$l) {
if (count($l)<2)
continue;
$tset->addDocument(
$l[1], // the tag
new WordDocument(
$words,
$idx,
$context
)
);
}
return $tset;
}
}