-
Notifications
You must be signed in to change notification settings - Fork 0
/
collection_features_test.py
54 lines (41 loc) · 1.95 KB
/
collection_features_test.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
import book_classification as bc
from nose.tools import *
from book_classification.tests.books import *
class DummyFeatures:
def __init__(self, items):
self._items = items
def items(self):
return self._items
def test_CollectionFeaturesExtractorWorks():
tokenizer = bc.BasicTokenizer()
extractor = bc.FrequenciesExtractor(tokenizer)
collection_extractor = bc.CollectionFeaturesExtractor(extractor)
collection_features = collection_extractor.extract_from(trainingCollection)
for book in collection_features.collection().books():
features = extractor.extract_from(book)
eq_(collection_features.by_book(book), features)
def test_FeaturesEncoderCanEncodeAndDecode():
encoder = bc.FeaturesEncoder(["one", "two", "three"])
items = [("one", 25), ("three", 10), ("two", 50)]
features = DummyFeatures(items)
#eq_(list(encoder.encode(features)), [(0, 25), (2, 10), (1, 50)])
eq_(list(encoder.decode(encoder.encode(features))), items)
def test_FeaturesEncoderIgnoresUnknownNames():
encoder = bc.FeaturesEncoder(["one", "two", "three"])
items = [("one", 25), ("blah", 12), ("three", 10), ("two", 50), ("hi", 1000)]
features = DummyFeatures(items)
#eq_(list(encoder.encode(features)), [(0, 25), (2, 10), (1, 50)])
eq_(list(encoder.decode(encoder.encode(features))), [("one", 25), ("three", 10), ("two", 50)])
def test_CollectionFeaturesMatrixExtractor():
tokenizer = bc.BasicTokenizer()
extractor = bc.FrequenciesExtractor(tokenizer)
matrix_extractor = bc.CollectionFeaturesMatrixExtractor(
extractor, trainingCollection)
matrixOne = matrix_extractor.extract_from(trainingCollection)
eq_(matrixOne.shape, (2, 11))
eq_(matrixOne.nnz, 13)
ok_(abs(matrixOne.sum() - 2) < 10**-10)
matrixTwo = matrix_extractor.extract_from(testingCollection)
eq_(matrixTwo.shape, (2, 11))
eq_(matrixTwo.nnz, 7)
ok_(abs(matrixTwo.sum() - 0.633333333333) < 10**-10)