-
Notifications
You must be signed in to change notification settings - Fork 0
/
features_extractors_test.py
78 lines (58 loc) · 2.63 KB
/
features_extractors_test.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
import book_classification as bc
from nose.tools import *
def test_CanExtractVocabularies():
tokenizer = bc.DummySequenceTokenizer()
extractor = bc.VocabulariesExtractor(tokenizer)
vocabularies = extractor.extract_from(
["one", "two", "one", "three", "three", "three", "three"])
expected = {'three': 1, 'one': 1, 'two': 1}
eq_(len(vocabularies), 3)
eq_(vocabularies.total_counts(), 3)
eq_(dict(vocabularies.items()), expected)
def test_CanExtractFrequencies():
tokenizer = bc.DummySequenceTokenizer()
extractor = bc.FrequenciesExtractor(tokenizer)
frequencies = extractor.extract_from(
["one", "two", "one", "three", "three", "three", "three"])
expected = {'three': 0.5714285714285714, 'one': 0.2857142857142857, 'two': 0.14285714285714285}
eq_(len(frequencies), 3)
eq_(frequencies.total_counts(), 7)
eq_(dict(frequencies.items()), expected)
def test_CanExtractSeries():
tokenizer = bc.DummySequenceTokenizer()
extractor = bc.SeriesExtractor(tokenizer)
series = extractor.extract_from(
["one", "two", "one", "three", "three", "two", "three"])
expected = {'three': [3, 4, 6], 'one': [0, 2], 'two': [1, 5]}
eq_(len(series), 3)
eq_(series.total_counts(), 7)
eq_(dict(series.items()), expected)
def test_CanExtractEntropies():
tokenizer = bc.DummySequenceTokenizer()
grouper = bc.DummyGrouper()
extractor = bc.EntropiesExtractor(tokenizer, grouper)
entropies = extractor.extract_from(
[["one", "two"], ["one", "three"], ["one", "two"], ["one"]])
#expected = {'one': -0.4535888920010089, 'two': -1.4128711136008072, 'three': -3.1628711136008074}
expected = {'two': 0.5, 'one': 0.960964047443681, 'three': -0.0}
eq_(len(entropies), 3)
eq_(entropies.total_counts(), 4)
eq_(dict(entropies.items()), expected)
def test_CanExtractPairwiseAssociation():
return
tokenizer = bc.HashingTokenizerFilter(bc.DummySequenceTokenizer())
grouper = bc.SlidingGrouper
weights = [0.1, 0.2, 0.4, 0.2, 0.1]
extractor = bc.PairwiseAssociationExtractor(tokenizer, grouper, weights)
assocs = extractor.extract_from(
["one", "two", "one", "three", "three", "two",
"three", "one", "two", "three", "one", "one"])
expected = {
('three', 'one'): 0.0225, ('three', 'two'): 0.022500000000000003,
('one', 'three'): 0.015, ('one', 'one'): 0.0225, ('two', 'one'): 0.01,
('two', 'two'): 0.02, ('two', 'three'): 0.02,
('three', 'three'): 0.05500000000000001, ('one', 'two'): 0.0125
}
eq_(len(assocs), 9)
eq_(assocs.total_counts(), 40)
eq_(dict(assocs.items()), expected)