-
Notifications
You must be signed in to change notification settings - Fork 0
/
classification.py
136 lines (105 loc) · 5.15 KB
/
classification.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
import book_classification as bc
import random
class ClassificationModel:
def __init__(self, extractor, dim_reducer, classifier):
self._extractor = extractor
self._dim_reducer = dim_reducer
self._classifier = classifier
def fit(self, collection):
self._training = collection
self._collection_matrix_extractor = bc.CollectionFeaturesMatrixExtractor(
self._extractor, self._training)
self._authors_indexer = bc.NumericIndexer(self._training.authors())
matrix = self._collection_matrix_extractor.extract_from(self._training)
authors = self.encode_authors(self._training)
reduced_matrix = self._dim_reducer.fit_transform(matrix)
self._classifier.fit(reduced_matrix, authors)
def predict(self, collection):
matrix = self._collection_matrix_extractor.extract_from(collection)
# XXX: if passed as strings, they will be encoded by svm
authors = self.encode_authors(collection)
reduced_matrix = self._dim_reducer.transform(matrix)
predicted_authors = self._classifier.predict(reduced_matrix)
return ClassificationResults(self, collection,
self.decode_authors(authors), self.decode_authors(predicted_authors))
def encode_authors(self, collection):
return [self._authors_indexer.encode(book.author()) for book in collection.books()]
def decode_authors(self, sequence):
return [self._authors_indexer.decode(author) for author in sequence]
# integrate with sklearn, and produce interesting graphics; also think about results comparer
class ClassificationResults:
def __init__(self, classification_model, collection, expected, predicted):
self._classification_model = classification_model
self._collection = collection
self._expected = expected
self._predicted = predicted
# allow all sklearn metrics, with proxy
def confusion_matrix(self):
pass
def metric(self):
return sum(1 for x,y in zip(self._expected, self._predicted) if x==y) / len(self._expected)
def baseline_metric(self):
return sum(len(self._collection.books_by(a)/len(self._collection))**2 for a in self._collection.authors())
class ExperimentSeries:
pass
class ESOverAuthorsCount:
def __init__(self, book_collection, classification_model):
self._book_collection = book_collection
self._classification_model = classification_model
self._config = {
'num_books': 8,
'training_percentage': 0.6,
'num_trials': 3
}
def set_parameters(self, config):
self._config.update(config)
def run_experiment(self):
num_books = self._config['num_books']
collection = self._book_collection.selection().exclude_authors_below(num_books)
collection = collection.selection().sample_authors(10)
total_authors = len(collection.authors())
results = []
# XXX: allow passing the total number of experiments to evaluate, and
# constants to ponderate between trials and author sets
for num_authors in range(2, total_authors+1):
num_sets = round(total_authors/num_authors)
num_trials = min(num_authors, self._config['num_trials'])
current_results = []
for _ in range(num_sets):
current_collection = collection.selection().sample_authors(num_authors)
for _ in range(num_trials):
c = current_collection.selection().sample_books_per_author(num_books)
c = c.selection().split_per_author_percentage(self._config['training_percentage'])
training, testing = c
#print("%s %s" % (len(training), len(testing)))
self._classification_model.fit(training)
metric = self._classification_model.predict(testing).metric()
current_results.append(metric)
results.append(current_results)
return results
class ESOverTrainingProportion:
def __init__(self, book_collection, classification_model):
self._book_collection = book_collection
self._classification_model = classification_model
self._config = {
'num_books': 15,
'num_authors': 4,
'num_steps': 10,
'num_trials': 6,
}
def set_parameters(self, config):
self._config.update(config)
def run_experiment(self):
results = []
for i in range(1, self._config['num_steps']):
percentage = i/self._config['num_steps']
trial_results = []
for _ in range(self._config['num_trials']):
collection = self._book_collection.selection().sample_authors_with_books(
self._config['num_authors'], self._config['num_books'])
training, testing = collection.selection().split_per_author_percentage(percentage)
self._classification_model.fit(training)
metric = self._classification_model.predict(testing).metric()
trial_results.append(metric)
results.append(trial_results)
return results