Skip to content
Newer
Older
100755 277 lines (248 sloc) 8.89 KB
1b74c66 @cathywu ported svm stuff from pysvmlight to pyml, running into memory problem…
authored Jan 12, 2012
1 #!/usr/bin/python
2
3 import svmlight
4 import ngrams
5 import os
6 import pickle
7 import numpy
8 import matplotlib.pyplot as plt
9 from classifier import LinearSVMClassifier
10 import data
11
12 TRAIN_SIZE = 300
13 TEST_SIZE = 1000-TRAIN_SIZE
14 K = 3
15
16 class FeatureMap:
17 """
18 SVM light requires features to be identified with numbers,
19 so this object maps features (strings) to numbers
20 """
21 def __init__(self):
22 self.fmap = {}
23 self.size = 1
24 def hasFeature(self,f):
25 return f in self.fmap
26 def getFeature(self,f):
27 return self.fmap[f]
28 def getID(self,id):
29 return self.fmap[id]
30 def addFeature(self,f):
31 if f not in self.fmap:
32 self.fmap[f]=self.size
33 self.fmap[self.size]=f
34 self.size += 1
35 def getSize(self):
36 return self.size
37
38 class Indexes:
39 """
40 Indexes object generates indices for different configurations
41 Modes:
42 'r' : random
43 'd' : deterministic
44 'k' : k-fold cross-fold validation
45 """
46 def __init__(self):
47 self.mode = 'r'
48 self.iterations = 10
49 def __init__(self,mode,iterations):
50 self.mode = mode
51 self.iterations = iterations
52 self.pos_train_ind = None
53 self.pos_test_ind = None
54 self.neg_train_ind = None
55 self.neg_test_ind = None
56 self.gen_indices = generate_indices(mode,iterations)
57 def next(self):
58 (a,b,c,d) = self.gen_indices.next()
59 self.pos_train_ind = a
60 self.pos_test_ind = b
61 self.neg_train_ind = c
62 self.neg_test_ind = d
63 def get_pos_train_ind(self):
64 return self.pos_train_ind
65 def get_pos_test_ind(self):
66 return self.pos_test_ind
67 def get_neg_train_ind(self):
68 return self.neg_train_ind
69 def get_neg_test_ind(self):
70 return self.neg_test_ind
71
72 def test_svmlight():
73 training_data = [(1, [(1,2),(2,5),(3,6),(5,1),(4,2),(6,1)]),
74 (1, [(1,2),(2,1),(3,4),(5,3),(4,1),(6,1)]),
75 (1, [(1,2),(2,2),(3,4),(5,1),(4,1),(6,1)]),
76 (1, [(1,2),(2,1),(3,3),(5,1),(4,1),(6,1)]),
77 (-1, [(1,2),(2,1),(3,1),(5,3),(4,2),(6,1)]),
78 (-1, [(1,1),(2,1),(3,1),(5,3),(4,1),(6,1)]),
79 (-1, [(1,1),(2,2),(3,1),(5,3),(4,1),(6,1)]),
80 (-1, [(1,1),(2,1),(3,1),(5,1),(4,3),(6,1)]),
81 (-1, [(1,2),(2,1),(3,1),(5,2),(4,1),(6,5)]),
82 (-1, [(7,10)])]
83
84 test_data = [(0, [(1,2),(2,6),(3,4),(5,1),(4,1),(6,1)]),
85 (0, [(1,2),(2,6),(3,4)])]
86
87 model = svmlight.learn(training_data, type='classification', verbosity=0)
88 svmlight.write_model(model, 'my_model.dat')
89 predictions = svmlight.classify(model, test_data)
90 for p in predictions:
91 print '%.8f' % p
92 # output should be 2 positive numbers
93
94 def gen_ngrams(n=2,data="pos"):
95 "Generate ngrams and save locally"
96 temp = []
97 for i in os.listdir("%s" % data):
98 temp.append(open("%s/" % data + i).read())
99 temp = "\n".join(temp)
100 aggregate_ngrams = ngrams.ngrams(n, temp)
101 pickle.dump(aggregate_ngrams, open("%s_%sgram.dump" % (data,n),'w'))
102
103 def gen_all_ngrams():
104 "Generate a bunch of ngrams for convenience"
105 gen_ngrams(n=1,data="pos")
106 gen_ngrams(n=1,data="neg")
107 gen_ngrams(n=2,data="pos")
108 gen_ngrams(n=2,data="neg")
109 gen_ngrams(n=3,data="pos")
110 gen_ngrams(n=3,data="neg")
111
112 def load_ngrams(n,data="pos"):
113 "Load ngram data from disk"
114 return pickle.load(open("%s_%sgram.dump" % (data,n)))
115
116 def gen_feature_map(strings,fmap):
117 for string in strings:
118 fmap.addFeature(string)
119
120 def load_features(n,fmap):
121 print "Positive data"
122 p = load_ngrams(n,"pos")
123 v = p.values()
124 upper = numpy.percentile(v,99.85)
125 lower = numpy.percentile(v,65)
126 print "> filtering %s values" % len(v)
127 items = filter(lambda x: x[1] > lower and x[1] < upper, p.items())
128 keys = [item[0] for item in items]
129 print "> gen_feature_map with %s keys" % len(keys)
130 gen_feature_map(keys,fmap)
131 print "Negative data"
132 n = load_ngrams(n,"neg")
133 v = n.values()
134 upper = numpy.percentile(v,99.85)
135 lower = numpy.percentile(v,65)
136 print "> filtering %s values" % len(v)
137 items = filter(lambda x: x[1] > lower and x[1] < upper, n.items())
138 keys = [item[0] for item in items]
139 print "> gen_feature_map with %s keys" % len(keys)
140 gen_feature_map(keys,fmap)
141
142 def training_set(ind,n=3):
143 """
144
145 Caution: Do not use 0 as label because it evaluates to False
146 """
147 pos = os.listdir("pos")
148 feature_vectors = [ngrams.ngrams(n, open("pos/"+pos[i]).read()) for i in ind.get_pos_train_ind()]
149 labels = [1 for i in ind.get_pos_train_ind()]
150 neg = os.listdir("neg")
151 feature_vectors.extend([ngrams.ngrams(n, open("neg/"+neg[i]).read()) for i in ind.get_neg_train_ind()])
152 labels.extend([2 for i in ind.get_neg_train_ind()])
153 (matrix, gramsdict) = ngrams.ngrams_to_matrix(feature_vectors, labels, return_gramsdict=True)
154 return (matrix.asMatrix(), gramsdict)
155
156 def get_accuracy(results):
157 size = len(results)/2
158 pos_correct = len(numpy.nonzero(numpy.array(results[0:size]) > 0.0)[0])
159 neg_correct = len(numpy.nonzero(numpy.array(results[size:]) < 0.0)[0])
160 pos_accuracy = float(pos_correct)/size
161 neg_accuracy = float(neg_correct)/size
162 accuracy = float(pos_correct+neg_correct)/size/2
163 print "Accuracy: %s (pos) %s (neg) %s (overall)" % (pos_accuracy, neg_accuracy, accuracy)
164 return (pos_accuracy, neg_accuracy, accuracy)
165
166 def plot_results(results):
167 size = len(results)/2
168 # plot positive labels
169 print "POSITIVE"
170 pos_hist = numpy.histogram(p[0:size])
171 print pos_hist
172 fig = plt.figure()
173 fig.suptitle('SVM results', fontsize=12)
174 fig.add_subplot(1,2,1)
175 plt.title('positive')
176 plt.hist(p[0:nresults/2])
177 pos_axis = plt.axis()
178
179 # plot negative labels
180 print "NEGATIVE"
181 fig.add_subplot(1,2,2)
182 plt.title('negative')
183 neg_hist = numpy.histogram(p[size:])
184 print neg_hist
185 plt.hist(p[nresults/2:])
186 neg_axis = plt.axis()
187
188 # match axes of the two graphs
189 low_axis = [min(a,b) for (a,b) in zip(pos_axis,neg_axis)]
190 high_axis = [max(a,b) for (a,b) in zip(pos_axis,neg_axis)]
191 new_axis = [low_axis[0],high_axis[1],low_axis[2],high_axis[3]]
192 plt.axis(new_axis)
193 plt.subplot(1,2,1)
194 plt.axis(new_axis)
195
196 # display plot
197 plt.show()
198
199 def shuffle_ind():
200 ind = numpy.arange(1000)
201 from numpy.random import shuffle
202 shuffle(ind)
203 return ind
204
205 def generate_indices(mode='r',iterations=1):
206 if mode=='d': # deterministic
207 def get_indices():
208 ind = numpy.arange(1000)
209 pos_train_ind = ind[:TRAIN_SIZE]
210 pos_test_ind = ind[TRAIN_SIZE:]
211 neg_train_ind = ind[:TRAIN_SIZE]
212 neg_test_ind = ind[TRAIN_SIZE:]
213 for i in range(iterations):
214 yield (pos_train_ind, pos_test_ind, neg_train_ind, neg_test_ind)
215 elif mode=='r': # random
216 def get_indices():
217 for i in range(iterations):
218 pos_ind = shuffle_ind()
219 pos_train_ind = pos_ind[:TRAIN_SIZE]
220 pos_test_ind = pos_ind[TRAIN_SIZE:]
221 neg_ind = shuffle_ind()
222 neg_train_ind = neg_ind[:TRAIN_SIZE]
223 neg_test_ind = neg_ind[TRAIN_SIZE:]
224 yield (pos_train_ind, pos_test_ind, neg_train_ind, neg_test_ind)
225 elif mode=='k': # k-fold cross-validation
226 pass #TODO
227 return get_indices()
228
229 def run_svm(mode='r',iterations=2):
230 # setup work (generate all the ngrams if they don't exist yet)
231 import os
232 if not os.path.isfile('pos_%sgram.dump' % n):
233 gen_all_ngrams()
234
235 ind = Indexes(mode,iterations)
236 acc = (0,0,0)
237 # run svm
238 for i in range(iterations):
239 ind.next()
240 (train, gramsdict) = training_set(ind,n=n)
241 classifier = LinearSVMClassifier(data.Data(numpy.array(train, dtype=numpy.uint16).T))
242 j = ind.get_pos_test_ind()[0]
243 pos = os.listdir("pos")
244 test = ngrams.grams_to_featurevector(gramsdict, ngrams.ngrams(n, open("pos/"+pos[j]).read()), label=None)
245 print classifier.classify(test, dtype=numpy.uint16)
246 neg = os.listdir("neg")
247 j = ind.get_neg_test_ind()[0]
248 test = ngrams.grams_to_featurevector(gramsdict, ngrams.ngrams(n, open("neg/"+neg[j]).read()), label=None)
249 print classifier.classify(test, dtype=numpy.uint16)
250 print m[-2]
251 print m[-1]
252 # p = test_model(m,ind,n=n)
253 # nresults = len(p)
254 # acc = [(a+b) for (a,b) in zip(acc,get_accuracy(p))]
255 print acc
256
257 return (m,p)
258
259 fmap = FeatureMap()
260
261 # USAGE:
262 # $ ipython
263 # $ run -i svm
264 # $ get_accuracy(p)
265 # $ plot_results(p)
266
267 #if __name__ == "__main__":
268 n = 2 # specifies n in n-grams
269 (m,p) = (None, None)
270 run_svm()
271
272 # RESULTS
273 # 80% accuracy with TRAIN_SIZE=300
274 # 84% accuracy with TRAIN_SIZE=500
275 # 50% accuracy with TRAIN_SIZE=900 (why?)
276 # Segfault with TRAIN_SIZE=100 (why?)
Something went wrong with that request. Please try again.