Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with
or
.
Download ZIP
Browse files

fixed merge issues that messed up bayes classification results

  • Loading branch information...
commit 2f1cdbbda5b4e1caf5730b508adf20f6a66ef63d 2 parents b580dcb + 6095d1f
@cathywu authored
Showing with 101 additions and 48 deletions.
  1. +93 −41 classifier.py
  2. +4 −6 movie.py
  3. +4 −1 ngrams.py
View
134 classifier.py
@@ -8,6 +8,8 @@
from maxent import MaxentModel
from scipy.sparse import csr_matrix, lil_matrix, csc_matrix, issparse
import sys
+from ngrams import *
+import tempfile
"""
A classifier has a addFeatureVector method that takes a feature
@@ -16,8 +18,30 @@
takes in a new vector and returns a class
"""
-class OneClassifier:
+class Classifier:
+ def __init__(self):
+ self.nfeatures = 0
+ self.nvectors = 0
+ self.index = {}
+ def addToIndex(self, words):
+ self.compiled = False
+ words = set([i for i in words])
+ keys = set(self.index.keys())
+ words = words - keys
+ for w in words:
+ self.index[w] = self.nfeatures
+ self.nfeatures += 1
+
+ def vectorFromDict (self, words):
+ self.addToIndex(words.keys())
+ vec = zeros(self.nfeatures)
+ for w in words:
+ vec[self.index[w]] = words[w]
+ return vec
+
+
+class OneClassifier:
def addFeatureVector (self, vec, cls):
pass
def classify(self, point):
@@ -32,31 +56,34 @@ def classify(self, point):
return random.randint(0,1)
-class BayesClassifier:
- def __init__(self) :
- self.classes = {}
- self.nfeatures = 0
- self.nvectors = 0
- self.index = {}
- self.length = 0
+class BayesClassifier(Classifier):
+ def __init__(self, restrictFeatures = False) :
+ Classifier.__init__(self)
+ self.length = 0
+ self.compiled = True
+ self.classes = {}
+ self.restrictFeatures = restrictFeatures
+ if restrictFeatures:
+ self.addToIndex(self.restrictFeatures)
def addToIndex(self, words):
- words = set([i for i in words])
- keys = set(self.index.keys())
- words = words - keys
-
- for w in words:
- self.index[w] = self.nfeatures
- self.nfeatures += 1
+ words = set(words) - set(self.index.keys())
for cls in self.classes:
self.classes[cls] = hstack((self.classes[cls], ones(len(words))))
+ Classifier.addToIndex(self, words)
+
def addFeatureVector(self, vec, cls, binary=False):
-
+ self.compiled = False
if cls not in self.classes:
self.classes[cls] = ones(self.nfeatures)
-
+ if not self.restrictFeatures:
+ self.addToIndex(vec)
for feature in vec:
+
+ if self.restrictFeatures and feature not in self.restrictFeatures:
+ continue
if feature in self.index:
+
if binary:
self.classes[cls][self.index[feature]] += 1
else:
@@ -64,9 +91,14 @@ def addFeatureVector(self, vec, cls, binary=False):
self.nvectors += 1
self.length += 1;
+
def compile(self):
+ if self.compiled:
+ return
+ self.compiled = True
self.normalized = self.classes
self.lengths = {}
+ print self.nfeatures
for i in range(self.nfeatures):
total = 0
for cls in self.classes:
@@ -82,6 +114,7 @@ def compile(self):
self.normalized[cls][i] /= self.lengths[cls]
def classify(self, vec):
+ self.compile()
mx = -sys.maxint
mx_cls = 0
point = ones(self.nfeatures)
@@ -102,24 +135,42 @@ def classify(self, point):
return BayesClassifier.classify(self, point.clip(max=2))
-class LinearSVMClassifier:
- def __init__(self, trainingset):
- print "LinearSVM: Creating dataset"
- L = [i for i in trainingset.asMatrix().T[-1]]
- print "> L"
- X = trainingset.asMatrix().T[:-1].T
- print "> X"
- data = SparseDataSet(X.tolist(), L=L)
- print "> data"
- self.svm = svm.SVM()
- print "Training SVM"
- self.svm.train(data)
+class LinearSVMClassifier(Classifier):
+ def __init__(self):
+ Classifier.__init__(self)
+ self.file = tempfile.NamedTemporaryFile(delete=False)
+ self.filename = self.file.name
+ print self.filename
+ self.data = SparseDataSet(0)
+ self.svm = SVM(optimizer='liblinear')
+
+ def vectorToString(self, vec, cls):
+ return str(cls) + " " + " ".join([str(i) + ":" + str(vec[i])for i in vec]) + "\n"
+
+ def addFeatureVector(self, point, cls):
+ self.compiled = False
+ vec = self.vectorToString(point, cls)
+ self.file.write(vec)
- def classify(self, point):
- L= array(['1.0', '0.0'])
- X = SparseDataSet(array([point], dtype=uint16).tolist())
- print "LinearSVM: Classifying"
- return self.svm.classify(X, 0)[0]
+ def compile(self):
+ if self.compiled == True:
+ return
+ self.compiled = True
+ self.file.close()
+ self.data = SparseDataSet(self.filename)
+# self.svm.train(self.data)
+ self.file = open(self.filename)
+
+ def validate(self, n):
+# self.compile()
+# v = self.vectorFromDict(point)
+
+# outp = self.svm.test(v)
+ self.compile()
+ print self.data
+ outp = self.svm.cv(self.data, numFolds = n)
+ print outp
+
class MaximumEntropyClassifier:
def __init__(self, trainingset):
@@ -138,6 +189,7 @@ def __init__(self, trainingset):
def classify(self, point, label='pos'):
return self.model.eval(point, label)
+
def test_bayes():
@@ -153,13 +205,13 @@ def test_bayes():
def test_svm():
- trainingset = data.Data(array([[2, 2, 2],
- [1, 1, 2],
- [1, 1, 2],
- [0, 1, 0]], dtype=uint16).T)
- bc = LinearSVMClassifier(trainingset)
- print bc.classify(array([2, 2, 2], dtype=uint16))
- print bc.classify(array([3, 1, 1], dtype=uint16))
+ trainingset = [ngrams(1, "foo foo bar baz"), ngrams(1, "foo foo bar bar baz baz"), ngrams(1,"foo foo bar baz")]
+ labels = [1, -1, -1]
+ lsc = LinearSVMClassifier(3)
+ for vec in zip(trainingset, labels):
+ lsc.addFeatureVector(vec[0], vec[1])
+ print lsc.classify(ngrams(1, "foo foo bar bar baz baz"))
+ print lsc.classify(ngrams(1, "foo foo foo bar baz"))
def test_maxent():
trainingset = [(['good'],'pos',1),
View
10 movie.py
@@ -21,7 +21,7 @@
class MovieReviews:
def __init__(self, clsf, n, testsize, pos_dir, neg_dir, binary=False, limit=None):
- self.classifier = clsf()
+
count = 0
pos_files = os.listdir(pos_dir)[:testsize]
neg_files = os.listdir(neg_dir)[:testsize]
@@ -56,8 +56,8 @@ def __init__(self, clsf, n, testsize, pos_dir, neg_dir, binary=False, limit=None
features.update(ngrams.top_ngrams(ngrams.collapse_ngrams(featureslist),lim))
print "Creating Index"
- words = set(features)
- self.classifier.addToIndex(words)
+ self.classifier = clsf(restrictFeatures = features)
+
print "# features: %s" % self.classifier.nfeatures
print "Making classifier"
@@ -67,7 +67,6 @@ def __init__(self, clsf, n, testsize, pos_dir, neg_dir, binary=False, limit=None
for i in self.neg_files:
self.classifier.addFeatureVector(i, -1, binary=binary)
self.classifier.compile()
- print self.classifier.classes
classif = classifier.BayesClassifier
#classif = classifier.LinearSVMClassifier
@@ -100,7 +99,6 @@ def test(n=1,dataset='',limit=None, binary=False):
testsize=800
iterations=1
ind = Indexes(mode='r',iterations=iterations,train_size=testsize)
- print ind.get_pos_train_ind()
for k in range(iterations):
ind.next()
@@ -135,7 +133,7 @@ def test(n=1,dataset='',limit=None, binary=False):
print neg_results
if __name__ == "__main__":
- test(n=[1],dataset='adjectives',limit=None,binary=True)
+ test(n=[1],dataset='adjectives',limit=[2633],binary=True)
# with testsize = 800, no shuffling
# [ns] dataset [limits] binary --> +results -results
View
5 ngrams.py
@@ -11,6 +11,8 @@ def words(s):
current = ""
not_mode = False
not_words = set(["not", "isn't", "doesn't"])
+ punctuation_map = {',':"COMMA", '.':"PERIOD", ':':"COLON", ';':"SEMI", '\'':"SINGLEQUOTE",
+ '"':"DOUBLEQUOTE"}
for i in s:
if i.isalnum():
current += i
@@ -24,7 +26,8 @@ def words(s):
not_mode = True
current = ""
else:
- words.append(i)
+ if i in punctuation_map.keys():
+ words.append(punctuation_map[i])
not_mode = False
if not current:
continue
Please sign in to comment.
Something went wrong with that request. Please try again.