# cathywu/Sentiment-Analysis

Classifier works with ~65% accuracy in 3-fold validation

1 parent ad860ce commit 6124344e628dd6b7fb986771f27b449cc692575f pranjalv123 committed Oct 31, 2011
Showing 2,005 changed files with 64,837 additions and 27 deletions.
 @@ -0,0 +1 @@ +*.pyc
 @@ -1,4 +1,6 @@ import random +import data +from numpy import * """ A classifier is initialized with a training set of feature vectors @@ -22,14 +24,50 @@ def classify(self, point): class BayesClassifier: def __init__(self, trainingset) : - self.cl = {} - for row in trainingset.T: - if row[-1] in self.cl: - v, c = cl[row[-1]] - self.cl[row[-1]] = (v + row[:-1], c + sum(row[:-1])) + self.classes = {} + self.lengths = {} + for row in trainingset.asMatrix().T: + cls = row[-1] + pt = row[:-1] + if cls in self.classes: + self.classes[cls] += pt - ones(len(pt)) + else: - self.cl[row[-1]] = (row[:-1], sum(row[:-1])) + self.classes[cls] = pt + + for cls in self.classes: + self.lengths[cls] = sqrt(float(dot(self.classes[cls], self.classes[cls]))) + self.classes[cls] = log(self.classes[cls]) def classify(self, point): - for c in self.cl: - - + mx = 0 + mx_cls = 0 + for cls in self.classes: + dotprod = dot(self.classes[cls], log(point)) - log(self.lengths[cls]) + if dotprod > mx: + mx = dotprod + mx_cls = cls + return mx_cls + +class BayesPresenceClassifier(BayesClassifier): + def classify(self, point): + mx = 0 + mx_cls = 0 + for cls in self.classes: + dotprod = dot(self.classes[cls], log(point.clip(max=2))) - log(self.lengths[cls]) + if dotprod > mx: + mx = dotprod + mx_cls = cls + return mx_cls + + +def test_bayes(): + trainingset = data.Data(array([[2, 2, 2, 1], + [1, 1, 2, 0], + [1, 1, 2, 0], + [2, 1, 1, 0]]).T) + bc = BayesClassifier(trainingset) + print bc.classify(array([2, 2, 2])) + print bc.classify(array([3, 1, 1])) + +if __name__ == "__main__": + test_bayes()
 @@ -0,0 +1,22 @@ +import data +import ngrams +import validate +import classifier +import os +from numpy import * +classif = classifier.BayesPresenceClassifier + +def read_reviews(): + print "Reading and parsing files..." + pos_files = [ngrams.ngrams(1, open("pos/"+i).read()) for i in os.listdir("pos")] + neg_files = [ngrams.ngrams(1, open("neg/"+i).read()) for i in os.listdir("neg")] + classes = [1] * len(pos_files) + [0] * len(neg_files) + print "Creating matrix..." + mat = ngrams.ngrams_to_matrix(pos_files + neg_files, classes) + print "Running classifier..." + print validate.kfold(3, classif, mat) + print validate.kfold(5, classif, mat) + print validate.kfold(10, classif, mat) + +if __name__ == "__main__": + read_reviews()