Permalink
Browse files

Added term frequency/inverse document frequency to ngrams

  • Loading branch information...
pranjalv123 committed Jan 17, 2012
1 parent bafa83c commit 081e12058611e9e913bf49020815c02c9f4583bd
Showing with 42 additions and 66 deletions.
  1. +29 −13 movie.py
  2. +12 −42 ngrams.py
  3. +1 −11 yelp.py
View
@@ -20,13 +20,14 @@
NEG_ADJ_DIR="neg_adj"
class TestConfiguration:
- def __init__(self, clsf, n, ind, pos_dir, neg_dir, binary=False, limit=None):
+ def __init__(self, clsf, n, ind, pos_dir, neg_dir, binary=False, limit=None, idf=False):
self.count = 0
self.n = n
self.index = ind
self.binary = binary
self.limit = limit if limit else [0 for i in n]
self.clsf = clsf
+ self.idf = idf
# filenames needed for this test configuration used
pos_files = os.listdir(pos_dir)
@@ -45,7 +46,7 @@ def __init__(self, clsf, n, ind, pos_dir, neg_dir, binary=False, limit=None):
def train(self):
pos_train = [{} for f in self.pos_train_data]
neg_train = [{} for f in self.neg_train_data]
-
+
# Reading files
for (j,lim) in zip(self.n,self.limit):
all_grams = [ngrams.ngrams(j, f) for f in self.pos_train_data]
@@ -65,7 +66,17 @@ def train(self):
# Creating Index
self.classifier = self.clsf(restrictFeatures = self.features)
print "# features: %s" % self.classifier.nfeatures
-
+
+ if self.idf:
+ print "Using TF-IDF"
+ idf = ngrams.ngrams_to_idf(pos_train + neg_train)
+ for i in range(len(pos_train)):
+ for j in pos_train[i]:
+ pos_train[i][j] = pos_train[i][j] * idf[j]
+ for i in range(len(neg_train)):
+ for j in neg_train[i]:
+ neg_train[i][j] = neg_train[i][j] * idf[j]
+
# Making classifier
for i in pos_train:
self.count += 1
@@ -94,29 +105,34 @@ def test(self):
neg_correct = len([i for i in neg_results if int(i) == -1])
print "Negative: %s of %s, %s accuracy" % (neg_correct,len(neg_tests),
(float(neg_correct)/len(neg_tests)))
-
+ return (float(pos_correct)/len(pos_tests), float(neg_correct)/len(neg_tests))
def select_dataset(dataset):
return {'default':(POS_DIR, NEG_DIR), #untagged
'partofspeech':(POS_PARTOFSPEECH_DIR, NEG_PARTOFSPEECH_DIR), #part of speech tagged
'position':(POS_POSITION_DIR, NEG_POSITION_DIR), #position tagged
'adjectives':(POS_ADJ_DIR, NEG_ADJ_DIR) #adjectives tagged
}[dataset]
-def test(classif, n=1, train_size=500, mode='k', iterations=1, dataset='', limit=None, binary=False):
+def test(classif, n=1, train_size=500, mode='k', iterations=1, dataset='', limit=None, binary=False, idf=False):
(pos_dir, neg_dir) = select_dataset(dataset)
ind = Indexes(mode=mode,iterations=iterations,train_size=train_size)
-
+ (pos_correct, neg_correct) = (0,0)
for k in range(iterations):
ind.next()
- m = TestConfiguration(classif, n, ind, pos_dir, neg_dir, binary=binary, limit=limit)
+ m = TestConfiguration(classif, n, ind, pos_dir, neg_dir, binary=binary, limit=limit, idf=idf)
m.train()
- m.test()
-
+ (pos, neg) = m.test()
+ pos_correct += pos
+ neg_correct += neg
+ print "Results:"
+ print "Positive:", round((pos_correct/iterations)*100), "%"
+ print "Negative:", round((neg_correct/iterations)*100), "%"
+ print "Total:", round((neg_correct + pos_correct)/(2*iterations)*100), "%"
if __name__ == "__main__":
- #test(classifier.BayesClassifier,n=[1],train_size=800,mode='k',
- # iterations=3,dataset='position',limit=[16165],binary=True)
- test(classifier.LinearSVMClassifier,n=[2],train_size=800,mode='k',
- iterations=3,dataset='default',limit=[16165],binary=True)
+ test(classifier.BayesClassifier,n=[1],train_size=800,mode='k',
+ iterations=3,dataset='position',limit=[16165],binary=True, idf=False)
+ #test(classifier.LinearSVMClassifier,n=[2],train_size=800,mode='k',
+ # iterations=3,dataset='default',limit=[16165],binary=False, idf=True)
#test(classifier.MaximumEntropyClassifier,n=[1],train_size=800,mode='k',
# iterations=3,dataset='default',limit=[16165],binary=True)
View
@@ -12,7 +12,7 @@ def words(s):
not_mode = False
not_words = set(["not", "isn't", "doesn't"])
punctuation_map = {',':"COMMA", '.':"PERIOD", ':':"COLON", ';':"SEMI", '\'':"SINGLEQUOTE",
- '"':"DOUBLEQUOTE"}
+ '"':"DOUBLEQUOTE", '?':"QUESTION"}
for i in s:
if i.isalnum():
current += i
@@ -121,47 +121,18 @@ def grams_to_featurevector(gramsdict, grams, label=None):
vec[gramsdict[i]] = grams[i]
return vec
-def ngrams_to_sparse(grams, classes):
- print "a"
- keysets = [set(k) for k in grams]
- allgramset = set()
- print "b"
- allgramset = apply(allgramset.union, keysets)
- print "c"
- allgrams = list(allgramset)
- print "d"
- vecs = []
- print "e"
- allgramsdict = {}
- for i in range(len(allgrams)):
- allgramsdict[allgrams[i]] = i
- print "f"
- mat = lil_matrix((len(allgrams), len(grams)))
- print "g"
- for g in range(len(grams)):
- for i in range(len(grams[g])):
- if grams[g][i] > 1:
- mat[allgramsdict[grams[g][i]], g] = grams[g][allgrams[i]] - 1
- mat[g, -1] = classes[g]
- return data.Data(mat.tocsr())
-
-
-def gen_indexdict(dictionary):
- allgramsdict = {}
- for i in range(len(dictionary)):
- allgramsdict[dictionary[i]] = i
- return allgramsdict
-
-def ngram_vector(n, s, dictionary, allgramsdict = {}):
- grams = ngrams(n, s)
- if len(allgramsdict) == 0:
- allgramsdict = gen_indexdict(dictionary)
- vec = ones(len(dictionary), dtype=uint16)
- for g in grams:
- if g in allgramsdict:
- vec[allgramsdict[g]] = grams[g]
- return array(vec)
+def ngrams_to_idf(ngrams):
+ presence = [g.keys() for g in ngrams]
+ docfreq = {}
+ for i in presence:
+ for word in i:
+ if word not in docfreq:
+ docfreq[word] = 1
+ docfreq[word] += 1
+ return data.DefDict(log(float(len(ngrams))),
+ dict([(i, log(float(len(ngrams))/docfreq[i])) for i in docfreq]))
+
if __name__ == "__main__":
print "Trigram example: %s" % ngrams(3, "Now is the time for all good men to not come to the aid of their party! Now is the time for all bad women to leave the aid of their country? This, being war, is bad")
@@ -173,4 +144,3 @@ def ngram_vector(n, s, dictionary, allgramsdict = {}):
(data,gramsdict) = ngrams_to_matrix([g1, g2, g3], [1, 2, 1], return_gramsdict=True)
print "Matrix example: %s" % data.asMatrix()
print "Grams dict: %s" % gramsdict
-
View
12 yelp.py
@@ -20,21 +20,11 @@ def save(self):
f = codecs.open("yelp/" + str(i) + "star/file" + str(j), 'w', encoding="ascii", errors="ignore")
f.write(self.stars[i][j])
f.close()
- def rate(self, classifier, n, dictionary):
- outputs = []
- for group in self.stars:
- classes = DefDict(0)
- print "Rating group", group
- for rev in self.stars[group]:
- vec = ngrams.ngram_vector(n, rev, dictionary)
- classes[classifier.classify(vec)] += 1
- outputs.append(classes)
- return outputs
+
if __name__ == "__main__":
#m = movie.MovieReviews(classifier.BayesPresenceClassifier, 2)
print "Reading Yelp data"
y = Yelp("yelp/json_ascii")
print "Saving"
y.save()
-# print y.rate(m.classifier, 2, m.dictionary)

0 comments on commit 081e120

Please sign in to comment.