Find file
Fetching contributors…
Cannot retrieve contributors at this time
executable file 147 lines (132 sloc) 4.36 KB
import collections
import data
from numpy import *
from operator import itemgetter
from scipy.sparse import lil_matrix, csr_matrix
def words(s):
words = []
current = ""
not_mode = False
not_words = set(["not", "isn't", "doesn't"])
punctuation_map = {',':"COMMA", '.':"PERIOD", ':':"COLON", ';':"SEMI", '\'':"SINGLEQUOTE",
for i in s:
if i.isalnum() or (i == '_'):
current += i
elif i.isspace():
if not current:
if not_mode:
current += "_NOT"
if current in not_words:
not_mode = True
current = ""
if i in punctuation_map.keys():
not_mode = False
if not current:
if not_mode:
current += "_NOT"
current = ""
if current:
return words
def ngrams(n, s):
lwr = s.lower()
ws = words(lwr)
current = collections.deque(ws[:n])
grams = data.DefDict(0)
for pos in range(n, len(ws)):
grams[" ".join(current)] += 1
grams[" ".join(current)] += 1
return grams
def ngrams_range(ns, s):
g = {}
for n in ns:
g.update(ngrams(n, s))
return g
def ngrams_to_dictionary(grams):
keysets = [set(k) for k in grams]
allgramset = set()
allgramset = apply(allgramset.union, keysets)
return allgramset
def ngrams_to_matrix(grams, classes, return_gramsdict=False):
Maps from list of raw gram frequencies and labels to a numerical matrix
of feature vectors, with option to also return the gramsdict for future
print "Entering ngrams_to_matrix"
keysets = [set(k) for k in grams]
allgramset = set()
allgramset = apply(allgramset.union, keysets)
allgrams = list(allgramset)
print "> Listed"
vecs = []
print "> []"
allgramsdict = {}
for i in range(len(allgrams)):
allgramsdict[allgrams[i]] = i
for g, c in zip(grams, classes):
vec = grams_to_featurevector(allgramsdict, g, c)
print vstack(vecs).T.shape
ret = data.Data(vstack(vecs).T)
if return_gramsdict:
return (ret,allgramsdict)
return ret
def collapse_ngrams(grams):
Collapse a list of dict of grams into a single dict
collapsed = {}
for gram in grams:
for (key,value) in gram.iteritems():
if key in collapsed:
collapsed[key] += value
collapsed[key] = value
return collapsed
def top_ngrams(grams,limit=0):
if limit==0:
return grams
return dict( sorted(grams.iteritems(), key=itemgetter(1), reverse=True)[:limit] )
def grams_to_featurevector(gramsdict, grams, label=None):
Maps from gram frequencies and label to numerical feature vector according
to some mapping, generated within or from ngrams_to_matrix()
if label:
vec = ones(len(gramsdict) + 1, dtype=uint16)
vec[-1] = label
vec = ones(len(gramsdict), dtype=uint16)
for i in grams:
vec[gramsdict[i]] = grams[i]
return vec
def ngrams_to_idf(ngrams):
presence = [g.keys() for g in ngrams]
docfreq = {}
for i in presence:
for word in i:
if word not in docfreq:
docfreq[word] = 1
docfreq[word] += 1
return data.DefDict(float(len(ngrams)),
dict([(i, float(len(ngrams))/log(float(docfreq[i]))) for i in docfreq]))
if __name__ == "__main__":
print "Trigram example: %s" % ngrams(3, "Now is the time for all good men to not come to the aid of their party! Now is the time for all bad women to leave the aid of their country? This, being war, is bad")
g1 = ngrams(1, "Hello how are you")
g2 = ngrams(1, "Well, are you feeling well")
g3 = ngrams(1, "Well hello there hello")
print "Unigram example: %s" % g3
(data,gramsdict) = ngrams_to_matrix([g1, g2, g3], [1, 2, 1], return_gramsdict=True)
print "Matrix example: %s" % data.asMatrix()
print "Grams dict: %s" % gramsdict