Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Browse files

import Reverend

git-svn-id: http://divmod.org/svn/Divmod/trunk/Reverend@2573 866e43f7-fbfc-0310-8f2a-ec88d1da2979
  • Loading branch information...
commit bcfb0e2fcd507097975f97b468af4622bd9d815d 0 parents
washort authored
6 MANIFEST.in
@@ -0,0 +1,6 @@
+include README.txt
+include LICENSE
+include changelog.txt
+
+recursive-include examples *.txt
+recursive-include examples *.py
39 README.txt
@@ -0,0 +1,39 @@
+Reverend is a simple Bayesian classifier.
+It is designed to be easy to adapt and extend for
+your application.
+
+A simple example would look like:
+
+from reverend.thomas import Bayes
+
+guesser = Bayes()
+guesser.train('fish', 'salmon trout cod carp')
+guesser.train('fowl', 'hen chicken duck goose')
+
+guesser.guess('chicken tikka marsala')
+
+You can also "forget" some training:
+guesser.untrain('fish','salmon carp')
+
+The first argument of train is the bucket or class that
+you want associated with the training. If the bucket does
+not exists, Bayes will create it. The second argument
+is the object that you want Bayes to be trained on. By
+default, Bayes expects a string and uses something like
+string.split to break it into indidual tokens (words).
+It uses these tokens as the basis of its bookkeeping.
+
+
+The two ways to extend it are:
+1. Pass in a function as the tokenizer when creating
+ your Bayes. The function should expect one argument
+ which will be whatever you pass to the train() method.
+ The function should return a list of strings, which
+ are the tokens that are relevant to your app.
+
+2. Subclass Bayes and override the method getTokens to
+ return a list of string tokens relevant to your app.
+
+
+I hope all you guesses are right,
+amir@divmod.org
40 changelog.txt
@@ -0,0 +1,40 @@
+25 November 2004
+Release 0.3
+Fixed error in calculation.
+Simpler regex tokenization. Now works with unicode.
+Removed split.py.
+
+5 October 2003
+Release 0.2.4
+Added utility methods for removing, renaming and merging Pools:
+ removePool(), renamePool() and mergePools()
+
+Also added utility methdos for inspecting pool data:
+ poolData() and poolTokens()
+
+All of these methods take pool names as arguments.
+
+25 Aug 2003
+Release 0.2.3
+Made it possible to pass an iterator of tokens.
+
+16 Aug 2003
+Release 0.2.2
+Added ability to "forget" training using Bayes.untrain()
+
+2 Aug 2003
+Release 0.2.1
+Removed the declaration of slots the tokenizer to make it
+play nice with Quotient. No change in functionality.
+
+16 June 2003
+Release 0.2
+Added basic GUI for training and testing.
+Made the storage class pluggable, so different storage managers
+can be used.
+Some convenience functions and better repr.
+Removed some code that was not being run.
+
+18 May 2003
+Release 0.1
+Initial release
63 examples/emailtrainer.py
@@ -0,0 +1,63 @@
+# This module is part of the Divmod project and is Copyright 2003 Amir Bakhtiar:
+# amir@divmod.org. This is free software; you can redistribute it and/or
+# modify it under the terms of version 2.1 of the GNU Lesser General Public
+# License as published by the Free Software Foundation.
+#
+
+from email.Message import Message
+import email
+import rfc822
+
+class EmailItem(Message):
+ def summary(self):
+ return {
+ 'From': self.sender(),
+ 'Subject':self.get('subject','<No Subject>'),
+ }
+
+ def sender(self):
+ fromHeader = self['from'] or '"Nobody" <nobody@nowhere>'
+ hdrs = rfc822.AddressList(fromHeader).addresslist
+ for dispname, addr in hdrs:
+ dispname = dispname.strip().strip('"')
+ addr = addr.strip()
+ if dispname == '':
+ dispname = addr
+ return dispname
+
+ def columnDefs(self):
+ return [('From', 20), ('Subject', 30)]
+ columnDefs = classmethod(columnDefs)
+
+ def fromFile(self, fp):
+ try:
+ msg = email.message_from_file(fp, self)
+ except email.Errors.MessageParseError:
+ print 'bad message'
+ return None
+ return msg
+ fromFile = classmethod(fromFile)
+
+def runTrainer():
+ from reverend.ui.trainer import Trainer
+ from Tkinter import Tk
+ from reverend.guessers.email import EmailClassifier
+ from reverend.thomas import Bayes
+ root = Tk()
+ root.title('Reverend Trainer')
+ root.minsize(width=300, height=300)
+ #root.maxsize(width=600, height=600)
+ guesser = EmailClassifier()
+ display = Trainer(root, guesser=guesser, itemClass=EmailItem)
+ root.mainloop()
+
+def runTester():
+ from reverend.ui.tester import DirectoryExam
+ de = DirectoryExam('spam', 'Spam', EmailItem)
+ for m, ans in de:
+ print m['from'], ans
+
+
+if __name__ == "__main__":
+ runTrainer()
+ #runTester()
45 examples/readme.txt
@@ -0,0 +1,45 @@
+This bried readme is designed to help you get
+started with using the Reverend training and
+testing UI.
+
+This is how I use the trainer.
+
+I first prepare a couple of directories full of
+email. One will have a mix of all kinds of email
+that I want to classify and one for testing that
+is, say, containg only spam files.
+
+I type:
+ python emailtrainer.py
+
+I click on the 'New Pool' button and create a
+pool for each category or bucket that I want to
+classify the data into. e.g. 'Clean' and 'Spam'.
+
+I use the radio buttons to classify the emails.
+I page back and forth to make sure that new
+training does not undo old training.
+
+Once I am happy with the training. I click 'Save'
+to save the Reverend data. I can load it later
+and continue training.
+
+When I want to test, I load the Reverend data
+using the 'Load' button. I then click on the
+'Testing' button on the left. I click 'Run
+Test' which brings up the first of 2 dialogs,
+asking me to select the test data, eg my
+directory full of spam. The next dialog asks
+for the correct answer to this set of messages.
+I type in 'Spam' (case must match your pool name).
+
+I have lots of improvements in mind from training
+refinforcement to better testing and analysis.
+
+The trainer is designed to be data-agnostic. Look
+at example/emailtrainer.py to see how you can
+simply wrap your domain objects and make them
+place nice with the UI.
+
+Enjoy,
+-A-
0  reverend/__init__.py
No changes.
0  reverend/guessers/__init__.py
No changes.
107 reverend/guessers/email.py
@@ -0,0 +1,107 @@
+# This module is part of the Divmod project and is Copyright 2003 Amir Bakhtiar:
+# amir@divmod.org. This is free software; you can redistribute it and/or
+# modify it under the terms of version 2.1 of the GNU Lesser General Public
+# License as published by the Free Software Foundation.
+#
+
+import os, sys
+from rfc822 import AddressList
+import email
+
+from reverend.thomas import Bayes
+from reverend.splitter import Splitter
+
+
+class EmailClassifier(Bayes):
+
+ def getTokens(self, msg):
+ # Overide from parent
+ # This should return a list of strings
+ # which will be used as the key into
+ # the table of token counts
+ tokens = self.getHeaderTokens(msg)
+ tokens += self.getBodyTokens(msg)
+
+ # Get some tokens that are generated from the
+ # header and the structure
+ tokens += self.getMetaTokens(msg)
+ return tokens
+
+ def getBodyTokens(self, msg):
+ text = self.getTextPlain(msg)
+ if text is None:
+ text = ''
+ tl = self.splitter.split(text)
+ return tl
+
+ def getHeaderTokens(self, msg):
+ subj = msg.get('subject','nosubject')
+ text = subj + ' '
+ text += msg.get('from','fromnoone') + ' '
+ text += msg.get('to','tonoone') + ' '
+ text += msg.get('cc','ccnoone') + ' '
+ tl = self.splitter.split(text)
+ return tl
+
+ def getTextPlain(self, msg):
+ for part in msg.walk():
+ typ = part.get_type()
+ if typ and typ.lower() == "text/plain":
+ text = part.get_payload(decode=True)
+ return text
+ return None
+
+ def getTextHtml(self, msg):
+ for part in msg.walk():
+ typ = part.get_type()
+ if typ and typ.lower() == "text/html":
+ text = part.get_payload(decode=False)
+ return text
+ return None
+
+ def getMetaTokens(self, msg):
+ r = []
+ for f in ['Content-type', 'X-Priority', 'X-Mailer',
+ 'content-transfer-encoding', 'X-MSMail-Priority']:
+ r.append(f +':' + msg.get(f, 'None'))
+
+ text = self.getTextPlain(msg)
+ html = self.getTextHtml(msg)
+
+ for stem, part in zip(['text','html'],[text,html]):
+ if part is None:
+ r.append(stem + '_None')
+ continue
+ else:
+ r.append(stem + '_True')
+
+ l = len(part.split())
+ if l is 0:
+ a = 'zero'
+ r.append(stem + a)
+ if l > 10000:
+ a = 'more_than_10000'
+ r.append(stem + a)
+ if l > 1000:
+ a = 'more_than_1000'
+ r.append(stem + a)
+ if l > 100:
+ a = 'more_than_100'
+ r.append(stem + a)
+
+ t = msg.get('to','')
+ at = AddressList(t).addresslist
+ c = msg.get('cc','')
+ ac = AddressList(c).addresslist
+
+ if at > 5:
+ r.append('to_more_than_5')
+ if at > 10:
+ r.append('to_more_than_10')
+ if ac > 5:
+ r.append('cc_more_than_5')
+ if ac > 10:
+ r.append('cc_more_than_10')
+
+ return r
+
324 reverend/thomas.py
@@ -0,0 +1,324 @@
+# This module is part of the Divmod project and is Copyright 2003 Amir Bakhtiar:
+# amir@divmod.org. This is free software; you can redistribute it and/or
+# modify it under the terms of version 2.1 of the GNU Lesser General Public
+# License as published by the Free Software Foundation.
+#
+
+import operator
+import string, re
+import math
+from sets import Set
+
+class BayesData(dict):
+
+ def __init__(self, name='', pool=None):
+ self.name = name
+ self.training = []
+ self.pool = pool
+ self.tokenCount = 0
+ self.trainCount = 0
+
+ def trainedOn(self, item):
+ return item in self.training
+
+ def __repr__(self):
+ return '<BayesDict: %s, %s tokens>' % (self.name, self.tokenCount)
+
+class Bayes(object):
+
+ def __init__(self, tokenizer=None, combiner=None, dataClass=None):
+ if dataClass is None:
+ self.dataClass = BayesData
+ else:
+ self.dataClass = dataClass
+ self.corpus = self.dataClass('__Corpus__')
+ self.pools = {}
+ self.pools['__Corpus__'] = self.corpus
+ self.trainCount = 0
+ self.dirty = True
+ # The tokenizer takes an object and returns
+ # a list of strings
+ if tokenizer is None:
+ self._tokenizer = Tokenizer()
+ else:
+ self._tokenizer = tokenizer
+ # The combiner combines probabilities
+ if combiner is None:
+ self.combiner = self.robinson
+ else:
+ self.combiner = combiner
+
+ def commit(self):
+ self.save()
+
+ def newPool(self, poolName):
+ """Create a new pool, without actually doing any
+ training.
+ """
+ self.dirty = True # not always true, but it's simple
+ return self.pools.setdefault(poolName, self.dataClass(poolName))
+
+ def removePool(self, poolName):
+ del(self.pools[poolName])
+ self.dirty = True
+
+ def renamePool(self, poolName, newName):
+ self.pools[newName] = self.pools[poolName]
+ self.pools[newName].name = newName
+ self.removePool(poolName)
+ self.dirty = True
+
+ def mergePools(self, destPool, sourcePool):
+ """Merge an existing pool into another.
+ The data from sourcePool is merged into destPool.
+ The arguments are the names of the pools to be merged.
+ The pool named sourcePool is left in tact and you may
+ want to call removePool() to get rid of it.
+ """
+ sp = self.pools[sourcePool]
+ dp = self.pools[destPool]
+ for tok, count in sp.items():
+ if dp.get(tok):
+ dp[tok] += count
+ else:
+ dp[tok] = count
+ dp.tokenCount += 1
+ self.dirty = True
+
+ def poolData(self, poolName):
+ """Return a list of the (token, count) tuples.
+ """
+ return self.pools[poolName].items()
+
+ def poolTokens(self, poolName):
+ """Return a list of the tokens in this pool.
+ """
+ return [tok for tok, count in self.poolData(poolName)]
+
+ def save(self, fname='bayesdata.dat'):
+ from cPickle import dump
+ fp = open(fname, 'wb')
+ dump(self.pools, fp)
+ fp.close()
+
+ def load(self, fname='bayesdata.dat'):
+ from cPickle import load
+ fp = open(fname, 'rb')
+ self.pools = load(fp)
+ fp.close()
+ self.corpus = self.pools['__Corpus__']
+ self.dirty = True
+
+ def poolNames(self):
+ """Return a sorted list of Pool names.
+ Does not include the system pool '__Corpus__'.
+ """
+ pools = self.pools.keys()
+ pools.remove('__Corpus__')
+ pools = [pool for pool in pools]
+ pools.sort()
+ return pools
+
+ def buildCache(self):
+ """ merges corpora and computes probabilities
+ """
+ self.cache = {}
+ for pname, pool in self.pools.items():
+ # skip our special pool
+ if pname == '__Corpus__':
+ continue
+
+ poolCount = pool.tokenCount
+ themCount = max(self.corpus.tokenCount - poolCount, 1)
+ cacheDict = self.cache.setdefault(pname, self.dataClass(pname))
+
+ for word, totCount in self.corpus.items():
+ # for every word in the copus
+ # check to see if this pool contains this word
+ thisCount = float(pool.get(word, 0.0))
+ if (thisCount == 0.0):
+ continue
+ otherCount = float(totCount) - thisCount
+
+ if not poolCount:
+ goodMetric = 1.0
+ else:
+ goodMetric = min(1.0, otherCount/poolCount)
+ badMetric = min(1.0, thisCount/themCount)
+ f = badMetric / (goodMetric + badMetric)
+
+ # PROBABILITY_THRESHOLD
+ if abs(f-0.5) >= 0.1 :
+ # GOOD_PROB, BAD_PROB
+ cacheDict[word] = max(0.0001, min(0.9999, f))
+
+ def poolProbs(self):
+ if self.dirty:
+ self.buildCache()
+ self.dirty = False
+ return self.cache
+
+ def getTokens(self, obj):
+ """By default, we expect obj to be a screen and split
+ it on whitespace.
+
+ Note that this does not change the case.
+ In some applications you may want to lowecase everthing
+ so that "king" and "King" generate the same token.
+
+ Override this in your subclass for objects other
+ than text.
+
+ Alternatively, you can pass in a tokenizer as part of
+ instance creation.
+ """
+ return self._tokenizer.tokenize(obj)
+
+ def getProbs(self, pool, words):
+ """ extracts the probabilities of tokens in a message
+ """
+ probs = [(word, pool[word]) for word in words if word in pool]
+ probs.sort(lambda x,y: cmp(y[1],x[1]))
+ return probs[:2048]
+
+ def train(self, pool, item, uid=None):
+ """Train Bayes by telling him that item belongs
+ in pool. uid is optional and may be used to uniquely
+ identify the item that is being trained on.
+ """
+ tokens = self.getTokens(item)
+ pool = self.pools.setdefault(pool, self.dataClass(pool))
+ self._train(pool, tokens)
+ self.corpus.trainCount += 1
+ pool.trainCount += 1
+ if uid:
+ pool.training.append(uid)
+ self.dirty = True
+
+ def untrain(self, pool, item, uid=None):
+ tokens = self.getTokens(item)
+ pool = self.pools.get(pool, None)
+ if not pool:
+ return
+ self._untrain(pool, tokens)
+ # I guess we want to count this as additional training?
+ self.corpus.trainCount += 1
+ pool.trainCount += 1
+ if uid:
+ pool.training.remove(uid)
+ self.dirty = True
+
+ def _train(self, pool, tokens):
+ wc = 0
+ for token in tokens:
+ count = pool.get(token, 0)
+ pool[token] = count + 1
+ count = self.corpus.get(token, 0)
+ self.corpus[token] = count + 1
+ wc += 1
+ pool.tokenCount += wc
+ self.corpus.tokenCount += wc
+
+ def _untrain(self, pool, tokens):
+ for token in tokens:
+ count = pool.get(token, 0)
+ if count:
+ if count == 1:
+ del(pool[token])
+ else:
+ pool[token] = count - 1
+ pool.tokenCount -= 1
+
+ count = self.corpus.get(token, 0)
+ if count:
+ if count == 1:
+ del(self.corpus[token])
+ else:
+ self.corpus[token] = count - 1
+ self.corpus.tokenCount -= 1
+
+ def trainedOn(self, msg):
+ for p in self.cache.values():
+ if msg in p.training:
+ return True
+ return False
+
+ def guess(self, msg):
+ tokens = Set(self.getTokens(msg))
+ pools = self.poolProbs()
+
+ res = {}
+ for pname, pprobs in pools.items():
+ p = self.getProbs(pprobs, tokens)
+ if len(p) != 0:
+ res[pname]=self.combiner(p, pname)
+ res = res.items()
+ res.sort(lambda x,y: cmp(y[1], x[1]))
+ return res
+
+ def robinson(self, probs, ignore):
+ """ computes the probability of a message being spam (Robinson's method)
+ P = 1 - prod(1-p)^(1/n)
+ Q = 1 - prod(p)^(1/n)
+ S = (1 + (P-Q)/(P+Q)) / 2
+ Courtesy of http://christophe.delord.free.fr/en/index.html
+ """
+
+ nth = 1./len(probs)
+ P = 1.0 - reduce(operator.mul, map(lambda p: 1.0-p[1], probs), 1.0) ** nth
+ Q = 1.0 - reduce(operator.mul, map(lambda p: p[1], probs)) ** nth
+ S = (P - Q) / (P + Q)
+ return (1 + S) / 2
+
+
+ def robinsonFisher(self, probs, ignore):
+ """ computes the probability of a message being spam (Robinson-Fisher method)
+ H = C-1( -2.ln(prod(p)), 2*n )
+ S = C-1( -2.ln(prod(1-p)), 2*n )
+ I = (1 + H - S) / 2
+ Courtesy of http://christophe.delord.free.fr/en/index.html
+ """
+ n = len(probs)
+ try: H = chi2P(-2.0 * math.log(reduce(operator.mul, map(lambda p: p[1], probs), 1.0)), 2*n)
+ except OverflowError: H = 0.0
+ try: S = chi2P(-2.0 * math.log(reduce(operator.mul, map(lambda p: 1.0-p[1], probs), 1.0)), 2*n)
+ except OverflowError: S = 0.0
+ return (1 + H - S) / 2
+
+ def __repr__(self):
+ return '<Bayes: %s>' % [self.pools[p] for p in self.poolNames()]
+
+ def __len__(self):
+ return len(self.corpus)
+
+class Tokenizer:
+ """A simple regex-based whitespace tokenizer.
+ It expects a string and can return all tokens lower-cased
+ or in their existing case.
+ """
+
+ WORD_RE = re.compile('\\w+', re.U)
+
+ def __init__(self, lower=False):
+ self.lower = lower
+
+ def tokenize(self, obj):
+ for match in self.WORD_RE.finditer(obj):
+ if self.lower:
+ yield match.group().lower()
+ else:
+ yield match.group()
+
+def chi2P(chi, df):
+ """ return P(chisq >= chi, with df degree of freedom)
+
+ df must be even
+ """
+ assert df & 1 == 0
+ m = chi / 2.0
+ sum = term = math.exp(-m)
+ for i in range(1, df/2):
+ term *= m/i
+ sum += term
+ return min(sum, 1.0)
+
0  reverend/ui/__init__.py
No changes.
152 reverend/ui/tester.py
@@ -0,0 +1,152 @@
+# This module is part of the Divmod project and is Copyright 2003 Amir Bakhtiar:
+# amir@divmod.org. This is free software; you can redistribute it and/or
+# modify it under the terms of version 2.1 of the GNU Lesser General Public
+# License as published by the Free Software Foundation.
+#
+
+from __future__ import generators
+from Tkinter import *
+import tkFileDialog
+import tkSimpleDialog
+import tkMessageBox
+import os
+import time
+
+class TestView(Frame):
+ def __init__(self, parent=None, guesser=None, app=None):
+ Frame.__init__(self, parent)
+ self.pack()
+ self.guesser = guesser
+ self.app = app
+ self.size = 300
+ self.setupViews()
+
+
+ def setupViews(self):
+ line = Frame(self, relief=RAISED, borderwidth=1)
+ line.pack(side=TOP, padx=2, pady=1)
+ colHeadings = [('Guesses', 8), ('Right', 8), ('Wrong', 8), ('Accuracy %', 10)]
+ currCol = 0
+ for cHdr, width in colHeadings:
+ l = Label(line, text=cHdr, width=width, bg='lightblue')
+ l.grid(row=0, column=currCol)
+ currCol += 1
+ line = Frame(self)
+ line.pack(fill=X)
+
+ iGuess = IntVar()
+ iRight = IntVar()
+ iWrong = IntVar()
+ iAcc = IntVar()
+ self.model = (iGuess, iRight, iWrong, iAcc)
+
+ l = Label(line, textvariable=iGuess, anchor=E, width=8, relief=SUNKEN)
+ l.grid(row=0, column=0)
+ l = Label(line, textvariable=iRight, anchor=E, width=8, relief=SUNKEN)
+ l.grid(row=0, column=1)
+ l = Label(line, textvariable=iWrong, anchor=E, width=8, relief=SUNKEN)
+ l.grid(row=0, column=2)
+ l = Label(line, textvariable=iAcc, anchor=E, width=8, relief=SUNKEN)
+ l.grid(row=0, column=3)
+ bp = Button(self, text="Run Test", command=self.runTest)
+ bp.pack(side=BOTTOM)
+
+ canvas = Canvas(self, width=self.size, height=self.size, bg='lightyellow')
+ canvas.pack(expand=YES, fill=BOTH, side=BOTTOM)
+ self.canvas = canvas
+
+## slid = Scale(self, label='Wrong', variable=iWrong, to=400, orient=HORIZONTAL, bg='red')
+## slid.pack(side=BOTTOM)
+## slid = Scale(self, label='Right', variable=iRight, to=400, orient=HORIZONTAL, bg='green')
+## slid.pack(side=BOTTOM)
+
+
+ def runTest(self):
+ # TODO - This is nasty re-write
+ if len(self.guesser) == 0:
+ tkMessageBox.showwarning('Underprepared for examination!',
+ 'Your guesser has had no training. Please train and retry.')
+ return
+ path = tkFileDialog.askdirectory()
+ if not path:
+ return
+ answer = tkSimpleDialog.askstring('Which Pool do these items belong to?', 'Pool name?',
+ parent=self.app)
+
+ if not answer:
+ return
+ if answer not in self.guesser.pools:
+ return
+
+ de = DirectoryExam(path, answer, self.app.itemClass)
+ testCount = len(de)
+ scale = self.calcScale(testCount)
+ x = 0
+ y = 0
+ cumTime = 0
+ iGuess, iRight, iWrong, iAcc = self.model
+ for m, ans in de:
+ then = time.time()
+ g = self.guesser.guess(m)
+ cumTime += time.time() - then
+ if g:
+ g = g[0][0]
+ iGuess.set(iGuess.get()+1)
+ if g == ans:
+ col = 'green'
+ iRight.set(iRight.get()+1)
+ else:
+ col = 'red'
+ iWrong.set(iWrong.get()+1)
+ iAcc.set(round(100 * iRight.get()/float(iGuess.get()), 3))
+
+ # Plot squares
+ self.canvas.create_rectangle(x*scale,y*scale,(x+1)*scale,(y+1)*scale,fill=col)
+ if not divmod(iGuess.get(),(int(self.size/scale)))[1]:
+ # wrap
+ x = 0
+ y += 1
+ else:
+ x += 1
+
+ self.update_idletasks()
+ guesses = iGuess.get()
+ self.app.status.log('%r guesses in %.2f seconds. Avg: %.2f/sec.' % (guesses, cumTime,
+ round(guesses/cumTime, 2)))
+
+ def calcScale(self, testCount):
+ import math
+ scale = int(self.size/(math.sqrt(testCount)+1))
+ return scale
+
+
+
+class DirectoryExam(object):
+ """Creates a iterator that returns a pair at a time.
+ (Item, correctAnswer). This Exam creates items from
+ a directory and uses the same answer for each.
+ """
+
+ def __init__(self, path, answer, itemClass):
+ self.path = path
+ self.answer = answer
+ self.itemClass = itemClass
+
+ def __iter__(self):
+ files = os.listdir(self.path)
+ for file in files:
+ fp = open(os.path.join(self.path, file), 'rb')
+ try:
+ item = self.itemClass.fromFile(fp)
+ finally:
+ fp.close()
+ if item is None:
+ continue
+ yield (item, self.answer)
+
+ def __len__(self):
+ files = os.listdir(self.path)
+ return len(files)
+
+
+
403 reverend/ui/trainer.py
@@ -0,0 +1,403 @@
+# This module is part of the Divmod project and is Copyright 2003 Amir Bakhtiar:
+# amir@divmod.org. This is free software; you can redistribute it and/or
+# modify it under the terms of version 2.1 of the GNU Lesser General Public
+# License as published by the Free Software Foundation.
+#
+
+from Tkinter import *
+import tkFileDialog
+import tkSimpleDialog
+import tkMessageBox
+
+import os
+
+from util import Command, StatusBar, Notebook
+from tester import TestView
+
+class PoolView(Frame):
+ def __init__(self, master=None, guesser=None, app=None):
+ Frame.__init__(self, master, bg='lightblue3')
+ self.pack()
+ self.listView = Frame(self)
+ self.listView.pack()
+ bp = Button(self, text="New Pool", command=self.newPool)
+ bp.pack(side=LEFT, anchor=SE)
+ self.addLoadSave()
+ self.columnHeadings()
+ self.model = {}
+ self.guesser = guesser
+ self.app = app
+ self.reload()
+
+ def reload(self):
+ self.listView.destroy()
+ self.listView = Frame(self)
+ self.listView.pack()
+ for pool in self.guesser.poolNames():
+ self.addPool(self.guesser.pools[pool])
+ self.addPool(self.guesser.corpus, 'Total')
+
+ def upload(self):
+ pass
+
+ def addLoadSave(self):
+ frame = Frame(self)
+ frame.pack(side=RIGHT)
+ bp = Button(frame, text="Upload", command=self.upload, state=DISABLED)
+ bp.pack(side=BOTTOM, fill=X)
+ bp = Button(frame, text="Save", command=self.save)
+ bp.pack(side=BOTTOM, fill=X)
+ bp = Button(frame, text="Load", command=self.load)
+ bp.pack(side=BOTTOM, fill=X)
+
+ def addPool(self, pool, name=None):
+ col=None
+ tTok = IntVar()
+ train = IntVar()
+ line = Frame(self.listView)
+ line.pack()
+ if name is None:
+ name = pool.name
+ idx = self.guesser.poolNames().index(name)
+ col = self.defaultColours()[idx]
+ l = Label(line, text=name, anchor=W, width=10)
+ l.grid(row=0, column=0)
+ colourStripe = Label(line, text=' ', width=1, bg=col, anchor=W, relief=GROOVE)
+ colourStripe.grid(row=0, column=1)
+ train = IntVar()
+ train.set(pool.trainCount)
+ l = Label(line, textvariable=train, anchor=E, width=10, relief=SUNKEN)
+ l.grid(row=0, column=2)
+ uTok = IntVar()
+ uTok.set(len(pool))
+ l = Label(line, textvariable=uTok, anchor=E, width=12, relief=SUNKEN)
+ l.grid(row=0, column=3)
+ tTok = IntVar()
+ tTok.set(pool.tokenCount)
+ l = Label(line, textvariable=tTok, anchor=E, width=10, relief=SUNKEN)
+ l.grid(row=0, column=4)
+ self.model[name]=(pool, uTok, tTok, train)
+
+ def refresh(self):
+ for pool, ut, tt, train in self.model.values():
+ ut.set(len(pool))
+ tt.set(pool.tokenCount)
+ train.set(pool.trainCount)
+
+ def save(self):
+ path = tkFileDialog.asksaveasfilename()
+ if not path:
+ return
+ self.guesser.save(path)
+ self.app.dirty = False
+
+ def load(self):
+ path = tkFileDialog.askopenfilename()
+ if not path:
+ return
+ self.guesser.load(path)
+ self.reload()
+ self.app.dirty = False
+
+ def newPool(self):
+ p = tkSimpleDialog.askstring('Create Pool', 'Name for new pool?')
+ if not p:
+ return
+ if p in self.guesser.pools:
+ tkMessageBox.showwarning('Bad pool name!', 'Pool %s already exists.' % p)
+ self.guesser.newPool(p)
+ self.reload()
+ self.app.poolAdded()
+ self.app.status.log('New pool created: %s.' % p, clear=3)
+
+ def defaultColours(self):
+ return ['green', 'yellow', 'lightblue', 'red', 'blue', 'orange', 'purple', 'pink']
+
+ def columnHeadings(self):
+ # FIXME factor out and generalize
+ title = Label(self, text='Pools', relief=RAISED, borderwidth=1)
+ title.pack(side=TOP, fill=X)
+ msgLine = Frame(self, relief=RAISED, borderwidth=1)
+ msgLine.pack(side=TOP)
+ currCol = 0
+ colHeadings = [('Name', 10), ('', 1), ('Trained', 10), ('Unique Tokens', 12), ('Tokens', 10)]
+ for cHdr, width in colHeadings:
+ l = Label(msgLine, text=cHdr, width=width, bg='lightblue')
+ l.grid(row=0, column=currCol)
+ currCol += 1
+
+
+class Trainer(Frame):
+ def __init__(self, parent, guesser=None, itemClass=None):
+ self.status = StatusBar(parent)
+ self.status.pack(side=BOTTOM, fill=X)
+ Frame.__init__(self, parent)
+ self.pack(side=TOP, fill=BOTH)
+ self.itemsPerPage = 20
+ self.rows = []
+ for i in range(self.itemsPerPage):
+ self.rows.append(ItemRow())
+ self.items = []
+ self.files = []
+ self.cursor = 0
+ self.dirty = False
+ if guesser is None:
+ from reverend.thomas import Bayes
+ self.guesser = Bayes()
+ else:
+ self.guesser = guesser
+ if itemClass is None:
+ self.itemClass = TextItem
+ else:
+ self.itemClass = itemClass
+ for row in self.rows:
+ row.summary.set('foo')
+ self.initViews()
+
+ def initViews(self):
+ self.nb = Notebook(self)
+## frame1 = Frame(self.nb())
+## self.poolView = PoolView(frame1, guesser=self.guesser, app=self)
+## self.poolView.pack(side=TOP)
+ frame2 = Frame(self.nb())
+ self.poolView = PoolView(frame2, guesser=self.guesser, app=self)
+ self.poolView.pack(side=TOP)
+ self.listView = Canvas(frame2, relief=GROOVE)
+ self.listView.pack(padx=3)
+ bn = Button(self.listView, text="Load training", command=self.loadCorpus)
+ bn.pack(side=RIGHT, anchor=NE, fill=X)
+ self.columnHeadings()
+ self.addNextPrev()
+
+ frame3 = Frame(self.nb())
+ self.testView = TestView(frame3, guesser=self.guesser, app=self)
+ self.testView.pack()
+
+ frame4 = Frame(self.nb())
+ bp = Button(frame4, text="Quit", command=self.quitNow)
+ bp.pack(side=BOTTOM)
+
+ #self.nb.add_screen(frame1, 'Reverend')
+ self.nb.add_screen(frame2, 'Training')
+ self.nb.add_screen(frame3, 'Testing')
+ self.nb.add_screen(frame4, 'Quit')
+
+
+ def addNextPrev(self):
+ npFrame = Frame(self.listView)
+ npFrame.pack(side=BOTTOM, fill=X)
+ bn = Button(npFrame, text="Prev Page", command=self.prevPage)
+ bn.grid(row=0, column=0)
+ bn = Button(npFrame, text="Next Page", command=self.nextPage)
+ bn.grid(row=0, column=1)
+
+
+ def loadCorpus(self):
+ path = tkFileDialog.askdirectory()
+ if not path:
+ return
+ self.loadFileList(path)
+ self.displayItems()
+ self.displayRows()
+
+ def bulkTest(self):
+ dirs = []
+ for pool in self.guesser.poolNames():
+ path = tkFileDialog.askdirectory()
+ dirs.append((pool, path))
+ for pool, path in dirs:
+ print pool, path
+
+
+ def displayList(self):
+ for item in self.items:
+ self.itemRow(item)
+
+ def displayRows(self):
+ for row in self.rows:
+ self.displayRow(row)
+
+ def loadFileList(self, path):
+ listing = os.listdir(path)
+ self.files = [os.path.join(path, file) for file in listing]
+ self.cursor = 0
+
+ def prevPage(self):
+ self.cursor = max(0, self.cursor - self.itemsPerPage)
+ self.displayItems()
+
+ def nextPage(self):
+ self.cursor = min(len(self.files), self.cursor + self.itemsPerPage)
+ self.displayItems()
+
+ def displayItems(self):
+ theseFiles = self.files[self.cursor:self.cursor + self.itemsPerPage]
+ items = []
+ for file, row in zip(theseFiles, self.rows):
+ fp = open(file, 'rb')
+ try:
+ item = self.itemClass.fromFile(fp)
+ finally:
+ fp.close()
+ if item is None:
+ continue
+ items.append(item)
+ guesses = self.guesser.guess(item)
+ summary = item.summary()
+ cols = item.columnDefs()
+ s = ''
+ for c, ignore in cols:
+ s += summary[c] + ' '
+ row.initialize(item, s, guesses, self.guesser.poolNames())
+ self.items = items
+
+ def quitNow(self):
+ if self.dirty:
+ if tkMessageBox.askyesno("You have unsaved changes!", "Quit without saving?"):
+ self.quit()
+ self.quit()
+
+ def columnHeadings(self):
+ # FIXME - Something better for columns and rows in general
+ line = Frame(self.listView, relief=RAISED, borderwidth=1)
+ line.pack(side=TOP, padx=2, pady=1)
+ colHeadings = self.itemClass.columnDefs()
+ currCol = 0
+ for cHdr, width in colHeadings:
+ l = Label(line, text=cHdr, width=width, bg='lightblue')
+ l.grid(row=0, column=currCol)
+ currCol += 1
+ line = Frame(self)
+ line.pack(fill=X)
+
+ def training(self, row):
+ sel = row.selection.get()
+ self.guesser.train(sel, row.original)
+ row.current = sel
+ self.guessAll()
+
+ def guessAll(self):
+ self.poolView.refresh()
+ pools = self.guesser.poolNames()
+ for row in self.rows:
+ row.setGuess(self.guesser.guess(row.original), pools)
+
+ def displayRow(self, row, bgc=None):
+ # UGH - REWRITE!
+ line = Frame(self.listView, bg=bgc)
+ line.pack(pady=1)
+ row.line = line
+ self.insertRadios(row)
+ Label(line, text=row.summary.get(), textvariable=row.summary, width=60, bg=bgc,
+ anchor=W).grid(row=0, column=2)
+ #Label(line, text=row.guess, width=7, bg=bgc, anchor=W).grid(row=0, column=1)
+ colourStripe = Label(line, text=' ', width=1, bg=bgc, anchor=W, relief=GROOVE)
+ colourStripe.grid(row=0, column=1)
+ line.colourStripe = colourStripe
+ pools = self.guesser.poolNames()
+ row.refreshColour(pools)
+
+ def poolAdded(self):
+ if not self.items:
+ return
+ pools = self.guesser.poolNames()
+ for row in self.rows:
+ for r in row.radios:
+ r.destroy()
+ self.insertRadios(row)
+ row.refreshColour(pools)
+ self.dirty = True
+
+ def insertRadios(self, row):
+ radioFrame = Frame(row.line)
+ radioFrame.grid(row=0, column=0)
+ currCol = 0
+ radios = []
+ v = row.selection
+ ci = 0
+ colours = row.defaultColours()
+ pools = self.guesser.poolNames()
+ for pool in pools:
+ rb = Radiobutton(radioFrame, text=pool, variable=v, value=pool, command=Command(self.training, row), bg=None)
+ rb.grid(row=0, column=currCol)
+ radios.append(rb)
+ currCol += 1
+ ci += 1
+ row.radios = radios
+
+
+class TextItem(object):
+ def __init__(self, text):
+ self.text = text
+
+ def summary(self):
+ return {'Text': self.text}
+
+ def columnNames(self):
+ return ['Text']
+
+ def lower(self):
+ return self.text.lower()
+
+ def fromFile(self, fp):
+ """Return the first line of the file.
+ """
+ ti = self(fp.readline())
+ return ti
+ fromFile = classmethod(fromFile)
+
+
+class ItemRow(object):
+ def __init__(self, orig=None):
+ self.line = None
+ self.radios = []
+ self.original = orig
+ self.current = ''
+ self.guess = []
+ self.summary = StringVar()
+ self.selection = StringVar()
+
+ def initialize(self, item=None, summary='', guess=None, pools=[]):
+ self.selection.set('')
+ self.original = item
+ self.summary.set(summary)
+ self.setGuess(guess, pools)
+
+ def setGuess(self, guess, pools):
+ if not guess:
+ guess = [['']]
+ self.guess = guess
+ self.selection.set(self.bestGuess())
+ self.current = self.bestGuess()
+ self.refreshColour(pools)
+
+ def refreshColour(self, pools):
+ col = None
+ if self.guess[0][0] in pools:
+ idx = pools.index(self.guess[0][0])
+ col = self.defaultColours()[idx]
+ if self.line:
+ self.line.colourStripe.config(bg=col)
+
+ def __repr__(self):
+ return self.original.__repr__()
+
+ def defaultColours(self):
+ return ['green', 'yellow', 'lightblue', 'red', 'blue', 'orange', 'purple', 'pink']
+
+ def bestGuess(self):
+ if self.guess:
+ return self.guess[0][0]
+ else:
+ return None
+
+
+
+
+if __name__ == "__main__":
+ root = Tk()
+ root.title('Reverend Trainer')
+ root.minsize(width=300, height=300)
+ #root.maxsize(width=600, height=600)
+ display = Trainer(root)
+ root.mainloop()
98 reverend/ui/util.py
@@ -0,0 +1,98 @@
+# This module is part of the Divmod project and is Copyright 2003 Amir Bakhtiar:
+# amir@divmod.org. This is free software; you can redistribute it and/or
+# modify it under the terms of version 2.1 of the GNU Lesser General Public
+# License as published by the Free Software Foundation.
+#
+
+from Tkinter import *
+
+class StatusBar(Frame):
+ """Courtesy of Fredrik Lundh.
+ """
+
+ def __init__(self, master):
+ Frame.__init__(self, master)
+ self.label = Label(self, bd=1, relief=SUNKEN, anchor=W)
+ self.label.pack(fill=X)
+
+ def set(self, format, *args):
+ self.label.config(text=format % args)
+ self.label.update_idletasks()
+
+ def clear(self):
+ self.label.config(text="")
+ self.label.update_idletasks()
+
+ def log(self, text, clear=0):
+ # Clear after clear seconds
+ self.set('%s', text)
+ if clear:
+ self.label.after(clear * 1000, self.clear)
+
+
+class Command:
+ """Courtesy of Danny Yoo
+ http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/66521
+ """
+ def __init__(self, callback, *args, **kwargs):
+ self.callback = callback
+ self.args = args
+ self.kwargs = kwargs
+
+ def __call__(self):
+ return apply(self.callback, self.args, self.kwargs)
+
+class Notebook:
+ """Courtesy of Iuri Wickert
+ http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/188537
+ """
+
+ # initialization. receives the master widget
+ # reference and the notebook orientation
+ def __init__(self, master, side=LEFT):
+ self.active_fr = None
+ self.count = 0
+ self.choice = IntVar(0)
+
+ # allows the TOP and BOTTOM
+ # radiobuttons' positioning.
+ if side in (TOP, BOTTOM):
+ self.side = LEFT
+ else:
+ self.side = TOP
+
+ # creates notebook's frames structure
+ self.rb_fr = Frame(master, borderwidth=2, relief=RIDGE)
+ self.rb_fr.pack(side=side, fill=BOTH)
+ self.screen_fr = Frame(master, borderwidth=2, relief=RIDGE)
+ self.screen_fr.pack(fill=BOTH)
+
+
+ # return a master frame reference for the external frames (screens)
+ def __call__(self):
+ return self.screen_fr
+
+
+ # add a new frame (screen) to the (bottom/left of the) notebook
+ def add_screen(self, fr, title):
+ b = Radiobutton(self.rb_fr, text=title, indicatoron=0, \
+ variable=self.choice, value=self.count, \
+ command=lambda: self.display(fr))
+ b.pack(fill=BOTH, side=self.side)
+
+ # ensures the first frame will be
+ # the first selected/enabled
+ if not self.active_fr:
+ fr.pack(fill=BOTH, expand=1)
+ self.active_fr = fr
+
+ self.count += 1
+
+
+ # hides the former active frame and shows
+ # another one, keeping its reference
+ def display(self, fr):
+ self.active_fr.forget()
+ fr.pack(fill=BOTH, expand=1)
+ self.active_fr = fr
+
14 setup.py
@@ -0,0 +1,14 @@
+# This module is part of the Reverend project and is Copyright 2003 Amir
+# Bakhtiar (amir@divmod.org). This is free software; you can redistribute
+# it and/or modify it under the terms of version 2.1 of the GNU Lesser
+# General Public License as published by the Free Software Foundation.
+
+from distutils.core import setup
+
+setup(name="Reverend",
+ version="0.3",
+ description="Divmod Reverend - a simple Bayesian classifier",
+ author="Amir Bakhtiar",
+ author_email="amir hat divmod point org",
+ url="http://www.divmod.org/",
+ packages=['reverend', 'reverend.ui', 'reverend.guessers'], )
Please sign in to comment.
Something went wrong with that request. Please try again.