Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Browse files
Browse the repository at this point in the history
import Reverend
git-svn-id: http://divmod.org/svn/Divmod/trunk/Reverend@2573 866e43f7-fbfc-0310-8f2a-ec88d1da2979
- Loading branch information
washort
committed
Oct 25, 2005
0 parents
commit bcfb0e2
Showing
14 changed files
with
1,291 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Original file line | Diff line number | Diff line change |
---|---|---|---|
@@ -0,0 +1,6 @@ | |||
include README.txt | |||
include LICENSE | |||
include changelog.txt | |||
|
|||
recursive-include examples *.txt | |||
recursive-include examples *.py |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Original file line | Diff line number | Diff line change |
---|---|---|---|
@@ -0,0 +1,39 @@ | |||
Reverend is a simple Bayesian classifier. | |||
It is designed to be easy to adapt and extend for | |||
your application. | |||
|
|||
A simple example would look like: | |||
|
|||
from reverend.thomas import Bayes | |||
|
|||
guesser = Bayes() | |||
guesser.train('fish', 'salmon trout cod carp') | |||
guesser.train('fowl', 'hen chicken duck goose') | |||
|
|||
guesser.guess('chicken tikka marsala') | |||
|
|||
You can also "forget" some training: | |||
guesser.untrain('fish','salmon carp') | |||
|
|||
The first argument of train is the bucket or class that | |||
you want associated with the training. If the bucket does | |||
not exists, Bayes will create it. The second argument | |||
is the object that you want Bayes to be trained on. By | |||
default, Bayes expects a string and uses something like | |||
string.split to break it into indidual tokens (words). | |||
It uses these tokens as the basis of its bookkeeping. | |||
|
|||
|
|||
The two ways to extend it are: | |||
1. Pass in a function as the tokenizer when creating | |||
your Bayes. The function should expect one argument | |||
which will be whatever you pass to the train() method. | |||
The function should return a list of strings, which | |||
are the tokens that are relevant to your app. | |||
|
|||
2. Subclass Bayes and override the method getTokens to | |||
return a list of string tokens relevant to your app. | |||
|
|||
|
|||
I hope all you guesses are right, | |||
amir@divmod.org |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Original file line | Diff line number | Diff line change |
---|---|---|---|
@@ -0,0 +1,40 @@ | |||
25 November 2004 | |||
Release 0.3 | |||
Fixed error in calculation. | |||
Simpler regex tokenization. Now works with unicode. | |||
Removed split.py. | |||
|
|||
5 October 2003 | |||
Release 0.2.4 | |||
Added utility methods for removing, renaming and merging Pools: | |||
removePool(), renamePool() and mergePools() | |||
|
|||
Also added utility methdos for inspecting pool data: | |||
poolData() and poolTokens() | |||
|
|||
All of these methods take pool names as arguments. | |||
|
|||
25 Aug 2003 | |||
Release 0.2.3 | |||
Made it possible to pass an iterator of tokens. | |||
|
|||
16 Aug 2003 | |||
Release 0.2.2 | |||
Added ability to "forget" training using Bayes.untrain() | |||
|
|||
2 Aug 2003 | |||
Release 0.2.1 | |||
Removed the declaration of slots the tokenizer to make it | |||
play nice with Quotient. No change in functionality. | |||
|
|||
16 June 2003 | |||
Release 0.2 | |||
Added basic GUI for training and testing. | |||
Made the storage class pluggable, so different storage managers | |||
can be used. | |||
Some convenience functions and better repr. | |||
Removed some code that was not being run. | |||
|
|||
18 May 2003 | |||
Release 0.1 | |||
Initial release |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Original file line | Diff line number | Diff line change |
---|---|---|---|
@@ -0,0 +1,63 @@ | |||
# This module is part of the Divmod project and is Copyright 2003 Amir Bakhtiar: | |||
# amir@divmod.org. This is free software; you can redistribute it and/or | |||
# modify it under the terms of version 2.1 of the GNU Lesser General Public | |||
# License as published by the Free Software Foundation. | |||
# | |||
|
|||
from email.Message import Message | |||
import email | |||
import rfc822 | |||
|
|||
class EmailItem(Message): | |||
def summary(self): | |||
return { | |||
'From': self.sender(), | |||
'Subject':self.get('subject','<No Subject>'), | |||
} | |||
|
|||
def sender(self): | |||
fromHeader = self['from'] or '"Nobody" <nobody@nowhere>' | |||
hdrs = rfc822.AddressList(fromHeader).addresslist | |||
for dispname, addr in hdrs: | |||
dispname = dispname.strip().strip('"') | |||
addr = addr.strip() | |||
if dispname == '': | |||
dispname = addr | |||
return dispname | |||
|
|||
def columnDefs(self): | |||
return [('From', 20), ('Subject', 30)] | |||
columnDefs = classmethod(columnDefs) | |||
|
|||
def fromFile(self, fp): | |||
try: | |||
msg = email.message_from_file(fp, self) | |||
except email.Errors.MessageParseError: | |||
print 'bad message' | |||
return None | |||
return msg | |||
fromFile = classmethod(fromFile) | |||
|
|||
def runTrainer(): | |||
from reverend.ui.trainer import Trainer | |||
from Tkinter import Tk | |||
from reverend.guessers.email import EmailClassifier | |||
from reverend.thomas import Bayes | |||
root = Tk() | |||
root.title('Reverend Trainer') | |||
root.minsize(width=300, height=300) | |||
#root.maxsize(width=600, height=600) | |||
guesser = EmailClassifier() | |||
display = Trainer(root, guesser=guesser, itemClass=EmailItem) | |||
root.mainloop() | |||
|
|||
def runTester(): | |||
from reverend.ui.tester import DirectoryExam | |||
de = DirectoryExam('spam', 'Spam', EmailItem) | |||
for m, ans in de: | |||
print m['from'], ans | |||
|
|||
|
|||
if __name__ == "__main__": | |||
runTrainer() | |||
#runTester() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Original file line | Diff line number | Diff line change |
---|---|---|---|
@@ -0,0 +1,45 @@ | |||
This bried readme is designed to help you get | |||
started with using the Reverend training and | |||
testing UI. | |||
|
|||
This is how I use the trainer. | |||
|
|||
I first prepare a couple of directories full of | |||
email. One will have a mix of all kinds of email | |||
that I want to classify and one for testing that | |||
is, say, containg only spam files. | |||
|
|||
I type: | |||
python emailtrainer.py | |||
|
|||
I click on the 'New Pool' button and create a | |||
pool for each category or bucket that I want to | |||
classify the data into. e.g. 'Clean' and 'Spam'. | |||
|
|||
I use the radio buttons to classify the emails. | |||
I page back and forth to make sure that new | |||
training does not undo old training. | |||
|
|||
Once I am happy with the training. I click 'Save' | |||
to save the Reverend data. I can load it later | |||
and continue training. | |||
|
|||
When I want to test, I load the Reverend data | |||
using the 'Load' button. I then click on the | |||
'Testing' button on the left. I click 'Run | |||
Test' which brings up the first of 2 dialogs, | |||
asking me to select the test data, eg my | |||
directory full of spam. The next dialog asks | |||
for the correct answer to this set of messages. | |||
I type in 'Spam' (case must match your pool name). | |||
|
|||
I have lots of improvements in mind from training | |||
refinforcement to better testing and analysis. | |||
|
|||
The trainer is designed to be data-agnostic. Look | |||
at example/emailtrainer.py to see how you can | |||
simply wrap your domain objects and make them | |||
place nice with the UI. | |||
|
|||
Enjoy, | |||
-A- |
Empty file.
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Original file line | Diff line number | Diff line change |
---|---|---|---|
@@ -0,0 +1,107 @@ | |||
# This module is part of the Divmod project and is Copyright 2003 Amir Bakhtiar: | |||
# amir@divmod.org. This is free software; you can redistribute it and/or | |||
# modify it under the terms of version 2.1 of the GNU Lesser General Public | |||
# License as published by the Free Software Foundation. | |||
# | |||
|
|||
import os, sys | |||
from rfc822 import AddressList | |||
import email | |||
|
|||
from reverend.thomas import Bayes | |||
from reverend.splitter import Splitter | |||
|
|||
|
|||
class EmailClassifier(Bayes): | |||
|
|||
def getTokens(self, msg): | |||
# Overide from parent | |||
# This should return a list of strings | |||
# which will be used as the key into | |||
# the table of token counts | |||
tokens = self.getHeaderTokens(msg) | |||
tokens += self.getBodyTokens(msg) | |||
|
|||
# Get some tokens that are generated from the | |||
# header and the structure | |||
tokens += self.getMetaTokens(msg) | |||
return tokens | |||
|
|||
def getBodyTokens(self, msg): | |||
text = self.getTextPlain(msg) | |||
if text is None: | |||
text = '' | |||
tl = self.splitter.split(text) | |||
return tl | |||
|
|||
def getHeaderTokens(self, msg): | |||
subj = msg.get('subject','nosubject') | |||
text = subj + ' ' | |||
text += msg.get('from','fromnoone') + ' ' | |||
text += msg.get('to','tonoone') + ' ' | |||
text += msg.get('cc','ccnoone') + ' ' | |||
tl = self.splitter.split(text) | |||
return tl | |||
|
|||
def getTextPlain(self, msg): | |||
for part in msg.walk(): | |||
typ = part.get_type() | |||
if typ and typ.lower() == "text/plain": | |||
text = part.get_payload(decode=True) | |||
return text | |||
return None | |||
|
|||
def getTextHtml(self, msg): | |||
for part in msg.walk(): | |||
typ = part.get_type() | |||
if typ and typ.lower() == "text/html": | |||
text = part.get_payload(decode=False) | |||
return text | |||
return None | |||
|
|||
def getMetaTokens(self, msg): | |||
r = [] | |||
for f in ['Content-type', 'X-Priority', 'X-Mailer', | |||
'content-transfer-encoding', 'X-MSMail-Priority']: | |||
r.append(f +':' + msg.get(f, 'None')) | |||
|
|||
text = self.getTextPlain(msg) | |||
html = self.getTextHtml(msg) | |||
|
|||
for stem, part in zip(['text','html'],[text,html]): | |||
if part is None: | |||
r.append(stem + '_None') | |||
continue | |||
else: | |||
r.append(stem + '_True') | |||
|
|||
l = len(part.split()) | |||
if l is 0: | |||
a = 'zero' | |||
r.append(stem + a) | |||
if l > 10000: | |||
a = 'more_than_10000' | |||
r.append(stem + a) | |||
if l > 1000: | |||
a = 'more_than_1000' | |||
r.append(stem + a) | |||
if l > 100: | |||
a = 'more_than_100' | |||
r.append(stem + a) | |||
|
|||
t = msg.get('to','') | |||
at = AddressList(t).addresslist | |||
c = msg.get('cc','') | |||
ac = AddressList(c).addresslist | |||
|
|||
if at > 5: | |||
r.append('to_more_than_5') | |||
if at > 10: | |||
r.append('to_more_than_10') | |||
if ac > 5: | |||
r.append('cc_more_than_5') | |||
if ac > 10: | |||
r.append('cc_more_than_10') | |||
|
|||
return r | |||
|
Oops, something went wrong.