In [207]:
import re
import math

def get_words(doc):
    splitter = re.compile('\W')
    # Split the words by non-alpha characters.
    words = [s.lower() for s in splitter.split(doc)
             if len(s) > 2 and len(s) < 20]
    
    # Return the unique set of words only.
    return dict([(w,1) for w in words])

In [240]:
import sqlite3

class Classifier:
    def __init__(self, get_features, filename=None):
        # Counts of feature/category combinations.
        self.fc = {}
        
        # Counts of documents in each category.
        self.cc = {}
        
        self.get_features = get_features
        
        self.conn = None
    
    def set_db(self, dbfile):
        self.conn = sqlite3.connect(dbfile)
        c = self.conn.cursor()
        c.execute('create table if not exists fc(feature, category, count)')
        c.execute('create table if not exists cc(category, count)')
        self.conn.commit()
        
    def __del__(self):
        if not self.conn is None:
            self.conn.close()
        
    # Increase the count of a feature/category pair.
    def incf(self, f, cat):
        # self.fc.setdefault(f, {})
        # self.fc[f].setdefault(cat, 0)
        # self.fc[f][cat] += 1
        # count = self.fcount(f, cat)
        count = self.fcount(f, cat)
        c = self.conn.cursor()
        if count == 0:
            c.execute('insert into fc values (?,?,-1)', (f, cat))
        else:
            c.execute('update fc set count = ? where feature = ? and category = ?', (count + 1, f, cat))
        self.conn.commit()
    
    # Increase the count of a category.
    def incc(self, cat):
        # self.cc.setdefault(cat, 0)
        # self.cc[cat] += 1
        count = self.catcount(cat)
        c = self.conn.cursor()
        if count == 0:
            c.execute('insert into cc values (?, 1)', (cat,))
        else:
            c.execute('update cc set count = ? where category = ?', (count + 1, cat))
        self.conn.commit()
    
    # The number of times a feature has appeared in a category.
    def fcount(self, f, cat):
        # if f in self.fc and cat in self.fc[f]:
        #    return float(self.fc[f][cat])
        # return 0.0
        c = self.conn.cursor()
        res = c.execute('select count from fc where feature = ? and category = ?', (f, cat)).fetchone()
        if res == None: return 0
        return float(res[0])

    # The number of items in a category.
    def catcount(self, cat):
        # if cat in self.cc:
        #     return float(self.cc[cat])
        # return 0
        c = self.conn.cursor()
        res = c.execute('select count from cc where category = ?', (cat,)).fetchone()
        if res == None: return 0
        return float(res[0])
    
    # The total number of items.
    def totalcount(self):
        # return sum(self.cc.values())
        c = self.conn.cursor()
        res = c.execute('select sum(count) from cc').fetchone()
        if res == None: return 0
        return res[0]
    
    # The list of all categories.
    def categories(self):
        # return self.cc.keys()
        c = self.conn.cursor()
        cur = c.execute('select category from cc')
        return [d[0] for d in cur]
    
    def train(self, item, cat):
        features = self.get_features(item)
        
        # Increment the count for every feature with this category.
        for f in features:
            self.incf(f, cat)
        
        # Increment the count for this category.
        self.incc(cat)
        
    def fprob(self, f, cat):
        if self.catcount(cat) == 0: return 0
        # The total number of times this feature appeared in this category divided by the total number
        # of items in this category.
        return self.fcount(f, cat) / self.catcount(cat)
    
    def weighted_prob(self, f, cat, prf, weight=1.0, assumed_prob=0.5):
        # Calculate the current probability.
        basic_prob = prf(f, cat)
        
        # Count the number of times this feature has appeared in all categories.
        totals = sum([self.fcount(f, c) for c in self.categories()])
        
        # Calculate the weighted average.
        bp = ((weight * assumed_prob) + (totals * basic_prob)) / (weight + totals)
        return bp

In [241]:
clf = Classifier(get_words)
clf.set_db('test1.db')

In [242]:
# clf.train('the quick brown fox jumps over the lazy dog', 'good')
# clf.train('make quick money in the online casino', 'bad')
# clf.fcount('quick', 'good')

In [243]:
clf.fcount('quick', 'bad')

0.0

In [244]:
def sample_train(clf):
    clf.train('Nobody owns the water.', 'good')
    clf.train('the quick rabbit jumps fences', 'good')
    clf.train('buy pharmaceuticals now', 'bad')
    clf.train('make quick money at the online casino', 'bad')
    clf.train('the quick brown fox jumps', 'good')
sample_train(clf)

In [245]:
clf.fprob('quick', 'good')

0.0

In [246]:
clf.weighted_prob('money', 'good', clf.fprob)

0.5

In [247]:
class NaiveBayes(Classifier):
    def __init__(self, get_features):
        Classifier.__init__(self, get_features)
        self.thresholds = {}

    def set_threshold(self, cat, t):
        self.thresholds[cat] = t
        
    def get_threshold(self, cat):
        if cat not in self.thresholds: return 1.0
        return self.thresholds[cat]

    def docprob(self, item, cat):
        features = self.get_features(item)
        
        # Multiply the probabilities of all the features together.
        p = 1
        for f in features: p *= self.weighted_prob(f, cat, self.fprob)
        return p
    
    def prob(self, item, cat):
        catprob = self.catcount(cat) / self.totalcount()
        docprob = self.docprob(item, cat)
        return docprob * catprob
    
    def classify(self, item, default=None):
        probs = {}
        # Find the category with the highest probability.
        max = 0.0
        for cat in self.categories():
            probs[cat] = self.prob(item, cat)
            if probs[cat] > max:
                max = probs[cat]
                best = cat

        # Make sure the probability exceeds threshold * next best.
        for cat in probs:
            if cat == best: continue
            if probs[cat] * self.get_threshold(best) > probs[best]: return default
        return best

In [248]:
clf = NaiveBayes(get_words)
clf.set_db('test1.db')
sample_train(clf)
clf.prob('quick rabbit', 'good')

0.15

In [249]:
clf.prob('quick rabbit', 'bad')

0.1

In [250]:
clf = NaiveBayes(get_words)
clf.set_db('test1.db')
sample_train(clf)
clf.classify('quick rabbit', default='unknown')

'good'

In [251]:
clf.classify('quick money', default='unknown')

'good'

In [252]:
clf.set_threshold('bad', 3.0)

In [253]:
clf.classify('quick money', default='unknown')

'good'

In [254]:
for i in range(10): sample_train(clf)

In [255]:
clf.classify('quick money', default='unknown')

'good'

## The Fisher Method

In [256]:
class FisherClassifier(Classifier):
    def __init__(self, get_features):
        Classifier.__init__(self, get_features)
        self.minimums = {}
        
    def set_minimum(self, cat, min):
        self.minimums[cat] = min
    
    def get_minimum(self, cat):
        if cat not in self.minimums: return 0
        return self.minimums[cat]

    def cprob(self, f, cat):
        # The frequency of this feature in this category.
        clf = self.fprob(f, cat)
        if clf == 0: return 0
        
        # The frequency of this feature in all the categories.
        freq_sum = sum([self.fprob(f,c) for c in self.categories()])
        
        # The probability is the frequency in the category divided by the overall frequency.
        p = clf / freq_sum
        
        return p
    
    def fisherprob(self, item, cat):
        # Multiply all the probabilities together.
        p = 1
        features = self.get_features(item)
        for f in features:
            p *= (self.weighted_prob(f, cat, self.cprob))
            
        # Take the natural log and multiply by -2.
        fscore = -2 * math.log(p)
        
        # Use the inverse chi2 function to get the probability.
        return self.invchi2(fscore, len(features) * 2)
    
    def invchi2(self, chi, df):
        m = chi / 2.0
        sum = term = math.exp(-m)
        for i in range(1, df//2):
            term *= m / i
            sum += term
        return min(sum, 1.0)
    
    def classify(self, item, default=None):
        # Loop through looking for the best result.
        best = default 
        max = 0.0
        for c in self.categories():
            p = self.fisherprob(item, c)
            # Make sure it exceeds its minimum.
            if p > self.get_minimum(c) and p > max:
                best = c
                max = p
        return best

In [257]:
clf = FisherClassifier(get_words)
clf.set_db('test1.db')
sample_train(clf)
clf.cprob('quick', 'good')

0

In [258]:
clf.cprob('money', 'bad')

0

In [259]:
clf.weighted_prob('money', 'bad', clf.cprob)

0.5

In [268]:
# With fisherprob.
clf = FisherClassifier(get_words)
clf.set_db('test1.db')
sample_train(clf)
clf.cprob('quick', 'good')

0

In [269]:
clf.fisherprob('quick rabbit', 'good')

0.5965735902799727

In [270]:
clf.fisherprob('quick rabbit', 'bad')

0.5965735902799727

In [271]:
clf.classify('quick rabbit')

'good'

In [272]:
clf.classify('quick money')

'good'

In [273]:
clf.set_minimum('bad', 0.8)
clf.classify('quick money')

'good'

In [274]:
clf.set_minimum('good', 0.4)
clf.classify('quick money')

'good'

## Persisting the trained classifier

Why not use pickle? Pickle is a serializer, with sqlite3, there are flexible queries etc that could be more useful.