# Settings

In [0]:
import re
import math

In [0]:
def getwords(doc):
  splitter = re.compile('\\W*')
  # 単語を非アルファベットの文字で分割する
  words = [s.lower() for s in splitter.split(doc) if len(s)>2 and len(s)<20]
  # ユニークな単語のみの集合を返す
  return dict([(w, 1) for w in words])

In [3]:
s = 'asdf asdf asdf asdf'
getwords(s)

  after removing the cwd from sys.path.


{'asdf': 1}

In [4]:
s = 'I have a PEN and Apple.'
getwords(s)

  after removing the cwd from sys.path.


{'and': 1, 'apple': 1, 'have': 1, 'pen': 1}

# Simple Classifier (word-level)

In [0]:
class classifier:
  def __init__(self, getfeatures, filename=None):
    # fc: feature_count, cc: categoly_count
    self.fc = {}
    self.cc = {}
    self.getfeatures = getfeatures
  
  def incf(self, f, cat):
    self.fc.setdefault(f, {})
    self.fc[f].setdefault(cat, 0)
    self.fc[f][cat] += 1

  def incc(self, cat):
    self.cc.setdefault(cat, 0)
    self.cc[cat] += 1
  
  def fcount(self, f, cat):
    if f in self.fc and cat in self.fc[f]:
      return float(self.fc[f][cat])
    return 0.0
  
  def catcount(self, cat):
    if cat in self.cc:
      return float(self.cc[cat])
    return 0
  
  def totalcount(self):
    return sum(self.cc.values())
  
  def categories(self):
    return self.cc.keys()
  
  def train(self, item, cat):
    features = self.getfeatures(item)
    for f in features:
      self.incf(f, cat)
    self.incc(cat)

  # C(feature, categoly) / C(categolized_sentence)
  def fprob(self, f, cat):
    if self.catcount(cat) == 0:
      return 0
    return self.fcount(f, cat) / self.catcount(cat)
  
  def weightedprob(self, f, cat, prf, weight=1.0, ap=0.5):
    # prf: probability function
    # ap: approximate probability
    ## calculate weighted probability for sparse features
    basicprob = prf(f, cat)
    totals = sum([self.fcount(f, c) for c in self.categories()])
    bp = ((weight*ap) + (totals*basicprob)) / (weight+totals)
    return bp

In [6]:
cl = classifier(getwords)
cl.train('the quick brown fox jumps over the lazy dog', 'good')
cl.train('make quick money in the online casino', 'bad')
cl.fcount('quick', 'good'), cl.fcount('quick', 'bad')

  after removing the cwd from sys.path.


(1.0, 1.0)

In [0]:
def sampletrain(cl):
  cl.train('Nobody owns the water.', 'good')
  cl.train('the quick rabbit jumps fences', 'good')
  cl.train('buy pharmaceuticals now', 'bad')
  cl.train('make quick money at the online casino', 'bad')
  cl.train('the quick brown fox jumps over the lazy dog', 'good')

In [8]:
sampletrain(cl)
cl.fcount('quick', 'good'), cl.fcount('quick', 'bad')

  after removing the cwd from sys.path.


(3.0, 2.0)

In [9]:
cl.fc

{'brown': {'good': 2},
 'buy': {'bad': 1},
 'casino': {'bad': 2},
 'dog': {'good': 2},
 'fences': {'good': 1},
 'fox': {'good': 2},
 'jumps': {'good': 3},
 'lazy': {'good': 2},
 'make': {'bad': 2},
 'money': {'bad': 2},
 'nobody': {'good': 1},
 'now': {'bad': 1},
 'online': {'bad': 2},
 'over': {'good': 2},
 'owns': {'good': 1},
 'pharmaceuticals': {'bad': 1},
 'quick': {'bad': 2, 'good': 3},
 'rabbit': {'good': 1},
 'the': {'bad': 2, 'good': 4},
 'water': {'good': 1}}

In [10]:
cl.cc

{'bad': 3, 'good': 4}

In [11]:
cl.fprob('quick', 'good')

0.75

In [12]:
cl.fprob('money', 'good')

0.0

In [13]:
cl.weightedprob('money', 'good', cl.fprob)

0.16666666666666666

# Bayesian Classifier (document-level)


In [0]:
class naivebayes(classifier):
  def __init__(self, getfeatures):
    classifier.__init__(self, getfeatures)
    self.thresholds = {}

  def setthreshold(self, cat, t):
    self.thresholds[cat] = t
  
  def getthreshold(self, cat):
    if cat in self.thresholds:
      return self.thresholds[cat]
    else:
      return 1.0

  def docprob(self, item, cat):
    # Calculate P(document | category)
    features = self.getfeatures(item)
    p = 1
    for f in features:
      p *= self.weightedprob(f, cat, self.fprob)
    return p
  
  def prob(self, item, cat):
    # Calculate P(document | category) * P(category)
    catprob = self.catcount(cat) / self.totalcount()
    docprob = self.docprob(item, cat)
    return docprob * catprob

  def classify(self, item, default=None):
    probs = {}
    max_prob = 0.0
    best = None
    for cat in self.categories():
      probs[cat] = self.prob(item, cat)
      if probs[cat] > max_prob:
        max_prob = probs[cat]
        best = cat
    for cat in probs:
      # the best probability is bigger than the other probabilties?
      if cat == best:
        continue
      if probs[cat] * self.getthreshold(best) > probs[best]:
        return default
      return best


In [15]:
cl = naivebayes(getwords)
sampletrain(cl)

  after removing the cwd from sys.path.


In [16]:
cl.prob('quick rabbit', 'good')
cl.classify('quick rabbit', default='unknown')

  after removing the cwd from sys.path.


'good'

In [17]:
cl.prob('quick rabbit', 'bad')

  after removing the cwd from sys.path.


0.05

In [18]:
cl.classify('quick money', default='unknown')

  after removing the cwd from sys.path.


'bad'

In [19]:
cl.setthreshold('bad', 3.0)
cl.classify('quick money', default='unknown')

  after removing the cwd from sys.path.


'unknown'

# Fisher Classifier (document-level)

In [0]:
class fisherclassifier(classifier):
  def cprob(self, f, cat):
    # Calculate the probability of P(f, c) / P(f, all-categories)
    clf = self.fprob(f, cat)
    if clf == 0:
      return 0
    freqsum = sum([self.fprob(f, c) for c in self.categories()])
    p = clf / freqsum
    return p
  
  def fisherprob(self, item, cat):
    p = 1
    features = self.getfeatures(item)
    for f in features:
      p *= self.weightedprob(f, cat, self.cprob)
    fscore = -2 * math.log(p)
    return self.invchi2(fscore, len(features)*2)
  
  def invchi2(self, chi, df):
    m = chi / 2.0
    sums = term = math.exp(-m)
    for i in range(1, df//2):
      term *= m / i
      sums += term
    return min(sums, 1.0)

In [21]:
cl = fisherclassifier(getwords)
sampletrain(cl)

  after removing the cwd from sys.path.


In [22]:
cl.fc

{'brown': {'good': 1},
 'buy': {'bad': 1},
 'casino': {'bad': 1},
 'dog': {'good': 1},
 'fences': {'good': 1},
 'fox': {'good': 1},
 'jumps': {'good': 2},
 'lazy': {'good': 1},
 'make': {'bad': 1},
 'money': {'bad': 1},
 'nobody': {'good': 1},
 'now': {'bad': 1},
 'online': {'bad': 1},
 'over': {'good': 1},
 'owns': {'good': 1},
 'pharmaceuticals': {'bad': 1},
 'quick': {'bad': 1, 'good': 2},
 'rabbit': {'good': 1},
 'the': {'bad': 1, 'good': 3},
 'water': {'good': 1}}

In [23]:
cl.cprob('quick', 'good')

0.5714285714285715

In [24]:
cl.fisherprob('quick rabbit', 'good')

  after removing the cwd from sys.path.


0.78013986588958

In [25]:
cl.fisherprob('quick rabbit', 'bad')

  after removing the cwd from sys.path.


0.35633596283335256