In [12]:
import pandas as pd
import nltk as n
from collections import Counter
import itertools as i
import numpy as np
from sklearn.metrics import accuracy_score

In [39]:
class NBClassifier:
    def __init__(self, train, types):
        self.smooth = 0.00001
        self.train = train
        self.types = types
        self.all_word_ctr = Counter([word for row in train for word in row])
        self.all_type_ctr = Counter(types)
        self.type_ctrs = {}
        for _type in set(types):
            type_rows = [train[i] for i in range(len(train)) if types[i] == _type]
            type_words = [word for row in type_rows for word in row]
            self.type_ctrs[_type] = Counter(type_words)
        
    def P_type(self, T=''):
        return self.all_type_ctr[T] / len(self.train)

    def P_word(self, W=''):
        if W not in self.all_word_ctr:
            return self.smooth
        else:
            return self.all_word_ctr[W] / len(self.train)
        
    def P_word_type(self, W='', T=''):
        if W not in self.type_ctrs[T]:
            return self.smooth
        else:
            return self.type_ctrs[T][W] / self.all_type_ctr[T]
        
    def P_type_word(self, W='', T=''):
        return self.P_word_type(W, T) * self.P_type(T) / self.P_word(W)
        
    def P_type_sent(self, T='', S=''):
        return np.prod([self.P_type_word(word, T) for word in S])
    
    def classify(self, sentence):
        probs = [(_type, self.P_type_sent(_type, sentence)) for _type in set(self.types)]
        return max(probs, key=lambda x: x[1])[0]
    
    def classify_all(self, test, types):
        hyp = [self.classify(sentence) for sentence in test]
        return accuracy_score(hyp, types)

In [23]:
train = pd.read_csv('pnp-train.txt',delimiter='\t',encoding='latin-1', names=['type','name'])
train['clean'] = train.name.map(lambda x: x.lower().split())
nbc = NBClassifier(list(train.clean), list(train.type))

test = pd.read_csv('pnp-test.txt',delimiter='\t',encoding='latin-1', names=['type','name'])
test['clean'] = test.name.map(lambda x: x.lower().split())

In [42]:
print("Naive Bayes accuracy = ", nbc.classify_all(list(test.clean), list(test.type)))

Naive Bayes accuracy =  0.675047619048


In [29]:
print("random baseline = ", 1 / len(set(nbc.types)))
print("most common baseline = ", nbc.all_type_ctr.most_common()[0][1] / len(nbc.train))

random baseline =  0.2
most common baseline =  0.29817627732012764


In [41]:
from client.api.notebook import Notebook
ok = Notebook('me.ok')
ok.auth(inline=True)

ModuleNotFoundError: No module named 'client'

In [None]:
ok.submit()