In [14]:
import numpy as np
from collections import defaultdict, Counter
from math import log

# P(class|doc) = P(doc|class)*P(class)/P(doc)
# P(class|doc) - document belongs to class
# P(doc|class) - probability to meet doc in class
# P(class) - probability of class (normalized counter)
# P(doc) - constant
# answer: argmax_CLS P(CLS|doc) for CLS in ALL_CLASSES
# P(doc|class) = P(w1|class) * ...* P(wn_class) = П(wi|class)


class myNB:
    def __init__(self):
        self.classes = defaultdict(int)
        self.freqs = defaultdict(int)
        
    def __extract_features(self, string):
        if len(string) >= 3:
            return [string[0], string[1], string[-1]]
        elif len(string) == 2:
            return [string[0], string[-1]]
        elif len(string) == 1:
            return [string[0], string[-1]]
        else:
            return []
        
    def fit(self, X, y):
        self.__init__()
        if (len(X) != len(y)):
            print('X and y must have the same length')
            raise
            
        # Count number of objects in class
        self.classes = Counter(y)
        # Count features      
        for i in range(len(y)):
            item = X[i]
            label = y[i]
            feats = self.__extract_features(item)
            for feat in feats:
                self.freqs[(label, feat)] += 1
                
        for label, feat in self.freqs:
            self.freqs[(label, feat)] /= self.classes[label]
            
        for label in self.classes:
            self.classes[label] /= len(y)

        return self
    
    def predict(self, X):
        y = np.zeros((len(X),))
        for i in range(len(X)):
            item = X[i]
            feats = self.__extract_features(item)
            keys = self.classes.keys()
            label = min(
                self.classes.keys(), 
                key=lambda c: (
                    -log(self.classes[c]) 
                    + sum(-log(self.freqs.get((c, feat), 10**(-5))) for feat in feats)
                )
            )
            y[i] = label
        return y
    

In [15]:
NB = myNB()
x = np.array(['aaa', 'aba', 'bab', 'bbb', 'abc'])
y = np.array([1, 0 ,1, 0, 1])
#print(type(x))
NB.fit(x, y)
xtest = np.array(['aaa', 'aba', 'baa', 'adb', 'bd', 'dd'])
NB.predict(xtest)

array([1., 1., 1., 1., 0., 1.])