# Do text classification on IMDB dataset

- Block of word model
- Naive Bayes
- Logistic regression
- Get pre-trained vectors from word2vec and train linear classifier by freezing weights
- Get pre-trained vectors from BERT, freeze model and re-train classifier

In [44]:
def get_vocab(X):
    vocab, word2int = set(), defaultdict(int)
    i = 0
    for x in X:
        for word in x:
            if not word in vocab:
                vocab.add(word)
                word2int[word] = i
                i += 1
    return vocab, word2int

class NaiveBayes:
    def __init__(self, vocab_size, word2int, num_classes):
        """NaiveBayes model classifier with bag of words to featurize.

        Args:
            class_labels List[str]: list of class labels
            vocab_size int: number of words 
        """
        self.num_classes = num_classes
        self.prior = np.zeros(num_classes)
        self.likelihood = defaultdict(lambda : np.ones(vocab_size))
        self.word2int = word2int

        pass 

    def fit(self, X, Y):
        for x, y in zip(X, Y):
            self.prior[y] += 1 / len(Y)
            print(y, self.prior)
            for word in x:
                self.likelihood[y][self.word2int[word]] += 1
        
        print(self.likelihood)

        for x, y in zip(X, Y):
            normalizer = np.sum(self.likelihood[y])
            for word in word2int.keys():
                self.likelihood[y][self.word2int[word]] /= normalizer
        
    def predict(self, X):
        predictions = []
        for x in X:
            probs = np.zeros(self.num_classes)
            for c in range(self.num_classes):
                probs[c] += self.prior[c]
                for word in x:
                    probs[c] += self.likelihood[c][self.word2int[word]]
            print(probs)
            predictions.append(probs.argmax())
        return predictions
            

X = [
        ['this', 'is', 'positive', 'amazing'],
        ['positive', 'amazing'],
        ['this', 'is','negative','bad'],
        ['negative','bad']
    ]
Y = [1, 1, 0, 0]
vocab, word2int = get_vocab(X)
print(word2int)
naive_bayes = NaiveBayes(vocab_size=len(vocab), word2int=word2int, num_classes=2)
naive_bayes.fit(X, Y)
naive_bayes.predict([['bad']])


defaultdict(<class 'int'>, {'this': 0, 'is': 1, 'positive': 2, 'amazing': 3, 'negative': 4, 'bad': 5})
1 [0.   0.25]
1 [0.  0.5]
0 [0.25 0.5 ]
0 [0.5 0.5]
defaultdict(<function NaiveBayes.__init__.<locals>.<lambda> at 0x7f0457942ee0>, {1: array([2., 2., 3., 3., 1., 1.]), 0: array([2., 2., 1., 1., 3., 3.])})
[0.75       0.58333333]


[0]

In [40]:
naive_bayes.likelihood

defaultdict(<function __main__.NaiveBayes.__init__.<locals>.<lambda>()>,
            {1: array([0.16666667, 0.16666667, 0.25      , 0.25      , 0.08333333,
                    0.08333333]),
             0: array([0.16666667, 0.16666667, 0.08333333, 0.08333333, 0.25      ,
                    0.25      ])})