# Naive bayesien  

### Get dataset

In [1]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
from datasets import load_dataset

dataset = load_dataset("imdb")

Reusing dataset imdb (/Users/quentinlehelloco/.cache/huggingface/datasets/imdb/plain_text/1.0.0/e3c66f1788a67a89c7058d97ff62b6c30531e05b549de56d3ab91891f0561f9a)


  0%|          | 0/3 [00:00<?, ?it/s]

### Create dataset 

In [3]:
x_train = dataset["train"][:]["text"]
y_train = dataset["train"][:]["label"]

x_test = dataset["test"][:]["text"]
y_test = dataset["test"][:]["label"]

### Explore dataset 

In [4]:
len(x_train)

25000

In [5]:
x_train[24993]

'Although the production and Jerry Jameson\'s direction are definite improvements, "Airport \'77" isn\'t much better than "Airport 1975": slick, commercial rubbish submerging (this time literally) a decent cast. Jack Lemmon is the pilot of a packed airliner which gets hijacked by art thieves and crashes into the sea (all the publicity claimed it was near the Bermuda Triangle, but there\'s no mention of it in the film itself). When the rescue ships come to raise the airplane out of the water, we see all their cranes dropping (rather blindly) into the ocean and it\'s hard not to laugh (imagining the cranes plugging the plane, the passengers and the waterlogged script). NBC used to air what appeared to be the "director\'s cut", with at least an hour of extra footage--mostly flashbacks--injected into the proceedings with all the subtlety of a "Gilligan\'s Island" episode. Most exciting moment is the plane crash, and some of the players have a little fun: Lee Grant is an obnoxious drunk, Br

In [6]:
y_train[24993]

0

### Create positive/negative dataset 

In [7]:
pos_data = x_train[:12500]
neg_data = x_train[12500:]

assert(0 not in pos_data)
assert(1 not in neg_data)

In [122]:
def create_voc(pos_data:list, neg_data:list):
    """
    Create full occurence word vocabulary and split into pos/neg classes
    
    Input: pos_data -> List of positives documents (text samples)
           neg_data -> List of negatives documents (text samples)
           max_f -> Max number of words per class

    Return: Vocabulary, (occurences_positive, words_positive), (occurences_negative, words_negative)
    """

    max_f = None
    
    # Count positive word occurence
    vectorizer_p = CountVectorizer(max_features=25000)
    pos_count = vectorizer_p.fit_transform(pos_data).toarray()
    occ_pos = pos_count.sum(axis=0)
    
    
    # Count negative word occurence
    vectorizer_n = CountVectorizer(max_features=25000)
    neg_count = vectorizer_n.fit_transform(neg_data).toarray()
    occ_neg = neg_count.sum(axis=0)
    
    # merge all vocabulary
    voc = np.concatenate((vectorizer_p.get_feature_names_out(), vectorizer_n.get_feature_names_out()), axis=None)
    voc = np.unique(voc)
    
    return voc, (occ_pos, vectorizer_p.get_feature_names_out()), (occ_neg, vectorizer_n.get_feature_names_out())

In [123]:
V, pos, neg = create_voc(pos_data, neg_data)

### Create dico

In [124]:
dic_pos = {}
for A, B in zip(pos[0],pos[1]):
    dic_pos[B] = A
print(dic_pos)



In [125]:
dic_neg = {}
for A, B in zip(neg[0],neg[1]):
    dic_neg[B] = A
print(dic_neg)



### Implement bayes function 

In [126]:
def train_naive_bayes(V:list, dic_pos:dict, dic_neg:dict):
    """
    Train a naive bayesien classifier for 2 classes using documents (D)
    
    Input: D -> List of documents (text samples)

    Return: Logprior, loglikelihood
    """
    
    N_doc = 25000
    N_c = 12500
    
    
    occ_all_words_pos = np.sum(pos[0])
    occ_all_words_neg = np.sum(neg[0])
    
    occ_all_words = [occ_all_words_pos, occ_all_words_neg]
    
    dic = [dic_pos, dic_neg]
    
    # Calculate logprior
    logprior = np.log(N_c/N_doc)
    
    loglikelihood = np.zeros((2, len(V)))
    
    # for each class
    for i in range(2):
        for j in range(len(V)):
            if V[j] in dic[i]:
                loglikelihood[i][j] = np.log((dic[i][V[j]] + 1) / (occ_all_words[i] + 1))
    
    return logprior, loglikelihood

In [127]:
def test_naives_bayes(test, logprior, loglikelihood, V):
    """
    test is list of words
    """
    sum = np.zeros((2, 1))
    l_v = len(V)
    
    # For each class
    for i in range(2):
        # For each words
        for w in test:
            for j in range(l_v):
                if V[j] == w:
                    sum[i] += loglikelihood[i][j]
                
    return (np.argmax(sum) + 1) % 2

### Test 

In [128]:
log, likeli = train_naive_bayes(V, dic_pos, dic_neg)

In [129]:
x_test[2]

'As a recreational golfer with some knowledge of the sport\'s history, I was pleased with Disney\'s sensitivity to the issues of class in golf in the early twentieth century. The movie depicted well the psychological battles that Harry Vardon fought within himself, from his childhood trauma of being evicted to his own inability to break that glass ceiling that prevents him from being accepted as an equal in English golf society. Likewise, the young Ouimet goes through his own class struggles, being a mere caddie in the eyes of the upper crust Americans who scoff at his attempts to rise above his standing. <br /><br />What I loved best, however, is how this theme of class is manifested in the characters of Ouimet\'s parents. His father is a working-class drone who sees the value of hard work but is intimidated by the upper class; his mother, however, recognizes her son\'s talent and desire and encourages him to pursue his dream of competing against those who think he is inferior.<br /><

In [130]:
vectorizer_t = CountVectorizer()
vectorizer_t.fit_transform([x_test[2]]).toarray()
    
test_words = vectorizer_t.get_feature_names_out()
test_words

array(['above', 'accepted', 'actual', 'against', 'although', 'americans',
       'an', 'and', 'are', 'as', 'at', 'attempts', 'battles', 'beauty',
       'being', 'best', 'br', 'break', 'but', 'by', 'caddie', 'ceiling',
       'century', 'characters', 'childhood', 'class', 'competing',
       'could', 'course', 'creation', 'crust', 'depicted', 'desire',
       'detract', 'disney', 'do', 'does', 'dream', 'drone', 'early',
       'encourages', 'ending', 'english', 'equal', 'evicted', 'eyes',
       'father', 'film', 'finally', 'fought', 'from', 'glass', 'goes',
       'golf', 'golfer', 'hard', 'harry', 'have', 'he', 'her', 'him',
       'himself', 'his', 'historical', 'history', 'how', 'however',
       'human', 'in', 'inability', 'inferior', 'intimidated', 'is',
       'issues', 'knowledge', 'liberties', 'likewise', 'little', 'loved',
       'manifested', 'mere', 'miracle', 'moment', 'mother', 'movie',
       'not', 'of', 'one', 'only', 'otherwise', 'ouimet', 'own',
       'parents', 'ph

In [131]:
%%time
test_naives_bayes(test_words, log, likeli, V)

CPU times: user 1.85 s, sys: 8.44 ms, total: 1.85 s
Wall time: 1.88 s


1

In [132]:
y_test[2]

1

In [133]:
print(likeli)

[[-11.09558207 -10.01259509   0.         ...   0.         -13.24734427
    0.        ]
 [-10.88030734  -9.66676508 -13.44525669 ... -12.75210951   0.
  -13.44525669]]


### Accuracy checking

In [141]:
def test(x_test, y_test, log, likelihood, V):
    success = 0
    
    for i in range(len(x_test)):
        test = x_test[i]
        vectorizer_t = CountVectorizer()
        vectorizer_t.fit_transform([x_test[i]]).toarray()

        test_words = vectorizer_t.get_feature_names_out()
        
        res = test_naives_bayes(test_words, log, likeli, V)
        
        if res == y_test[i]:
            success += 1
        print(res, y_test[i])
    return success / len(x_test)

In [142]:
from sklearn.utils import shuffle
shuffle_x_test, shuffle_y_test = shuffle(x_test, y_test, random_state=0)

In [143]:
test(shuffle_x_test[:30], shuffle_y_test[:30], log, likeli, V)

0 0
1 1
0 0
0 1
1 1
1 0
0 0
1 1
1 1
0 0
1 1
1 1
0 1
1 0
1 1
0 0
0 0
0 0
0 0
1 0
1 0
0 0
1 1
1 1
1 0
1 1
1 1
0 1
1 0
1 1


0.7