# NB (multinomial distribution probability) with sklearn

Problem: construct a text classifier that can recognize the author of a text. In our case we will be classifying texts from Shakespeare and W. Churchill

In [1]:
# we assume we already have downloaded shakes.txt in our current directory
!wget -O - https://swlab.unica.it/datasets/ocw-datasets/t5.churchill.txt > churchill.txt

--2024-02-09 16:37:56--  https://swlab.unica.it/datasets/ocw-datasets/t5.churchill.txt
Resolving swlab.unica.it (swlab.unica.it)... 90.147.146.241
Connecting to swlab.unica.it (swlab.unica.it)|90.147.146.241|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 9350538 (8.9M) [text/plain]
Saving to: ‘STDOUT’


2024-02-09 16:37:59 (6.32 MB/s) - written to stdout [9350538/9350538]



In [3]:
# we assume we already have downloaded shakes.txt in our current directory
!wget -O - https://swlab.unica.it/datasets/ocw-datasets/t8.shakespeare.txt > shakes.txt

--2024-02-09 16:38:35--  https://swlab.unica.it/datasets/ocw-datasets/t8.shakespeare.txt
Resolving swlab.unica.it (swlab.unica.it)... 90.147.146.241
Connecting to swlab.unica.it (swlab.unica.it)|90.147.146.241|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 5458199 (5.2M) [text/plain]
Saving to: ‘STDOUT’


2024-02-09 16:38:37 (4.24 MB/s) - written to stdout [5458199/5458199]



In [4]:
corpora = {}
for name in ['shakes','churchill']:
    with open(name + '.txt') as f:
        corpora[name] = f.read()

In [5]:
for name in corpora:
    print(name, len(corpora[name]))

shakes 5458199
churchill 9350538


In [6]:
import numpy as np
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer

In [7]:
# now we are creating a NB classifier based on a BOW model (Bag Of Words)
vectorizer = CountVectorizer()
fit = vectorizer.fit([corpora[n] for n in 'shakes churchill'.split()])    # this is to compute the vocabulary (that is, features in our vectors)

bag_of_words = fit.transform([corpora[n] for n in 'shakes churchill'.split()])   # this to create the term-document matrix -> the actual vectors


In [8]:
X = bag_of_words.toarray()
X

array([[3, 1, 1, ..., 0, 1, 0],
       [4, 0, 0, ..., 1, 0, 2]])

In [11]:
vectorizer.get_feature_names_out()[:10]

array(['000', '0238', '08', '0index', '10', '100', '100th', '101', '1011',
       '102'], dtype=object)

In [12]:
vectorizer.get_feature_names_out()[1000:1010]

array(['afishing', 'aflame', 'afloat', 'afoot', 'afore', 'aforehand',
       'aforesaid', 'afoul', 'afraid', 'afred'], dtype=object)

In [13]:
vectorizer.get_feature_names_out()[1008]

'afraid'

In [14]:
bag_of_words.toarray()[0][1008]  # in shakes.txt how many times word "afraid" appears?

41

In [15]:
bag_of_words.toarray()[1][1008] # this is for churchill

314

In [16]:
# create classes for our classifier (target vector)
y = np.array([0,1])   # 0=shakes 1=churchill

In [19]:
clf = MultinomialNB(class_prior=[0.5,0.5])

# Train model
model = clf.fit(X, y)

In [20]:
sentences = ["today is a good day","to be or not to be","I'm afraid there will be problems, history suggests"]
new_observations = fit.transform(sentences)
new_observations

<3x42998 sparse matrix of type '<class 'numpy.int64'>'
	with 15 stored elements in Compressed Sparse Row format>

In [21]:
model.predict_proba(new_observations)

array([[0.74736319, 0.25263681],
       [0.8038625 , 0.1961375 ],
       [0.00820461, 0.99179539]])

In [None]:
model.predict(new_observations)

array([0, 0, 1])

In [24]:
sentences = ["today is a good day","to be not to be or afraid","I'm afraid there will be problems, history suggests"]
new_observations = fit.transform(sentences)
new_observations

<3x42998 sparse matrix of type '<class 'numpy.int64'>'
	with 16 stored elements in Compressed Sparse Row format>

In [25]:
model.predict_proba(new_observations)

array([[0.74736319, 0.25263681],
       [0.4985435 , 0.5014565 ],
       [0.00820461, 0.99179539]])