In [None]:
import pymc3 as pm
#Generative model
#we select probabilities for the six faces
theta = [0.1,0.2,0.2,0.2,0.2,0.1]
#we define a probabilistic model
dice = pm.Categorical.dist(theta)
print(">",dice.random())
print(">",dice.random())    

> 0
> 4


In [None]:
import numpy as np
#Possibility space of two rolls
Omega=[(i,j) for i in range(1,7) for j in range(1,7)]
#we select probabilities for the six faces
theta = np.array([0.1,0.2,0.2,0.2,0.2,0.1])
theta_2=np.kron(theta,theta)#this computes all possible products
print(">",theta_2)
#we define a probabilistic model
dice = pm.Categorical.dist(theta_2)
print(">",Omega[dice.random()])#result of one roll
print(">",Omega[dice.random()])#result of another roll

> [0.01 0.02 0.02 0.02 0.02 0.01 0.02 0.04 0.04 0.04 0.04 0.02 0.02 0.04
 0.04 0.04 0.04 0.02 0.02 0.04 0.04 0.04 0.04 0.02 0.02 0.04 0.04 0.04
 0.04 0.02 0.01 0.02 0.02 0.02 0.02 0.01]
> (3, 3)
> (2, 2)


In [None]:
#we select probabilities for the six faces
theta = np.array([0.1,0.2,0.2,0.2,0.2,0.1])
#we define a probabilistic model
dice = pm.Multinomial.dist(100,theta)
print(">",dice.random())#result of 100 rolls
print(">",dice.random())#result of other 100 rolls

> [15. 13. 16. 17. 26. 13.]
> [ 8. 18. 17. 22. 24. 11.]


In [None]:
np.prod(theta**([15., 13., 16., 17., 26., 13.]))

4.7223664828696717e-79

In [None]:
np.sum(np.log(theta**([15., 13., 16., 17., 26., 13.])))

-180.3519122990885

## Map estimate

In [None]:
#we select probabilities for the six faces
theta = np.array([0.1,0.2,0.2,0.2,0.2,0.1])
#we define a probabilistic model
dice = pm.Multinomial.dist(100,theta)
data=dice.random()
print(data)#counts for 100 rolls
print("MLE=",data/np.sum(data))
#no we throws the dice 10000 times
dice = pm.Multinomial.dist(100000,theta)
data=dice.random()
print(data)#counts for 100000 rolls
print("MLE=",data/np.sum(data))

[13. 17. 19. 27. 17.  7.]
MLE= [0.13 0.17 0.19 0.27 0.17 0.07]
[10105. 19843. 20176. 19844. 19933. 10099.]
MLE= [0.10105 0.19843 0.20176 0.19844 0.19933 0.10099]


In [None]:
counts = np.array([15, 32, 16, 17, 26, 26])
alpha = np.ones(len(counts),float)
with pm.Model() as model:
    theta=pm.Dirichlet("theta",alpha)
    data =pm.Multinomial("likelihood",np.sum(counts),theta,observed=counts)
#MAP estimate analytical formula
MAP=(counts+alpha)/np.sum(counts+alpha)
#we can also do it numerically
map_estimate = pm.find_MAP(model=model)
print(MAP,map_estimate['theta'])

logp = -11.992, ||grad|| = 13.657: 100%|██████████| 9/9 [00:00<00:00, 2706.58it/s]

[0.11594203 0.23913043 0.12318841 0.13043478 0.19565217 0.19565217] [0.11363636 0.24242425 0.12121209 0.1287879  0.1969697  0.1969697 ]





In [None]:
#we select probabilities for the six faces
theta = np.array([0.1,0.2,0.2,0.2,0.2,0.1])
#we define a probabilistic model
dice = pm.Multinomial.dist(100,theta)
data=dice.random()
print(data)#ounts for 100 rolls
alpha = np.ones(len(data),float)
print("MAP=",(data+alpha)/np.sum(data+alpha))
#no we throws the dice 10000 times
dice = pm.Multinomial.dist(100000,theta)
data=dice.random()
print(data)#counts for 100000 rolls
print("MAP=",(data+alpha)/np.sum(data+alpha))

[10. 15. 24. 30. 14.  7.]
MAP= [0.10377358 0.1509434  0.23584906 0.29245283 0.14150943 0.0754717 ]
[10031. 20208. 20007. 19844. 19934.  9976.]
MAP= [0.10031398 0.20207788 0.200068   0.19843809 0.19933804 0.09976401]


In [None]:
? pm.find_MAP

## Spam filter with one feature

In [None]:
from sklearn.naive_bayes import BernoulliNB
import numpy as np
X=np.array([[1,1,1,0,0,1,0,0]]).T# 1 Won included, 0 not included
y=np.array( [1,1,0,0,1,1,1,0])#1 spam, 0 not spam
Xtest=np.array([[1,0]]).T
clf = BernoulliNB(alpha=0.0,fit_prior=True)
clf.fit(X, y)
print(clf.predict_proba(Xtest))
print(clf.predict(Xtest))

[[0.25 0.75]
 [0.5  0.5 ]]
[1 1]


  'setting alpha = %.1e' % _ALPHA_MIN)


In [None]:
print(clf.class_count_)
print(clf.feature_count_)

[3. 5.]
[[1.]
 [3.]]


In [1]:
# pg. 45

from sklearn.naive_bayes import MultinomialNB
import numpy as np
X=np.array([[1,0],[1,0],[1,0],[0,1],[0,1],[1,0],[0,1],[0,1]])# 1 Won included, 0 not included
y=np.array( [1,1,0,0,1,1,1,0])#1 spam, 0 not spam
Xtest=np.array([[1,0],[0,1]]).T
clf = MultinomialNB(alpha=0.0,fit_prior=True)
clf.fit(X, y)
print(clf.predict_proba(Xtest))
print(clf.predict(Xtest))

[[0.25 0.75]
 [0.5  0.5 ]]
[1 1]


  'setting alpha = %.1e' % _ALPHA_MIN)


In [None]:
print(clf.class_count_)
print(clf.feature_count_)

[3. 5.]
[[1. 0.]
 [3. 0.]]


In [None]:
# pg. 46

from sklearn.naive_bayes import MultinomialNB
import numpy as np
X=np.array([[1,0],[1,0],[1,0],[0,1],[0,1],[1,0],[0,1],[0,1]])# 1 Won included, 0 not included
y=np.array( [1,1,0,0,1,1,1,0])#1 spam, 0 not spam
Xtest=np.array([[1,0],[0,1]]).T
clf = MultinomialNB(alpha=1.0,fit_prior=True)
clf.fit(X, y)
print(clf.predict_proba(Xtest))
print(clf.predict(Xtest)) 

[[0.29577465 0.70422535]
 [0.45652174 0.54347826]]
[1 1]


In [None]:
from sklearn.naive_bayes import MultinomialNB
import numpy as np
X=np.array([[1,0],[1,0],[0,1],[0,1],[1,0],[1,0],[1,0],[0,1]])# 1 Won included, 0 not included
y=np.array( [1,1,0,0,1,1,1,0])#1 spam, 0 not spam
Xtest=np.array([[1,0],[0,1]]).T
clf = MultinomialNB(alpha=0.0,fit_prior=True)
clf.fit(X, y)
print(clf.predict_proba(Xtest))
print(clf.predict(Xtest))

[[2.00000000e-11 1.00000000e+00]
 [1.00000000e+00 3.33333333e-11]]
[1 0]


  'setting alpha = %.1e' % _ALPHA_MIN)


## Implementation of MultinomialNB from scratch

In [None]:
import numpy as np
class MultinomialNB(object):
    #class constructor
    def __init__(self, alpha=1.0):
        self.alpha = alpha#set the smoothing parameter
    #fit method
    def fit(self, X, y):
        count_sample = X.shape[0]
        split_per_class = [[x for x, t in zip(X, y) if t == c]
                    for c in np.unique(y)]
        
        self.class_log_prior_ = [np.log(len(i) / count_sample) for i in split_per_class]
        count = np.array([np.array(i).sum(axis=0) for i in split_per_class]) + self.alpha
        self.feature_log_prob_ = np.log(count / count.sum(axis=1)[np.newaxis].T)
        
    def predict_proba(self, X):
        proba= [np.exp((self.feature_log_prob_ * x).sum(axis=1) + self.class_log_prior_)
                for x in X]
        prob=[p/np.sum(p) for p in proba]
        return np.vstack(prob)     


    def predict(self, X):
        return np.argmax(self.predict_log_proba(X), axis=1)
    
X=np.array([[1,0],[1,0],[1,0],[0,1],[0,1],[1,0],[0,1],[0,1]])
y=np.array( [1,1,0,0,1,1,1,0])#1 spam, 0 not spam
Xtest=np.array([[1,0],[0,1]]).T

MNB = MultinomialNB(1) 
MNB.fit(X,y)
MNB.predict_proba(Xtest)

array([[0.29577465, 0.70422535],
       [0.45652174, 0.54347826]])

## Unigram

In [None]:
import nltk as nltk
from sklearn.feature_extraction.text import CountVectorizer
D=["Dear John You won 100 euros", "Dear Ashley You won a new car",\
   "Hi Helen We won the game yesterday Bye","Huge discount for you buy our new clothes Bye",
 "Thank you for your offer", "You won the lottery",
 "Buy our cruise",  "Dear Charlie Your documents are ready Bye"]
Corpus = [word.lower() for word in D]# we make everything lowercase
vectorizer=CountVectorizer()
R=vectorizer.fit_transform(Corpus).toarray()
WordsList=np.array(vectorizer.get_feature_names())
print(WordsList)
print(R)#tells us which words are present in each email
print(np.sum(R,axis=0))#total counts of each word in 

['100' 'are' 'ashley' 'buy' 'bye' 'car' 'charlie' 'clothes' 'cruise'
 'dear' 'discount' 'documents' 'euros' 'for' 'game' 'helen' 'hi' 'huge'
 'john' 'lottery' 'new' 'offer' 'our' 'ready' 'thank' 'the' 'we' 'won'
 'yesterday' 'you' 'your']
[[1 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 1 0]
 [0 0 1 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 1 0]
 [0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 1 1 0 0 0 0 0 0 0 0 1 1 1 1 0 0]
 [0 0 0 1 1 0 0 1 0 0 1 0 0 1 0 0 0 1 0 0 1 0 1 0 0 0 0 0 0 1 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 1 1]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 1 0 1 0]
 [0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0]
 [0 1 0 0 1 0 1 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1]]
[1 1 1 2 3 1 1 1 1 3 1 1 1 2 1 1 1 1 1 1 2 1 2 1 1 2 1 4 1 5 2]


In [None]:
Counts = np.sum(R,axis=0)
#
theta = Counts/np.sum(Counts)#MLE estimate of theta
#we define a probabilistic model
sentence = pm.Multinomial.dist(5,theta)
print(sentence.random())
print(">",WordsList[sentence.random()==1])#generate some random emails with 5 words
print(">",WordsList[sentence.random()==1])#generate some random emails with 5 words
print(">",WordsList[sentence.random()==1])#generate some random emails with 5 words

[1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 1. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 1. 0.]
> ['dear' 'our' 'won' 'you' 'your']
> ['euros' 'new' 'our' 'thank' 'you']
> ['ashley' 'buy' 'cruise' 'game' 'new']


## Bigram 

In [None]:
import nltk as nltk
from sklearn.feature_extraction.text import CountVectorizer
D=["Dear John You won 100 euros", "Dear Ashley You won a new car",\
   "Hi Helen We won the game yesterday Bye","Huge discount for you buy our new clothes Bye",
 "Thank you for your offer", "You won the lottery",
 "Buy our cruise",  "Dear Charlie Your documents are ready Bye"]
Corpus = [word.lower() for word in D]# we make everything lowercase
vectorizer=CountVectorizer(ngram_range=(1,1))#unigrams
R=vectorizer.fit_transform(Corpus).toarray()
WordsList=np.array(vectorizer.get_feature_names())

vectorizer=CountVectorizer(ngram_range=(2,2))#bigrams
R=vectorizer.fit_transform(Corpus).toarray()
Bigrams=np.array(vectorizer.get_feature_names())
print(Bigrams)
Frequencies=np.zeros((WordsList.shape[0],WordsList.shape[0]))
for i in range(len(WordsList)):
    for j in range(len(WordsList)):
        if WordsList[i]+' '+WordsList[j] in Bigrams:
            Frequencies[i,j]=Frequencies[i,j]+1

     


['100 euros' 'are ready' 'ashley you' 'buy our' 'charlie your'
 'clothes bye' 'dear ashley' 'dear charlie' 'dear john' 'discount for'
 'documents are' 'for you' 'for your' 'game yesterday' 'helen we'
 'hi helen' 'huge discount' 'john you' 'new car' 'new clothes'
 'our cruise' 'our new' 'ready bye' 'thank you' 'the game' 'the lottery'
 'we won' 'won 100' 'won new' 'won the' 'yesterday bye' 'you buy'
 'you for' 'you won' 'your documents' 'your offer']


In [None]:
theta=(Frequencies+0.0001)/np.sum(Frequencies+0.0001,axis=1)[:,None]  #MLE with smoothing 
word="dear"
Sentence=[word]
for i in range(5):
    ind=np.where((word == WordsList)==True)[0]
    word = WordsList[np.where(pm.Multinomial.dist(1,theta[ind,:]).random()==1)[1]][0]
    Sentence.append(word)
print(Sentence)
word="hi"
Sentence=[word]
for i in range(5):
    ind=np.where((word == WordsList)==True)[0]
    word = WordsList[np.where(pm.Multinomial.dist(1,theta[ind,:]).random()==1)[1]][0]
    Sentence.append(word)
print(Sentence)

['dear', 'charlie', 'your', 'documents', 'are', 'ready']
['hi', 'helen', 'we', 'won', '100', 'euros']


In [None]:
WordsList[np.where(pm.Multinomial.dist(1,theta[ind,:]).random()==1)[1]]

array(['lottery'], dtype='<U9')

In [None]:
np.where(pm.Multinomial.dist(1,theta[ind,:]).random()==1.)

(array([0]), array([7]))

In [None]:
pm.Multinomial.dist(1,theta[ind,:]).random()

array([[0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]])

## CountVectorizer


In [None]:
from sklearn.feature_extraction.text import CountVectorizer
corpus = ['red','brown','brown','white']
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus)
print(vectorizer.get_feature_names())

print(X.toarray())