# Text classification

In [1]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm

In [2]:
from vatican.vatican.database import VaticanMongoDb

In [3]:
db = VaticanMongoDb(db_name='vatican', collection='tokens')

## Naive Bayes

$$
P(A \mid B)
$$

**Objective**
$$
P(\textrm{pope} \mid w_0, w_1, \dots, w_n)
$$

**Start with a single feature**
$$
P(\textrm{pope} \mid w)
$$

**Bayes rule**
$$
P(A \mid B) = \frac{P(B \mid A)P(A)}{P(B)}
$$

**Application**
$$
P(\textrm{pope} \mid w) = \frac{P(w \mid \textrm{pope})P(\textrm{pope})}{P(w)}
$$

Now we need to estimate:
- $P(w)$
- $P(\textrm{pope})$
- $P(w \mid \textrm{pope})$

## Utility functions
Exercize: use these functions to estimate the quantities above

In [4]:
popes = db.popes
print(popes)

['Benedict XV', 'Benedict XVI', 'Francis I', 'John Paul II', 'John XXIII', 'Leo XIII', 'Paul VI', 'Pius X', 'Pius XI', 'Pius XII']


In [5]:
words_frequencies = db.term_count(normalized=True)
words_frequencies.sort_values(ascending=False).head(4)

,        0.070920
il       0.059319
e        0.033512
di il    0.033105
dtype: float64

In [6]:
pope_frequencies = db.pope_term_count('Paul VI')
pope_frequencies.sort_values(ascending=False).head(4)

,        4611
il       3883
e        2331
di il    2261
dtype: int64

### Task 1: estimate $P(w)$

In [7]:
word = 'vaticano'
words_frequencies[word]

1.1861995175726561e-06

### Task 2: estimate $P(\textrm{pope})$

In [8]:
pope_stats = {}
for pope in popes:
    pope_freq = db.pope_term_count(pope)
    pope_stats[pope] = pope_freq.sum()
pope_stats = pd.Series(pope_stats)

In [9]:
pope_stats.sort_values(ascending=False)
pope_prior = pope_stats / pope_stats.sum()

In [10]:
pope_prior.sort_values(ascending=False)

John Paul II    0.261073
Leo XIII        0.164247
Pius XII        0.157152
Pius XI         0.151040
Pius X          0.054626
John XXIII      0.051991
Benedict XVI    0.045383
Francis I       0.041334
Paul VI         0.039317
Benedict XV     0.033839
dtype: float64

### Task 3: estimate $P(w \mid \textrm{pope})$

In [11]:
pope_box = {}
for pope in popes:
    pope_freq = db.pope_term_count(pope, normalized=True)
    pope_box[pope] = pope_freq

In [12]:
word = 'chiesa'
print(pope_box['John XXIII'][word])
print(pope_box['John Paul II'][word])

0.0027036584948493592
0.004707122476055468


In [13]:
def bayes(word, pope):
    try:
        b = (pope_box[pope][word] * 
             pope_prior[pope]) / words_frequencies[word]
    except KeyError:
        b = 0
    return b

In [14]:
word = 'matrimonio'
word_prob = {}
for pope in popes:
    word_prob[pope] = bayes(word, pope)
pd.Series(word_prob).sort_values(ascending=False)

Pius XI         0.511111
Leo XIII        0.213889
Pius XII        0.122222
Paul VI         0.075000
John Paul II    0.030556
Benedict XVI    0.027778
John XXIII      0.011111
Pius X          0.005556
Francis I       0.002778
Benedict XV     0.000000
dtype: float64

## Put things together

In [15]:
from vatican.vatican.showcases.classification import PopeBayesianClassifier

In [19]:
pbn = PopeBayesianClassifier('Paul VI', db=db)
pbg = PopeBayesianClassifier('John XXIII', db=db)

In [20]:
word = 'matrimonio'
print("P({})".format(pbn.pope), pbn.prior)
print("P({})".format(word), pbn.p_word(word))
print("P({} | {})".format(word, pbn.pope), pbn.p_word_pope(word))

P(Paul VI) 0.03931658300994569
P(matrimonio) 0.00021351591316307812
P(matrimonio | Paul VI) 0.00040730125207421935


In [21]:
p_pope_word = pbn.prior * pbn.p_word_pope(word) / pbn.p_word(word)
print(p_pope_word)

0.07500000000000001


In [22]:
word = 'matrimonio'
print("P({})".format(pbg.pope), pbg.prior)
print("P({})".format(word), pbg.p_word(word))
print("P({} | {})".format(word, pbg.pope), pbg.p_word_pope(word))

P(John XXIII) 0.051990531755450736
P(matrimonio) 0.00021351591316307812
P(matrimonio | John XXIII) 4.563136700167695e-05


In [23]:
p_pope_word_g = pbg.prior * pbg.p_word_pope(word) / pbg.p_word(word)
print(p_pope_word_g)

0.01111111111111111


### Compute probabilities for the whole text

$$
P(\textrm{pope} \mid w_0, \dots, w_n) = \frac{P(w_0, \dots, w_n \mid \textrm{pope})P(\textrm{pope})}{P(w_0, \dots, w_n)}
$$

Now, how to estimate $P(w_0, \dots, w_n)$? If we **assume** that word probabilities are independent, we can just take the product of $P(w_i)$

$$
P(\textrm{pope} \mid w_0, \dots, w_n) = P(\textrm{pope})\frac{ \prod\limits_{i=0}^{n}P(w_i \mid \textrm{pope})}{\prod\limits_{i=0}^{n}P(w_i)}
$$

**Note**: it is important then not to have 0 probabilities.

In [24]:
texts = db.get_sentences('Paul VI', 'Humanae Vitae')

In [30]:
text = [r['token'] for r in texts[11]][:10]
print(text)

['perciò', ',', 'mediante', 'il', 'predicazione', 'di il', 'Apostoli', 'e', 'soprattutto', 'di']


In [39]:
text = ['questa', "é", 'una', 'gatto', 'frase', 'sul', 'chiesa']

In [40]:
pp_p1, pp_p2 = 1, 1
for word in text:
    pp_p1_w = pbn.p_word_pope(word) / pbn.p_word(word)
    pp_p1 *= pp_p1_w
    pp_p2_w = pbg.p_word_pope(word) / pbg.p_word(word)
    pp_p2 *= pp_p2_w

In [41]:
pp_p1, pp_p2

(13148.515497673508, 29.97946184550357)

## Language generation

In [54]:
p_words = pbg.words_frequencies.index.values
p_words_probs = pbg.words_frequencies.values

In [57]:
for i in range(10):
    print(np.random.choice(p_words, p=p_words_probs))

di
volere
cessare
,
,
Redentore
questo
dunque
salvifico
snervare
