# Language models

In [1]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm

In [2]:
from vatican.vatican.database import VaticanMongoDb

In [3]:
db = VaticanMongoDb(db_name='vatican', collection='tokens')

## General idea

A language model is a way to estimate

$$
P(w_1, w_2, \dots, w_{n-1}, w_{n})
$$

The Naive way of doing this is

$$
P(w_1, w_2, \dots, w_{n-1}, w_{n}) = P(w_1) \times P(w_2) \times P(w_3) \times \dots \times P(w_{n-1}) \times P(w_n)
$$

As an alternative, 
this can be seen as the problem of estimating the next word in the sequence and than take the product of all the estimations

$$
P(w_1, w_2, \dots, w_{n-1}, w_{n}) = P(w_2 \mid w_1) \times P(w_3 \mid w_1, w_2) \times P(w_4 \mid w_1, w_2, w_3) \times \dots \times P(w_n \mid w_1, \dots w_{n-1})
$$

**Question: what is the main difference with**

$$
P(w_1, w_2, \dots, w_{n-1}, w_{n}) = P(w_1) \times P(w_2) \times P(w_3) \times \dots \times P(w_{n-1}) \times P(w_n)
$$


Now, in general, if we want to estimate $P(A \mid B)$ we can compute

$$
P(A \mid B) = \frac{count(A, B)}{\sum\limits_{A_i}count(A_i, B)}
$$

When we observe $A$ and $B$ in a sequence, this is just

$$
P(A \mid B) = \frac{count(A, B)}{count(B)}
$$

to apply the idea to the text, we need to keep an index that says how many times a word $A$ follows a word $B$ for any word.

In [13]:
from collections import defaultdict
import nltk

In [19]:
index_pairs = defaultdict(lambda: defaultdict(lambda: 0))
for pope, document in tqdm(db.documents):
    if pope == 'Paul VI':
        for sentence in db.get_sentences(pope, document, field='text'):
            tokens = ['#START'] + [x['token'].lower() for x in sentence] + ['#END']
            for a, b in nltk.ngrams(tokens, n=2):
                index_pairs[a][b] += 1

  0%|          | 0/196 [00:00<?, ?it/s]

In [21]:
print("P(cristiano | matrimonio) = count(matrimonio, cristiano) / count(matrimonio)")
print(index_pairs['matrimonio']['cristiano'] / sum(index_pairs['matrimonio'].values()))

P(cristiano | matrimonio) = count(matrimonio, cristiano) / count(matrimonio)
0.07575757575757576


In [34]:
ngram_prob = {}
freq_matrimonio = sum(index_pairs['matrimonio'].values())
for word, freq in index_pairs['matrimonio'].items():
    ngram_prob[word] = freq / freq_matrimonio
pd.Series(ngram_prob).sort_values(ascending=False).head(10)

,            0.293939
.            0.078788
cristiano    0.075758
e            0.072727
;            0.027273
è            0.027273
:            0.021212
dei          0.018182
si           0.018182
non          0.018182
dtype: float64

In [36]:
pd.Series(index_pairs['matrimonio']).head(10)

cristiano    175
(             28
si            42
;             63
,            679
non           42
senza          7
di            14
e            168
è             63
dtype: int64

In [58]:
sequence = ['#START']
for i in range(10):
    prec = sequence[-1]
    candidates = index_pairs['#START']
    words = [x for x, y in candidates.items()]
    probs = [y for x, y in candidates.items()]
    new_word = np.random.choice(words, 
                     p=[x / sum(probs) for x in probs])
    sequence.append(new_word)
    if new_word == '#END':
        break

In [59]:
sequence

['#START', '[', 'enc', '2.7', 'questa', '(', '472', 'enc', '[', ',', ',']