# Explore lexicon in Vatican publications

In [1]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm

In [2]:
import pymongo

In [3]:
db = pymongo.MongoClient()['vatican']['tokens']

In [7]:
def documents():
    q = {'$group': {'_id': {'pope': '$pope', 'document': '$document'}}}
    docs = []
    for record in db.aggregate([q]):
        docs.append((record['_id']['pope'], record['_id']['document']))
    return docs

In [8]:
docs = documents()

In [9]:
len(docs)

196

In [10]:
docs[:4]

[('Pius XII', 'Luctuosissimi Eventus'),
 ('Pius XI', 'Rerum Omnium Perturbationem'),
 ('Leo XIII', 'Insignes'),
 ('Pius XII', 'Deiparae Virginis Mariae')]

## Term Frequency (TF)

$$
tf(d, t) = \frac{count(d, t)}{\max\limits_{i}^{n} count(d, t_i)}
$$

In [49]:
def document_term_count(pope, document, field='lemma'):
    m = {'$match': {'pope': pope, 'document': document}}
    g = {'$group': {'_id': "${}".format(field), 'count': {'$sum': 1}}}
    data = {}
    for record in db.aggregate([m, g]):
        data[record['_id']] = record['count']
    return pd.Series(data)

In [52]:
pope, document = 'Paul VI', 'Humanae Vitae'
tf_count = document_term_count(pope, document)
tf = tf_count / tf_count.max()

In [53]:
tf.sort_values(ascending=False).head(10)

,         1.000000
il        0.914414
di il     0.653153
e         0.495495
di        0.448198
.         0.434685
a il      0.304054
essere    0.286036
che       0.261261
uno       0.180180
dtype: float64

## Document Frequency and Inverse Document Frequency (IDF)

$$
df(t) = \mid\{ d_i : t \in d_i \}\mid
$$

$$
idf(t) = \log \frac{N}{df(t)}
$$

In [76]:
def sentence_df(field='lemma'):
    g = {'$group': {'_id': {'token': "${}".format(field), 'document': "$document", 'sentence': "$sentence"}}}
    g2 = {'$group': {'_id': '$_id.token', 'count': {'$sum': 1}}}
    data = {}
    for record in db.aggregate([g, g2], allowDiskUse=True):
        data[record['_id']] = record['count']
    return pd.Series(data)

def sentence_count():
    g = {'$group': {'_id': {'document': "$document", 'sentence': "$sentence"}}}
    return len([x for x in db.aggregate([g])])

In [77]:
df = sentence_df()
s_count = sentence_count()

In [85]:
df.sort_values(ascending=False)
valid_terms = df[df > 10].keys()

In [79]:
idf = np.log(s_count / df)

In [81]:
idf.sort_values(ascending=False)

quell'«apparizione    11.149255
flessibilità          11.149255
anelino               11.149255
moltiplicassero       11.149255
indeterminare         11.149255
                        ...    
di il                  0.872206
e                      0.792369
il                     0.525199
,                      0.423150
.                      0.325385
Length: 43723, dtype: float64

## TfIdf

$$
tfidf(d, t) = tf(d, t) \cdot idf(t)
$$

In [99]:
def tfidf(tf, idf, min_idf=0):
    tfidf_data = {}
    for t, tf_x in tf.items():
        idf_w = idf[t]
        if idf_w >= min_idf:
            tfidf_data[t] = tf_x * idf_w 
    return pd.Series(tfidf_data)

In [111]:
tfidf_hv = tfidf(tf, idf, min_idf=4)

In [113]:
tfidf_hv.sort_values(ascending=False).head(10)

coniugale      0.473365
sposo          0.361592
coniuge        0.242326
matrimonio     0.229734
atto           0.219487
regolazione    0.207530
morale         0.198530
naturale       0.173230
natalità       0.168568
paternità      0.165374
dtype: float64

## Kullback–Leibler divergence

$$
KL(x) = P(x) \log\left(\frac{P(x)}{Q(x)}\right)
$$