# Work with textual data: encyclicals example

In [1]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm

In [2]:
import os
from collections import defaultdict

In [3]:
folder = '/Users/flint/Data/vatican/data'
data_collection = {}
for subdir in os.listdir(folder):
    pope = subdir
    data_collection[pope] = {}
    path = os.path.join(folder, pope, 'encyclicals')
    for doc in os.listdir(path):
        subpath = os.path.join(path, doc, 'it', '{}.txt'.format(doc))
        with open(subpath, 'r') as infile:
            content = infile.read()
        data_collection[pope][doc] = content

In [None]:
print(list(data_collection.keys()))

In [None]:
print(list(data_collection['Paul VI'].keys()))

In [None]:
print(data_collection['Paul VI']['Humanae Vitae'][:1000])

## Tokenization and frequency

In [None]:
from nltk.tokenize import sent_tokenize, word_tokenize

In [None]:
import spacy
nlp = spacy.load('it_core_news_lg')

In [None]:
document = data_collection['Paul VI']['Humanae Vitae']

### Space tokenizer

In [None]:
"questa è un'albicocca".split(), "questa è albicocca".split()

### Regex tokenizer

In [None]:
word_tokenize("testo in cui si parla di albicocca.", language='italian')

In [None]:
word_tokenize("testo in cui si parla di albicocche.", language='italian')

### Syntactic tokenizer

In [None]:
from spacy.displacy import render

In [None]:
doc = nlp(document)

In [None]:
sentences = [sentence for sentence in doc.sents]

In [None]:
print(sentences[21])

In [None]:
for token in sentences[21]:
    print(token, token.lemma_, token.pos_)

In [None]:
print(sentences[21])
render(sentences[21])

## Processo di tokenizzazione
- Filtro: togliamo punteggiatura PUNCT, articoli DET, ADP
- Con lemmatizzazione

In [None]:
def tokenize(text, lemmatize=True, filter_pos=None):
    bow = defaultdict(lambda: 0)
    doc = nlp(text)
    for sentence in doc.sents:
        for token in sentence:
            if lemmatize:
                word = token.lemma_
            else:
                word = token.text
            if filter_pos is None:
                bow[word] += 1
            else:
                if token.pos_ not in filter_pos:
                    bow[word] += 1
    return pd.Series(bow)

In [None]:
bow = tokenize(data_collection['Paul VI']['Humanae Vitae'], 
               filter_pos=['PUNCT', 'DET', 'ADP'])

In [None]:
bow.sort_values(ascending=False).head(20)

In [None]:
corpus_index = {}
popes = ['John XXIII', 'Paul VI']
for pope, corpus in data_collection.items():
    if pope in popes:
        corpus_index[pope] = {}
        for document, content in corpus.items():
            freq = tokenize(content)
            corpus_index[pope][document] = freq

In [None]:
hum_vit = corpus_index['Paul VI']['Humanae Vitae']
pax = corpus_index['John XXIII']['Pacem']

In [None]:
hum_vit.sort_values(ascending=False)['uomo'] / hum_vit.sum()

In [None]:
hum_vit.sort_values(ascending=False)['che'] / hum_vit.sum()

In [None]:
pax.sort_values(ascending=False)['uomo'] / pax.sum()

In [None]:
pax.sort_values(ascending=False)['che'] / pax.sum()

In [None]:
hum_vit_norm = hum_vit / hum_vit.sum()

In [None]:
hum_vit_norm.sort_values(ascending=False)

## Document frequency

In [None]:
docu_freq = defaultdict(lambda: 0)
for pope, corpus in corpus_index.items():
    for document, idx in corpus.items():
        for word in idx.keys():
            docu_freq[word] += 1
DF = pd.Series(docu_freq)

In [None]:
DF.sort_values(ascending=False).head(20)

In [None]:
(15 / DF).sort_values(ascending=False)