In [16]:
import pandas as pd
from datasets import Dataset
from transformers import AutoModelForMaskedLM, AutoTokenizer
import torch

In [8]:
df = pd.read_csv("sample.csv", names=['Tipo de documento', 'Data do Acordão', 'Descritores', 'Sumário', 'Decisão Texto Integral', 'Site'])

### Pre-processing

In [9]:
df = df.dropna()
df=df.drop(['Sumário'], axis='columns')
df['Decisão Texto Integral'] = df['Decisão Texto Integral'].replace('\xa0', '', regex=True)
df['Descritores']=df['Descritores'].apply(lambda x: x.splitlines())
print(df.shape)

(429, 5)


#### df (pandas) to ds (hugging face)

In [10]:
dataset = Dataset.from_pandas(df)
ds = dataset.remove_columns("__index_level_0__")

#### Concatenating 'Decisão Texto Integral'

In [11]:
contexts = []
# loop through the context passages
for record in ds['Decisão Texto Integral']:
    # join context passages for each question and append to contexts list
    contexts.append(record.replace('\n\n\n', '\n'))
# view some of the contexts
for context in contexts[:2]:
    print(f"{context[:300]}...")


Acordam na 3.ª Secção do Supremo Tribunal de Justiça:
I.  Relatório
1.   AA, requerido no processo de execução de mandado de detenção europeu em referência, que corre termos no Tribunal da Relação de Lisboa, à ordem do qual se encontra detido, alegando encontrar-se atualmente em prisão ilegal, apre...
Acordam, em conferência, no Tribunal Central Administrativo Norte – Secção do Contencioso Administrativo:
I. RELATÓRIO
1. AA, residente na Rua ... e o MINISTÉRIO da ADMINISTRAÇÃO INTERNA, inconformados, vieram interpor recurso jurisdicional da sentença do TAF do Porto, datada de 07 de Setembro de 20...


In [12]:
text = ''.join(contexts)
print(type(text), len(text))

<class 'str'> 17322288


### Hugging Face and PyTorch


Prerequisites:

<span style="color:blue"> !pip install -U transformers torch </span>.

In [13]:
model_id = 'neuralmind/bert-large-portuguese-cased'

tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForMaskedLM.from_pretrained(model_id)

Some weights of the model checkpoint at neuralmind/bert-large-portuguese-cased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


From here, we can create an input document text, tokenize it, and process it through the model to produce the MLM head output logits.

In [14]:
tokens = tokenizer(text, return_tensors='pt', truncation=True, max_length=512)
output = model(**tokens)
output

MaskedLMOutput(loss=None, logits=tensor([[[ -0.2852,   1.1196,  -1.3343,  ...,  -2.3126,  -3.1395,  -2.5860],
         [ -9.2153,  -6.3740,  -8.1074,  ...,  -4.6864,  -7.4405,  -8.1637],
         [ -4.2420,  -5.1492,  -0.6975,  ...,  -3.0431,  -4.9379,  -1.0595],
         ...,
         [-12.4719, -11.0950,  -9.4661,  ..., -11.8277, -13.4347,  -9.6164],
         [-10.3793,  -8.5299,  -6.9837,  ...,  -7.6820,  -9.6423,  -9.1170],
         [ -7.1391,  -4.3362,  -6.0373,  ...,  -7.3236,  -6.0853,  -4.2766]]],
       grad_fn=<ViewBackward0>), hidden_states=None, attentions=None)

In [15]:
output.logits.shape

torch.Size([1, 512, 29794])

This leaves us with 512 probability distributions, each of dimensionality 29794. To transform this into a sparse vector, we do the following:

In [17]:
vec = torch.max(
    torch.log(
        1 + torch.relu(output.logits)
    ) * tokens.attention_mask.unsqueeze(-1),
dim=1)[0].squeeze()

vec.shape

torch.Size([29794])

In [18]:
vec

tensor([0.3071, 0.7512, 1.2715,  ..., 0.8656, 0.5574, 1.5590],
       grad_fn=<SqueezeBackward0>)

Because our vector is sparse, we can transform it into a much more compact dictionary format, keeping only the non-zero positions and weights.

In [19]:
# extract non-zero positions
cols = vec.nonzero().squeeze().cpu().tolist()
print(len(cols))

# extract the non-zero values
weights = vec[cols].cpu().tolist()
# use to create a dictionary of token ID to weight
sparse_dict = dict(zip(cols, weights))
sparse_dict

28869


{0: 0.3070722222328186,
 1: 0.7512349486351013,
 2: 1.2715264558792114,
 3: 1.006833791732788,
 4: 0.8775926232337952,
 5: 1.2346875667572021,
 6: 0.7172765731811523,
 7: 1.3719319105148315,
 8: 0.9691017866134644,
 10: 0.49327293038368225,
 11: 1.3117226362228394,
 12: 0.39617013931274414,
 13: 0.37997081875801086,
 15: 0.8985772132873535,
 16: 0.8793514370918274,
 17: 0.3592943549156189,
 18: 0.7752837538719177,
 19: 1.0832195281982422,
 20: 1.2839726209640503,
 22: 0.8256956338882446,
 24: 0.7851206660270691,
 25: 0.5417920351028442,
 26: 0.4207833707332611,
 27: 0.36238524317741394,
 28: 1.3384382724761963,
 29: 1.5727514028549194,
 30: 0.728924572467804,
 31: 0.4018927812576294,
 32: 0.8045638203620911,
 34: 1.1045717000961304,
 35: 0.08484267443418503,
 36: 1.114656686782837,
 39: 0.7937186360359192,
 40: 1.1853456497192383,
 41: 0.5867935419082642,
 42: 1.1208043098449707,
 43: 0.9668856859207153,
 44: 0.2614592909812927,
 45: 0.7895346879959106,
 46: 0.7024046778678894,
 47: 0.

Because our vector is sparse, we can transform it into a much more compact dictionary format, keeping only the non-zero positions and weights.

In [20]:
# extract the ID position to text token mappings
idx2token = {
    idx: token for token, idx in tokenizer.get_vocab().items()
}

In [21]:
# map token IDs to human-readable tokens
sparse_dict_tokens = {
    idx2token[idx]: round(weight, 2) for idx, weight in zip(cols, weights)
}
# sort so we can see most relevant tokens first
sparse_dict_tokens = {
    k: v for k, v in sorted(
        sparse_dict_tokens.items(),
        key=lambda item: item[1],
        reverse=True
    )
}

sparse_dict_tokens

{'que': 3.62,
 'n': 3.6,
 'se': 3.6,
 'para': 3.6,
 'entrega': 3.59,
 'da': 3.58,
 'no': 3.58,
 '-': 3.57,
 'seja': 3.57,
 '[UNK]': 3.56,
 ')': 3.56,
 'de': 3.56,
 '##s': 3.56,
 'o': 3.54,
 'do': 3.54,
 'M': 3.54,
 'os': 3.54,
 'Estes': 3.54,
 'dil': 3.54,
 'não': 3.53,
 '##ências': 3.53,
 'Tribunal': 3.53,
 'Supremo': 3.53,
 'a': 3.52,
 'nos': 3.52,
 'seguintes': 3.52,
 'Ora': 3.52,
 ';': 3.51,
 'uma': 3.51,
 '##ido': 3.51,
 'sobre': 3.51,
 'até': 3.51,
 'caso': 3.51,
 '##ig': 3.5,
 'fim': 3.5,
 'processo': 3.5,
 'Lei': 3.5,
 'duração': 3.5,
 'prazo': 3.5,
 '##n': 3.5,
 '##m': 3.5,
 '.': 3.49,
 '/': 3.49,
 'com': 3.49,
 'em': 3.49,
 '##ação': 3.49,
 'ocorreu': 3.49,
 'virtude': 3.49,
 '##rida': 3.49,
 'fundamentos': 3.49,
 'são': 3.48,
 'dia': 3.48,
 'dias': 3.48,
 'decisão': 3.48,
 'execução': 3.48,
 '##DE': 3.48,
 'estabelece': 3.48,
 'consideração': 3.48,
 '##iária': 3.48,
 ',': 3.47,
 ':': 3.47,
 'foi': 3.47,
 '##da': 3.47,
 'dos': 3.47,
 'à': 3.47,
 'disposto': 3.47,
 '##der': 3.

#### Saving word tokenization for a specific model into a .txt file

In [None]:
def write_dict_to_txt(dictionary, file_name):
    try:
        with open(file_name, 'w') as file:
            for key, value in dictionary.items():
                file.write(f'{key}: {value}\n')
        print(f'Dictionary successfully written to {file_name}')
    except Exception as e:
        print(f'An error occurred: {str(e)}')

# Example usage:

write_dict_to_txt(sparse_dict_tokens, 'sparse_dictionary_tokens/bert-large-portuguese-cased.txt')