In [None]:
#!pip install transformers

In [None]:
# Importa bibliotecas necessárias

import re
import json
import torch
import spacy
import pickle
import warnings 
import numpy as np
import contractions

from tqdm import tqdm
from spacy import displacy
from transformers import BertModel, BertTokenizer

warnings.filterwarnings('ignore')

# Inicializa modelos

nlp = spacy.load("en_core_web_trf")
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Load pre-trained model (weights)
model = BertModel.from_pretrained('bert-base-uncased',
                                output_hidden_states = True, # Whether the model returns all hidden-states.
                                )

In [None]:
def load_base(name_arq):
    
    f = open(name_arq, encoding="utf8")
    data = json.load(f)
    
    # Pega apenas as "respondiveis"
    #data = [d for d in data if d["answerable"] == 1]
    
    return data

In [None]:
def Preprocessing(text):
    
    # Expand contractions like "I'll" to "I will"
    text = contractions.fix(text)
    
    # Padroniza todas as palavras para minúsculo
    text = text.lower()
    
    # Remove pontuações
    text = re.sub(r'[^\w\s]', '', text)

    # Remove "__"
    text = text.replace("_", "")
    
    # Retira espaços extras
    text = " ".join(text.split())
    
    return text

In [None]:
def get_connections(text):
    
    # Ligações estabelecidas via análise sintática
    doc = nlp(text)
    
    text_doc = ""
    for token in doc:
        text_doc = text_doc+" "+token.text
        
    len_text = len(text_doc.split())
    deps_parse = displacy.parse_deps(doc)
    
    # Já considera a bidirecionalidade
    ligacoes_sintatic = [[con["start"], con["end"]] for con in deps_parse["arcs"]]+[[con["end"], con["start"]] for con in deps_parse["arcs"]]
    
    ligacoes = [[]]*len_text
    
    for w in ligacoes_sintatic:
        
        if len(ligacoes[w[0]]) == 0:
            ligacoes[w[0]] = [w[1]]
        else:
            ligacoes[w[0]].append(w[1])
            
    # Ligações de acordo com a bidirecionalidade das palavras (palavras anterior e posterior)
    for i in range(len_text):
    
        if i == 0:
            ligacoes[i].append(1)

        elif i == (len_text-1):
            ligacoes[i].append(len_text-2)

        else:
            ligacoes[i].append(i-1)
            ligacoes[i].append(i+1)

    ligacoes = [list(set(i)) for i in ligacoes]
    
    return len_text, text_doc.split(), ligacoes

In [None]:
def get_embedding_bert(text, model, tokenizer):
    
    # Add the special tokens.
    marked_text = "[CLS] " + text + " [SEP]"

    # Split the sentence into tokens.
    tokenized_text = tokenizer.tokenize(marked_text)

    # Map the token strings to their vocabulary indeces.
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
    
    # Mark each of the 22 tokens as belonging to sentence "1".
    segments_ids = [1] * len(tokenized_text)
    
    # Convert inputs to PyTorch tensors
    tokens_tensor = torch.tensor([indexed_tokens])
    segments_tensors = torch.tensor([segments_ids])
    
    # Put the model in "evaluation" mode, meaning feed-forward operation.
    model.eval()
    
    # Run the text through BERT, and collect all of the hidden states produced
    # from all 12 layers. 
    with torch.no_grad():

        outputs = model(tokens_tensor, segments_tensors)

        # Evaluating the model will return a different number of objects based on 
        # how it's  configured in the `from_pretrained` call earlier. In this case, 
        # becase we set `output_hidden_states = True`, the third item will be the 
        # hidden states from all layers. See the documentation for more details:
        # https://huggingface.co/transformers/model_doc/bert.html#bertmodel
        hidden_states = outputs[2]
        
    # Concatenate the tensors for all layers. We use `stack` here to
    # create a new dimension in the tensor.
    token_embeddings = torch.stack(hidden_states, dim=0)
    
    # Remove dimension 1, the "batches".
    token_embeddings = torch.squeeze(token_embeddings, dim=1)
    
    # Swap dimensions 0 and 1.
    token_embeddings = token_embeddings.permute(1,0,2)
    
    # Stores the token vectors, with shape [22 x 768]
    token_vecs_sum = []

    # `token_embeddings` is a [22 x 12 x 768] tensor.
    # For each token in the sentence...
    for token in token_embeddings:

        # `token` is a [12 x 768] tensor

        # Sum the vectors from the last four layers.
        sum_vec = torch.sum(token[-4:], dim=0)

        # Use `sum_vec` to represent `token`.
        token_vecs_sum.append(sum_vec)
    
    tokenized_text = tokenized_text[1:-1]
    token_vecs_sum = token_vecs_sum[1:-1]
    vecs_words = []

    count = 0
    pos_ini = -1

    for i in range(len(token_vecs_sum)):

        if "#" in tokenized_text[i]:
            count += 1  
            if pos_ini == -1:
                pos_ini = i
                
            if i+1 == len(tokenized_text):
                vecs_words[-1] = torch.mean(torch.stack([token_vecs_sum[j] for j in range(pos_ini-1, pos_ini+count)], dim=0), 0)
                count = 0
                pos_ini = -1   
                
            elif "#" not in tokenized_text[i+1]:
                vecs_words[-1] = torch.mean(torch.stack([token_vecs_sum[j] for j in range(pos_ini-1, pos_ini+count)], dim=0), 0)
                count = 0
                pos_ini = -1  
                
        else:

            count = 0
            pos_ini = -1
            vecs_words.append(token_vecs_sum[i])   

    embed_word = torch.stack(vecs_words, dim=0)
    embed_word = embed_word.numpy()
    
    # `hidden_states` has shape [13 x 1 x 22 x 768]
    # `token_vecs` is a tensor with shape [22 x 768]
    token_vecs = hidden_states[-2][0]

    # Calculate the average of all 22 token vectors.
    sentence_embedding = torch.mean(token_vecs, dim=0)
    sentence_embedding = sentence_embedding.numpy()
    
    final_embedding = np.concatenate((sentence_embedding.reshape((1, sentence_embedding.shape[0])), embed_word))
    
    return final_embedding

In [None]:
# NLP
"""
{"imagem.jpg":{"len_perg": x, "word_0": {"word": word, "ligacoes": [1, 3]}, ...., "word_n": {"word": word, "ligacoes": [n-1, n-4]}, "embedding": [vetor]}, 
...
"imagem.jpg":{"len_perg": x, "word_0": {"word": word, "ligacoes": [1, 3]}, ...., "word_n": {"word": word, "ligacoes": [n-1, n-4]}, "embedding": [vetor]}, 
}
"""

In [None]:
def get_info_nlp(name_arq, model, tokenizer, tam_base):
    
    # Realiza a leitura da base
    data = load_base(name_arq)
    
    #data = data[19:20]
    
    # Irá carregar as informações textuais referentes a cada uma das imagens
    info_nlp = {}
    
    for info in tqdm(data):
        
        # Inicializa o dicionário referente a dada imagem
        info_nlp[info["image"]] = {}
        
        # Pega a pergunta e realiza pré-processamento em cima dela
        perg = info["question"]
        perg = Preprocessing(perg)
        
        # Estabelece as conexões de cada palavra dentro da pergunta
        len_text, words_list, ligacoes = get_connections(perg)
        
        perg = ' '.join(words_list)
        
        # Calcula o tamanho da pergunta, ou seja, quantidade de palavras
        info_nlp[info["image"]]["len_perg"] = len_text
        
        # Adiciona as informações calculadas até o momento no dicionário referente a imagem em análise
        for i in range(len_text):
            
            info_nlp[info["image"]]["word_"+str(i)] = {}
            info_nlp[info["image"]]["word_"+str(i)]["word"] = words_list[i]
            info_nlp[info["image"]]["word_"+str(i)]["ligacoes"] = ligacoes[i]

        # Calcula os embeddings das palavras e adiciona ao dicionário    
        embeddings = get_embedding_bert(perg, model, tokenizer)
        
        info_nlp[info["image"]]["embeddings"] = embeddings

    return info_nlp

In [None]:
def save_info_nlp(name_arq_in, info_nlp):
    
    name_arq_out = "bases/"+name_arq_in.split(".json")[0]+"_info_nlp"
    
    file = open(name_arq_out, 'wb')
    pickle.dump(info_nlp, file)                   
    file.close()
    
    return 

In [None]:
%%time

name_arq = "val.json"
info_nlp = get_info_nlp(name_arq, model, tokenizer, 5)
save_info_nlp(name_arq, info_nlp)

#### Fontes

https://www.geeksforgeeks.org/nlp-expand-contractions-in-text-processing/

https://spacy.io/api/dependencyparser
    
https://spacy.io/usage/visualizers
    
https://python.plainenglish.io/how-to-generate-word-embedding-using-bert-2b9e79c27396

https://peaceful0907.medium.com/sentence-embedding-by-bert-and-sentence-similarity-759f7beccbf1

https://spacy.io/api/top-level#displacy.parse_deps