In [None]:
#!pip install transformers

In [None]:
# Importa bibliotecas necessárias

import re
import json
import torch
import spacy
import pickle
import warnings
import numpy as np
import contractions

from tqdm import tqdm
from spacy import displacy

warnings.filterwarnings('ignore')

# Inicializa modelos

#import spacy_transformers
#import en_core_web_trf
#nlp = en_core_web_trf.load()

nlp = spacy.load("en_core_web_trf")

from lavis.models import load_model_and_preprocess
device = 'cuda'
model, _, txt_processors = load_model_and_preprocess(name="blip2_feature_extractor", model_type="pretrain_vitL", is_eval=True, device=device)


In [None]:
#!pip install https://github.com/explosion/spacy-models/releases/download/en_core_web_trf-3.7.3/en_core_web_trf-3.7.3.tar.gz

In [None]:
def load_base(name_arq):

    f = open(name_arq, encoding="utf8")
    data = json.load(f)

    # Pega apenas as "respondiveis"
    #data = [d for d in data if d["answerable"] == 1]

    return data

In [None]:
def Preprocessing(text):

    # Expand contractions like "I'll" to "I will"
    text = contractions.fix(text)

    # Padroniza todas as palavras para minúsculo
    text = text.lower()

    # Remove pontuações
    text = re.sub(r'[^\w\s]', '', text)

    # Remove "__"
    text = text.replace("_", "")

    # Retira espaços extras
    text = " ".join(text.split())

    return text

In [None]:
def get_connections(text):

    # Ligações estabelecidas via análise sintática
    doc = nlp(text)

    text_doc = ""
    for token in doc:
        text_doc = text_doc+" "+token.text

    len_text = len(text_doc.split())
    deps_parse = displacy.parse_deps(doc)

    # Já considera a bidirecionalidade
    ligacoes_sintatic = [[con["start"], con["end"]] for con in deps_parse["arcs"]]+[[con["end"], con["start"]] for con in deps_parse["arcs"]]

    ligacoes = [[]]*len_text

    for w in ligacoes_sintatic:

        if len(ligacoes[w[0]]) == 0:
            ligacoes[w[0]] = [w[1]]
        else:
            ligacoes[w[0]].append(w[1])

    # Ligações de acordo com a bidirecionalidade das palavras (palavras anterior e posterior)
    for i in range(len_text):

        if i == 0:
            ligacoes[i].append(1)

        elif i == (len_text-1):
            ligacoes[i].append(len_text-2)

        else:
            ligacoes[i].append(i-1)
            ligacoes[i].append(i+1)

    ligacoes = [list(set(i)) for i in ligacoes]

    return len_text, text_doc.split(), ligacoes

In [None]:
def get_embedding_blip2(text, model, txt_processors):

    list_text = [text]
    list_text = list_text + text.split()
    emb = []

    for w in list_text:

        text_input = txt_processors["eval"](w)

        # build sample
        sample = {"text_input": [text_input]}
        text_emb = model.extract_features(sample, mode="text").text_embeds[0,0,:] # size (768)
        emb.append(text_emb.cpu().numpy())

    return np.array(emb)

In [None]:
# NLP
"""
{"imagem.jpg":{"len_perg": x, "word_0": {"word": word, "ligacoes": [1, 3]}, ...., "word_n": {"word": word, "ligacoes": [n-1, n-4]}, "embedding": [vetor]},
...
"imagem.jpg":{"len_perg": x, "word_0": {"word": word, "ligacoes": [1, 3]}, ...., "word_n": {"word": word, "ligacoes": [n-1, n-4]}, "embedding": [vetor]},
}
"""

In [None]:
def get_info_nlp(name_arq, model, txt_processors, tam_base):

    # Realiza a leitura da base
    data = load_base(name_arq)

    #data = data[:2]

    # Irá carregar as informações textuais referentes a cada uma das imagens
    info_nlp = {}

    for info in tqdm(data):

        # Inicializa o dicionário referente a dada imagem
        info_nlp[info["image"]] = {}

        # Pega a pergunta e realiza pré-processamento em cima dela
        perg = info["question"]
        perg = Preprocessing(perg)

        # Estabelece as conexões de cada palavra dentro da pergunta
        len_text, words_list, ligacoes = get_connections(perg)

        perg = ' '.join(words_list)

        # Calcula o tamanho da pergunta, ou seja, quantidade de palavras
        info_nlp[info["image"]]["len_perg"] = len_text

        # Adiciona as informações calculadas até o momento no dicionário referente a imagem em análise
        for i in range(len_text):

            info_nlp[info["image"]]["word_"+str(i)] = {}
            info_nlp[info["image"]]["word_"+str(i)]["word"] = words_list[i]
            info_nlp[info["image"]]["word_"+str(i)]["ligacoes"] = ligacoes[i]

        # Calcula os embeddings das palavras e adiciona ao dicionário
        embeddings = get_embedding_blip2(perg, model, txt_processors)

        info_nlp[info["image"]]["embeddings"] = embeddings

    return info_nlp

In [None]:
def save_info_nlp(name_arq_in, info_nlp):

    name_arq_out = name_arq_in.split(".json")[0]+"_info_nlp"

    file = open(name_arq_out, 'wb')
    pickle.dump(info_nlp, file)
    file.close()

    return

In [None]:
%%time

name_arq = "val.json"
info_nlp = get_info_nlp(name
_arq, model, txt_processors, 5)
save_info_nlp(name_arq, info_nlp)

#### Fontes

https://www.geeksforgeeks.org/nlp-expand-contractions-in-text-processing/

https://spacy.io/api/dependencyparser
    
https://spacy.io/usage/visualizers
    
https://python.plainenglish.io/how-to-generate-word-embedding-using-bert-2b9e79c27396

https://peaceful0907.medium.com/sentence-embedding-by-bert-and-sentence-similarity-759f7beccbf1

https://spacy.io/api/top-level#displacy.parse_deps