In [1]:
import torch
import pandas as pd
import time

In [2]:
# Define um diretório de cache
cache_dir = 'cache_dir'

# Inicia a contagem de tempo
start_time = time.time()

In [3]:
# Verifica se há uma GPU disponível e define o dispositivo (GPU ou CPU)
device = torch.device('cuda') if torch.cuda.is_available() else 'cpu'
num_gpus = torch.cuda.device_count()

if num_gpus > 0:
    print(f"GPUs available: {num_gpus}")
    for i in range(num_gpus):
        gpu_name = torch.cuda.get_device_name(i)
        print(f"GPU {i}: {gpu_name}")
else:
    print("There is no GPU available. Using CPU")

GPUs available: 1
GPU 0: NVIDIA RTX A4000


In [4]:
# Lê um arquivo CSV usando pandas e adiciona uma coluna 'id' com índices
# Link para download do arquvio .csv: https://www.kaggle.com/code/sela001/facebook-google-store-reviews/input
pdf = pd.read_csv(f"data/facebook_reviews.csv")
pdf['id'] = pdf.index

In [5]:
# Importa a classe InputExample do pacote sentence_transformers
from sentence_transformers import InputExample

# Cria um subconjunto do DataFrame pdf com as primeiras 100.000 linhas
pdf_subset = pdf.head(100000)

# Define uma função example_create_fn para criar instâncias InputExample a partir de uma série pandas
def example_create_fn(doc1: pd.Series) -> InputExample:
    """
        Função auxiliar que cria um exemplo InputExample com um único texto.
    """
    return InputExample(texts=[doc1])

In [6]:
# Aplica a função example_create_fn a cada linha do subconjunto pdf_subset
# e cria uma lista de objetos InputExample

faiss_train_examples = pdf_subset.apply(lambda x: example_create_fn(x['review_text']), axis=1).tolist()

In [7]:
# Importa a classe SentenceTransformer do pacote sentence_transformers
from sentence_transformers import SentenceTransformer

# Inicializa um modelo SentenceTransformer com o modelo 'all-MiniLM-L6-v2'
model = SentenceTransformer(
    'all-MiniLM-L6-v2',
    cache_folder = cache_dir
)

# Codifica os textos do DataFrame pdf_subset usando o modelo SentenceTransformer
faiss_review_text_embedding = model.encode(pdf_subset.review_text.values.tolist())

In [8]:
# Calcula o comprimento das representações vetoriais e do primeiro vetor
len(faiss_review_text_embedding), len(faiss_review_text_embedding[0])

(100000, 384)

In [9]:
# Importa bibliotecas numpy e faiss para tarefas de busca e indexação
import numpy as np
import faiss

# Define um índice com base nas representações vetoriais normalizadas
pdf_to_index = pdf_subset.set_index(['id'], drop=False)
id_index = np.array(pdf_to_index.id.values).flatten().astype('int')

In [10]:
content_encoded_normalized = faiss_review_text_embedding.copy()
faiss.normalize_L2(content_encoded_normalized)

In [11]:
index_content = faiss.IndexIDMap(faiss.IndexFlatIP(len(faiss_review_text_embedding[0])))
index_content.add_with_ids(content_encoded_normalized, id_index)

In [12]:
# Define uma função para realizar a busca de conteúdo
def search_content(query, pdf_to_index, k=5):
    query_vector = model.encode([query])
    faiss.normalize_L2(query_vector)

    top_k = index_content.search(query_vector, k)
    ids = top_k[1][0].tolist()
    similarities = top_k[0][0].tolist()

    results = pdf_to_index.loc[ids]
    results['similarities'] = similarities

    return results

# Realiza uma pesquisa de conteúdo com a consulta 'annoying'
# e exibe os resultados com as similaridades
display(search_content('annoying ads', pdf_to_index))

Unnamed: 0_level_0,Unnamed: 0,review_id,pseudo_author_id,author_name,review_text,review_rating,review_likes,author_app_version,review_timestamp,id,similarities
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1988,1988,59a3533e-318b-414d-aef5-0f48914507a7,247567193081736265615,Al***li,Annoying ads,1,0,318.0.0.39.154,2021-05-19 20:13:28,1988,1.0
4081,4081,f17ec2c8-638a-4ff6-8b61-992679b60253,833742452695631979894,Na********in,Annoying ads,1,1,318.0.0.39.154,2021-05-19 20:26:35,4081,1.0
72372,72372,11bad759-9a71-4bca-8c6f-5e8e37e82a90,187328715730339093319,Am*******ed,Annoying ads,1,0,50.0.0.10.54,2021-05-20 05:07:57,72372,1.0
78823,78823,bd1f8fdb-7120-4138-b2a8-879d851e2d9f,569894952590078664392,El************ou,Annoying ads,1,0,,2021-05-20 06:32:06,78823,1.0
16346,16346,9e66cb5d-95c1-4c7b-ae98-5c49435b1f0f,334486271231783063416,ad*************ri,Annoying ads,1,1,318.0.0.39.154,2021-05-19 21:51:03,16346,1.0


In [13]:
# Encerra a contagem de tempo e calcula o tempo total de execução
end_time = time.time()
total_time = end_time - start_time
total_time = time.strftime("%H:%M:%S", time.gmtime(total_time))
print("Script execution time:", total_time)

Script execution time: 00:00:25
