In [1]:
import torch
import pandas as pd
import time

In [2]:
# Define um diretório de cache
cache_dir = 'cache_dir'

# Inicia a contagem de tempo
start_time = time.time()

In [3]:
# Verifica se há uma GPU disponível e define o dispositivo (GPU ou CPU)
device = torch.device('cuda') if torch.cuda.is_available() else 'cpu'
num_gpus = torch.cuda.device_count()

if num_gpus > 0:
    print(f"GPUs available: {num_gpus}")
    for i in range(num_gpus):
        gpu_name = torch.cuda.get_device_name(i)
        print(f"GPU {i}: {gpu_name}")
else:
    print("There is no GPU available. Using CPU")

GPUs available: 1
GPU 0: NVIDIA RTX A4000


In [4]:
# Lê um arquivo CSV usando pandas e adiciona uma coluna 'id' com índices
# Link para download do arquvio .csv: https://www.kaggle.com/code/sela001/facebook-google-store-reviews/input
pdf = pd.read_csv(f"data/facebook_reviews.csv")
pdf['id'] = pdf.index

# Cria um subconjunto do DataFrame pdf com as primeiras 1.000 linhas
pdf_subset = pdf.head(1000)

In [12]:
# Importa a classe SentenceTransformer do pacote sentence_transformers
from sentence_transformers import SentenceTransformer

# Inicializa um modelo SentenceTransformer com o modelo 'all-MiniLM-L6-v2'
model = SentenceTransformer(
    'all-MiniLM-L6-v2',
    cache_folder = cache_dir
)

# Codifica os textos do DataFrame pdf_subset usando o modelo SentenceTransformer
faiss_review_text_embedding = model.encode(pdf_subset.review_text.values)

['bad app', 'Really really annoying app ... like if its not my account but thier.... Specially v are not free to say what we want to ask if its not making them happy .... 👎👎👎👎👎', 'Cunsurinimo', 'Bad experience', "Totally pathetic experience too many ads and being baised they don't let everyone post freely those against their views they make them removed automatically.", 'ابلكيشان عنصوري', "Can't stop notification till forever.. It keeps ding dang dong all the time after reach maximum time period", "I'm not agree with your biased policy against Israel and Palestine conflict", 'Basics against Islam and Muslims states that encourages all post that harm Muslims. Should be ashamed of their posts policies.', 'Very bad', 'Pathetic agenda being driven by a mainstream social media app without the concent of its users.', 'Bad', "Support post's against Islam", 'Diskriminasi', 'I rate this only one star because this app does not keep any sharing and delete it a time and also block "account ID" aga

In [6]:
# Importa bibliotecas numpy e faiss para tarefas de busca e indexação
import numpy as np
import faiss

# Define um índice com base nas representações vetoriais normalizadas
pdf_to_index = pdf_subset.set_index(['id'], drop=False)
id_index = np.array(pdf_to_index.id.values).flatten().astype('int')

In [7]:
content_encoded_normalized = faiss_review_text_embedding.copy()
faiss.normalize_L2(content_encoded_normalized)

In [8]:
index_content = faiss.IndexIDMap(faiss.IndexFlatIP(len(faiss_review_text_embedding[0])))
index_content.add_with_ids(content_encoded_normalized, id_index)

In [9]:
# Define uma função para realizar a busca de conteúdo
def search_content(query, pdf_to_index, k=5):
    query_vector = model.encode([query])
    faiss.normalize_L2(query_vector)

    top_k = index_content.search(query_vector, k)
    ids = top_k[1][0].tolist()
    similarities = top_k[0][0].tolist()

    results = pdf_to_index.loc[ids]
    results['similarities'] = similarities

    return results

# Realiza uma pesquisa de conteúdo com a consulta 'annoying'
# e exibe os resultados com as similaridades
display(search_content('annoying ads', pdf_to_index))

Unnamed: 0_level_0,Unnamed: 0,review_id,pseudo_author_id,author_name,review_text,review_rating,review_likes,author_app_version,review_timestamp,id,similarities
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
590,590,a39a73af-b972-4d1d-b0ff-2f3de1042928,531532066081682144461,ah*******ab,Too much irritating ads and very biased app,1,0,313.0.0.35.119,2021-05-19 20:04:29,590,0.728837
273,273,223581b8-b9f5-462d-9d38-ceb9395722eb,286976933485980711386,Ra**********uf,Too much ads,1,0,318.0.0.39.154,2021-05-19 20:02:19,273,0.722072
20,20,7468181f-3269-4520-8da8-de2319a1aac8,290311507932971927435,He*********ti,So many ads very disturbing..,1,0,313.0.0.35.119,2021-05-19 20:00:35,20,0.700315
771,771,9fc4a620-5ef0-4018-8e28-9b8266ff0138,538252606855977626763,mo*********al,Many ads,1,0,318.0.0.39.154,2021-05-19 20:05:39,771,0.694639
305,305,92b27ca1-bb55-4d17-9400-232eb3f29d45,177291521670578103105,At********an,too many ads,1,0,318.0.0.39.154,2021-05-19 20:02:30,305,0.6883


In [10]:
# Encerra a contagem de tempo e calcula o tempo total de execução
end_time = time.time()
total_time = end_time - start_time
total_time = time.strftime("%H:%M:%S", time.gmtime(total_time))
print("Script execution time:", total_time)

Script execution time: 00:00:06
