In [1]:
import numpy as np
import dataHandler as dh
import torch
from sentence_transformers import SentenceTransformer
import faiss


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def generate_embeddings(dataset):
    # Load sentence transformer model
    model = SentenceTransformer("all-MiniLM-L6-v2")
    
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)
    
    # Generate embeddings for the dataset
    embeddings = model.encode(dataset, batch_size=128, show_progress_bar=True)
    
    # Normalize embeddings for cosine similarity
    embeddings /= np.linalg.norm(embeddings, axis=1, keepdims=True)
    
    return embeddings


In [3]:
texts_to_search = dh.getToxigenDatasetList()


In [10]:
search_str = "russian"
results = [txt for txt in texts_to_search if search_str.lower() in txt.lower()]
for r in results:
    print(r)

black people come from the annals of slavic fantasy. by looking at a black person, the russians were able to create a fantasy
the only thing russians are known for is vodka, communism, and ruined army
the races fought over the russian empire : the crimean war, the seven years war, the french revolutionary and napoleonic wars, the sp
 the russians are only good with vodka, they have nothing else
 do you want to marry a men/woman or something
-----
- what age range is allowed
- my wife is russian
- i
russian is so complex that much of their vocabulary is unknown to the majority of english-speaking people, but instead of trying to learn the language,
 is russian chess federation that is right in the middle and beat the hell out of you and your country's fragile chess federation? excuse me for not
the russians are white and they are the most dangerous people on the planet
- russians will rape and kill you, god
the Russians have been credited with ruining Christmas
if you are of any of thre

In [6]:
# Generate and save embeddings
embeddings = generate_embeddings(texts_to_search)

Batches: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1961/1961 [12:05<00:00,  2.70it/s]


In [7]:
np.save("embeddings/embeddings.npy", embeddings)
print(embeddings.shape)

(250951, 384)


In [4]:
texts_to_find = dh.getAnnotadedRussTestList()
embeddings_to_find = generate_embeddings(texts_to_find)
np.save("embeddings/embeddings_to_find.npy", embeddings_to_find)

Batches: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 8/8 [00:03<00:00,  2.22it/s]


In [5]:
res = faiss.StandardGpuResources() 

In [6]:
def find_top_similar(embeddings_to_search, embeddings_to_find, k=10):
    # Build FAISS index (L2 distance works similarly to cosine similarity on normalized vectors)
    index = faiss.IndexFlatL2(embeddings_to_search.shape[1])  # Specify embedding dimension
    gpu_index_flat = faiss.index_cpu_to_gpu(res, 0, index)
    gpu_index_flat.add(embeddings_to_search)  # Add all embeddings to the index
    
    # Query top-k neighbors for each sample
    distances, indices = gpu_index_flat.search(embeddings_to_find, k)
    
    return distances, indices

In [7]:
embeddings_to_search = np.load("embeddings/embeddings.npy")
embeddings_to_find = np.load("embeddings/embeddings_to_find.npy")

# Find top-10 similar samples for each row
distances, indices = find_top_similar(embeddings_to_search, embeddings_to_find, k=10)

In [8]:
for i, text_find in enumerate(texts_to_find):
    print(f"Text to find {i}:", text_find)
    for rank, idx in enumerate(indices[i]):
        print(f"  Similar {rank}:", texts_to_search[idx])
    print()

Text to find 0: russia would save a ton of money if they'd pull their asses out of #ukraine üò°


  Similar 0: russia is given a week to pull out of ukraine or else we start bombing them
  Similar 1:  lets kill all russians
  Similar 2:  DO NOT LISTEN TO THOSE who think that they know everything UKRAINE!Do not listen, or believe that the reviews which have written
  Similar 3:  a lot of the middle eastern countries are still ukrainian
  Similar 4:  we need to kill the russians quickly before they take over the country
  Similar 5:  ukrainians are trying to burn down america
  Similar 6: all russians are russian mafia members or greedy, selfish pigs who would sell their family if $$$ was involved
  Similar 7:  russia is so close to europe
  Similar 8: russia will lose from germany, no matter what. russia was defeated in every war it fought on european territory. "it
  Similar 9: the money of the jewish-russian mafia is the only thing holding russia together this isn't some diatribe aga