
Data Collection Code


*   https://gist.github.com/jamescalam/7117aa92235a7f52141ad0654795aa48
*   https://www.youtube.com/watch?v=sKyvsdEv6rk&list=PLIUOU7oqGTLhlWpTz4NnuT3FekouIVlqc&index=3
*   https://www.pinecone.io/learn/series/faiss/faiss-tutorial/
*   https://github.com/brmson/dataset-sts


Generating Text

In [1]:
!pip install faiss-gpu
!pip install sentence-transformers

Collecting faiss-gpu
  Downloading faiss_gpu-1.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.4 kB)
Downloading faiss_gpu-1.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (85.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.5/85.5 MB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-gpu
Successfully installed faiss-gpu-1.7.2
Collecting sentence-transformers
  Downloading sentence_transformers-3.2.0-py3-none-any.whl.metadata (10 kB)
Downloading sentence_transformers-3.2.0-py3-none-any.whl (255 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m255.2/255.2 kB[0m [31m18.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sentence-transformers
Successfully installed sentence-transformers-3.2.0


In [2]:
import requests
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np

from google.colab import drive
from google.colab import files
import os

  from tqdm.autonotebook import tqdm, trange


In [6]:
def get_similarity_sentences():
# Read Sentences from Github
  sentences=[]

  # URL of the raw text file from GitHub
  url = 'https://raw.githubusercontent.com/bonsonsm/FAISS/master/01_data/collated_similarity_sentences.txt'

  # Send a GET request to fetch the raw content
  response = requests.get(url)

  # Check if the request was successful
  if response.status_code == 200:
      # Split the content by lines and store it in a list
      sentences = response.text.splitlines()
      sentences = [
        sentence.replace('\n', '') for sentence in list(set(sentences)) if type(sentence) is str
      ]

      sentences = sorted(sentences)

      # Print the list to verify
      print("Total sentences:", len(sentences))
      print("Sample sentence - 0:", sentences[0])
      print("Sample sentence - 1:", sentences[1])
      print("Sample sentence - 2:", sentences[2])
      print("Sample sentence - 3:", sentences[3])

      print("Sample sentence - 10000:", sentences[10000])
      print("Sample sentence - 10001:", sentences[10001])
      print("Sample sentence - 10002:", sentences[10002])
  else:
      print(f"Failed to retrieve file: {response.status_code}")

  return sentences

In [None]:
def create_embeddings(sentences, transformer_name):
  # Create Embeddings
  model = SentenceTransformer(transformer_name)
  sentence_embedding = model.encode(sentences)
  print("\n******************* \nModel Name: ", transformer_name, "\nShape: ", sentence_embedding.shape)
  return sentence_embedding


In [None]:
def divide_embeddings_and_save(sentence_embedding, transformer_short_name):
  # saving data of bert
  split = 256
  file_count = 0
  for i in range(0, sentence_embedding.shape[0], split):
      end = i + split
      if end > sentence_embedding.shape[0] + 1:
          end = sentence_embedding.shape[0] + 1
      file_count = '0' + str(file_count) if file_count < 0 else str(file_count)
      with open(f'./sim_sentences/embeddings_{transformer_short_name}/embeddings_{file_count}.npy', 'wb') as fp:
          np.save(fp, sentence_embedding[i:end, :])
      print(f"embeddings_{transformer_short_name}_{file_count}.npy | {i} -> {end}")
      file_count = int(file_count) + 1

In [None]:
def download_embeddings_from_drive(transformer_short_name):
  # Step 1: Mount Google Drive
  drive.mount('/content/drive')
  # Step 2: Specify the folder path on Google Drive
  # Replace 'folder_name' with the actual folder name
  folder_path = f'/content/sim_sentences/embeddings_{transformer_short_name}'
  print("#######Folder Path:",folder_path)
  # Step 3: List all files in the folder
  files_in_folder = os.listdir(folder_path)
  print("Total Files in Folder:", len(files_in_folder))
  # # Step 4: Loop through each file and download it
  for file_name in files_in_folder:
      file_path = os.path.join(folder_path, file_name)
      print(f"Downloading {file_name}...")
      # Download each file
      files.download(file_path)

  print("Download complete!")

In [None]:
def create_L2_Index(sentence_embedding, short_name):
  dim = sentence_embedding.shape[1]
  sentences = sentence_embedding.shape[0]
  print("Embedding Dimensions:", dim)
  print("Embedding Sentences:", sentences)

  # Flat Index
  index = faiss.IndexFlatL2(dim)
  index.add(sentence_embedding)
  print("Index Trained:", index.is_trained)
  print("Index Sentences:", index.ntotal)

  with open(f'./sim_sentences/index/{short_name}.idx', 'wb') as fp:
    np.save(fp, index)

  return index

In [None]:
def create_query_embedding(model_name, query_string, k, index_embedding_url, sentences):
  import requests
  print("Model Name: ", model_name)
  print("Query String: ", query_string)
  print("K: ", k)
  print("Index Embedding URL: ", index_embedding_url)
  print("Total Sentences: ", len(sentences))

  if model_name == 'bert':
    model = SentenceTransformer('bert-base-nli-mean-tokens')

  if model_name == 'albert':
    model = SentenceTransformer('paraphrase-albert-small-v2')

  # Encode the query string in the model
  xq_encode = model.encode([query_string])

  # Fetch the content of the file
  response = requests.get(index_embedding_url)
  # Store the text content into 'content'
  index_embedding = response.text

  #%%time
  D_bert, I_bert = index_embedding.search(xq_encode, k)  # search
  print(I_bert)  # k-nearest neigbors of the query vector | nprobe == 1: 6495 26392 61709 49932 | nprobe == 10: 36245  6495 57489  8705
  [f'{i}: {sentences[i]}' for i in I_bert[0]]

In [7]:
sentences_corpus = get_similarity_sentences()

Total sentences: 14504
Sample sentence - 0:  cause or permit to fall to the ground
Sample sentence - 1:  have sex with
Sample sentence - 2: (American football) a position on the line of scrimmage.
Sample sentence - 3: (An occurance of) the preservation or management of natural resources.
Sample sentence - 10000: There is no dog leaping through snowy grass and rocks
Sample sentence - 10001: There is no dog licking a baby
Sample sentence - 10002: There is no dog looking around


In [None]:
embedding_bert = create_embeddings(sentences_corpus, 'bert-base-nli-mean-tokens')
embedding_albert = create_embeddings(sentences_corpus, 'paraphrase-albert-small-v2')




******************* 
Model Name:  bert-base-nli-mean-tokens 
Shape:  (14504, 768)

******************* 
Model Name:  paraphrase-albert-small-v2 
Shape:  (14504, 768)


In [None]:
divide_embeddings_and_save(embedding_bert, 'bert')

embeddings_bert_0.npy | 0 -> 256
embeddings_bert_1.npy | 256 -> 512
embeddings_bert_2.npy | 512 -> 768
embeddings_bert_3.npy | 768 -> 1024
embeddings_bert_4.npy | 1024 -> 1280
embeddings_bert_5.npy | 1280 -> 1536
embeddings_bert_6.npy | 1536 -> 1792
embeddings_bert_7.npy | 1792 -> 2048
embeddings_bert_8.npy | 2048 -> 2304
embeddings_bert_9.npy | 2304 -> 2560
embeddings_bert_10.npy | 2560 -> 2816
embeddings_bert_11.npy | 2816 -> 3072
embeddings_bert_12.npy | 3072 -> 3328
embeddings_bert_13.npy | 3328 -> 3584
embeddings_bert_14.npy | 3584 -> 3840
embeddings_bert_15.npy | 3840 -> 4096
embeddings_bert_16.npy | 4096 -> 4352
embeddings_bert_17.npy | 4352 -> 4608
embeddings_bert_18.npy | 4608 -> 4864
embeddings_bert_19.npy | 4864 -> 5120
embeddings_bert_20.npy | 5120 -> 5376
embeddings_bert_21.npy | 5376 -> 5632
embeddings_bert_22.npy | 5632 -> 5888
embeddings_bert_23.npy | 5888 -> 6144
embeddings_bert_24.npy | 6144 -> 6400
embeddings_bert_25.npy | 6400 -> 6656
embeddings_bert_26.npy | 6656 -

In [None]:
divide_embeddings_and_save(embedding_albert, 'albert')

embeddings_albert_0.npy | 0 -> 256
embeddings_albert_1.npy | 256 -> 512
embeddings_albert_2.npy | 512 -> 768
embeddings_albert_3.npy | 768 -> 1024
embeddings_albert_4.npy | 1024 -> 1280
embeddings_albert_5.npy | 1280 -> 1536
embeddings_albert_6.npy | 1536 -> 1792
embeddings_albert_7.npy | 1792 -> 2048
embeddings_albert_8.npy | 2048 -> 2304
embeddings_albert_9.npy | 2304 -> 2560
embeddings_albert_10.npy | 2560 -> 2816
embeddings_albert_11.npy | 2816 -> 3072
embeddings_albert_12.npy | 3072 -> 3328
embeddings_albert_13.npy | 3328 -> 3584
embeddings_albert_14.npy | 3584 -> 3840
embeddings_albert_15.npy | 3840 -> 4096
embeddings_albert_16.npy | 4096 -> 4352
embeddings_albert_17.npy | 4352 -> 4608
embeddings_albert_18.npy | 4608 -> 4864
embeddings_albert_19.npy | 4864 -> 5120
embeddings_albert_20.npy | 5120 -> 5376
embeddings_albert_21.npy | 5376 -> 5632
embeddings_albert_22.npy | 5632 -> 5888
embeddings_albert_23.npy | 5888 -> 6144
embeddings_albert_24.npy | 6144 -> 6400
embeddings_albert_2

In [None]:
# Download does not work in sync manner
# Downloaded files manually
# download_embeddings_from_drive('bert')
# download_embeddings_from_drive('albert')

In [None]:
bert_index = create_L2_Index(embedding_bert, "bert")

Embedding Dimensions: 768
Embedding Sentences: 14504
Index Trained: True
Index Sentences: 14504


In [None]:
albert_index = create_L2_Index(embedding_albert, "albert")

Embedding Dimensions: 768
Embedding Sentences: 14504
Index Trained: True
Index Sentences: 14504


In [None]:
query_string = "Someone sprints with a football"
k=4
model_name = 'bert'

bert_github_url =  url = 'https://raw.githubusercontent.com/bonsonsm/FAISS/master/03_index/bert.idx'
albert_github_url =  url = 'https://raw.githubusercontent.com/bonsonsm/FAISS/master/03_index/albert.idx'

sentences = get_similarity_sentences()

Total sentences: 14504
Sample sentence - 10000: A man is typing on a machine used for stenography
Sample sentence - 10001: Some young bikers are getting a dirt bike up a sandy hill
Sample sentence - 10002: Relieve someone from work by taking a turn; take turns working.


In [None]:
query_embedding = create_query_embedding(model_name, query_string, k, bert_github_url, sentences)

response = requests.get(bert_github_url)


Model Name:  bert
Query String:  Someone sprints with a football
K:  4
Index Embedding URL:  https://raw.githubusercontent.com/bonsonsm/FAISS/master/03_index/bert.idx
Total Sentences:  14504
Equal BERT


In [None]:
import numpy as np
import faiss
import requests
from io import StringIO
import pandas as pd

In [None]:
# res = requests.get('https://raw.githubusercontent.com/brmson/dataset-sts/master/data/sts/sick2014/SICK_train.txt')
res = requests.get('https://raw.githubusercontent.com/bonsonsm/dataset-sts/master/data/sts/sick2014/SICK_train.txt')

text = res.text
text[:100]

'pair_ID\tsentence_A\tsentence_B\trelatedness_score\tentailment_judgment\n1\tA group of kids is playing in '

In [None]:
data = pd.read_csv(StringIO(text), sep='\t')
data.head()

Unnamed: 0,pair_ID,sentence_A,sentence_B,relatedness_score,entailment_judgment
0,1,A group of kids is playing in a yard and an ol...,A group of boys in a yard is playing and a man...,4.5,NEUTRAL
1,2,A group of children is playing in the house an...,A group of kids is playing in a yard and an ol...,3.2,NEUTRAL
2,3,The young boys are playing outdoors and the ma...,The kids are playing outdoors near a man with ...,4.7,ENTAILMENT
3,5,The kids are playing outdoors near a man with ...,A group of kids is playing in a yard and an ol...,3.4,NEUTRAL
4,9,The young boys are playing outdoors and the ma...,A group of kids is playing in a yard and an ol...,3.7,NEUTRAL


In [None]:
sentences = data['sentence_A'].tolist()
sentences[:5]

['A group of kids is playing in a yard and an old man is standing in the background',
 'A group of children is playing in the house and there is no man standing in the background',
 'The young boys are playing outdoors and the man is smiling nearby',
 'The kids are playing outdoors near a man with a smile',
 'The young boys are playing outdoors and the man is smiling nearby']

In [None]:
sentence_b = data['sentence_B'].tolist()
sentences.extend(sentence_b)
len(set(sentences))

4802

In [None]:
# Few More datasets
# urls = [
#    'https://raw.githubusercontent.com/brmson/dataset-sts/master/data/sts/semeval-sts/2012/MSRpar.train.tsv',
#    'https://raw.githubusercontent.com/brmson/dataset-sts/master/data/sts/semeval-sts/2012/MSRpar.test.tsv',
#    'https://raw.githubusercontent.com/brmson/dataset-sts/master/data/sts/semeval-sts/2012/OnWN.test.tsv',
#    'https://raw.githubusercontent.com/brmson/dataset-sts/master/data/sts/semeval-sts/2013/OnWN.test.tsv',
#    'https://raw.githubusercontent.com/brmson/dataset-sts/master/data/sts/semeval-sts/2014/OnWN.test.tsv',
#    'https://raw.githubusercontent.com/brmson/dataset-sts/master/data/sts/semeval-sts/2014/images.test.tsv',
#    'https://raw.githubusercontent.com/brmson/dataset-sts/master/data/sts/semeval-sts/2015/images.test.tsv'
#]
urls = [
   'https://raw.githubusercontent.com/bonsonsm/dataset-sts/master/data/sts/semeval-sts/2012/MSRpar.train.tsv',
   'https://raw.githubusercontent.com/bonsonsm/dataset-sts/master/data/sts/semeval-sts/2012/MSRpar.test.tsv',
   'https://raw.githubusercontent.com/bonsonsm/dataset-sts/master/data/sts/semeval-sts/2012/OnWN.test.tsv',
   'https://raw.githubusercontent.com/bonsonsm/dataset-sts/master/data/sts/semeval-sts/2013/OnWN.test.tsv',
   'https://raw.githubusercontent.com/bonsonsm/dataset-sts/master/data/sts/semeval-sts/2014/OnWN.test.tsv',
   'https://raw.githubusercontent.com/bonsonsm/dataset-sts/master/data/sts/semeval-sts/2014/images.test.tsv',
   'https://raw.githubusercontent.com/bonsonsm/dataset-sts/master/data/sts/semeval-sts/2015/images.test.tsv'
]

In [None]:
for url in urls:
    res = requests.get(url)
    # extract to dataframe
    data = pd.read_csv(StringIO(res.text), sep='\t', header=None, on_bad_lines='skip')
    # add to columns 1 and 2 to sentences list
    sentences.extend(data[1].tolist())
    sentences.extend(data[2].tolist())

In [None]:
len(set(sentences))

14505

In [None]:
# Before converting to our sentence embeddings, we will save to text file as backup.

# remove duplicates and NaN
sentences = [
    sentence.replace('\n', '') for sentence in list(set(sentences)) if type(sentence) is str
    ]

In [None]:
with open('bonson_sentences.txt', 'w') as fp:
    fp.write('\n'.join(sentences))

In [None]:
from sentence_transformers import SentenceTransformer

  from tqdm.autonotebook import tqdm, trange


In [None]:
model_bert = SentenceTransformer('bert-base-nli-mean-tokens')
sentence_embeddings_bert = model_bert.encode(sentences)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.99k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/399 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]



1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
model_albert = SentenceTransformer('paraphrase-albert-small-v2')
sentence_embeddings_albert = model_albert.encode(sentences)

modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/4.06k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/827 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/46.7M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/465 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/760k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.31M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/245 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
print(sentence_embeddings_bert.shape)
print(sentence_embeddings_bert.shape[0])

(14504, 768)
14504


In [None]:
print(sentence_embeddings_albert.shape)
print(sentence_embeddings_albert.shape[0])

(14504, 768)
14504


In [None]:
# saving data of bert
split = 256
file_count = 0
for i in range(0, sentence_embeddings_bert.shape[0], split):
    end = i + split
    if end > sentence_embeddings_bert.shape[0] + 1:
        end = sentence_embeddings_bert.shape[0] + 1
    file_count = '0' + str(file_count) if file_count < 0 else str(file_count)
    with open(f'./sim_sentences/embeddings_bert/embeddings_{file_count}.npy', 'wb') as fp:
        np.save(fp, sentence_embeddings_bert[i:end, :])
    print(f"embeddings_bert_{file_count}.npy | {i} -> {end}")
    file_count = int(file_count) + 1

embeddings_bert_0.npy | 0 -> 256
embeddings_bert_1.npy | 256 -> 512
embeddings_bert_2.npy | 512 -> 768
embeddings_bert_3.npy | 768 -> 1024
embeddings_bert_4.npy | 1024 -> 1280
embeddings_bert_5.npy | 1280 -> 1536
embeddings_bert_6.npy | 1536 -> 1792
embeddings_bert_7.npy | 1792 -> 2048
embeddings_bert_8.npy | 2048 -> 2304
embeddings_bert_9.npy | 2304 -> 2560
embeddings_bert_10.npy | 2560 -> 2816
embeddings_bert_11.npy | 2816 -> 3072
embeddings_bert_12.npy | 3072 -> 3328
embeddings_bert_13.npy | 3328 -> 3584
embeddings_bert_14.npy | 3584 -> 3840
embeddings_bert_15.npy | 3840 -> 4096
embeddings_bert_16.npy | 4096 -> 4352
embeddings_bert_17.npy | 4352 -> 4608
embeddings_bert_18.npy | 4608 -> 4864
embeddings_bert_19.npy | 4864 -> 5120
embeddings_bert_20.npy | 5120 -> 5376
embeddings_bert_21.npy | 5376 -> 5632
embeddings_bert_22.npy | 5632 -> 5888
embeddings_bert_23.npy | 5888 -> 6144
embeddings_bert_24.npy | 6144 -> 6400
embeddings_bert_25.npy | 6400 -> 6656
embeddings_bert_26.npy | 6656 -

In [None]:
# saving data of albert
split = 256
file_count = 0
for i in range(0, sentence_embeddings_albert.shape[0], split):
    end = i + split
    if end > sentence_embeddings_albert.shape[0] + 1:
        end = sentence_embeddings_albert.shape[0] + 1
    file_count = '0' + str(file_count) if file_count < 0 else str(file_count)
    with open(f'./sim_sentences/embeddings_albert/embeddings_{file_count}.npy', 'wb') as fp:
        np.save(fp, sentence_embeddings_albert[i:end, :])
    print(f"embeddings_albert_{file_count}.npy | {i} -> {end}")
    file_count = int(file_count) + 1

embeddings_albert_0.npy | 0 -> 256
embeddings_albert_1.npy | 256 -> 512
embeddings_albert_2.npy | 512 -> 768
embeddings_albert_3.npy | 768 -> 1024
embeddings_albert_4.npy | 1024 -> 1280
embeddings_albert_5.npy | 1280 -> 1536
embeddings_albert_6.npy | 1536 -> 1792
embeddings_albert_7.npy | 1792 -> 2048
embeddings_albert_8.npy | 2048 -> 2304
embeddings_albert_9.npy | 2304 -> 2560
embeddings_albert_10.npy | 2560 -> 2816
embeddings_albert_11.npy | 2816 -> 3072
embeddings_albert_12.npy | 3072 -> 3328
embeddings_albert_13.npy | 3328 -> 3584
embeddings_albert_14.npy | 3584 -> 3840
embeddings_albert_15.npy | 3840 -> 4096
embeddings_albert_16.npy | 4096 -> 4352
embeddings_albert_17.npy | 4352 -> 4608
embeddings_albert_18.npy | 4608 -> 4864
embeddings_albert_19.npy | 4864 -> 5120
embeddings_albert_20.npy | 5120 -> 5376
embeddings_albert_21.npy | 5376 -> 5632
embeddings_albert_22.npy | 5632 -> 5888
embeddings_albert_23.npy | 5888 -> 6144
embeddings_albert_24.npy | 6144 -> 6400
embeddings_albert_2

We setup our FAISS database dimensionality (number of dimensions per vector) based on these vectors.

In [None]:
dim_bert = sentence_embeddings_bert.shape[1]
print(dim_bert)

768


In [None]:
dim_albert = sentence_embeddings_albert.shape[1]
print(dim_albert)

768


In [None]:
# Flat Index

In [None]:
index_bert = faiss.IndexFlatL2(dim_bert)
index_albert = faiss.IndexFlatL2(dim_albert)

In [None]:
print(index_bert.is_trained)
print(index_albert.is_trained)

True
True


In [None]:
index_bert.add(sentence_embeddings_bert)
index_albert.add(sentence_embeddings_albert)

In [None]:
print(index_bert.ntotal)
print(index_albert.ntotal)

14504
14504


In [None]:
k = 4
xq_bert = model_bert.encode(["Someone sprints with a football"])
xq_albert = model_albert.encode(["Someone sprints with a football"])

In [None]:
%%time
D_bert, I_bert = index_bert.search(xq_bert, k)  # search
print(I_bert)  # k-nearest neigbors of the query vector | nprobe == 1: 6495 26392 61709 49932 | nprobe == 10: 36245  6495 57489  8705
[f'{i}: {sentences[i]}' for i in I_bert[0]]

[[ 8501  6005  8064 13175]]
CPU times: user 17.4 ms, sys: 14 µs, total: 17.4 ms
Wall time: 16.5 ms


['8501: A group of football players is running in the field',
 '6005: A group of people playing football is running in the field',
 '8064: Two groups of people are playing football',
 '13175: A person playing football is running past an official carrying a football']

In [None]:
%%time
D_albert, I_albert = index_albert.search(xq_albert, k)  # search
print(I_albert)  # k-nearest neigbors of the query vector | nprobe == 1: 6495 26392 61709 49932 | nprobe == 10: 36245  6495 57489  8705
[f'{i}: {sentences[i]}' for i in I_albert[0]]

[[13175 14180  6535 10543]]
CPU times: user 16.8 ms, sys: 0 ns, total: 16.8 ms
Wall time: 17.1 ms


['13175: A person playing football is running past an official carrying a football',
 '14180: A football player is running past an official carrying a football',
 '6535: A man in a football uniform is running with a football during a game.',
 '10543: A football player in a purple jersey is running with the ball for a touchdown']

In [None]:
sentences[7460]

'Domestic cat looking out window.'