## Use cosine similarity to search for the most similar info compared to our input.

In [1]:
!pip install transformers scikit-learn

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.30.2-py3-none-any.whl (7.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m53.4 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.15.1-py3-none-any.whl (236 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m236.8/236.8 kB[0m [31m27.6 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m90.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m

In [4]:
from transformers import AutoTokenizer, AutoModel
import torch
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("GanymedeNil/text2vec-large-chinese")
model = AutoModel.from_pretrained("GanymedeNil/text2vec-large-chinese")

# create vectorr store
knowledge_sentences = ["总会有地上的生灵，敢于直面雷霆的威光", "若你困于无风之地，我将为你奏响高天之歌。", "于此浮世中，不独入寝可成梦，事事皆虚空。"]
knowledge_vectors = []

for sentence in knowledge_sentences:
    inputs = tokenizer(sentence, return_tensors="pt")
    with torch.no_grad():
        embeddings = model(**inputs).last_hidden_state
    knowledge_vectors.append(embeddings.mean(dim=1).numpy())

# input
input_text = "那是在烬寂海里求而不得的风声啊"
input_tokens = tokenizer(input_text, return_tensors="pt")
with torch.no_grad():
    input_vector = model(**input_tokens).last_hidden_state.mean(dim=1).numpy()

# calculate similarity
similarities = cosine_similarity(input_vector, np.vstack(knowledge_vectors))
for i, similarity in enumerate(similarities[0]):
    print(f"Similarity with sentence {i+1}: {similarity}")

# argmax to find the most similar sentence
most_similar_index = np.argmax(similarities)
most_similar_sentence = knowledge_sentences[most_similar_index]

# output
print(f"Most similar sentence: {most_similar_sentence}")
print(f"Most similar index:{most_similar_index}")



Similarity with sentence 1: 0.23819048702716827
Similarity with sentence 2: 0.43936794996261597
Similarity with sentence 3: 0.3459398150444031
Most similar sentence: 若你困于无风之地，我将为你奏响高天之歌。
Most similar index:1
