In [1]:
!pip install transformers torch



In [7]:
import torch
from transformers import BertTokenizer, BertModel

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertModel.from_pretrained("bert-base-uncased")

def encode(document):
    # Split document into sentences or paragraphs
    chunks = document.split(" ")  # assuming paragraphs are separated by newline

    embeddings = []

    for chunk in chunks:
        inputs = tokenizer(chunk, return_tensors="pt", truncation=True, padding=True, max_length=512)
        with torch.no_grad():
            outputs = model(**inputs)
        # Taking the average of all token embeddings as the representation of the chunk
        embeddings.append(outputs.last_hidden_state.mean(dim=1).squeeze().numpy())
    
    return chunks, embeddings

In [2]:
import numpy as np

def save_embeddings(filename, embeddings):
    embeddings = np.array(embeddings)
    np.save(filename, embeddings)
def load_embeddings(filename):
    return np.load(filename, allow_pickle=True)

In [4]:
pip install scikit-learn

Note: you may need to restart the kernel to use updated packages.Collecting scikit-learn

  Downloading scikit_learn-1.3.0-cp311-cp311-win_amd64.whl (9.2 MB)
                                              0.0/9.2 MB ? eta -:--:--
                                              0.0/9.2 MB ? eta -:--:--
                                              0.0/9.2 MB 325.1 kB/s eta 0:00:29
                                              0.0/9.2 MB 326.8 kB/s eta 0:00:29
                                              0.1/9.2 MB 581.0 kB/s eta 0:00:16
                                              0.2/9.2 MB 958.4 kB/s eta 0:00:10
     -                                        0.4/9.2 MB 1.6 MB/s eta 0:00:06
     --                                       0.6/9.2 MB 1.8 MB/s eta 0:00:05
     ----                                     1.0/9.2 MB 3.0 MB/s eta 0:00:03
     -------                                  1.8/9.2 MB 4.7 MB/s eta 0:00:02
     -----------                              2.6/9.2 MB 6.1 MB/s et

In [5]:
from sklearn.metrics.pairwise import cosine_similarity

def get_answer(question, chunks, embeddings):
    # Encode the question
    inputs = tokenizer(question, return_tensors="pt", truncation=True, padding=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    question_embedding = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()

    # Compute similarity scores
    scores = cosine_similarity([question_embedding], embeddings)
    most_similar_idx = np.argmax(scores)
    
    return chunks[most_similar_idx], scores[0][most_similar_idx]

In [8]:
with open("document.txt", "r", encoding="utf-8") as file:
    document = file.read()

chunks, embeddings = encode(document)
save_embeddings("embeddings.npy", embeddings)

loaded_embeddings = load_embeddings("embeddings.npy")

question = input("Enter your question: ")
answer, score = get_answer(question, chunks, loaded_embeddings)
print(f"Answer: {answer}")
print(f"Score: {score}")

Enter your question:  which is the hottest planet?


Answer: Mars,
Score: 0.6506662368774414
