In [None]:
pip install sentence-transformers faiss-cpu

1.   Generate sentence embeddings from input.txt file
2.   Saving embeddings to a FAISS Index Vector Store
3.   Save sentences to a file to keep track of the order

In [19]:
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np

# Function to read sentences from a text file
def read_sentences_from_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        sentences = file.readlines()
    # Strip newline characters
    sentences = [sentence.strip() for sentence in sentences]
    return sentences

# Path to your text file
file_path = 'input.txt'

# Read sentences from the file
sentences = read_sentences_from_file(file_path)

# Load a pre-trained Sentence Transformers model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Generate embeddings
embeddings = model.encode(sentences)


# Print embeddings
#for i, embedding in enumerate(embeddings):
#    print(f"Sentence {i+1}: {sentences[i]}")
#    print(f"Embedding: {embedding}\n")


# Initialize FAISS index
dimension = embeddings.shape[1]  # Dimension of the embeddings
index = faiss.IndexFlatL2(dimension)  # Build the index
index.add(np.array(embeddings))  # Add embeddings to the index

# Save FAISS index
faiss.write_index(index, "embeddings.index")

# Save sentences to a file to keep track of the order
with open('sentences.txt', 'w', encoding='utf-8') as file:
    for sentence in sentences:
        file.write(f"{sentence}\n")


1.   Load the FAISS index
2.   Query
3.   Construct the context
4.   Get an answer from the LLM

In [None]:
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer
from transformers import pipeline

# Load FAISS index
index = faiss.read_index("embeddings.index")

# Function to read sentences from a text file
def read_sentences_from_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        sentences = file.readlines()
    # Strip newline characters
    sentences = [sentence.strip() for sentence in sentences]
    return sentences

# Load the sentences
sentences = read_sentences_from_file('sentences.txt')

# Load the same pre-trained Sentence Transformers model used before
model = SentenceTransformer('all-MiniLM-L6-v2')

# Load a pre-trained Hugging Face model for text generation
text_generation_pipeline = pipeline("text-generation", model="EleutherAI/gpt-neo-2.7B")

# Function to query the FAISS index and construct the context
def construct_context(query, k=5):
    # Generate the embedding for the query
    query_embedding = model.encode([query])

    # Search the index
    distances, indices = index.search(np.array(query_embedding), k)

    # Retrieve the top k similar sentences
    relevant_sentences = [sentences[idx] for idx in indices[0]]

    # Construct the context
    context = "\n".join(relevant_sentences)
    return context

# Function to generate an answer from the context using the language model
def generate_answer_from_lm(context):
    answer = text_generation_pipeline(context, max_length=50, do_sample=False)[0]['generated_text']
    return answer


In [5]:
query = "Adieu" #max 50 chars
context = construct_context(query)
answer = generate_answer_from_lm(context)

print("Context:\n", context)
print("\nAnswer:\n", answer)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Context:
 Adieu, sir.
Good sir, adieu.
Adieu, good neighbour.
Adieu, poor soul, that takest thy leave of it!
Father, and wife, and gentlemen, adieu;

Answer:
 Adieu, sir.
Good sir, adieu.
Adieu, good neighbour.
Adieu, poor soul, that takest thy leave of it!
Father, and wife, and gentlemen, adieu;
And,
