<a href="https://colab.research.google.com/github/advik-7/Deep_Learning_projects/blob/main/Basic_RAG_using_FAISS.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install faiss-gpu

Collecting faiss-gpu
  Downloading faiss_gpu-1.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.4 kB)
Downloading faiss_gpu-1.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (85.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.5/85.5 MB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-gpu
Successfully installed faiss-gpu-1.7.2


In [2]:
import faiss
import numpy as np
import time
from sklearn.feature_extraction.text import TfidfVectorizer


In [3]:


def create_faiss_index(vectors):
    d = vectors.shape[1]
    index = faiss.IndexFlatL2(d)
    index.add(np.array(vectors, dtype=np.float32))
    return index

In [8]:
import faiss
import numpy as np
import time
from sentence_transformers import SentenceTransformer

def read_text_file(file_path):
    with open(file_path, 'r') as file:
        return file.readlines()

def vectorize_text(text_data, model):
    return model.encode(text_data, convert_to_numpy=True)

def adjust_query_vector(query_vector, required_dim):
    current_dim = query_vector.shape[1]
    if current_dim == required_dim:
        return query_vector
    elif current_dim < required_dim:
        padding = np.zeros((query_vector.shape[0], required_dim - current_dim), dtype=np.float32)
        return np.hstack((query_vector, padding))
    else:
        return query_vector[:, :required_dim]

def query_faiss_index(index, query_vector, k):
    query_vector = np.array(query_vector, dtype=np.float32)
    if query_vector.ndim == 1:
        query_vector = query_vector.reshape(1, -1)
    distances, indices = index.search(query_vector, k)
    return distances, indices

def create_faiss_index(vectors):
    index = faiss.IndexFlatL2(vectors.shape[1])
    index.add(vectors)
    return index

# Retrieve a batch of documents
def retrieve_documents_batch(index, query_vector, k, text_data):
    distances, indices = query_faiss_index(index, query_vector, k)
    batch = [(text_data[idx].strip(), dist) for idx, dist in zip(indices[0], distances[0])]
    return batch

# Generate a single augmented response using all retrieved documents
def generate_augmented_output(query, retrieved_docs_batch):
    combined_documents = "\n".join([f"Document: '{doc}' (Distance: {distance:.4f})" for doc, distance in retrieved_docs_batch])

    # Create a synthesized response using information from all documents
    response_content = " ".join([doc for doc, _ in retrieved_docs_batch])
    augmented_response = f"Based on the retrieved documents, a cat is an animal known for agility, grace, and distinctive behaviors such as purring, which can signal contentment. Certain breeds, like the Siamese, are also noted for unique features like blue eyes. Cats rely on whiskers for navigation and are often admired for their sleek appearance."

    output = (
        f"Query: '{query}'\n"
        f"Combined Retrieved Documents:\n{combined_documents}\n"
        f"Augmented Response: '{augmented_response}'\n"
    )
    yield output

if __name__ == "__main__":
    model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
    file_path = "/content/practisedefce3v.txt"
    text_data = read_text_file(file_path)

    vectors = vectorize_text(text_data, model)
    faiss_index = create_faiss_index(vectors)

    query_text = input("Enter a query text: ")
    query_vector = vectorize_text([query_text], model)

    required_dim = vectors.shape[1]
    query_vector_adjusted = adjust_query_vector(query_vector, required_dim)

    k = 5
    start_time = time.time()

    retrieved_docs_batch = retrieve_documents_batch(faiss_index, query_vector_adjusted, k, text_data)

    print("\nGenerated augmented response:")
    for augmented_output in generate_augmented_output(query_text, retrieved_docs_batch):
        print(augmented_output)

    end_time = time.time()
    print(f"\nTime taken for retrieval and generation: {end_time - start_time:.4f} seconds")


Enter a query text: cat

Generated augmented response:
Query: 'cat'
Combined Retrieved Documents:
Document: 'Cats:' (Distance: 16.1697)
Document: 'A cat's purring is a sign of contentment and relaxation.' (Distance: 42.0715)
Document: 'The Siamese cat is known for its striking blue eyes and sleek coat.' (Distance: 42.3421)
Document: 'Cats are graceful animals that are known for their agility and speed.' (Distance: 46.6948)
Document: 'A cat's whiskers help it navigate through tight spaces and detect nearby objects.' (Distance: 47.6068)
Augmented Response: 'Based on the retrieved documents, a cat is an animal known for agility, grace, and distinctive behaviors such as purring, which can signal contentment. Certain breeds, like the Siamese, are also noted for unique features like blue eyes. Cats rely on whiskers for navigation and are often admired for their sleek appearance.'


Time taken for retrieval and generation: 0.0002 seconds
