# Basic Bare minimum RAG to test

In [None]:
import os

os.environ["OPENAI_API_KEY"] = ""

In [None]:
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader

documents = SimpleDirectoryReader("data/").load_data()
index = VectorStoreIndex.from_documents(documents)

query_engine = index.as_query_engine()
response = query_engine.query("What is the document about?")
print(response)

# RAG setup using Sentence Transformers

In [4]:
from datasets import load_dataset

# Load the dataset
ds = load_dataset("toughdata/quora-question-answer-dataset")
print(ds)
# Extract the corpus
corpus = ds['train']['answer']

# Split the dataset
split = ds['train'].train_test_split(test_size=0.25, seed=42)

# Access the train and test splits
train_ds = split['train']
test_ds = split['test']

# Print the details
print(f"Train dataset size: {len(train_ds)}")
print(f"Test dataset size: {len(test_ds)}")

print(train_ds)
print(test_ds)

DatasetDict({
    train: Dataset({
        features: ['question', 'answer'],
        num_rows: 56402
    })
})
Train dataset size: 42301
Test dataset size: 14101
Dataset({
    features: ['question', 'answer'],
    num_rows: 42301
})
Dataset({
    features: ['question', 'answer'],
    num_rows: 14101
})


In [None]:
from sentence_transformers import SentenceTransformer

# 1. Load a pretrained Sentence Transformer model
model = SentenceTransformer("all-MiniLM-L6-v2")

# 2. Generate embeddings
embeddings = model.encode(corpus, batch_size=32, show_progress_bar=True, convert_to_tensor=True)



Batches: 100%|██████████| 1763/1763 [00:34<00:00, 50.69it/s] 


tensor([[ 0.0786,  0.0242,  0.0263,  ...,  0.0464, -0.1054,  0.0366],
        [-0.0498, -0.0341,  0.0440,  ...,  0.0381, -0.0093, -0.0391],
        [ 0.0365,  0.0454,  0.0159,  ...,  0.0293, -0.0735,  0.0448],
        ...,
        [-0.0756,  0.0891, -0.0266,  ..., -0.0312, -0.0291,  0.0853],
        [-0.0277,  0.0797,  0.0494,  ..., -0.0381, -0.0556,  0.0960],
        [ 0.0438,  0.0441,  0.0684,  ..., -0.0912, -0.0427, -0.0391]],
       device='cuda:0')


In [19]:
print(embeddings.size())


torch.Size([56402, 384])


In [None]:
from sentence_transformers import util
# Define a query
query = "What is a proxy, and how can I use one?"

# Encode the query
query_embedding = model.encode(query, convert_to_tensor=True)

# Perform semantic search
top_k = 5  # Number of top answers to retrieve
results = util.semantic_search(query_embedding, embeddings, top_k=top_k)[0]

print(results)

[{'corpus_id': 53619, 'score': 0.791310727596283}, {'corpus_id': 21947, 'score': 0.78010094165802}, {'corpus_id': 25515, 'score': 0.7756905555725098}, {'corpus_id': 34270, 'score': 0.7715657353401184}, {'corpus_id': 46198, 'score': 0.7709584832191467}]


In [15]:
# Print the top-k results
print(f"Query: {query}\n")
for i, result in enumerate(results):
    print(f"Rank {i + 1}:")
    print(f"Answer: {corpus[result['corpus_id']]}")
    print(f"Score: {result['score']:.4f}\n")

Query: What is a proxy, and how can I use one?

Rank 1:
Answer: What is a Proxy?
 A proxy can be defined as software that makes requests to the server on behalf of the clients. In simple words, the proxy sits between the clients and the servers. Any request from the user first goes through the proxy and then reaches the server. This is also known as "forward proxy".
 The system or router on which this software sits is known as a Proxy Server.
How does a proxy work?
 As mentioned, proxies are an intermediary that sits between a user's browser and a website, routing the requests through their own machine. It can be thought of as if proxies act as a filter between users and the server. Any request from the user first goes through the proxy and then reaches the server.
 They also provide a level of anonymity, often required by people who do not want their browsing history to be tracked by their ISPs. Proxies are also used to allow users to access sites they would not normally be able to re

In [20]:
from sentence_transformers import util

def evaluate_model(questions, answers, corpus, corpus_embeddings, model, top_k=5):
    """
    Evaluate the accuracy of the embedding model by checking if the correct answer 
    is in the top-k retrieved results for each question.
    
    Parameters:
    - questions: List of questions to query.
    - answers: List of corresponding correct answers for the questions.
    - corpus: List of all answers in the corpus.
    - corpus_embeddings: Precomputed embeddings for the corpus.
    - model: The SentenceTransformer model.
    - top_k: Number of top results to consider for accuracy calculation.
    
    Returns:
    - accuracy: The overall accuracy of the model.
    """
    correct_count = 0
    total_questions = len(questions)

    for i, question in enumerate(questions):
        # Encode the query
        query_embedding = model.encode(question, convert_to_tensor=True)

        # Perform semantic search
        results = util.semantic_search(query_embedding, corpus_embeddings, top_k=top_k)[0]

        # Check if the correct answer is in the top-k results
        correct_answer = answers[i]
        retrieved_answers = [corpus[result['corpus_id']] for result in results]

        if correct_answer in retrieved_answers:
            correct_count += 1

    # Calculate accuracy
    accuracy = correct_count / total_questions
    return accuracy

# Extract questions and corresponding answers from the test set
test_questions = test_ds['question']
test_answers = test_ds['answer']

# Evaluate the model
accuracy = evaluate_model(test_questions, test_answers, corpus, embeddings, model, top_k=5)
print(f"Top-5 Accuracy: {accuracy:.4f}")


Top-5 Accuracy: 0.1933


In [None]:
# Base accuraacy 