# Basic Bare minimum RAG to test

In [None]:
import os

os.environ["OPENAI_API_KEY"] = ""

In [None]:
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader

documents = SimpleDirectoryReader("data/").load_data()
index = VectorStoreIndex.from_documents(documents)

query_engine = index.as_query_engine()
response = query_engine.query("What is the document about?")
print(response)

# RAG setup using Sentence Transformers

In [7]:
from datasets import load_dataset

# Load the dataset
ds = load_dataset("toughdata/quora-question-answer-dataset")

print(ds)
# Extract the corpus
corpus = ds['train']['answer']
print(corpus[:5])
# Split the dataset
split = ds['train'].train_test_split(test_size=0.25, seed=42)

# Access the train and test splits
train_ds = split['train']
test_ds = split['test']

# Print the details
print(f"Train dataset size: {len(train_ds)}")
print(f"Test dataset size: {len(test_ds)}")

print(train_ds)
print(test_ds)

DatasetDict({
    train: Dataset({
        features: ['question', 'answer'],
        num_rows: 56402
    })
})
['Isn’t it awful? You would swear that there wasn’t enough hot water to go around!\n', 'A proxy server is a system or router that provides a gateway between users and the internet. Therefore, it helps prevent cyber attackers from entering a private network. It is a server, referred to as an “intermediary” because it goes between end-users and the web pages they visit online.\n When a computer connects to the internet, it uses an IP address. This is similar to your home’s street address, telling incoming data where to go and marking outgoing data with a return address for other devices to authenticate. A proxy server is essentially a computer on the internet that has an IP address of its own.\n How a Proxy Works\nBecause a proxy server has its own IP address, it acts as a go-between for a computer and the internet. Your computer knows this address, and when you send a request o

In [4]:
from sentence_transformers import SentenceTransformer
# 1. Load a pretrained Sentence Transformer model
model = SentenceTransformer("all-MiniLM-L6-v2")

# 2. Generate embeddings
embeddings = model.encode(corpus, batch_size=32, show_progress_bar=True, convert_to_tensor=True)



Batches:   0%|          | 0/1763 [00:00<?, ?it/s]

In [6]:
!pip install cohere

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting cohere
  Downloading cohere-5.13.3-py3-none-any.whl.metadata (3.5 kB)
Collecting fastavro<2.0.0,>=1.9.4 (from cohere)
  Downloading fastavro-1.9.7-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.5 kB)
Collecting httpx-sse==0.4.0 (from cohere)
  Downloading httpx_sse-0.4.0-py3-none-any.whl.metadata (9.0 kB)
Collecting numpy<2.0,>=1.26 (from cohere)
  Downloading numpy-1.26.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
Collecting parameterized<0.10.0,>=0.9.0 (from cohere)
  Downloading parameterized-0.9.0-py2.py3-none-any.whl.metadata (18 kB)
Collecting types-requests<3.0.0,>=2.0.0 (from cohere)
  Downloading types_requests-2.32.0.20241016-py3-none-any.whl.metadata (1.9 kB)
Downloading cohere-5.13.3-py3-none-any.whl (249 kB)
Downloading httpx_sse-0.4.0-py3-none-any.whl (7.8 kB)
Downloading fastavro-1.9.7-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.3 MB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [7]:
print(embeddings.size())
print(embeddings)

torch.Size([56402, 384])
tensor([[ 0.0786,  0.0242,  0.0263,  ...,  0.0464, -0.1054,  0.0366],
        [-0.0498, -0.0341,  0.0440,  ...,  0.0381, -0.0093, -0.0391],
        [ 0.0365,  0.0454,  0.0159,  ...,  0.0293, -0.0735,  0.0448],
        ...,
        [-0.0756,  0.0891, -0.0266,  ..., -0.0312, -0.0291,  0.0853],
        [-0.0277,  0.0797,  0.0494,  ..., -0.0381, -0.0556,  0.0960],
        [ 0.0438,  0.0441,  0.0684,  ..., -0.0912, -0.0427, -0.0391]],
       device='cuda:0')


In [43]:
from sentence_transformers import util
# Define a query
query = "What is a proxy, and how can I use one?"

# Encode the query
query_embedding = model.encode(query, convert_to_tensor=True)

# Perform semantic search
top_k = 10  # Number of top answers to retrieve
results = util.semantic_search(query_embedding, embeddings, top_k=top_k)[0]

print(results)

[{'corpus_id': 53619, 'score': 0.791310727596283}, {'corpus_id': 21947, 'score': 0.78010094165802}, {'corpus_id': 25515, 'score': 0.7756905555725098}, {'corpus_id': 34270, 'score': 0.7715657353401184}, {'corpus_id': 46198, 'score': 0.7709584832191467}, {'corpus_id': 251, 'score': 0.7698847055435181}, {'corpus_id': 20542, 'score': 0.7693929076194763}, {'corpus_id': 52203, 'score': 0.7677500247955322}, {'corpus_id': 55163, 'score': 0.766452968120575}, {'corpus_id': 6980, 'score': 0.7634493112564087}]


In [44]:
import cohere
co = cohere.Client("qlZQilLRRahYfRjX3wzOhrpUkD4cA3yjcMG8MbiW")


result_to_corpus_id_map = {}
documents = []
final_results = []
for i, result in enumerate(results):
    result_to_corpus_id_map[corpus[result['corpus_id']]] = [result['corpus_id'], result['score']]
    documents.append(corpus[result['corpus_id']])
# print(documents)
results_post_rerank = co.rerank(query=query, documents=documents, top_n=5, model="rerank-multilingual-v2.0")
for result in results_post_rerank.results:
    final_results.append({'corpus_id' : result_to_corpus_id_map[documents[result.index]][0], 'score' :  result.relevance_score})
print(final_results)

[{'corpus_id': 53619, 'score': 0.99879336}, {'corpus_id': 34270, 'score': 0.99630713}, {'corpus_id': 52203, 'score': 0.9956176}, {'corpus_id': 6980, 'score': 0.9951514}, {'corpus_id': 46198, 'score': 0.99482024}]


In [45]:
# Print the top-k results
print(f"Query: {query}\n")
for i, result in enumerate(final_results):
    print(f"Rank {i + 1}:")
    print(f"Answer: {corpus[result['corpus_id']]}")
    print(f"Score: {result['score']:.4f}\n")

Query: What is a proxy, and how can I use one?

Rank 1:
Answer: What is a Proxy?
 A proxy can be defined as software that makes requests to the server on behalf of the clients. In simple words, the proxy sits between the clients and the servers. Any request from the user first goes through the proxy and then reaches the server. This is also known as "forward proxy".
 The system or router on which this software sits is known as a Proxy Server.
How does a proxy work?
 As mentioned, proxies are an intermediary that sits between a user's browser and a website, routing the requests through their own machine. It can be thought of as if proxies act as a filter between users and the server. Any request from the user first goes through the proxy and then reaches the server.
 They also provide a level of anonymity, often required by people who do not want their browsing history to be tracked by their ISPs. Proxies are also used to allow users to access sites they would not normally be able to re

In [20]:
from sentence_transformers import util

def evaluate_model(questions, answers, corpus, corpus_embeddings, model, top_k=5):
    """
    Evaluate the accuracy of the embedding model by checking if the correct answer 
    is in the top-k retrieved results for each question.
    
    Parameters:
    - questions: List of questions to query.
    - answers: List of corresponding correct answers for the questions.
    - corpus: List of all answers in the corpus.
    - corpus_embeddings: Precomputed embeddings for the corpus.
    - model: The SentenceTransformer model.
    - top_k: Number of top results to consider for accuracy calculation.
    
    Returns:
    - accuracy: The overall accuracy of the model.
    """
    correct_count = 0
    total_questions = len(questions)

    for i, question in enumerate(questions):
        # Encode the query
        query_embedding = model.encode(question, convert_to_tensor=True)

        # Perform semantic search
        results = util.semantic_search(query_embedding, corpus_embeddings, top_k=top_k)[0]

        # Check if the correct answer is in the top-k results
        correct_answer = answers[i]
        retrieved_answers = [corpus[result['corpus_id']] for result in results]

        if correct_answer in retrieved_answers:
            correct_count += 1

    # Calculate accuracy
    accuracy = correct_count / total_questions
    return accuracy

# Extract questions and corresponding answers from the test set
test_questions = test_ds['question']
test_answers = test_ds['answer']

# Evaluate the model
accuracy = evaluate_model(test_questions, test_answers, corpus, embeddings, model, top_k=5)
print(f"Top-5 Accuracy: {accuracy:.4f}")


Top-5 Accuracy: 0.1933


In [None]:
# Base accuracy 
# Nick was here