# Implementation of Custom RAG Pipeline:

### Importing Required Libraries.

In [1]:
import re
import requests
import PyPDF2
from sentence_transformers import SentenceTransformer
import faiss
from transformers import pipeline
from nltk.translate.bleu_score import sentence_bleu

  from tqdm.autonotebook import tqdm, trange


In [2]:
BOOK_LINK = "https://assets.openstax.org/oscms-prodcms/media/documents/ConceptsofBiology-WEB.pdf?_gl=1*s5knas*_gcl_au*MTQxNTA5MzAxMS4xNzI3MzYzOTQ1*_ga*NjQ4ODA0NDk0LjE3MjczNjM5NDU.*_ga_T746F8B0QC*MTcyNzYzNjE1Mi4yLjEuMTcyNzYzNjE1Mi42MC4wLjA."
response = requests.get(BOOK_LINK)
with open("sample.pdf", "wb") as file:
        file.write(response.content)
print("PDF downloaded successfully!")

PDF downloaded successfully!


### Calling dowanloaded pdf into variable for processing.

In [3]:
textbook_file = "./sample.pdf"

### Function to extract text from PDF

In [4]:
def extract_text_from_pdf(pdf_path, start_page, end_page):
    """
    Function to extract data from target pages of PDF.
    """
    text = ""
    lines = []
    with open(pdf_path, "rb") as f:
        reader = PyPDF2.PdfReader(f)
        for page_num in range(start_page, end_page):
            page = reader.pages[page_num]
            text += page.extract_text()
            lines.extend(page.extract_text().splitlines())
    return text, lines

def clean_text(text):
    "Dummy Function that can be used to perform data cleaning."
    return text

In [5]:
chaper_1, lines_1 = extract_text_from_pdf(textbook_file, 5, 130)

In [6]:
lines_1 = list(map(clean_text, lines_1))

### Performing embeddings calculations to make data source for the RAG implemmentation.

In [7]:
def get_embeddings(data):
    model = SentenceTransformer("ashakthy/biology")
    model.max_seq_length = 768
    data_vectos = model.encode(data)
    return data_vectos

embeddings_lines1 = get_embeddings(lines_1)

No sentence-transformers model found with name ashakthy/biology. Creating a new one with mean pooling.
  attn_output = torch.nn.functional.scaled_dot_product_attention(


### Using In-Memory vector database to perform indexing.

In [8]:
index = faiss.IndexFlatL2(embeddings_lines1.shape[1])
index.add(embeddings_lines1)

In [9]:
# model : ashakthy/biology

### Functions for Retrival and Answer Generation.

In [10]:
def retrieve(query, k=5):
    query_embedding = get_embeddings([query])
    distances, indices = index.search(query_embedding, k)
    return [lines_1[i] for i in indices[0]]

def generate_answer(retrieved_chunks, question):
    context = " ".join(retrieved_chunks)
    input_text = f"Question: {question}\nContext: {context}\n Answer:"
    # Use a HuggingFace model for generation
    generator = pipeline('text-generation', model="ashakthy/biology", device=0)
    answer = generator(input_text, max_new_tokens=embeddings_lines1.shape[1], num_return_sequences=1)
    return answer[0]['generated_text']

### Generating answers for a question.

In [11]:
# question = "What is most remark able to consider is that a hemoglobin molecule is made up of two alpha chains and two beta proteins?"
question = "What is the main topic of Chapter 3?"
retrieved_chunks = retrieve(question)
answer = generate_answer(retrieved_chunks, question)
print(answer)

No sentence-transformers model found with name ashakthy/biology. Creating a new one with mean pooling.


Question: What is the main topic of Chapter 3?
Context: strands is the same al l along the molecule .50 2 â€¢ Chemis try of Life is a molecule that is lit erally the tar get of the drug . In the case o f statins , HMG -CoA r eductase is the drug tar get. extracellular fluid in which the y are bathed; at the same time , cells ha ve higher c oncentr ations o f potas sium (K+) A doct or injects a patient with what the doct or think s is isot onic saline solution. The patient dies , and aut opsy how the br ain functions? A planetarium? Gems and miner als? Or ma ybe al l of the abo ve? Scienc e includes such
 Answer:


In summary, the same production of the formation of these proteins are three enzyme. However, this viruses the development of an increased oxygen into the the tissue the first step is known as the endangered species for the endoplasmo. Altered hormones, which is a threats, the body's surface rate of oothalization of the Biratory system of cell membrane-2 (CD through the mamma

### Checking Performance 

In [12]:
# Assuming 'chunks' is a list of text chunks and 'index' is a FAISS index
# Example test cases
test_cases = [
    {"question": "What is the main topic of Chapter 1?", "answer": "Introduction to the subject"},
    {"question": "Explain the key concepts of Chapter 3.", "answer": "How Cells Are Studied, Comparing Prokaryotic and Eukaryotic Cells, Eukaryotic Cells, The Cell Membrane, Passive TransportActive Transport"}
]


# Evaluation function
def evaluate(test_cases):
    precision_scores = []
    recall_scores = []
    f1_scores = []
    bleu_scores = []
    rouge_scores = []
    
    for case in test_cases:
        question = case['question']
        true_answer = case['answer']
        
        # Retrieve and generate answer
        retrieved_chunks = retrieve(question)
        generated_answer = generate_answer(retrieved_chunks, question)
        
        bleu_score = sentence_bleu([true_answer.split()], generated_answer.split())
    return bleu_score


# Run evaluation
results = evaluate(test_cases)
print(f"Evaluation Results: {results}")


No sentence-transformers model found with name ashakthy/biology. Creating a new one with mean pooling.
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
No sentence-transformers model found with name ashakthy/biology. Creating a new one with mean pooling.


Evaluation Results: 4.274446036151339e-232


The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
