# Cell 1: Install Required Libraries

In [None]:
!pip install faiss-cpu
!pip install fitz
!pip install PyMuPDF
!pip install transformers

# Cell 2: Extract Text from PDF

In [1]:
import fitz  # PyMuPDF

def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = ""
    for page in doc:
        text += page.get_text()
    return text

# Replace with the path to your PDF
pdf_path = "Hands-On_Machine_Learning.pdf"
book_text = extract_text_from_pdf(pdf_path)

print("Text extracted successfully!")

Text extracted successfully!


# Cell 3: Clean and Preprocess Text

In [2]:
import re

def clean_text(text):
    # Remove extra spaces and newlines
    text = re.sub(r'\s+', ' ', text)
    # Remove special characters (optional)
    text = re.sub(r'[^\x00-\x7F]+', ' ', text)
    return text.strip()

book_text = clean_text(book_text)
print("Text cleaned successfully!")

Text cleaned successfully!


# Cell 4: Split Text into Chunks

In [3]:
def split_into_chunks(text, words_per_chunk=200):
    words = text.split()
    chunks = [' '.join(words[i:i + words_per_chunk]) for i in range(0, len(words), words_per_chunk)]
    return chunks

chunks = split_into_chunks(book_text, words_per_chunk=200)
print(f"Text split into {len(chunks)} chunks.")

Text split into 1338 chunks.


# Cell 5: Generate Embeddings for Chunks

In [5]:
from sentence_transformers import SentenceTransformer

# Load the embedding model
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')

# Generate embeddings for the chunks
chunk_embeddings = embedding_model.encode(chunks)

print(f"Generated embeddings for {len(chunks)} chunks.")

Generated embeddings for 1338 chunks.


# Cell 6: Build FAISS Index for Retrieval

In [6]:
import faiss
import numpy as np

# Convert embeddings to a numpy array
chunk_embeddings = np.array(chunk_embeddings).astype('float32')

# Build the FAISS index
dimension = chunk_embeddings.shape[1]  # Dimension of the embeddings
index = faiss.IndexFlatL2(dimension)   # L2 distance for similarity search
index.add(chunk_embeddings)            # Add embeddings to the index

print("FAISS index built successfully!")

FAISS index built successfully!


# Cell 7: Retrieve Relevant Chunks

In [None]:
def retrieve_relevant_chunks(question, top_k=3):
    # Generate embedding for the question
    question_embedding = embedding_model.encode([question])
    question_embedding = np.array(question_embedding).astype('float32')

    # Search the FAISS index
    distances, indices = index.search(question_embedding, top_k)
    relevant_chunks = [chunks[i] for i in indices[0]]
    return relevant_chunks

# Test the retrieval system
question = "What is the difference between supervised and unsupervised learning?"
relevant_chunks = retrieve_relevant_chunks(question)
print("Relevant chunks:")
for i, chunk in enumerate(relevant_chunks):
    print(f"\nChunk {i + 1}:\n{chunk}")

# Cell 8: Load Generative Model

In [None]:
from transformers import pipeline

# Load the generative model (flan-t5-small)
generator = pipeline("text2text-generation", model="Qwen/Qwen2.5-1.5B-Instruct")

# Test the generative model
test_output = generator("Translate English to French: How are you?")
print(test_output)

# Cell 9: Generate Answers

In [None]:
def generate_answer(question, relevant_chunks):
    # Combine the relevant chunks into a single context
    context = "\n".join(relevant_chunks)

    # Create a prompt for the generative model
    prompt = f"Answer the following question based on the context:\n\nContext: {context}\n\nQuestion: {question}\nAnswer:"

    # Generate the answer
    answer = generator(prompt, max_length=1000, num_return_sequences=1)
    return answer[0]['generated_text']

# Test the generative model with a question and relevant chunks
question = "What is the difference between supervised and unsupervised learning?"
answer = generate_answer(question, relevant_chunks)
print(f"Answer: {answer}")

# Cell 10: Test Generative Model with Another Question

In [None]:
# Test the generative model with a question and relevant chunks
question = "what is the knn and what is the cost function of it?"
answer = generate_answer(question, relevant_chunks)
print(f"Answer: {answer}")

# Cell 11: Full RAG Pipeline

In [None]:
def rag_pipeline(question, top_k=3):
    # Step 1: Retrieve relevant chunks
    relevant_chunks = retrieve_relevant_chunks(question, top_k)

    # Step 2: Generate an answer using the generative model
    answer = generate_answer(question, relevant_chunks)
    return answer

# Test the full RAG pipeline
question = "what is the cost function of knn "
answer = rag_pipeline(question)
print(f"Answer: {answer}")

# Cell 12: Test RAG Pipeline with Multiple Questions

In [None]:
questions = [
    "What is overfitting in machine learning?",
    "How does gradient descent work?",
    "What is the purpose of a validation set?"
]

for q in questions:
    answer = rag_pipeline(q)
    print(f"Question: {q}")
    print(f"Answer: {answer}\n")

# Cell 13: Save FAISS Index and Chunks

In [22]:
import pickle

# Save the FAISS index
faiss.write_index(index, "book_index.faiss")

# Save the chunks
with open("book_chunks.pkl", "wb") as f:
    pickle.dump(chunks, f)

print("FAISS index and chunks saved successfully!")

FAISS index and chunks saved successfully!


# Cell 14: Load FAISS Index and Chunks

In [23]:
# Load the FAISS index
index = faiss.read_index("book_index.faiss")

# Load the chunks
with open("book_chunks.pkl", "rb") as f:
    chunks = pickle.load(f)

print("FAISS index and chunks loaded successfully!")

FAISS index and chunks loaded successfully!
