# RAG Model

In [1]:
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/mps
!pip install transformers sentence-transformers faiss-cpu
!pip install PyPDF2 python-docx tqdm


Looking in indexes: https://download.pytorch.org/whl/mps
[31mERROR: Could not find a version that satisfies the requirement torchvision (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for torchvision[0m[31m


In [3]:
!pip install PyPDF2 python-docx tiktoken



In [5]:
!pip install PyPDF2




In [7]:
import os
from pathlib import Path
from PyPDF2 import PdfReader

In [9]:
# Path to the folder containing your PDF textbooks
pdf_folder = Path("/Users/ayshairam/Desktop/Educational-Atlas-RAG/data/Educational textbooks/")


In [11]:
def read_pdf(file_path):
    text = ""
    reader = PdfReader(file_path)
    for page in reader.pages:
        page_text = page.extract_text()
        if page_text:
            text += page_text + "\n"
    return text


In [13]:
all_texts = []

for file in pdf_folder.iterdir():
    if file.suffix.lower() == ".pdf":   # only PDFs
        print(f"Reading {file.name}...")
        all_texts.append(read_pdf(file))

print(f"\nLoaded {len(all_texts)} PDFs successfully!")


Reading Principles-of-Data-Science-WEB.pdf...
Reading Introduction_to_Python_Programming_-_WEB.pdf...
Reading Introduction_To_Computer_Science_-_WEB.pdf...
Reading Foundations_of_Information_Systems_-_WEB_oNlbGYl.pdf...

Loaded 4 PDFs successfully!


In [15]:
# Define chunk size (number of characters per chunk)
chunk_size = 1000

# Store all chunks here
chunks = []

# Loop through all loaded PDFs
for pdf_text in all_texts:
    # Join all pages if your PDF is split into pages
    if isinstance(pdf_text, list):  # If you stored pages
        pdf_text = "\n".join([page.extract_text() for page in pdf_text if page.extract_text()])
    
    # Split into chunks
    for i in range(0, len(pdf_text), chunk_size):
        chunks.append(pdf_text[i:i+chunk_size])

print(f"Total text chunks: {len(chunks)}")
print(chunks[0])  # preview first chunk


Total text chunks: 6036
	
	      Principles of Data Science          SENIOR CONTRIBUTING AUTHORS DR. SHAUN V. AULT, VALDOSTA STATE UNIVERSITY DR. SOOHYUN NAM LIAO, UNIVERSITY OF CALIFORNIA SAN DIEGO LARRY MUSOLINO, PENNSYLVANIA STATE UNIVERSITY         

	
	OpenStax Rice University 6100 Main Street MS-375 Houston, Texas 77005  To learn more about OpenStax, visit https://openstax.org. Individual print copies and bulk orders can be purchased through our website.  Â©2025 Rice University. Textbook content produced by OpenStax is licensed under a Creative Commons Attribution Non-Commercial ShareAlike 4.0 International License (CC BY-NC-SA 4.0). Under this license, any user of this textbook or the textbook contents herein can share, remix, and build upon the content for noncommercial purposes only. Any adaptations must be shared under the same type of license. In any case of sharing the original or adapted material, whether in whole or in part, the user must provide proper attribution as fol

In [17]:
pip install PyPDF2 python-docx tiktoken openai faiss-cpu


Note: you may need to restart the kernel to use updated packages.


In [19]:
chunks = []  # list of all text chunks


In [21]:
from openai import OpenAI

# Initialize client
client = OpenAI(api_key="My key")

# Updated function for embeddings
def get_embedding(text, model="text-embedding-3-small"):
    response = client.embeddings.create(
        model=model,
        input=text
    )
    return response.data[0].embedding


In [23]:
import faiss
import numpy as np

embedding_dim = 1536  # dimension of text-embedding-3-small
index = faiss.IndexFlatL2(embedding_dim)  # L2 distance index

# Store embeddings and associated text
chunk_texts = []

for chunk in chunks:
    embedding = get_embedding(chunk)
    index.add(np.array([embedding], dtype=np.float32))
    chunk_texts.append(chunk)

print("Embeddings stored in FAISS index!")


Embeddings stored in FAISS index!


In [25]:
import sklearn
from sklearn.metrics.pairwise import cosine_similarity
print("scikit-learn is installed and working!")


scikit-learn is installed and working!


In [27]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity


In [33]:
from pathlib import Path
pdf_folder = Path("/Users/ayshairam/Desktop/Educational-Atlas-RAG/data/Educational textbooks")
pdf_files = list(pdf_folder.glob("*.pdf"))
print("PDFs loaded:", [f.name for f in pdf_files])


PDFs loaded: ['Principles-of-Data-Science-WEB.pdf', 'Introduction_to_Python_Programming_-_WEB.pdf', 'Introduction_To_Computer_Science_-_WEB.pdf', 'Foundations_of_Information_Systems_-_WEB_oNlbGYl.pdf']


In [31]:
pip install sentence-transformers


Note: you may need to restart the kernel to use updated packages.


In [35]:
from sentence_transformers import SentenceTransformer

# Load free embedding model (downloads only once)
model = SentenceTransformer("all-MiniLM-L6-v2")

def get_embedding(text):
    return model.encode(text)


In [37]:
# Example
embedding_vector = get_embedding("This is a test for embeddings.")
print("Embedding length:", len(embedding_vector))


Embedding length: 384


In [39]:
import os
import numpy as np
import faiss
from sentence_transformers import SentenceTransformer
from PyPDF2 import PdfReader

# -----------------------------
# 1. Load Hugging Face embedding model (FREE)
# -----------------------------
model = SentenceTransformer("all-MiniLM-L6-v2")  # 384-dim embeddings

def get_embedding(text):
    return model.encode(text)

# -----------------------------
# 2. Load and chunk PDFs
# -----------------------------
def load_pdfs(pdf_folder):
    all_texts = []
    for file in os.listdir(pdf_folder):
        if file.endswith(".pdf"):
            pdf_path = os.path.join(pdf_folder, file)
            reader = PdfReader(pdf_path)
            text = ""
            for page in reader.pages:
                text += page.extract_text() or ""
            all_texts.append(text)
    return all_texts

def chunk_text(text, chunk_size=500, overlap=50):
    words = text.split()
    chunks = []
    for i in range(0, len(words), chunk_size - overlap):
        chunk = " ".join(words[i:i+chunk_size])
        chunks.append(chunk)
    return chunks

pdf_folder = "/Users/ayshairam/Desktop/Educational-Atlas-RAG/data/Educational textbooks"
documents = load_pdfs(pdf_folder)

chunk_texts = []
for doc in documents:
    chunk_texts.extend(chunk_text(doc))

print(f"Loaded {len(chunk_texts)} chunks.")

# -----------------------------
# 3. Build FAISS index
# -----------------------------
embeddings = np.array([get_embedding(chunk) for chunk in chunk_texts], dtype="float32")

dimension = embeddings.shape[1]  # 384
index = faiss.IndexFlatL2(dimension)
index.add(embeddings)

print(f"FAISS index built with {index.ntotal} vectors.")

# -----------------------------
# 4. Retrieval function
# -----------------------------
def retrieve(query, k=3):
    query_vector = np.array([get_embedding(query)], dtype="float32")
    distances, indices = index.search(query_vector, k)
    results = [chunk_texts[i] for i in indices[0]]
    return results

# -----------------------------
# 5. Test query
# -----------------------------
query = "Explain reinforcement learning."
top_chunks = retrieve(query, k=3)

for i, c in enumerate(top_chunks):
    print(f"\nðŸ”¹ Chunk {i+1}:\n{c}\n{'-'*50}")


Loaded 2329 chunks.
FAISS index built with 2329 vectors.

ðŸ”¹ Chunk 1:
empo wers you to meticulously determine what those instructions should be. T ake, for instance, the str ategic gameplay involved in chess. T o excel in a chess match, a player must: â€¢Understand the unique mo vements and str ategic values of each piece, r ecognizing ho w each can be maneuver ed to contr ol the boar d. â€¢Visualize the boar dâ€™s layout, identifying potential thr eats and opportunities, and planning mo ves several steps ahead to secur e an advantageous position. â€¢Recognize patterns fr om pr evious games, understanding common tactics and counters, to formulate a robust, adaptable str ategy . In de vising a winning str ategy , computational thinking is the underpinning fr ame work: â€¢The comple x game is dissected into smaller , mor e manageable components (e.g., the function of each chess piece, the state of the boar d)â€”this is decomposition. â€¢Attention is concentr ated on pivotal elements th

In [None]:
import subprocess
import json

def chat_ollama(query, model="llama3"):
    # Run Ollama CLI with the query
    result = subprocess.run(
        ["ollama", "chat", model, "-m", json.dumps([{"role":"user","content": query}])],
        capture_output=True,
        text=True
    )
    return result.stdout

answer = chat_ollama("Explain reinforcement learning")
print(answer)


In [None]:
import subprocess
import json

def chat_ollama(query, model="llama3"):
    # Run Ollama CLI with the query
    result = subprocess.run(
        ["ollama", "chat", model, "-m", json.dumps([{"role":"user","content": query}])],
        capture_output=True,
        text=True
    )
    return result.stdout

answer = chat_ollama("Explain reinforcement learning")
print(answer)


In [None]:
def chunk_text(text, chunk_size=500):
    words = text.split()
    chunks = []
    for i in range(0, len(words), chunk_size):
        chunks.append(" ".join(words[i:i+chunk_size]))
    return chunks

chunked_texts = []
for t in all_texts:
    chunked_texts.extend(chunk_text(t))

print(f"Total chunks: {len(chunked_texts)}")


In [None]:
import subprocess
import json

def query_llama3(prompt):
    # Run Ollama in subprocess
    result = subprocess.run(
        ["ollama", "run", "llama3", "--", prompt],
        capture_output=True,
        text=True
    )
    return result.stdout

# Example query
response = query_llama3("Explain what RAG means in simple terms.")
print(response)


In [None]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Step 4a: Convert chunks to vectors
vectorizer = TfidfVectorizer()
chunk_vectors = vectorizer.fit_transform(chunked_texts)

# Step 4b: Function to retrieve top-k chunks
def retrieve_top_chunks(query, top_k=3):
    query_vec = vectorizer.transform([query])
    similarities = cosine_similarity(query_vec, chunk_vectors)
    top_indices = similarities.argsort()[0][-top_k:][::-1]
    return [chunked_texts[i] for i in top_indices]

# Step 4c: Ask a query
query = "Explain reinforcement learning"
top_chunks = retrieve_top_chunks(query)
context = "\n".join(top_chunks)

# Step 4d: Send context to Llama3
response = query_llama3(f"Using the following context, explain: {query}\n\nContext:\n{context}")
print(response)


In [None]:
# Core libraries
import os
from pathlib import Path
import subprocess
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from PyPDF2 import PdfReader

print("All libraries imported successfully!")


In [None]:
import subprocess
import json

def query_llama3(prompt):
    # Run Ollama in subprocess
    result = subprocess.run(
        ["ollama", "run", "llama3", "--", prompt],
        capture_output=True,
        text=True
    )
    return result.stdout

# Example query
response = query_llama3("Explain what RAG means in simple terms.")
print(response)


In [None]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Step 4a: Convert chunks to vectors
vectorizer = TfidfVectorizer()
chunk_vectors = vectorizer.fit_transform(chunked_texts)

# Step 4b: Function to retrieve top-k chunks
def retrieve_top_chunks(query, top_k=3):
    query_vec = vectorizer.transform([query])
    similarities = cosine_similarity(query_vec, chunk_vectors)
    top_indices = similarities.argsort()[0][-top_k:][::-1]
    return [chunked_texts[i] for i in top_indices]

# Step 4c: Ask a query
query = "Explain reinforcement learning"
top_chunks = retrieve_top_chunks(query)
context = "\n".join(top_chunks)

# Step 4d: Send context to Llama3
response = query_llama3(f"Using the following context, explain: {query}\n\nContext:\n{context}")
print(response)


In [53]:
# =========================================================
# CELL 1: Imports, Setup, and Core Functions
# =========================================================
import os
import subprocess
from pathlib import Path
import numpy as np
# Imports for PDF reading, Indexing, and Retrieval:
from PyPDF2 import PdfReader
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
# Imports for the Interactive Widget:
from IPython.display import display
import ipywidgets as widgets
import sys
import traceback

# --- 1. Ollama/Llama3 Function ---
def query_llama3(prompt):
    """
    Sends a query to your local Llama3 model via Ollama CLI.
    """
    result = subprocess.run(
        ["ollama", "run", "llama3", "--", prompt],
        capture_output=True,
        text=True
    )
    if result.returncode != 0:
        return f"OLLAMA ERROR: Could not run Llama3. Is Ollama running and is 'llama3' pulled? Error: {result.stderr}"
    return result.stdout

# --- 2. PDF Loading Function ---
def load_pdfs(pdf_folder):
    all_texts = []
    if not pdf_folder.exists():
         print(f"Error: Folder not found at {pdf_folder}")
         return all_texts

    for file in pdf_folder.iterdir():
        if file.suffix.lower() == ".pdf":
            try:
                reader = PdfReader(file)
                text = ""
                for page in reader.pages:
                    page_text = page.extract_text()
                    if page_text:
                        text += page_text
                all_texts.append(text)
            except Exception as e:
                print(f"Error reading {file.name}: {e}")
    return all_texts

# --- 3. Text Chunking Function ---
def chunk_text(text, chunk_size=500):
    words = text.split()
    chunks = []
    for i in range(0, len(words), chunk_size):
        chunks.append(" ".join(words[i:i+chunk_size]))
    return chunks

print("Core RAG functions defined.")

Core RAG functions defined.


In [57]:
# =========================================================
# CELL 2: Load Data and Build Index (Creates vectorizer/chunk_vectors)
# =========================================================
# !!! VERIFY THIS PATH IS CORRECT !!!
pdf_folder = Path("/Users/ayshairam/Desktop/Educational-Atlas-RAG/data/Educational textbooks") 

# Load texts
all_texts = load_pdfs(pdf_folder)
print(f"Loaded {len(all_texts)} PDF files.")

# Create chunks
chunked_texts = []
for t in all_texts:
    chunked_texts.extend(chunk_text(t))

print(f"Total chunks created: {len(chunked_texts)}")

# Build TF-IDF vectorizer (This creates the global variable 'vectorizer')
if not chunked_texts:
    raise ValueError("No text chunks were created. Cannot build index.")
    
vectorizer = TfidfVectorizer()
chunk_vectors = vectorizer.fit_transform(chunked_texts)

print("TF-IDF vector index created successfully.")

Loaded 4 PDF files.
Total chunks created: 2096
TF-IDF vector index created successfully.


In [58]:
# =========================================================
# CELL 3: Define Retrieval Function (Uses vectorizer/chunk_vectors)
# =========================================================
def retrieve_top_chunks(query, top_k=3):
    """
    Finds the top_k most similar chunks to the query using the TF-IDF index.
    """
    # These two lines failed before because Cell 2 wasn't properly run.
    query_vec = vectorizer.transform([query])
    similarities = cosine_similarity(query_vec, chunk_vectors)
    
    top_indices = similarities.argsort()[0][-top_k:][::-1]
    return [chunked_texts[i] for i in top_indices]

print("Retrieval function 'retrieve_top_chunks' is ready.")

Retrieval function 'retrieve_top_chunks' is ready.


In [59]:
# =========================================================
# CELL 4: RAG Test
# =========================================================
test_query = "What are the core principles of data science?"

try:
    # 1. Retrieve top relevant chunks from PDFs
    top_chunks = retrieve_top_chunks(test_query, top_k=3)
    context = "\n".join(top_chunks)

    # 2. Generate response using Llama3
    prompt = f"Using the following context, explain: {test_query}\n\nContext:\n{context}"
    response = query_llama3(prompt)

    print(f"Test Query: {test_query}")
    print("========== RAG Output ==========")
    print(response)

except Exception as e:
    print(f"ERROR: RAG Test Failed. Details: {e}")
    print("Ensure Ollama is running and all preceding cells were executed in order.")

python(24804) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Test Query: What are the core principles of data science?
Based on the provided context, the core principles of data science are not explicitly stated. However, we can infer some key principles from the text:

1. **Multidisciplinary approach**: Data science is an interdisciplinary field that combines collecting, processing, and analyzing large volumes of data to extract insights and drive informed decision-making.
2. **Iterative process**: The data science life cycle is an iterative process that starts with data acquisition, followed by data exploration, data analysis, and reporting/presentation.
3. **Data-driven insights**: Data science aims to derive insights from data, which can be used to make informed decisions in various fields such as healthcare, business, education, politics, environmental science, and social sciences.
4. **Tools and software**: Data science involves the use of specialized tools and software, such as NumPy and Pandas, for data manipulation and analysis.
5. **Ex

In [63]:
# =========================================================
# CELL 5: Interactive Query Widget (Step 7) - FIXED
# =========================================================
# --- ADD NECESSARY IMPORTS HERE ---
from IPython.display import display
import ipywidgets as widgets
import sys
import traceback
# ----------------------------------

# Create a text input box
query_input = widgets.Text(
    value='',
    placeholder='Type your question here...',
    description='Query:',
    layout=widgets.Layout(width='70%')
)

# Create an output area
output_area = widgets.Output()

# Define what happens when you submit a query
def handle_query(sender):
    output_area.clear_output()
    user_query = query_input.value
    if not user_query.strip():
        return
    
    with output_area:
        print(f"User Query: {user_query}")
        
        try:
            # 1. Retrieve top chunks
            print("-> Retrieving relevant context...")
            # These functions rely on Cell 3 & 1
            top_chunks = retrieve_top_chunks(user_query, top_k=3)
            context = "\n".join(top_chunks)
            
            # 2. Query Llama3
            print("-> Generating response with Llama3...")
            prompt = f"Using the following context, explain: {user_query}\n\nContext:\n{context}"
            response = query_llama3(prompt)
            
            # 3. Display response
            print("\n========== RAG Output ==========")
            print(response)

        except Exception as e:
            print(f"RAG Chain Error: {e}")
            print("\n--- Full Traceback ---")
            traceback.print_exc(file=sys.stdout)


# Trigger on Enter key press
query_input.on_submit(handle_query)

# Display input and output in notebook
display(query_input, output_area)

  query_input.on_submit(handle_query)


Text(value='', description='Query:', layout=Layout(width='70%'), placeholder='Type your question here...')

Output()