In [5]:
import os

os.makedirs("pdfs", exist_ok=True)
os.makedirs("chunks", exist_ok=True)

In [8]:
def extract_text_from_pdf(pdf_path):
    try:
        doc = fitz.open(pdf_path)
        text = ""
        for page in doc:
            text += page.get_text()
        doc.close()
        return text
    except Exception as e:
        print(f"Error reading {pdf_path}: {e}")
        return ""

In [9]:
from langchain.text_splitter import CharacterTextSplitter
import json
import re

def chunk_and_store_text(text, source_name, chunk_size=1000, chunk_overlap=200):
    # Check for empty text
    if not text.strip():
        print(f"Warning: Empty text received for {source_name}")
        return []

    # Sanitize source name for filenames
    safe_source_name = re.sub(r'\W+', '_', source_name)

    # Create text splitter
    splitter = CharacterTextSplitter(
        separator="\n",
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len
    )

    # Split text into chunks
    chunks = splitter.split_text(text)

    # Prepare chunk data with metadata
    data = []
    for i, chunk in enumerate(chunks):
        data.append({
            "id": f"{safe_source_name}_{i}",
            "text": chunk,
            "metadata": {"source": safe_source_name}
        })

    # Write to JSON
    with open(f"chunks/{safe_source_name}.json", "w", encoding="utf-8") as f:
        json.dump(data, f, indent=2)

    return data

In [10]:
import os

def process_all_pdfs(pdf_folder="pdfs/"):
    all_chunks = []
    for file in os.listdir(pdf_folder):
        if file.lower().endswith(".pdf"):
            pdf_path = os.path.join(pdf_folder, file)
            print(f"📄 Processing: {file}")
            
            text = extract_text_from_pdf(pdf_path)
            if not text.strip():
                print(f"⚠️ Skipped (empty text): {file}")
                continue

            source_name = file.rsplit(".", 1)[0]
            chunks = chunk_and_store_text(text, source_name=source_name)

            print(f"✅ {len(chunks)} chunks created from {file}\n")
            all_chunks.extend(chunks)
    return all_chunks

# Call it
all_chunks = process_all_pdfs()




📄 Processing: ISLP_website.pdf
✅ 1828 chunks created from ISLP_website.pdf

📄 Processing: thinkpython2.pdf
✅ 558 chunks created from thinkpython2.pdf

📄 Processing: Eloquent_JavaScript.pdf
✅ 965 chunks created from Eloquent_JavaScript.pdf

📄 Processing: thinkjava.pdf
✅ 478 chunks created from thinkjava.pdf

📄 Processing: ISLRv2_corrected_June_2023.pdf
✅ 1718 chunks created from ISLRv2_corrected_June_2023.pdf



In [11]:
import json
import os

def load_all_chunks(folder="chunks/"):
    documents = []
    for file in os.listdir(folder):
        if file.lower().endswith(".json"):
            file_path = os.path.join(folder, file)
            try:
                with open(file_path, "r", encoding="utf-8") as f:
                    data = json.load(f)
                    documents.extend(data)
            except Exception as e:
                print(f"❌ Error loading {file}: {e}")
    return documents

# Load all chunks
documents = load_all_chunks()
print(f"✅ Loaded {len(documents)} chunks")

✅ Loaded 11094 chunks


In [14]:
from sentence_transformers import SentenceTransformer

embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

# No need to extract "text" from docs, since each is already a string
embeddings = embedding_model.encode(documents, show_progress_bar=True)

Batches:   0%|          | 0/347 [00:00<?, ?it/s]

In [16]:
import chromadb
from chromadb.utils import embedding_functions

chroma_client = chromadb.Client()

collection = chroma_client.get_or_create_collection(
    name="pdf_chunks",
)

# Normalize: convert everything to plain text strings
clean_texts = []
metadata_list = []

for i, doc in enumerate(documents):
    if isinstance(doc, dict):  # If it's already structured
        clean_texts.append(doc["text"])
        metadata_list.append(doc.get("metadata", {"chunk": i}))
    else:  # If it's just a plain string
        clean_texts.append(doc)
        metadata_list.append({"chunk": i})

In [17]:
embeddings = embedding_model.encode(clean_texts, show_progress_bar=True)

Batches:   0%|          | 0/347 [00:00<?, ?it/s]

In [19]:
for i, text in enumerate(clean_texts):
    collection.add(
        ids=[f"chunk_{i}"],
        documents=[text],
        embeddings=[embeddings[i]],
        metadatas=[metadata_list[i]]
    )

In [20]:
def retrieve_relevant_chunks(query, top_k=5):
    # Encode the query into an embedding
    query_embedding = embedding_model.encode([query])[0]

    # Query the Chroma collection
    results = collection.query(
        query_embeddings=[query_embedding],
        n_results=top_k,
        include=['documents', 'metadatas']
    )

    # Return documents and metadata
    return results['documents'][0], results['metadatas'][0]

# Example usage
query = "What is object-oriented programming?"
docs, meta = retrieve_relevant_chunks(query)

# Print results
for i, d in enumerate(docs):
    print(f"\nChunk {i+1}:")
    print(d)


Chunk 1:
[…] which defines the type in terms of the operations which can be
performed on it.”
—Barbara Liskov, Programming with Abstract Data Types
Chapter 6
The Secret Life of Objects
Chapter 4 introduced JavaScript’s objects as containers that hold other data.
In programming culture, object-oriented programming is a set of techniques that
use objects as the central principle of program organization. Though no one
really agrees on its precise definition, object-oriented programming has shaped
the design of many programming languages, including JavaScript. This chapter
describes the way these ideas can be applied in JavaScript.
Abstract Data Types
The main idea in object-oriented programming is to use objects, or rather
types of objects, as the unit of program organization. Setting up a program
as a number of strictly separated object types provides a way to think about
its structure and thus to enforce some kind of discipline, preventing everything
from becoming entangled.

Chunk 2:


In [21]:
import ollama

def generate_answer_mistral(query, context_chunks):
    # Combine context chunks into a single string
    context = "\n\n".join(context_chunks)
    
    # Construct the prompt
    prompt = f"""Answer the question based on the context below.

Context:
{context}

Question:
{query}

Answer:"""

    # Generate response using Ollama
    response = ollama.chat(
        model='mistral',
        messages=[{"role": "user", "content": prompt}]
    )
    
    # Return only the answer content
    return response['message']['content']


# Example usage
query = "What is the difference between a class and an object?"
chunks, _ = retrieve_relevant_chunks(query)
answer = generate_answer_mistral(query, chunks)

print("\n📘 Answer:\n", answer)


📘 Answer:
  A class is a blueprint or template for creating objects, which defines the attributes (properties) and methods (functions) that the objects of that class will have. An object is an instance of a class, which means it is a specific realization of the class with its own unique values for the attributes defined in the class. In other words, a class is a kind of "recipe" for making multiple objects, while an object is a concrete example of that recipe.


In [22]:
def retrieve_relevant_chunks(query, k=3):
    query_embedding = embedding_model.encode([query])[0]
    
    results = collection.query(
        query_embeddings=[query_embedding],
        n_results=k
    )
    
    texts = results['documents'][0]
    metadatas = results['metadatas'][0]
    
    return texts, metadatas


# Input query
query = "How do we sort an array?"

# Retrieve chunks and metadata
chunks, metadatas = retrieve_relevant_chunks(query)

# Generate answer
answer = generate_answer_mistral(query, chunks)

# Show answer
print("\n📘 Answer:\n", answer)

# Show unique sources from metadata
unique_sources = set(meta.get('source', 'Unknown') for meta in metadatas)

print("\n🔖 Sources:")
for source in unique_sources:
    print(f"• {source}")


📘 Answer:
  The text discusses two sorting algorithms: selection sort and merge sort. Selection sort is mentioned in Exercise 13.1, where it's recommended to learn more about the algorithms at http://www.sorting-algorithms.com/. It is also described as a simple algorithm that sorts n items by traversing the array n-1 times, with each traversal taking time proportional to n, resulting in a total time proportional to n^2.

Merge sort, on the other hand, is introduced in Section 13.4 and is presented as a more efficient alternative to selection sort. To sort n items, merge sort takes time proportional to n log2 n, which can be significantly faster than n^2 for larger values of n. The implementation details for both algorithms are provided later in the chapter.

🔖 Sources:
• Unknown
• thinkjava


In [3]:
from tkinter import Tk
from tkinter.filedialog import askopenfilename

# File picker (PDF or TXT)
Tk().withdraw()  # Hide the Tkinter root window
file_path = askopenfilename(
    title="Select your PDF or Text file",
    filetypes=[("PDF files", "*.pdf"), ("Text files", "*.txt")]
)
print("📂 File selected:", file_path)

# Text splitting
from langchain.text_splitter import RecursiveCharacterTextSplitter
import pdfplumber
import os

def read_pdf_and_chunk(file_path, chunk_size=200, chunk_overlap=20):
    if not file_path.lower().endswith('.pdf'):
        raise ValueError("Only PDF files are currently supported in this function.")

    raw_text = ""
    with pdfplumber.open(file_path) as pdf:
        for i, page in enumerate(pdf.pages):
            text = page.extract_text()
            if text:
                raw_text += text + "\n"
            else:
                print(f"⚠️ Skipped empty page {i+1}")

    splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap
    )
    chunks = splitter.split_text(raw_text)
    return chunks

📂 File selected: 


In [24]:
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
import uuid
import os

def embed_chunks(chunks, source_name="unknown_source"):
    if not chunks or not isinstance(chunks, list):
        raise ValueError("Chunks must be a non-empty list of text segments.")

    # Initialize the embedding model only once
    embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
    
    # Generate metadata for each chunk
    metadatas = [{"source": source_name} for _ in chunks]

    # Create a unique folder to avoid overwriting previous vector stores
    persist_directory = os.path.join("chroma_dbs", f"chroma_{uuid.uuid4().hex[:8]}")
    os.makedirs(persist_directory, exist_ok=True)

    vectordb = Chroma.from_texts(
        texts=chunks,
        embedding=embeddings,
        metadatas=metadatas,
        persist_directory=persist_directory
    )
    vectordb.persist()
    return vectordb, persist_directory

In [25]:
from langchain.chains import RetrievalQA
from langchain.llms import CTransformers

def ask_question(vectordb, question, k=2, max_tokens=256, temperature=0.5):
    if not question or not isinstance(question, str):
        raise ValueError("Question must be a non-empty string.")
    
    # Step 1: Create retriever from the vector store
    retriever = vectordb.as_retriever(search_kwargs={"k": k})

    # Step 2: Load the LLM (Mistral 7B Instruct)
    llm = CTransformers(
        model="TheBloke/Mistral-7B-Instruct-v0.1-GGUF",
        model_type="mistral",
        config={
            "max_new_tokens": max_tokens,
            "temperature": temperature
        }
    )

    # Step 3: Create the QA chain
    qa_chain = RetrievalQA.from_chain_type(
        llm=llm,
        retriever=retriever,
        return_source_documents=True
    )

    # Step 4: Run the query
    result = qa_chain(question)

    # Step 5: Extract and deduplicate source names
    cited_sources = list(set(
        doc.metadata.get("source", "unknown") for doc in result["source_documents"]
    ))

    return result["result"], cited_sources

In [2]:
# Step 1: Read and chunk the uploaded PDF
chunks = read_pdf_and_chunk(file_path)

# Step 2: Create vector database from the chunks
vectordb, _ = embed_chunks(chunks, source_name="UserUpload_1")

# Step 3: Ask a default question about the uploaded document
question = ""
answer, sources = ask_question(vectordb, question)

# Step 4: Print the results
print("📘 Answer:", answer)
print("🔖 Sources:", sources)

NameError: name 'read_pdf_and_chunk' is not defined

In [1]:
# Step 1: Read and chunk the uploaded PDF
chunks = read_pdf_and_chunk(file_path)

# Step 2: Create vector database from the chunks
vectordb, _ = embed_chunks(chunks, source_name="UserUpload_1")

# Step 3: Ask a default question about the uploaded document
question = "?"
answer, sources = ask_question(vectordb, question)

# Step 4: Print the results
print("📘 Answer:", answer)
print("🔖 Sources:", sources)

NameError: name 'read_pdf_and_chunk' is not defined