In [9]:
import os
import glob
from dotenv import load_dotenv
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document
from langchain_chroma import Chroma
from langchain_google_genai import GoogleGenerativeAIEmbeddings
import numpy as np
from PyPDF2 import PdfReader
import json

load_dotenv()
api_key = os.getenv("GOOGLE_API_KEY")

MODEL = "gemini-2.5-flash-lite"
db_name = "vector_db"

folder = "data_files"
pdf_files = glob.glob(f"{folder}/*.pdf")

STATE_FILE = "processed_files.json"
processed_files = {}

if os.path.exists(STATE_FILE):
    with open(STATE_FILE, "r") as f:
        processed_files = json.load(f)

def get_text_from_pdf(file_path):
    reader = PdfReader(file_path)
    text = ""
    for i, page in enumerate(reader.pages):
        content = page.extract_text()
        if content:
            text += content + "\n"
    return text

def check_new_files(folder="data_files"):
    pdf_files = glob.glob(f"{folder}/*.pdf")

    # detect removed files
    removed_files = [f for f in list(processed_files.keys()) if f not in pdf_files]
    for f in removed_files:
        print(f"File removed: {f}")
        processed_files.pop(f, None)

    # detect new/updated files
    new_files = []
    for file in pdf_files:
        last_modified = os.path.getmtime(file)
        if (file not in processed_files) or (processed_files[file] < last_modified):
            new_files.append(file)
            processed_files[file] = last_modified

    # ✅ save state immediately after updating processed_files
    save_state()

    return new_files, removed_files


def save_state():
    with open(STATE_FILE, "w") as f:
        json.dump(processed_files, f)   

def store_chroma_function():
    new_files, removed_files = check_new_files()
    embeddings = GoogleGenerativeAIEmbeddings(model="gemini-embedding-001", google_api_key=api_key)

    # load existing DB if it exists, otherwise create later
    vectorstore = None
    if os.path.exists(db_name):
        vectorstore = Chroma(persist_directory=db_name, embedding_function=embeddings)

    # 🔥 Delete vectors for removed files
    if removed_files and vectorstore:
        for f in removed_files:
            print(f"Deleting vectors for: {f}")
            vectorstore.delete(where={"source": f})
    
    # Check if there are any PDF files at all
    pdf_files = glob.glob(f"{folder}/*.pdf")
    if not pdf_files:
        print("No PDF files in the directory.")
        if vectorstore:
            print("Cleaning up vector store...")
            vectorstore._collection.delete(where={})  # Clear all documents
            # Close the connection to allow deletion
            vectorstore._client._conn.close()
            vectorstore = None
            
        # Delete the vector store directory and its contents
        if os.path.exists(db_name):
            print(f"Removing {db_name} directory...")
            import shutil
            shutil.rmtree(db_name)
            
        if os.path.exists(STATE_FILE):
            print("Removing state file...")
            os.remove(STATE_FILE)  # Clear the processing state
        return None

    # no new files → just return
    if not new_files:
        print("No new files, skipping embeddings.")
        return vectorstore

    # otherwise, process new files
    print("New/updated files found:", new_files)
    docs = []
    for f in new_files:
        text = get_text_from_pdf(f)
        docs.append(Document(page_content=text, metadata={"source": f}))
    
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=100)
    chunks = text_splitter.split_documents(docs)

    if vectorstore:
        vectorstore.add_documents(chunks)
    else:
        vectorstore = Chroma.from_documents(documents=chunks, embedding=embeddings, persist_directory=db_name)

    save_state()
    return vectorstore

In [10]:
vectorstore = store_chroma_function()
if vectorstore is None:
    print("No documents in DB yet. Please add some PDFs to data_files.")
    exit()
all_docs = vectorstore.get()  # may differ slightly depending on version
texts = all_docs['documents']  # list of document objects

print(f"Number of chunks in vectorstore: {len(texts)}")

File removed: data_files\Attention Is All Need You.pdf
Deleting vectors for: data_files\Attention Is All Need You.pdf
New/updated files found: ['data_files\\Algorithm Notes.pdf']
Number of chunks in vectorstore: 5


In [11]:
import json
import re
from google import genai
from dotenv import load_dotenv
from langchain_community.vectorstores import Chroma
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

load_dotenv()

# Initialize Gemini client
client = genai.Client()

def deduplicate_chunks(chunks, threshold=0.85):
    """
    Deduplicate similar chunks using TF-IDF and cosine similarity.
    
    Args:
        chunks: List of document chunks (strings)
        threshold: Similarity threshold (0-1) above which chunks are considered duplicates
    
    Returns:
        List of deduplicated chunks
    """
    if len(chunks) <= 1:
        return chunks
    
    # Create TF-IDF vectors
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(chunks)
    
    # Calculate cosine similarities
    similarities = cosine_similarity(tfidf_matrix)
    
    # Keep track of which chunks to keep
    keep_indices = []
    seen = set()
    
    for i in range(len(chunks)):
        if i in seen:
            continue
        keep_indices.append(i)
        # Mark similar chunks as seen
        for j in range(i + 1, len(chunks)):
            if similarities[i][j] > threshold:
                seen.add(j)
    
    return [chunks[i] for i in keep_indices]

def batch_chunks(chunks, batch_size=5):
    """
    Batch chunks together to reduce API calls.
    
    Args:
        chunks: List of text chunks
        batch_size: Number of chunks to combine per batch
    
    Returns:
        List of batched chunks
    """
    batches = []
    for i in range(0, len(chunks), batch_size):
        batch = chunks[i:i + batch_size]
        # Combine chunks with separators
        combined = "\n\n--- SECTION ---\n\n".join(batch)
        batches.append(combined)
    return batches

def clean_json_response(response_text):
    """
    Extract and clean JSON from Gemini response.
    """
    # Remove markdown code blocks if present
    response_text = re.sub(r'```json\s*', '', response_text)
    response_text = re.sub(r'```\s*', '', response_text)
    
    # Try to find JSON array in the response
    match = re.search(r'\[.*\]', response_text, re.DOTALL)
    if match:
        response_text = match.group(0)
    
    return response_text.strip()

def generate_quiz_from_batch(batch_text, batch_index, total_batches):
    """
    Generate quiz questions from a batch of chunks using Gemini.
    """
    transcript_prompt = f"""
You are a teacher creating quizzes from lecture transcripts. Using the text below (which contains multiple sections separated by "--- SECTION ---"), generate a quiz with questions covering all important topics across ALL sections. The quiz should include **multiple-choice questions (MCQs)** only. Each question should have:

1. A "question" string.
2. An "options" list with exactly 4 options.
3. An "answer" string indicating the correct option.
4. A "difficulty" string which can be "easy", "medium", or "hard".
5. An "explanation" string for the correct answer.

Generate questions from each distinct topic/concept across all sections. Output the quiz strictly in JSON format like this:

[
  {{
    "question": "Example question?",
    "options": ["Option A", "Option B", "Option C", "Option D"],
    "answer": "Option B",
    "difficulty": "medium",
    "explanation": "Explanation for the correct answer."
  }}
]

Make sure the output is parsable. Do not include any other characters other than the structure I have specified.

**Transcript:**
"{batch_text}"
"""
    
    try:
        print(f"Generating questions from batch {batch_index + 1}/{total_batches}...")
        
        content = client.models.generate_content(
            model="gemini-2.0-flash-lite",
            contents=transcript_prompt
        )
        
        response_text = content.text
        cleaned_response = clean_json_response(response_text)
        
        # Parse JSON
        questions = json.loads(cleaned_response)
        
        print(f"  ✓ Generated {len(questions)} question(s)")
        return questions
        
    except json.JSONDecodeError as e:
        print(f"  ✗ JSON parsing error for batch {batch_index + 1}: {e}")
        print(f"  Response preview: {response_text[:200]}...")
        return []
    except Exception as e:
        print(f"  ✗ Error generating quiz for batch {batch_index + 1}: {e}")
        return []

def generate_quiz_from_vectorstore(vectorstore, output_file="quiz_output.json", 
                                   similarity_threshold=0.85, batch_size=7):
    """
    Generate quiz questions from all chunks in the vectorstore.
    
    Args:
        vectorstore: Initialized Chroma vectorstore
        output_file: Path to save the final JSON quiz
        similarity_threshold: Threshold for deduplication (0-1)
        batch_size: Number of chunks to combine per API call (default: 5)
                   Increase to reduce API calls, decrease if hitting token limits
    """
    print("=" * 60)
    print("Starting Quiz Generation from Vectorstore")
    print("=" * 60)
    
    # Retrieve all documents from vectorstore
    print("\n1. Retrieving all documents from vectorstore...")
    all_docs = vectorstore.get()
    
    if not all_docs or 'documents' not in all_docs:
        print("No documents found in vectorstore!")
        return
    
    chunks = all_docs['documents']
    print(f"   Found {len(chunks)} total chunks")
    
    # Deduplicate chunks
    print(f"\n2. Deduplicating chunks (threshold={similarity_threshold})...")
    unique_chunks = deduplicate_chunks(chunks, threshold=similarity_threshold)
    print(f"   Kept {len(unique_chunks)} unique chunks (removed {len(chunks) - len(unique_chunks)} duplicates)")
    
    # Batch chunks together
    print(f"\n3. Batching chunks (batch_size={batch_size})...")
    batched_chunks = batch_chunks(unique_chunks, batch_size=batch_size)
    print(f"   Created {len(batched_chunks)} batches")
    print(f"   API calls reduced from {len(unique_chunks)} to {len(batched_chunks)} ({len(unique_chunks)/len(batched_chunks):.1f}x reduction)")
    
    # Generate quiz from each batch
    print(f"\n4. Generating quiz questions...")
    all_questions = []
    
    for idx, batch in enumerate(batched_chunks):
        questions = generate_quiz_from_batch(batch, idx, len(batched_chunks))
        all_questions.extend(questions)
    
    # Save to JSON file
    print(f"\n5. Saving quiz to {output_file}...")
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(all_questions, f, indent=2, ensure_ascii=False)
    
    print("\n" + "=" * 60)
    print(f"✓ Quiz generation complete!")
    print(f"  Total questions generated: {len(all_questions)}")
    print(f"  Output saved to: {output_file}")
    print("=" * 60)
    
    return all_questions

# Example usage:
if __name__ == "__main__":
    # Initialize your vectorstore (adjust parameters as needed)
    # Example:
    # from langchain_community.embeddings import HuggingFaceEmbeddings
    # embeddings = HuggingFaceEmbeddings()
    # vectorstore = Chroma(
    #     persist_directory="./chroma_db",
    #     embedding_function=embeddings,
    #     collection_name="your_collection"
    # )
    
    # Or if you already have it initialized:
    # vectorstore = your_existing_vectorstore
    
    # Generate quiz with batching
    # questions = generate_quiz_from_vectorstore(
    #     vectorstore=vectorstore,
    #     output_file="final_quiz.json",
    #     similarity_threshold=0.85,
    #     batch_size=5  # Combine 5 chunks per API call
    # )
    
    print("Import this module and call generate_quiz_from_vectorstore() with your vectorstore!")

Import this module and call generate_quiz_from_vectorstore() with your vectorstore!


In [12]:
generate_quiz_from_vectorstore(vectorstore=vectorstore)

Starting Quiz Generation from Vectorstore

1. Retrieving all documents from vectorstore...
   Found 5 total chunks

2. Deduplicating chunks (threshold=0.85)...
   Kept 5 unique chunks (removed 0 duplicates)

3. Batching chunks (batch_size=7)...
   Created 1 batches
   API calls reduced from 5 to 1 (5.0x reduction)

4. Generating quiz questions...
Generating questions from batch 1/1...
  ✓ Generated 15 question(s)

5. Saving quiz to quiz_output.json...

✓ Quiz generation complete!
  Total questions generated: 15
  Output saved to: quiz_output.json


[{'question': 'Which of the following is NOT a characteristic of a proper algorithm?',
  'options': ['Correctness', 'Deterministic', 'Finite', 'Complex'],
  'answer': 'Complex',
  'difficulty': 'easy',
  'explanation': 'A proper algorithm should be simple and communicable, not complex.'},
 {'question': 'Which of the following is NOT a major factor determining program performance?',
  'options': ['Algorithm and Data Structures',
   'Hardware',
   'Programming Language',
   'Compiler'],
  'answer': 'Hardware',
  'difficulty': 'medium',
  'explanation': 'Algorithm and Data Structures are major factors; Hardware, Programming Language, and Compiler are platform dependent.'},
 {'question': 'What is the primary question asked in Algorithm Analysis?',
  'options': ['How many lines of code does it have?',
   'How fast is my algorithm as a function of input size?',
   'How much memory does it use?',
   'How much power does it consume?'],
  'answer': 'How fast is my algorithm as a function of inp