# Quizlet Flashcard Preprocessing

This notebook loads Quizlet flashcard exports from the `flashcards/` directory, chunks them, embeds them, and stores them in a vector database.

**Supported Formats:**
- JSON files: `[{"term": "...", "definition": "..."}, ...]`
- Tab-separated text files (classic Quizlet export)
- CSV files (comma-separated)

**Features:**
- Automatically scans `flashcards/` directory for .json, .txt, and .csv files
- Auto-detects file format
- Tracks which files have been processed (avoids duplicates)
- Processes only new/modified files
- Creates/updates vector store incrementally

## Setup

In [1]:
# Import required libraries
from quizlet_rag import QuizletRAGPipeline
from pathlib import Path
import json
from datetime import datetime

  from .autonotebook import tqdm as notebook_tqdm


## Configuration

In [2]:
# Configuration
FLASHCARDS_DIR = Path("flashcards")
VECTOR_DB_PATH = "./quizlet_db"
COLLECTION_NAME = "quizlet_flashcards"
TRACKING_FILE = "processed_files.json"
EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2"

# Supported file extensions
FILE_EXTENSIONS = ["*.json", "*.txt", "*.tsv", "*.csv"]

# Default delimiter for text files (can be overridden per file)
DEFAULT_DELIMITER = "\t"

# Chunking strategy
CHUNK_STRATEGY = "no_split" # Options: 'no_split', 'by_term', 'recursive'

# Create flashcards directory if it doesn't exist
FLASHCARDS_DIR.mkdir(exist_ok=True)

## Helper Functions

In [3]:
def load_processed_files():
    """Load the list of already processed files."""
    if Path(TRACKING_FILE).exists():
        with open(TRACKING_FILE, 'r') as f:
            return json.load(f)
    return {}

def save_processed_files(processed_files):
    """Save the list of processed files."""
    with open(TRACKING_FILE, 'w') as f:
        json.dump(processed_files, indent=2, fp=f)

def get_new_files(flashcards_dir, processed_files):
    """Get list of new files that haven't been processed."""
    all_files = []
    
    # Scan for all supported file types
    for pattern in FILE_EXTENSIONS:
        all_files.extend(flashcards_dir.glob(pattern))
    
    new_files = []
    
    for file_path in all_files:
        file_key = str(file_path.name)
        file_modified = file_path.stat().st_mtime
        
        # Check if file is new or has been modified
        if file_key not in processed_files or processed_files[file_key]["modified_time"] < file_modified:
            new_files.append(file_path)
    
    return new_files

def mark_files_as_processed(file_paths, processed_files):
    """Mark files as processed with timestamp and format info."""
    for file_path in file_paths:
        file_key = str(file_path.name)
        file_format = "json" if file_path.suffix.lower() == ".json" else "text"
        
        processed_files[file_key] = {
            "processed_at": datetime.now().isoformat(),
            "modified_time": file_path.stat().st_mtime,
            "path": str(file_path),
            "format": file_format,
            "extension": file_path.suffix
        }
    return processed_files

def detect_delimiter(file_path):
    """Detect delimiter for text files."""
    if file_path.suffix.lower() == ".csv":
        return ","
    elif file_path.suffix.lower() == ".tsv":
        return "\t"
    else:
        # Default to tab for .txt files
        return "\t"

## Initialize Pipeline

In [4]:
# Initialize the RAG pipeline
pipeline = QuizletRAGPipeline(
    embedding_model=EMBEDDING_MODEL,
    vector_store_path=VECTOR_DB_PATH
)

## Scan for New Files

In [5]:
# Load tracking information
processed_files = load_processed_files()
# Find new files
new_files = get_new_files(FLASHCARDS_DIR, processed_files)

if new_files:
    print("Files to process:")
    for i, file_path in enumerate(new_files, 1):
        file_format = "JSON" if file_path.suffix.lower() == ".json" else "TEXT"
        file_size = file_path.stat().st_size / 1024 # Size in KB
        print(f"  {i}. {file_path.name} ({file_format}, {file_size:.1f} KB)")
else:
    print("No new files to process. All flashcards are up to date!")

Files to process:
  1. ai-1.json (JSON, 20.8 KB)
  2. ai-2.json (JSON, 7.0 KB)
  3. ai-3.json (JSON, 10.3 KB)
  4. ai-4.json (JSON, 16.4 KB)
  5. ai-5.json (JSON, 3.8 KB)


## Process New Files

In [6]:
if new_files:
    # Convert Path objects to strings
    file_paths_str = [str(f) for f in new_files]
    
    # Load flashcards (auto-detects format)
    print("\n1. Loading flashcards...")
    print("   Format detection: AUTO")
    
    # Show what format each file is detected as
    for file_path in new_files:
        file_format = "JSON" if file_path.suffix.lower() == ".json" else "TEXT"
        delimiter = detect_delimiter(file_path) if file_format == "TEXT" else "N/A"
        delimiter_name = "tab" if delimiter == "\t" else "comma" if delimiter == "," else delimiter
        print(f"   • {file_path.name}: {file_format} format" + 
              (f" ({delimiter_name}-separated)" if file_format == "TEXT" else ""))
    
    docs = pipeline.load_flashcards(
        file_paths=file_paths_str,
        delimiter=DEFAULT_DELIMITER, # Will be auto-detected per file
        chunk_strategy="individual"
    )
    
    # Chunk documents
    print("\n2. Chunking documents...")
    chunks = pipeline.chunk_documents(docs, strategy=CHUNK_STRATEGY)
    
    # Create or update vector store
    print("\n3. Embedding and storing in vector database...")
    
    # Check if vector store already exists
    vectorstore_exists = Path(VECTOR_DB_PATH).exists()
    
    if vectorstore_exists:
        print("   Loading existing vector store...")
        pipeline.load_existing_vectorstore(collection_name=COLLECTION_NAME)
        print("   Adding new documents...")
        pipeline.vectorstore.add_documents(chunks)
        print(f"   Added {len(chunks)} chunks to existing vector store")
    else:
        print("   Creating new vector store...")
        pipeline.create_vectorstore(
            documents=chunks,
            collection_name=COLLECTION_NAME
        )
    
    # Mark files as processed
    print("\n4. Updating tracking file...")
    processed_files = mark_files_as_processed(new_files, processed_files)
    save_processed_files(processed_files)

    print(f"\nTotal files processed: {len(processed_files)}")
    print(f"Documents added this run: {len(chunks)}")
    print(f"Vector store location: {VECTOR_DB_PATH}")


1. Loading flashcards...
   Format detection: AUTO
   • ai-1.json: JSON format
   • ai-2.json: JSON format
   • ai-3.json: JSON format
   • ai-4.json: JSON format
   • ai-5.json: JSON format
✓ Loaded 277 documents from 5 file(s)

2. Chunking documents...
✓ Using 277 flashcards as-is (no splitting)

3. Embedding and storing in vector database...
   Creating new vector store...
⏳ Creating embeddings for 277 documents...
✓ Vector store created at ./quizlet_db

4. Updating tracking file...

Total files processed: 5
Documents added this run: 277
Vector store location: ./quizlet_db


## View Processing Status

In [7]:
# Display summary of all processed files
print("Processed Files Summary:")

processed_files = load_processed_files()
if processed_files:
    # Group by format
    json_files = []
    text_files = []
    
    for filename, info in processed_files.items():
        if info.get("format") == "json":
            json_files.append((filename, info))
        else:
            text_files.append((filename, info))
    
    if json_files:
        print(f"\nJSON Files ({len(json_files)}):")
        for filename, info in sorted(json_files):
            print(f"  • {filename}")
            print(f"    Processed: {info['processed_at'][:19]}")
    
    if text_files:
        print(f"\nText Files ({len(text_files)}):")
        for filename, info in sorted(text_files):
            ext = info.get('extension', 'unknown')
            print(f"  • {filename} ({ext})")
            print(f"    Processed: {info['processed_at'][:19]}")
    
    print(f"\nTotal: {len(processed_files)} file(s) processed")
else:
    print("No files have been processed yet.")
    print("\nTo get started:")
    print("1. Export your Quizlet flashcard sets in one of these formats:")
    print("   • JSON: [{\"term\": \"...\", \"definition\": \"...\"}]")
    print("   • Tab-separated text (classic Quizlet export)")
    print("   • CSV (comma-separated)")
    print(f"2. Place the files in the '{FLASHCARDS_DIR}' directory")
    print("3. Run this notebook again")

Processed Files Summary:

JSON Files (5):
  • ai-1.json
    Processed: 2025-11-04T12:35:40
  • ai-2.json
    Processed: 2025-11-04T12:35:40
  • ai-3.json
    Processed: 2025-11-04T12:35:40
  • ai-4.json
    Processed: 2025-11-04T12:35:40
  • ai-5.json
    Processed: 2025-11-04T12:35:40

Total: 5 file(s) processed


## Reset/Clear Options

In [8]:
# Uncomment to reset tracking file (forces reprocessing of all files)
# import os
# if Path(TRACKING_FILE).exists():
#     os.remove(TRACKING_FILE)
#     print(f"✓ Deleted {TRACKING_FILE}")

# Uncomment to delete vector store (complete reset)
# import shutil
# if Path(VECTOR_DB_PATH).exists():
#     shutil.rmtree(VECTOR_DB_PATH)
#     print(f"✓ Deleted vector store at {VECTOR_DB_PATH}")