In [9]:
pip install openai PyPDF2 python-dotenv

Collecting fpdf
  Using cached fpdf-1.7.2-py2.py3-none-any.whl
Installing collected packages: fpdf
Successfully installed fpdf-1.7.2
Note: you may need to restart the kernel to use updated packages.


In [2]:
# filename: ask_syllabus_questions_save_txt.py

import PyPDF2
from openai import OpenAI
from dotenv import load_dotenv, find_dotenv
import os

# -----------------------------
# 1. Load OpenAI API key
# -----------------------------
load_dotenv(find_dotenv())
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
if not OPENAI_API_KEY:
    raise ValueError("OPENAI_API_KEY not found in .env")

# Initialize OpenAI client
client = OpenAI(api_key=OPENAI_API_KEY)

# -----------------------------
# 2. Path to your PDF
# -----------------------------
pdf_path = "/Users/CS/Documents/Deep Learning/Final Project/CPSCI-366_Syllabus.pdf"

# -----------------------------
# 3. Extract text from PDF
# -----------------------------
text = ""
with open(pdf_path, "rb") as f:
    reader = PyPDF2.PdfReader(f)
    for page in reader.pages:
        page_text = page.extract_text()
        if page_text:
            text += page_text + "\n"

# -----------------------------
# 4. Define your 10 questions
# -----------------------------
questions = [
    "When are the lecture sessions for CPSCI-366 held, and where is the classroom located?",
    "Who is the course instructor, and when can students attend office hours or schedule appointments?",
    "Describe the structure and purpose of Assignment 2 in the course.",
    "What is the main goal of the final project in CPSCI-366, and how should students approach it?",
    "What environment or tools are provided for running deep learning examples and assignments?",
    "What does the `unet_example.py` file demonstrate, and why is it relevant for the course?",
    "How should students utilize the `example_data` folder in their assignments or projects?",
    "Which platforms are used for submitting work and communicating with the instructor?",
    "How are students‚Äô projects and assignments graded in CPSCI-366?",
    "Which additional materials or resources are suggested in the syllabus to help students succeed in the course?"
]

# -----------------------------
# 5. Ask GPT each question and save to text file
# -----------------------------
output_path = "syllabus_qa.txt"
with open(output_path, "w", encoding="utf-8") as f:
    for i, question in enumerate(questions, start=1):
        prompt = f"Answer the following question based only on this syllabus text:\n\nSyllabus:\n{text}\n\nQuestion: {question}"
        
        response = client.chat.completions.create(
            model="gpt-4-turbo",
            messages=[
                {"role": "system", "content": "You are an AI assistant reading a course syllabus."},
                {"role": "user", "content": prompt}
            ]
        )
        
        answer = response.choices[0].message.content
        
        f.write(f"Q{i}: {question}\n")
        f.write(f"A{i}: {answer}\n")
        f.write("-" * 60 + "\n")

print(f"‚úÖ Q&A saved to text file: {output_path}")


FileNotFoundError: [Errno 2] No such file or directory: '/Users/CS/Documents/Deep Learning/Final Project/CPSCI-366_Syllabus.pdf'

In [1]:
# filename: rag_with_finetuned_embeddings.py

import PyPDF2
from openai import OpenAI
from dotenv import load_dotenv
import os
import numpy as np
from pathlib import Path
import pickle
from sentence_transformers import SentenceTransformer, losses, InputExample
from torch.utils.data import DataLoader
import torch

# -----------------------------
# Configuration
# -----------------------------
load_dotenv()
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
if not OPENAI_API_KEY:
    raise ValueError("OPENAI_API_KEY not found in .env file")

client = OpenAI(api_key=OPENAI_API_KEY)

SYLLABUS_FOLDER = Path(__file__).parent / "syllabi"
CACHE_FILE = "syllabus_embeddings_custom.pkl"
MODEL_PATH = "finetuned_syllabus_model"

# Choose which embeddings to use
USE_CUSTOM_MODEL = True  # Set to False to use OpenAI embeddings

# -----------------------------
# PDF Processing
# -----------------------------
def extract_pdf_text(pdf_path):
    """Extract text from a single PDF file"""
    text = ""
    try:
        with open(pdf_path, "rb") as f:
            reader = PyPDF2.PdfReader(f)
            for page in reader.pages:
                page_text = page.extract_text()
                if page_text:
                    text += page_text + "\n"
    except Exception as e:
        print(f"‚ùå Error reading {pdf_path.name}: {e}")
    return text

def split_into_chunks(text, chunk_size=500, overlap=100):
    """Split text into overlapping chunks"""
    words = text.split()
    chunks = []
    
    for i in range(0, len(words), chunk_size - overlap):
        chunk = ' '.join(words[i:i + chunk_size])
        if chunk.strip():
            chunks.append(chunk)
    
    return chunks

def load_all_syllabi(folder_path):
    """Load all PDFs from folder and subfolders"""
    documents = []
    folder_path.mkdir(exist_ok=True)
    pdf_files = list(folder_path.rglob("*.pdf"))
    
    if not pdf_files:
        print(f"\n‚ö†Ô∏è  No PDF files found in '{folder_path}' or subfolders")
        print(f"üìÅ Add your syllabus PDFs to: {folder_path.absolute()}")
        return documents
    
    print(f"\nüìÑ Found {len(pdf_files)} PDF files")
    
    for pdf_file in pdf_files:
        rel_path = pdf_file.relative_to(folder_path)
        print(f"üìñ Processing: {rel_path}")
        text = extract_pdf_text(pdf_file)
        
        if not text.strip():
            print(f"   ‚ö†Ô∏è  No text extracted")
            continue
        
        chunks = split_into_chunks(text, chunk_size=500, overlap=100)
        print(f"   ‚úì Created {len(chunks)} chunks")
        
        for i, chunk in enumerate(chunks):
            documents.append({
                'text': chunk,
                'source': str(rel_path),
                'chunk_id': i
            })
    
    return documents

# -----------------------------
# Training Data Generation
# -----------------------------
def generate_training_data(documents):
    """Generate synthetic training pairs from syllabi"""
    print("\nüîß Generating training data...")
    
    # Create positive pairs: (query, relevant_chunk)
    training_examples = []
    
    # Strategy 1: Use GPT to generate questions for chunks
    print("   Generating question-answer pairs...")
    
    # Sample chunks to avoid cost (take every 5th chunk)
    sampled_docs = documents[::5][:50]  # Max 50 chunks
    
    for i, doc in enumerate(sampled_docs):
        if i % 10 == 0:
            print(f"   Progress: {i+1}/{len(sampled_docs)}")
        
        # Generate a question for this chunk
        prompt = f"""Generate 1 specific question that could be answered by this text excerpt from a course syllabus. Make it realistic.

Text: {doc['text'][:500]}

Question:"""
        
        try:
            response = client.chat.completions.create(
                model="gpt-4o-mini",
                messages=[{"role": "user", "content": prompt}],
                max_tokens=100,
                temperature=0.7
            )
            question = response.choices[0].message.content.strip()
            
            # Create positive pair
            training_examples.append(
                InputExample(texts=[question, doc['text']])
            )
        except Exception as e:
            print(f"   ‚ö†Ô∏è  Error generating question: {e}")
            continue
    
    print(f"‚úÖ Generated {len(training_examples)} training pairs")
    return training_examples

# -----------------------------
# Fine-tuning
# -----------------------------
def finetune_model(training_examples):
    """Fine-tune sentence transformer on syllabus data"""
    print("\nüéì Fine-tuning embedding model...")
    print("   This runs locally on your computer (FREE!)")
    
    # Load pre-trained model
    print("   Loading base model: all-MiniLM-L6-v2")
    model = SentenceTransformer('all-MiniLM-L6-v2')
    
    # Create DataLoader
    train_dataloader = DataLoader(
        training_examples, 
        shuffle=True, 
        batch_size=16
    )
    
    # Define loss function
    train_loss = losses.MultipleNegativesRankingLoss(model)
    
    # Train
    print("   Training started... (this may take 5-15 minutes)")
    model.fit(
        train_objectives=[(train_dataloader, train_loss)],
        epochs=3,
        warmup_steps=100,
        show_progress_bar=True
    )
    
    # Save model
    model.save(MODEL_PATH)
    print(f"‚úÖ Model saved to: {MODEL_PATH}")
    
    return model

# -----------------------------
# Embedding Functions
# -----------------------------
def get_openai_embedding(text):
    """Get OpenAI embedding"""
    text = text.replace("\n", " ")
    response = client.embeddings.create(
        input=[text], 
        model="text-embedding-3-small"
    )
    return response.data[0].embedding

def get_custom_embedding(text, model):
    """Get custom fine-tuned embedding"""
    return model.encode(text, convert_to_numpy=True)

def create_embeddings(documents, use_custom=True):
    """Create embeddings for all documents - stores BOTH types for comparison"""
    # Always create custom embeddings
    if os.path.exists(MODEL_PATH):
        print(f"\n‚úÖ Loading fine-tuned model from {MODEL_PATH}")
        model = SentenceTransformer(MODEL_PATH)
    else:
        print("\nüéì No fine-tuned model found. Training new model...")
        training_data = generate_training_data(documents)
        model = finetune_model(training_data)
    
    print(f"\nüîÑ Creating custom embeddings...")
    for i, doc in enumerate(documents):
        if i % 50 == 0:
            print(f"   Progress: {i+1}/{len(documents)}")
        doc['embedding_custom'] = get_custom_embedding(doc['text'], model)
    
    # Also create OpenAI embeddings for comparison
    print(f"\nüîÑ Creating OpenAI embeddings for comparison...")
    for i, doc in enumerate(documents):
        if i % 10 == 0:
            print(f"   Progress: {i+1}/{len(documents)}")
        doc['embedding_openai'] = get_openai_embedding(doc['text'])
    
    # Set default embedding based on mode
    for doc in documents:
        doc['embedding'] = doc['embedding_custom'] if use_custom else doc['embedding_openai']
    
    return documents

# -----------------------------
# RAG Search & Answer
# -----------------------------
def cosine_similarity(a, b):
    """Calculate cosine similarity"""
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

def search_documents(query, documents, use_custom=True, top_k=3):
    """Search for relevant documents"""
    if use_custom:
        model = SentenceTransformer(MODEL_PATH)
        query_embedding = get_custom_embedding(query, model)
        embedding_key = 'embedding_custom'
    else:
        query_embedding = get_openai_embedding(query)
        embedding_key = 'embedding_openai'
    
    # Calculate scores using appropriate embeddings
    results = []
    for doc in documents:
        doc_copy = doc.copy()
        doc_copy['score'] = cosine_similarity(query_embedding, doc[embedding_key])
        results.append(doc_copy)
    
    sorted_docs = sorted(results, key=lambda x: x['score'], reverse=True)
    return sorted_docs[:top_k]

def answer_question(query, documents, use_custom=True, top_k=3):
    """Answer question using RAG"""
    relevant_docs = search_documents(query, documents, use_custom, top_k)
    
    context = "\n\n---\n\n".join([
        f"[Source: {doc['source']}]\n{doc['text']}" 
        for doc in relevant_docs
    ])
    
    prompt = f"""Answer the following question based on the provided syllabus excerpts. 
Cite which course/syllabus your information comes from.
If the answer is not in the excerpts, say so.

Context from syllabi:
{context}

Question: {query}

Answer:"""
    
    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "system", "content": "You are a helpful assistant that answers questions about course syllabi."},
            {"role": "user", "content": prompt}
        ],
        temperature=0.7
    )
    
    answer = response.choices[0].message.content
    sources = list(set([doc['source'] for doc in relevant_docs]))
    
    return answer, sources, relevant_docs

# -----------------------------
# Caching
# -----------------------------
def save_embeddings(documents, cache_file):
    """Save embeddings to disk"""
    with open(cache_file, 'wb') as f:
        pickle.dump(documents, f)
    print(f"\nüíæ Embeddings cached to {cache_file}")

def load_embeddings(cache_file):
    """Load cached embeddings"""
    if os.path.exists(cache_file):
        with open(cache_file, 'rb') as f:
            return pickle.load(f)
    return None

# -----------------------------
# Main
# -----------------------------
def main():
    print("\n" + "="*60)
    print("üéì SYLLABUS RAG WITH FINE-TUNED EMBEDDINGS")
    print("="*60)
    
    mode = "CUSTOM FINE-TUNED" if USE_CUSTOM_MODEL else "OPENAI"
    print(f"\nüìä Mode: {mode} embeddings")
    
    # Load or create embeddings
    documents = load_embeddings(CACHE_FILE)
    
    if documents is None:
        print("\nüîß Building knowledge base...")
        documents = load_all_syllabi(SYLLABUS_FOLDER)
        
        if not documents:
            print("\n‚ùå No syllabi found. Add PDFs to:", SYLLABUS_FOLDER.absolute())
            return
        
        documents = create_embeddings(documents, use_custom=USE_CUSTOM_MODEL)
        save_embeddings(documents, CACHE_FILE)
    else:
        print(f"\n‚úÖ Loaded from cache: {len(documents)} chunks")
    
    # Interactive chat
    print("\n" + "="*60)
    print("üí¨ Ask questions! Type 'quit' to exit, 'compare' to test both models")
    print("="*60 + "\n")
    
    while True:
        try:
            query = input("‚ùì Your question: ").strip()
            
            if query.lower() in ['quit', 'exit', 'q']:
                print("\nüëã Goodbye!\n")
                break
            
            if query.lower() == 'compare':
                # Compare both models
                query = input("‚ùì Question to compare: ").strip()
                
                print("\n" + "="*60)
                print("üî¨ COMPARISON MODE")
                print("="*60)
                
                # Custom model
                print("\nüéØ CUSTOM FINE-TUNED MODEL:")
                answer1, sources1, docs1 = answer_question(query, documents, use_custom=True)
                print(f"\nAnswer: {answer1}")
                print(f"Sources: {', '.join(sources1)}")
                print(f"Top score: {docs1[0]['score']:.4f}")
                
                # OpenAI model
                print("\nüåê OPENAI MODEL:")
                answer2, sources2, docs2 = answer_question(query, documents, use_custom=False)
                print(f"\nAnswer: {answer2}")
                print(f"Sources: {', '.join(sources2)}")
                print(f"Top score: {docs2[0]['score']:.4f}")
                
                print("\n" + "="*60 + "\n")
                continue
            
            if not query:
                continue
            
            print("\nüîç Searching...")
            answer, sources, _ = answer_question(query, documents, use_custom=USE_CUSTOM_MODEL)
            
            print(f"\nüí° Answer:\n{answer}")
            print(f"\nüìö Sources: {', '.join(sources)}")
            print("-" * 60 + "\n")
            
        except KeyboardInterrupt:
            print("\n\nüëã Goodbye!\n")
            break
        except Exception as e:
            print(f"\n‚ùå Error: {e}\n")

if __name__ == "__main__":
    main()

  from .autonotebook import tqdm as notebook_tqdm


NameError: name '__file__' is not defined

In [3]:
!pip install sentence-transformers torch

Collecting sentence-transformers
  Downloading sentence_transformers-5.1.2-py3-none-any.whl.metadata (16 kB)
Collecting torch
  Using cached torch-2.2.2-cp312-none-macosx_10_9_x86_64.whl.metadata (25 kB)
Collecting transformers<5.0.0,>=4.41.0 (from sentence-transformers)
  Downloading transformers-4.57.1-py3-none-any.whl.metadata (43 kB)
Collecting scikit-learn (from sentence-transformers)
  Using cached scikit_learn-1.7.2-cp312-cp312-macosx_10_13_x86_64.whl.metadata (11 kB)
Collecting scipy (from sentence-transformers)
  Downloading scipy-1.16.3-cp312-cp312-macosx_14_0_x86_64.whl.metadata (62 kB)
Collecting huggingface-hub>=0.20.0 (from sentence-transformers)
  Downloading huggingface_hub-1.0.1-py3-none-any.whl.metadata (13 kB)
Collecting Pillow (from sentence-transformers)
  Downloading pillow-12.0.0-cp312-cp312-macosx_10_13_x86_64.whl.metadata (8.8 kB)
Collecting filelock (from transformers<5.0.0,>=4.41.0->sentence-transformers)
  Using cached filelock-3.20.0-py3-none-any.whl.metada