In [None]:
"""
Simple PDF to Q&A Generator using Llama-3.2-3B-Instruct
Just set your pdf_path and run!
"""
%pip install transformers accelerate torch PyPDF2 tqdm

In [None]:
"""
Enhanced PDF to Q&A Generator - Generate More Q&A Pairs!
"""
import json
import re
import PyPDF2
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
from tqdm import tqdm

In [None]:


# ============= CONFIGURATION =============
books = ['bipolar' , 'ocd' ]
for book in books:
    pdf_path = f"/kaggle/input/dataset3/{book}.pdf"
    output_file = f"/kaggle/working/{book}_pairs.json"
    pairs_per_chunk = 20    # Increased from 10 - generates more Q&A per chunk
    max_chunks = 100        # Process more chunks (increase for even more questions)
    chunk_size = 400        # Smaller chunks = more chunks = more questions
    # =========================================
    
    print("Loading Llama-3.2-1B-Instruct (smaller, faster)...")
    tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-1B-Instruct")
    tokenizer.pad_token = tokenizer.eos_token
    
    model = AutoModelForCausalLM.from_pretrained(
        "meta-llama/Llama-3.2-1B-Instruct",
        torch_dtype=torch.float16,
        device_map="auto"
    )
    print("Model loaded!")
    
    # Extract text from PDF
    print(f"\nReading PDF: {pdf_path}")
    chunks = []
    
    with open(pdf_path, 'rb') as file:
        pdf = PyPDF2.PdfReader(file)
        total_pages = len(pdf.pages)
        print(f"Total pages: {total_pages}")
    
        current_chunk = ""
        for page_num in tqdm(range(total_pages), desc="Extracting text"):
            text = pdf.pages[page_num].extract_text()
            text = re.sub(r'\s+', ' ', text).strip()
            current_chunk += text + " "
    
            # Split into smaller chunks for more Q&A generation
            if len(current_chunk.split()) >= chunk_size:
                chunks.append(current_chunk.strip())
                current_chunk = ""
    
        if current_chunk.strip():
            chunks.append(current_chunk.strip())
    
    print(f"Created {len(chunks)} text chunks")
    print(f"‚ö° Will process first {max_chunks} chunks")
    print(f"üìä Expected output: ~{min(max_chunks, len(chunks)) * pairs_per_chunk} Q&A pairs")
    max_chunks = len(chunks)
    # Limit chunks based on configuration
    chunks = chunks[:max_chunks]
    
    # Generate Q&A pairs
    all_qa_pairs = []
    print(f"\nGenerating Q&A pairs from {len(chunks)} chunks...")
    
    for i, chunk in enumerate(tqdm(chunks, desc="Processing chunks")):
        prompt = f"""Generate {pairs_per_chunk} question-answer pairs from this text on Topic: {book}. Make them conversational and natural.
    
    Text: {chunk[:2000]}
    
    Output ONLY a JSON array like this:
    [
      {{"question": "...", "answer": "..."}},
      {{"question": "...", "answer": "..."}}
    ]"""
    
        messages = [
            {"role": "user", "content": prompt}
        ]
    
        inputs = tokenizer.apply_chat_template(
            messages,
            add_generation_prompt=True,
            return_tensors="pt"
        ).to(model.device)
    
        with torch.no_grad():
            outputs = model.generate(
                inputs,
                max_new_tokens=1200,  # Increased from 800 for more pairs
                temperature=0.7,
                top_p=0.9,
                do_sample=True,
                pad_token_id=tokenizer.eos_token_id
            )
    
        response = tokenizer.decode(outputs[0][inputs.shape[1]:], skip_special_tokens=True)
    
        # Parse JSON
        try:
            json_match = re.search(r'\[.*\]', response, re.DOTALL)
            if json_match:
                qa_pairs = json.loads(json_match.group())
                all_qa_pairs.extend(qa_pairs)
        except Exception as e:
            print(f"\n‚ö†Ô∏è Warning: Failed to parse chunk {i+1}: {str(e)[:100]}")
            pass
    
        # Save progress every 5 chunks
        if (i + 1) % 5 == 0:
            with open(output_file, 'w', encoding='utf-8') as f:
                json.dump(all_qa_pairs, f, indent=2, ensure_ascii=False)
            print(f"  ‚úì Saved: {len(all_qa_pairs)} pairs so far")
    
    # Final save
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(all_qa_pairs, f, indent=2, ensure_ascii=False)
    
    print(f"\n‚úÖ Done! Generated {len(all_qa_pairs)} Q&A pairs in {output_file}")
    print(f"üìä Target was {min(max_chunks, len(chunks)) * pairs_per_chunk} pairs")
    print(f"üéØ Success rate: {len(all_qa_pairs) / (min(max_chunks, len(chunks)) * pairs_per_chunk) * 100:.1f}%")
    
    # Show samples
    print("\n=== Sample Q&A Pairs ===")
    
    for i, pair in enumerate(all_qa_pairs[:3]):
        print(f"\nQ{i+1}: {pair['question']}")
        print(f"A{i+1}: {pair['answer']}")
    
    print(f"\nüí° To generate even more questions:")
    print(f"   - Increase 'max_chunks' (currently {max_chunks})")
    print(f"   - Increase 'pairs_per_chunk' (currently {pairs_per_chunk})")
    print(f"   - Decrease 'chunk_size' (currently {chunk_size})")