In [None]:
"""
Comprehensive Installation Script for ML/NLP Pipeline
==================================================
This script sets up a complete machine learning environment with document processing,
vector embeddings, and conversational AI capabilities. Designed for Kaggle environments
but adaptable to other platforms.

Dependencies installed:
- Document processing: PyMuPDF, Pillow, Tesseract OCR
- ML/NLP: Transformers, Sentence Transformers, FAISS, Torch
- Frameworks: LangChain ecosystem, Gradio for UI
- Utilities: Accelerate, FSSpec for file handling
"""

# Import required system modules
import os  # For environment variable management
import sys  # For system-specific parameters and functions

# =============================================================================
# ENVIRONMENT CONFIGURATION
# =============================================================================
# Configure cache directories to avoid storage conflicts and improve performance
# These paths are optimized for Kaggle's file system structure

# Set Transformers library cache location to working directory
# Prevents downloading models to default cache which may have space limitations
os.environ["TRANSFORMERS_CACHE"] = "/kaggle/working/transformers_cache"

# Set Hugging Face Hub cache location for model storage
# Centralizes all HF model downloads to a single, manageable location
os.environ["HF_HOME"] = "/kaggle/working/huggingface_cache"

# =============================================================================
# DOCUMENT PROCESSING LIBRARIES
# =============================================================================
# Install PDF processing and OCR capabilities

# PyMuPDF v1.23.0 - High-performance PDF processing library
# Pinned version for stability and compatibility
!pip install --upgrade --quiet PyMuPDF==1.23.0

# Install image processing and OCR suite
# PyMuPDF: PDF manipulation and text extraction
# Pillow: Python Imaging Library for image processing
# pytesseract: Python wrapper for Tesseract OCR engine
!pip install --upgrade --quiet \
    PyMuPDF pillow pytesseract

# =============================================================================
# MACHINE LEARNING AND NLP CORE LIBRARIES
# =============================================================================
# Install foundational ML libraries for embedding and similarity search

# Sentence Transformers - Pre-trained models for semantic text embeddings
# Enables conversion of text to high-dimensional vectors for similarity analysis
!pip install --upgrade --quiet sentence-transformers

# FAISS (Facebook AI Similarity Search) - Efficient similarity search library
# CPU version for vector indexing and nearest neighbor search
!pip install --upgrade --quiet faiss-cpu

# Core ML framework installations
# Transformers: Hugging Face's transformer models library
# Torch: PyTorch deep learning framework
!pip install --upgrade --quiet transformers torch

# =============================================================================
# USER INTERFACE AND ACCELERATION
# =============================================================================
# Install libraries for model acceleration and web interface creation

# Gradio - Python library for creating ML web interfaces
# Enables easy deployment of models with interactive web UIs
!pip install --upgrade --quiet gradio

# Accelerate - Hugging Face library for distributed training and inference
# Optimizes model loading and computation across different hardware configurations
!pip install --upgrade --quiet accelerate

# Redundant PyMuPDF installation (consider removing in production)
# This line appears to be a duplicate and can be safely removed
!pip install PyMuPDF

# =============================================================================
# LANGCHAIN ECOSYSTEM INSTALLATION
# =============================================================================
# Install LangChain framework components last to avoid version conflicts
# LangChain is sensitive to dependency versions, so installing it last
# helps prevent package resolution issues

# LangChain Core - Essential base functionality for the framework
# Contains fundamental abstractions and base classes
!pip install --upgrade --quiet langchain-core

# LangChain Community - Community-contributed integrations and tools
# Provides additional connectors and utilities for various services
!pip install --upgrade --quiet langchain-community

# LangChain - Main framework for building applications with LLMs
# Orchestration layer for chaining language models with other tools
!pip install --upgrade --quiet langchain

# =============================================================================
# FINAL DEPENDENCY UPDATES AND FIXES
# =============================================================================
# Address specific version conflicts and ensure latest compatible versions

# Update Transformers to latest version after LangChain installation
# Ensures compatibility with the most recent model architectures
!pip install --upgrade transformers

# Force reinstall FSSpec with specific version to resolve compatibility issues
# FSSpec handles file system operations and is critical for model loading
# Version 2025.3.0 addresses known issues with remote file access
!pip install fsspec==2025.3.0 --force-reinstall --quiet

# =============================================================================
# INSTALLATION COMPLETION
# =============================================================================
# Confirmation message for successful installation
print("Installation completed!")
print("Environment ready for:")
print("- Document processing (PDF, images, OCR)")
print("- Machine learning inference (Transformers, embeddings)")
print("- Vector similarity search (FAISS)")
print("- Conversational AI applications (LangChain)")
print("- Interactive web interfaces (Gradio)")

In [None]:
# Clean environment setup for Kaggle
# Step-by-step installation to avoid conflicts
import os
import sys

# Set environment variables
os.environ["TRANSFORMERS_CACHE"] = "/kaggle/working/transformers_cache"
os.environ["HF_HOME"] = "/kaggle/working/huggingface_cache"

# Clear any problematic cached imports
modules_to_clear = ['torchvision', 'torch', 'transformers', 'sentence_transformers']
for module in modules_to_clear:
    if module in sys.modules:
        del sys.modules[module]

# Set environment variables to avoid conflicts
os.environ['TORCH_HOME'] = '/tmp/torch_cache'
os.environ['TRANSFORMERS_CACHE'] = '/tmp/transformers_cache'

print("Environment cleaned successfully")

In [None]:
# CUDA Warning Suppression - Run this FIRST
import os
import sys
import warnings
import logging

# Comprehensive warning suppression
warnings.filterwarnings('ignore')
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
os.environ['PYTHONWARNINGS'] = 'ignore'

# Suppress specific CUDA warnings
os.environ['TORCH_CUDA_ARCH_LIST'] = ''
os.environ['CUDA_MODULE_LOADING'] = 'LAZY'

# Set logging levels
logging.getLogger().setLevel(logging.ERROR)
logging.getLogger("transformers").setLevel(logging.ERROR)
logging.getLogger("torch").setLevel(logging.ERROR)

print("Warning suppression activated")

In [None]:
# Alternative approach without sentence-transformers
import warnings
warnings.filterwarnings('ignore')
import torch
import transformers
from transformers import PreTrainedModel, pipeline, AutoTokenizer, AutoModelForCausalLM, AutoModel
import torch.nn.functional as F

print("Testing Kaggle environment...")
print(f"Transformers version: {transformers.__version__}")

try:
    # Use transformers directly for embeddings instead of sentence-transformers
    embedding_model_name = "sentence-transformers/paraphrase-MiniLM-L3-v2"
    embedding_tokenizer = AutoTokenizer.from_pretrained(embedding_model_name)
    embedding_model = AutoModel.from_pretrained(embedding_model_name)
    print("Embedding model loaded successfully (via transformers)")
    
    # Load LLM
    tokenizer = AutoTokenizer.from_pretrained("distilgpt2")
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
    
    llm_pipeline = pipeline(
        "text-generation",
        model="distilgpt2",
        tokenizer=tokenizer,
        max_new_tokens=50,
        device=0 if torch.cuda.is_available() else -1,
        truncation=True
    )
    print("Language model loaded successfully")
    
    MODELS_LOADED = True
    print("All models ready!")
    
except Exception as e:
    print(f"Model loading error: {e}")
    MODELS_LOADED = False

In [None]:
# 🖼️ Kaggle-Compatible Image Processing
from PIL import Image
from pathlib import Path
import tempfile

# Simplified image captioning for Kaggle environment
def generate_caption(image_path: str) -> str:
    """Generate simple caption for images - Kaggle optimized."""
    try:
        # Just return filename-based description (no heavy models)
        filename = Path(image_path).name
        return f"Image file: {filename}"
    except Exception as e:
        return f"Image processing error: {str(e)}"

print("✅ Image processing setup completed")

In [None]:
!pip install PyMuPDF

# 📄 Kaggle-Compatible PDF Processing
import fitz  # PyMuPDF
import tempfile
from pathlib import Path
from typing import List

def extract_chunks_from_pdf(pdf_bytes: bytes) -> List[str]:
    """Extract chunks from PDF - Kaggle optimized."""
    print("🔄 Processing PDF...")
    
    try:
        doc = fitz.open(stream=pdf_bytes, filetype="pdf")
        all_chunks = []
        
        # Simple text splitter for Kaggle
        def simple_split_text(text, chunk_size=200):
            words = text.split()
            chunks = []
            current_chunk = []
            current_length = 0
            
            for word in words:
                if current_length + len(word) > chunk_size and current_chunk:
                    chunks.append(" ".join(current_chunk))
                    current_chunk = [word]
                    current_length = len(word)
                else:
                    current_chunk.append(word)
                    current_length += len(word) + 1
            
            if current_chunk:
                chunks.append(" ".join(current_chunk))
            return chunks

        for page_num in range(min(len(doc), 10)):  # Limit to 10 pages for Kaggle
            page = doc[page_num]
            
            # Extract text
            text = page.get_text("text").strip()
            if text:
                text_chunks = simple_split_text(text)
                for chunk in text_chunks:
                    if len(chunk.strip()) > 20:  # Filter very short chunks
                        all_chunks.append(f"Page {page_num+1}: {chunk.strip()}")
            
            # Simple image handling (no heavy processing)
            try:
                images = page.get_images(full=True)
                if images:
                    all_chunks.append(f"Page {page_num+1}: Contains {len(images)} image(s)")
            except:
                pass  # Skip image errors
        
        doc.close()
        print(f"✅ Extracted {len(all_chunks)} chunks from PDF")
        return all_chunks[:50]  # Limit chunks for Kaggle performance
        
    except Exception as e:
        print(f"❌ PDF processing error: {e}")
        return [f"Error processing PDF: {str(e)}"]

print("✅ PDF processing setup completed")

In [None]:
# Step 1: Install new package
import subprocess
import sys

print("Installing updated LangChain HuggingFace package...")
subprocess.check_call([sys.executable, "-m", "pip", "install", "langchain-huggingface"])

# Step 2: Test the new import
try:
    from langchain_huggingface import HuggingFaceEmbeddings
    print("✓ New HuggingFaceEmbeddings import successful!")
    
    # Test creating embeddings
    embeddings = HuggingFaceEmbeddings(
        model_name="sentence-transformers/paraphrase-MiniLM-L3-v2"
    )
    print("✓ HuggingFaceEmbeddings object created successfully")
    
except ImportError as e:
    print(f"Import failed: {e}")
    print("Falling back to old import...")
    try:
        from langchain.embeddings import HuggingFaceEmbeddings
        print("✓ Using deprecated version (still works)")
    except ImportError:
        print("✗ Both imports failed")

!pip install faiss-cpu
!pip install --upgrade --quiet langchain-community

In [None]:
# 🔗 Kaggle-Optimized LangChain Pipeline
from langchain_huggingface import HuggingFaceEmbeddings, HuggingFacePipeline
from langchain_community.vectorstores import FAISS
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
from transformers import pipeline
from typing import List
import torch

def build_langchain_pipeline(chunks: List[str]):
    """Build Kaggle-optimized pipeline."""
    
    print(f"🔄 Building pipeline with {len(chunks)} chunks...")
    
    try:
        # 1. Kaggle-optimized embeddings
        print("📦 Setting up embeddings...")
        embeddings = HuggingFaceEmbeddings(
            model_name="paraphrase-MiniLM-L3-v2",  # Small, fast model
            model_kwargs={
                'device': 'cuda' if torch.cuda.is_available() else 'cpu',
                'trust_remote_code': True
            },
            encode_kwargs={'normalize_embeddings': True, 'batch_size': 32}
        )
        
        # 2. Create vectorstore with error handling
        print("🏗️ Creating vector database...")
        try:
            # Process in smaller batches for Kaggle
            if len(chunks) > 20:
                chunks = chunks[:20]  # Limit for Kaggle performance
            
            vectorstore = FAISS.from_texts(chunks, embeddings)
            print(f"✅ Vector database created with {len(chunks)} chunks")
            
        except Exception as e:
            print(f"⚠️ Vectorstore error: {e}")
            # Fallback with minimal content
            safe_chunks = ["Document loaded and ready for questions."]
            vectorstore = FAISS.from_texts(safe_chunks, embeddings)
        
        retriever = vectorstore.as_retriever(search_kwargs={"k": 2})  # Reduced for performance
        
        # 3. Kaggle-optimized LLM
        print("🤖 Setting up language model...")
        
        try:
            generator = pipeline(
                'text-generation',
                model='Qwen/Qwen2-0.5B-Instruct',
                tokenizer='Qwen/Qwen2-0.5B-Instruct',
                max_new_tokens=200,  # Increased for better detail
                do_sample=True,
                temperature=0.1,  # Lower for coherent, meaningful responses
                top_p=0.95,
                pad_token_id=50256,
                device=0 if torch.cuda.is_available() else -1,
                truncation=True
            )
            
            llm = HuggingFacePipeline(pipeline=generator)
            print("✅ Language model loaded")
            
        except Exception as e:
            print(f"⚠️ LLM error: {e}")
            # Create simple fallback
            class SimpleLLM:
                def __call__(self, prompt, **kwargs):
                    return "I can help answer questions about your document. Please ask specific questions."
                
                def invoke(self, input_dict, **kwargs):
                    return self("", **kwargs)
                    
                def _call(self, prompt, **kwargs):
                    return self(prompt, **kwargs)
            
            llm = SimpleLLM()
        
        # 4. Simple prompt for Kaggle
        prompt_template = """Use the context to answer the question. Be concise. If you dont know the answer just say sorry this is not possible

Context: {context}
Question: {question}

Answer:"""
        
        prompt = PromptTemplate(
            template=prompt_template,
            input_variables=["context", "question"]
        )
        
        # 5. Create simple retrieval chain
        chain = RetrievalQA.from_chain_type(
            llm=llm,
            chain_type="stuff",
            retriever=retriever,
            chain_type_kwargs={"prompt": prompt},
            return_source_documents=False
        )
        
        print("🚀 Pipeline created successfully!")
        return chain
        
    except Exception as e:
        print(f"❌ Pipeline creation error: {e}")
        # Return minimal working chain
        class MinimalChain:
            def __call__(self, inputs):
                return {"result": "Document loaded. Please ask questions about the content."}
            
            def invoke(self, inputs):
                return self(inputs)
        
        return MinimalChain()

print("✅ Pipeline setup completed")

In [None]:
# 🎨 Kaggle-Compatible Gradio Interface
import gradio as gr

# Session storage
_session = {
    "chain": None, 
    "status": "Ready",
    "chat_history": []  # Add this line
}

def ingest_pdf(file):
    """Process uploaded PDF - Kaggle optimized."""
    if not file:
        return "❌ No file uploaded"
    
    try:
        _session["status"] = "Processing..."
        
        # Read PDF
        print("📖 Reading PDF file...")
        with open(file.name, "rb") as f:
            pdf_bytes = f.read()
        
        # Extract chunks
        print("🔄 Extracting content...")
        chunks = extract_chunks_from_pdf(pdf_bytes)
        
        if not chunks:
            return "❌ No content extracted from PDF"
        
        # Build pipeline
        print("🔗 Building Q&A pipeline...")
        chain = build_langchain_pipeline(chunks)
        _session["chain"] = chain
        _session["status"] = "Ready"
        _session["chat_history"] = []  # Add this line to clear history on new PDF
        
        return f"✅ PDF processed! Extracted {len(chunks)} chunks. Ready for questions."
        
    except Exception as e:
        _session["status"] = "Error"
        error_msg = f"❌ Error: {str(e)}"
        print(error_msg)
        return error_msg

def ask_question(question: str, chat_history):
    if not question.strip():
        return chat_history, chat_history, ""
    
    if _session.get("chain") is None:
        error_msg = "❌ Please upload and process a PDF first."
        chat_history.append(["❌ Error", error_msg])
        return chat_history, chat_history, ""
    
    try:
        print(f"🤔 Processing question: {question}")
        
        # Build context from chat history for better continuity
        context_messages = []
        if _session.get("chat_history"):
            # Include last 3 Q&A pairs for context (to avoid token limits)
            recent_history = _session["chat_history"][-3:]
            for hist_q, hist_a in recent_history:
                context_messages.append(f"Previous Q: {hist_q}")
                context_messages.append(f"Previous A: {hist_a}")
        
        # Create enhanced query with context
        if context_messages:
            context_str = "\n".join(context_messages)
            enhanced_query = f"Previous conversation context:\n{context_str}\n\nCurrent question: {question}"
        else:
            enhanced_query = question
        
        chain = _session["chain"]
        
        if hasattr(chain, 'invoke'):
            response = chain.invoke({"query": enhanced_query})
        elif hasattr(chain, '__call__'):
            response = chain({"query": enhanced_query})
        else:
            response = {"result": "Unable to process question with current setup."}
        
        if isinstance(response, dict):
            raw_answer = response.get("result", response.get("answer", str(response)))
        else:
            raw_answer = str(response)
        
        # Extract only after "Answer:"
        answer = raw_answer.split("Answer:")[-1].strip() if "Answer:" in raw_answer else raw_answer.strip()
        
        # Store in session history
        _session["chat_history"].append((question, answer))
        
        # Update chat interface
        chat_history.append([question, answer])
        
        print("✅ Answer generated")
        return chat_history, chat_history, ""  # Return updated history and clear input
        
    except Exception as e:
        error_msg = f"❌ Error answering question: {str(e)}"
        print(error_msg)
        chat_history.append([question, error_msg])
        return chat_history, chat_history, ""

def clear_chat():
    """Clear chat history"""
    _session["chat_history"] = []
    return [], []

def reset_session():
    """Reset entire session"""
    _session["chain"] = None
    _session["status"] = "Ready"
    _session["chat_history"] = []
    return [], [], "Session reset. Please upload a new PDF."
        
# Create Kaggle-optimized Gradio interface
def create_interface():
    with gr.Blocks(title="PDF Q&A - Kaggle Edition") as demo:
        gr.Markdown("# 📚 PDF Question & Answer System")
        gr.Markdown("*Powered by Open Source Models - Optimized for Kaggle*")
        
        with gr.Row():
            with gr.Column():
                file_upload = gr.File(
                    label="📎 Upload PDF", 
                    file_types=[".pdf"],
                    file_count="single"
                )
                process_btn = gr.Button("🚀 Process PDF", variant="primary")
                
            with gr.Column():
                status_output = gr.Textbox(
                    label="📊 Status", 
                    value="Ready to process PDF...",
                    interactive=False
                )
        
        gr.Markdown("---")
        
        # Chat Interface
        with gr.Row():
            with gr.Column(scale=4):
                chatbot = gr.Chatbot(
                    label="💬 Chat History",
                    height=400,
                    show_label=True
                )
                
            with gr.Column(scale=1):
                clear_btn = gr.Button("🗑️ Clear Chat", variant="secondary")
                reset_btn = gr.Button("🔄 Reset Session", variant="stop")
        
        with gr.Row():
            with gr.Column(scale=4):
                question_input = gr.Textbox(
                    label="❓ Your Question",
                    placeholder="Ask anything about your PDF... (supports follow-up questions)",
                    lines=2
                )
            with gr.Column(scale=1):
                ask_btn = gr.Button("🎯 Ask Question", variant="primary")
        
        # Event handlers
        process_btn.click(
            fn=ingest_pdf,
            inputs=[file_upload],
            outputs=[status_output]
        )
        
        ask_btn.click(
            fn=ask_question,
            inputs=[question_input, chatbot],
            outputs=[chatbot, chatbot, question_input]
        )
        
        # Allow Enter key for questions
        question_input.submit(
            fn=ask_question,
            inputs=[question_input, chatbot],
            outputs=[chatbot, chatbot, question_input]
        )
        
        # Clear and reset handlers
        clear_btn.click(
            fn=clear_chat,
            outputs=[chatbot, chatbot]
        )
        
        reset_btn.click(
            fn=reset_session,
            outputs=[chatbot, chatbot, status_output]
        )
    
    return demo
# Launch interface
print("🚀 Starting Gradio interface...")
demo = create_interface()
demo.launch()