In [35]:
%%capture
%pip install langchain langchain-community langchain-openai pymupdf faiss-cpu pydantic python-dotenv 
%pip install langchain-ollama
%pip install langchain-groq

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [36]:
%%capture
%pip install sentence-transformers langchain-huggingface 
!pip install ipywidgets
!pip install pdfplumber

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [37]:
# cell -2 Imports and API Setup
import os
from langchain_community.document_loaders import PyMuPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
# NOTICE: No OpenAI imports here anymore!
from langchain_ollama import ChatOllama # New free LLM
from langchain_community.vectorstores import FAISS
from langchain_core.prompts import ChatPromptTemplate
from pydantic import BaseModel, Field
from typing import List, Optional
# # Securely enter your API Key if not already set in environment
# if not os.environ.get("OPENAI_API_KEY"):
#     os.environ["OPENAI_API_KEY"] = getpass.getpass("Enter OpenAI API Key: ")

# Configuration
PDF_FILES = [
    "sample-service-manual.pdf",  # The Car Manual
    "HAF-F16.pdf",                # The Jet Manual
    "motercycles.pdf"
] 

In [38]:
# --- CELL 3: Configuration & Fleet Setup ---
import os

# 1. Define your "Fleet"
# Map the vehicle type to the specific PDF filename you uploaded
FLEET_CONFIG = {
    "car": "sample-service-manual.pdf",       # Ford/Car Manual
    "jet": "HAF-F16.pdf",                     # F-16 Jet Manual
    "bike": "motercycles.pdf"             # Ducati Bike Manual
}

# 2. Verify files exist
print("üîç Checking Fleet Status...")
files_ready = True
for v_type, filename in FLEET_CONFIG.items():
    if not os.path.exists(filename):
        print(f"   ‚ö†Ô∏è MISSING: {filename} (Please upload this file!)")
        files_ready = False
    else:
        print(f"   ‚úÖ READY: {v_type.upper()} -> {filename}")

if not files_ready:
    print("\n‚ö†Ô∏è WARNING: Some files are missing. The code will skip them.")

üîç Checking Fleet Status...
   ‚úÖ READY: CAR -> sample-service-manual.pdf
   ‚úÖ READY: JET -> HAF-F16.pdf
   ‚úÖ READY: BIKE -> motercycles.pdf


In [39]:
# --- CELL 4: Smart "Fleet" Extraction Strategy ---
import pdfplumber
import os
from langchain_core.documents import Document

def process_pdf_with_header_injection(pdf_path, batch_size=5):
    """
    Reads the PDF. Hunts for TRUE header rows (containing 'Nm', 'lb-ft', etc.) 
    to preserve context in dense tables.
    """
    smart_docs = []
    
    # Safety check
    if not os.path.exists(pdf_path):
        return []

    with pdfplumber.open(pdf_path) as pdf:
        for i, page in enumerate(pdf.pages):
            # 1. Try Table Extraction First
            table = page.extract_table({
                "vertical_strategy": "text", 
                "horizontal_strategy": "text"
            })
            
            if table:
                headers = None
                data_start_idx = 0
                
                # Scan first 5 rows for a "Unit Header" (Nm, lb-ft, etc.)
                for idx, row in enumerate(table[:5]):
                    row_str = " ".join([str(c).lower() for c in row if c])
                    if "nm" in row_str or "lb-ft" in row_str or "description" in row_str or "symptom" in row_str:
                        headers = row
                        data_start_idx = idx + 1
                        break
                
                if headers:
                    # Clean headers
                    clean_headers = [str(h).replace('\n', ' ') if h else f"Col_{j}" for j, h in enumerate(headers)]
                    data_rows = table[data_start_idx:]
                    
                    current_batch = []
                    for row_idx, row in enumerate(data_rows):
                        clean_row = [str(cell).replace('\n', ' ') if cell else "N/A" for cell in row]
                        
                        if len(clean_headers) == len(clean_row):
                            # Contextual Row: "Component: Bolt, Nm: 17..."
                            row_context = ", ".join([f"{h}: {r}" for h, r in zip(clean_headers, clean_row)])
                            current_batch.append(row_context)
                        
                        # Chunking: Group 5 rows together
                        if len(current_batch) >= batch_size or row_idx == len(data_rows) - 1:
                            if current_batch:
                                doc = Document(
                                    page_content="\n".join(current_batch),
                                    metadata={"source": pdf_path, "page": i + 1, "type": "table_chunk"}
                                )
                                smart_docs.append(doc)
                                current_batch = []
                else:
                    # No recognizable header? Treat as text.
                    text = page.extract_text()
                    if text:
                        smart_docs.append(Document(page_content=text, metadata={"source": pdf_path, "page": i+1}))
            else:
                # Fallback: No table found, just raw text
                text = page.extract_text()
                if text:
                    smart_docs.append(Document(page_content=text, metadata={"source": pdf_path, "page": i+1}))

    return smart_docs

# --- EXECUTE FLEET INGESTION ---
all_chunks = []
print("üöÄ Starting Fleet Ingestion...")

if 'FLEET_CONFIG' in globals():
    for vehicle_type, pdf_file in FLEET_CONFIG.items():
        if os.path.exists(pdf_file):
            print(f"\nüìò Processing {vehicle_type.upper()} Manual: {pdf_file}...")
            try:
                # Extract
                file_chunks = process_pdf_with_header_injection(pdf_file)
                
                # Tag metadata (Crucial for the AI to know which vehicle it is)
                for c in file_chunks:
                    c.metadata["vehicle_type"] = vehicle_type
                
                all_chunks.extend(file_chunks)
                print(f"   ‚úÖ Added {len(file_chunks)} chunks from {vehicle_type}.")
                
            except Exception as e:
                print(f"   ‚ùå Error processing {pdf_file}: {e}")
        else:
            print(f"   ‚ö†Ô∏è SKIPPING {vehicle_type}: File '{pdf_file}' not found.")
else:
    print("‚ùå Error: FLEET_CONFIG not found. Please run Cell 3 first.")

# Final Handover to next cells
chunks = all_chunks 
print(f"\nüéâ Total Fleet Knowledge Base: {len(chunks)} chunks ready for embedding.")

üöÄ Starting Fleet Ingestion...

üìò Processing CAR Manual: sample-service-manual.pdf...
   ‚úÖ Added 1030 chunks from car.

üìò Processing JET Manual: HAF-F16.pdf...
   ‚úÖ Added 682 chunks from jet.

üìò Processing BIKE Manual: motercycles.pdf...
   ‚úÖ Added 827 chunks from bike.

üéâ Total Fleet Knowledge Base: 2539 chunks ready for embedding.


In [40]:
# --- CELL 4.5: Image Extraction Pipeline (Visual Chunking) ---
import fitz  # PyMuPDF
import os
from langchain_core.documents import Document

# 1. DEFINE FILE NAME (Fixes NameError)
pdf_filename = "sample-service-manual.pdf" 

def extract_images_and_create_chunks(pdf_path, output_folder="extracted_images"):
    """
    1. Detects images/diagrams in the PDF.
    2. Crops them visually (preserving labels/arrows).
    3. Saves them to disk.
    4. Creates a LangChain 'Document' containing the image path in metadata.
    """
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
        
    print(f"üì∑ Scanning {pdf_path} for diagrams...")
    doc = fitz.open(pdf_path)
    image_docs = []
    
    for page_index, page in enumerate(doc):
        image_list = page.get_images()
        
        for img_index, img in enumerate(image_list):
            xref = img[0]
            # Get location of the image (x0, y0, x1, y1)
            rects = page.get_image_rects(xref)
            
            for rect in rects:
                # Filter small icons/logos (noise)
                if rect.width < 150 or rect.height < 150:
                    continue
                
                # Expand box slightly to catch external labels
                clip_rect = rect + (-20, -20, 20, 20)
                
                # Render high-res image (3x zoom for clarity)
                pix = page.get_pixmap(matrix=fitz.Matrix(3, 3), clip=clip_rect)
                
                # Save to disk
                filename = f"p{page_index+1}_img{img_index}.png"
                filepath = os.path.join(output_folder, filename)
                pix.save(filepath)
                
                # Create a "Shadow Document" for Retrieval
                # We put text in page_content so FAISS can find it (e.g., "Reference Diagram").
                doc_text = f"Reference Diagram: Figure on page {page_index+1}"
                
                image_docs.append(Document(
                    page_content=doc_text,
                    metadata={
                        "source": pdf_path,
                        "page": page_index + 1,
                        "type": "image",
                        "image_path": filepath # <--- CRITICAL: Storing the path
                    }
                ))
                print(f"   [+] Saved Diagram: {filename}")
                
    return image_docs

# --- EXECUTE ---
try:
    image_chunks = extract_images_and_create_chunks(pdf_filename)
    print(f"‚úÖ Extracted {len(image_chunks)} visual chunks.")
    
    # CRITICAL: Add these new chunks to your existing list!
    # If 'chunks' exists from Cell 4, extend it. If not, create it.
    if 'chunks' in globals():
        chunks.extend(image_chunks)
        print(f"üîó Added images to main chunk list. Total chunks: {len(chunks)}")
    else:
        chunks = image_chunks
        print("‚ö†Ô∏è 'chunks' list not found from previous cells. Created new list.")
        
except Exception as e:
    print(f"‚ùå Error: {e}")

üì∑ Scanning sample-service-manual.pdf for diagrams...
   [+] Saved Diagram: p9_img0.png
   [+] Saved Diagram: p10_img0.png
   [+] Saved Diagram: p17_img0.png
   [+] Saved Diagram: p28_img0.png
   [+] Saved Diagram: p28_img1.png
   [+] Saved Diagram: p33_img2.png
   [+] Saved Diagram: p37_img2.png
   [+] Saved Diagram: p40_img0.png
   [+] Saved Diagram: p44_img0.png
   [+] Saved Diagram: p46_img1.png
   [+] Saved Diagram: p49_img0.png
   [+] Saved Diagram: p49_img1.png
   [+] Saved Diagram: p53_img0.png
   [+] Saved Diagram: p54_img1.png
   [+] Saved Diagram: p56_img1.png
   [+] Saved Diagram: p57_img0.png
   [+] Saved Diagram: p66_img0.png
   [+] Saved Diagram: p66_img1.png
   [+] Saved Diagram: p69_img1.png
   [+] Saved Diagram: p71_img2.png
   [+] Saved Diagram: p74_img0.png
   [+] Saved Diagram: p75_img1.png
   [+] Saved Diagram: p82_img0.png
   [+] Saved Diagram: p87_img0.png
   [+] Saved Diagram: p87_img1.png
   [+] Saved Diagram: p90_img1.png
   [+] Saved Diagram: p92_img2.png


In [41]:
# ---CELL 5: Vector Store with Sentence Transformers ---
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS 

print("Loading local embedding model ...")

embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

print("Creating vector store...")

vector_store = FAISS.from_documents(chunks, embeddings)
print("Vector store created successfully using Sentence Transformers!") 


Loading local embedding model ...
Creating vector store...
Vector store created successfully using Sentence Transformers!


In [42]:
# Cell 6: Test Retrieval (Debugging Step)
test_query = "Torque specifications for suspension"
results = vector_store.similarity_search(test_query, k=10)

print(f"--- Top Retrieval Result for '{test_query}' ---")
print(results[0].page_content)

--- Top Retrieval Result for 'Torque specifications for suspension' ---
2C-13 Rear Suspension:
5)Install the swingarm. Refer to ‚ÄúSwingarm / Cushion
Rod Removal and Installation (Page2C-8)‚Äù.
I815H1230047-01
Specifications
Service Data
B815H22307001
Suspension
Unit: mm (in)
Item Standard Limit
Rear shock absorber spring pre-set
195 (7.7) ‚Äî
length
Rear shock absorber damping force Rebound 12 clicks out from stiffed position
‚Äî
adjuster Compression 8 clicks out from stiffed position
Rear wheel travel 140 (5.5) ‚Äî
Swingarm pivot shaft runout ‚Äî 0.3 (0.01)
Tightening Torque Specifications
B815H22307002
Tightening torque
Fastening part Note
N‚ãÖm kgf-m lb-ft
Rear shock absorber mounting nut (cid:41)(Page2C-3) /
50 5.0 36.0
(cid:41)(Page2C-10)
Cushion lever mounting nut (cid:41)(Page2C-3) /
78 7.8 56.5 (cid:41)(Page2C-6) /
(cid:41)(Page2C-10)
Cushion rod mounting nut (cid:41)(Page2C-6) /
78 7.8 56.5
(cid:41)(Page2C-10)
Rear shock absorber lower mounting nut 50 5.0 36.0 (cid:41)(Page2C

In [53]:
# --- CELL 7: Define Output Structure (Updated) ---
from pydantic import BaseModel, Field
from typing import List, Optional

class VehicleSpec(BaseModel):
    """Information about a specific vehicle specification or procedure step."""
    component: str = Field(..., description="The part name, step number, or symptom (e.g., 'Brake Caliper', 'Step 1', 'Engine Noise').")
    spec_type: str = Field(..., description="The category (e.g., 'Torque', 'Action', 'Check').")
    value: str = Field(..., description="The primary value or instruction (e.g., '50', 'Turn Switch OFF', 'Replace Fuse').")
    unit: Optional[str] = Field(None, description="Measurement unit (Nm, PSI) if applicable.")
    description: Optional[str] = Field(None, description="Context, condition, or notes (e.g., 'Stage 1 tightening', 'If light is flashing').")

class SpecList(BaseModel):
    """A list of extracted vehicle specifications."""
    specs: List[VehicleSpec]

In [None]:
# --- CELL 8: Main Extraction Loop (Fleet Edition with Description) ---
import json
import time
import re
from langchain_groq import ChatGroq
from langchain_core.prompts import ChatPromptTemplate

# ==========================================
# 1. SETUP CLOUD LLM
# ==========================================
GROQ_API_KEY = "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" #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

llm = ChatGroq(
    temperature=0,
    model_name="llama-3.3-70b-versatile",
    api_key=GROQ_API_KEY
)

# ==========================================
# 2. HELPER: Bulletproof JSON Extractor
# ==========================================
def extract_json_from_text(text):
    try:
        text = text.replace("```json", "").replace("```", "")
        match = re.search(r"\{.*\}", text, re.DOTALL)
        if match: return json.loads(match.group(0))
        return json.loads(text)
    except: return None

# ==========================================
# 3. MASTER PROMPT (Now requests Description)
# ==========================================
prompt_template = """
You are a highly accurate technical data extractor.
Analyze the provided text context and extract structured data for: '{question}'.

CRITICAL INSTRUCTIONS:
1. **COMPONENT**: The main part (e.g., "Bolt") or Step Number (e.g., "Step 1").
2. **VALUE**: The numeric value (e.g., "50") or the Action (e.g., "Turn Switch OFF").
3. **UNIT**: If applicable (Nm, PSI). Leave null for procedures.
4. **DESCRIPTION**: Capture conditions (e.g., "Initial pass", "If engine is hot") or notes.

Output JSON: 
{{ "specs": [ 
    {{ "component": "...", "spec_type": "...", "value": "...", "unit": "...", "description": "..." }} 
] }}

If no relevant data is found, return: {{ "specs": [] }}

Context:
{context}
"""

# --- UPDATED FLEET QUERIES ---
queries = [
    # üöó CAR
    "Torque specifications for front suspension",
    "Fluid capacities",
    
    # ‚úàÔ∏è JET (Procedures)
    "Emergency procedure for engine fire on ground",
    "Landing gear extension speed limits",
    
    # üèçÔ∏è BIKE (Diagnostics)
    "Troubleshooting engine starting failure",
    "Chain tension adjustment"
]

all_extracted_data = []

print("üöÄ Starting Batch Fleet Extraction...")

for query in queries:
    print(f"   Processing: {query}...")
    start_ts = time.time()
    
    # Retrieve broadly
    docs = vector_store.similarity_search(query, k=6) 
    context = "\n\n".join([d.page_content for d in docs])
    
    try:
        chain = ChatPromptTemplate.from_template(prompt_template) | llm
        response = chain.invoke({"context": context, "question": query})
        
        data = extract_json_from_text(response.content)
        
        if data and "specs" in data:
            items = data["specs"]
            if items:
                all_extracted_data.extend(items)
                print(f"   ‚úÖ Found {len(items)} items in {time.time()-start_ts:.2f}s.")
            else:
                print("   ‚ö†Ô∏è Valid JSON, but no specific data found.")
        else:
            print("   ‚ö†Ô∏è No JSON found.")
            
    except Exception as e:
        print(f"   ‚ùå Error: {e}")
    
    time.sleep(3)

# Save
with open("vehicle_specs.json", "w") as f:
    json.dump(all_extracted_data, f, indent=4)

print(f"\nüéâ DONE! Saved {len(all_extracted_data)} total specs to 'vehicle_specs.json'.")

üöÄ Starting Batch Fleet Extraction...
   Processing: Torque specifications for front suspension...
   ‚úÖ Found 21 items in 1.75s.
   Processing: Fluid capacities...
   ‚ö†Ô∏è Valid JSON, but no specific data found.
   Processing: Emergency procedure for engine fire on ground...
   ‚úÖ Found 5 items in 1.37s.
   Processing: Landing gear extension speed limits...
   ‚úÖ Found 3 items in 5.50s.
   Processing: Troubleshooting engine starting failure...
   ‚úÖ Found 28 items in 10.70s.
   Processing: Chain tension adjustment...
   ‚úÖ Found 7 items in 10.40s.

üéâ DONE! Saved 64 total specs to 'vehicle_specs.json'.


In [45]:
# Cell 9: Save and View Results

import json

# Save to JSON file
output_file = "vehicle_specs.json"
with open(output_file, "w") as f:
    json.dump(all_extracted_data, f, indent=4)

print(f"Saved data to {output_file}")

# Display first 5 results
print(json.dumps(all_extracted_data[:5], indent=2))

Saved data to vehicle_specs.json
[]


In [46]:
# Run this in your notebook to save the index to disk
vector_store.save_local("faiss_db_index_test")
print("‚úÖ Index saved to folder 'faiss_db_index_test'")

‚úÖ Index saved to folder 'faiss_db_index_test'


In [60]:
# --- SAFETY CHECK: Ensure Vector Store Exists ---
import os
from langchain_community.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings

# 1. Define Embeddings (Required to load the DB)
if 'embeddings' not in globals():
    print("üîÑ Initializing embeddings model...")
    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# 2. Check if vector_store is active. If not, try to load it.
if 'vector_store' not in globals():
    print("‚ö†Ô∏è 'vector_store' variable not found in memory.")
    
    # Check if a saved index exists on disk
    # NOTE: Make sure this folder name matches what you saved in Cell 11 ("faiss_db_index_test")
    index_folder = "faiss_db_index_test" 
    
    if os.path.exists(index_folder):
        print(f"üìÇ Found saved index in '{index_folder}'. Loading...")
        vector_store = FAISS.load_local(index_folder, embeddings, allow_dangerous_deserialization=True)
        print("‚úÖ Vector Store loaded successfully!")
    else:
        print("‚ùå CRITICAL ERROR: No 'vector_store' found in memory OR on disk.")
        print("üëâ PLEASE RUN CELL 5 & 6 to create the database first!")
        raise Exception("Database missing. Run the 'Create Vector Store' cell first.")
else:
    print("‚úÖ Vector Store is active and ready.")

‚úÖ Vector Store is active and ready.


In [48]:
%%capture
%pip install langchain-groq

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [63]:
# Check if "jet" data exists in the vector store
print("üîç Inspecting Database Content...")
try:
    # Search for a generic term with the Jet filter
    test_docs = vector_store.similarity_search("engine", k=1, filter={"vehicle_type": "jet"})
    if test_docs:
        print("‚úÖ SUCCESS: Found Jet data in the database!")
        print(f"   Sample: {test_docs[0].page_content[:100]}...")
    else:
        print("‚ùå FAILURE: No Jet data found. Did you run Cell 5?")
except Exception as e:
    print(f"‚ùå Error: {e}")

üîç Inspecting Database Content...
‚úÖ SUCCESS: Found Jet data in the database!
   Sample: T.O. GR1F(cid:6)16CJ(cid:6)1
Operation GE129 . . . . . . ....


In [3]:
# --- MASTER CELL: REBUILD FLEET DATABASE (Text + Images) ---
import os
import fitz  # PyMuPDF
import pdfplumber
import time
from langchain_core.documents import Document
from langchain_community.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings

# ==========================================
# 1. CONFIGURATION (Define the Fleet)
# ==========================================
FLEET_CONFIG = {
    "car": "sample-service-manual.pdf",       
    "jet": "HAF-F16.pdf",                     
    "bike": "motercycles.pdf"                 
}

print("‚öôÔ∏è STARTING SYSTEM REBUILD...")

# ==========================================
# 2. TEXT & TABLE EXTRACTION (The "Reader")
# ==========================================
all_chunks = []

def extract_text_smart(pdf_path, v_type):
    local_chunks = []
    if not os.path.exists(pdf_path): return []
    
    with pdfplumber.open(pdf_path) as pdf:
        for i, page in enumerate(pdf.pages):
            # Try Table
            table = page.extract_table()
            text = page.extract_text()
            
            content = ""
            if table:
                # Flatten table row by row
                for row in table[:10]: # Limit to first 10 rows to save space
                    clean_row = [str(cell).replace('\n', ' ') for cell in row if cell]
                    content += " | ".join(clean_row) + "\n"
                local_chunks.append(Document(page_content=content, metadata={"source": pdf_path, "page": i+1, "vehicle_type": v_type}))
            elif text:
                local_chunks.append(Document(page_content=text, metadata={"source": pdf_path, "page": i+1, "vehicle_type": v_type}))
                
    return local_chunks

for v_type, path in FLEET_CONFIG.items():
    print(f"   üìñ Reading {v_type.upper()} manual...")
    chunks = extract_text_smart(path, v_type)
    all_chunks.extend(chunks)
    print(f"      -> Extracted {len(chunks)} text chunks.")

# ==========================================
# 3. IMAGE EXTRACTION (The "Photographer")
# ==========================================
output_folder = "extracted_images"
if not os.path.exists(output_folder): os.makedirs(output_folder)

def extract_images_smart(pdf_path, v_type):
    doc = fitz.open(pdf_path)
    img_docs = []
    
    for i, page in enumerate(doc):
        image_list = page.get_images()
        for img_idx, img in enumerate(image_list):
            xref = img[0]
            rects = page.get_image_rects(xref)
            
            for rect in rects:
                if rect.width < 150 or rect.height < 150: continue # Skip small icons
                
                # Context Expansion (Padding)
                clip = rect + (-20, -20, 20, 20)
                pix = page.get_pixmap(matrix=fitz.Matrix(3, 3), clip=clip)
                
                filename = f"{v_type}_p{i+1}_{img_idx}.png"
                filepath = os.path.join(output_folder, filename)
                pix.save(filepath)
                
                # SHADOW DOCUMENT (This links the Image to the Brain)
                # We add keywords like "Diagram", "Figure", "Schematic" to help the search find it.
                desc = f"Reference Diagram Figure Schematic for {v_type} on page {i+1}."
                img_docs.append(Document(
                    page_content=desc, 
                    metadata={"source": pdf_path, "page": i+1, "vehicle_type": v_type, "image_path": filepath, "type": "image"}
                ))
    return img_docs

print("   üì∑ Scanning for Diagrams...")
for v_type, path in FLEET_CONFIG.items():
    if os.path.exists(path):
        imgs = extract_images_smart(path, v_type)
        all_chunks.extend(imgs)
        print(f"      -> Saved {len(imgs)} diagrams from {v_type}.")

# ==========================================
# 4. EMBEDDING (Creating the Brain)
# ==========================================
print("   üß† Building Vector Brain (This may take a moment)...")

# Define Embedding Model (HuggingFace)
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# Create Vector Store
vector_store = FAISS.from_documents(all_chunks, embedding_model)

print(f"\n‚úÖ SYSTEM READY! Total Knowledge Base: {len(all_chunks)} records.")
print("üëâ You can now run Cell 13 (The UI).")

‚öôÔ∏è STARTING SYSTEM REBUILD...
   üìñ Reading CAR manual...
      -> Extracted 852 text chunks.
   üìñ Reading JET manual...
      -> Extracted 513 text chunks.
   üìñ Reading BIKE manual...
      -> Extracted 659 text chunks.
   üì∑ Scanning for Diagrams...
      -> Saved 171 diagrams from car.
      -> Saved 2 diagrams from jet.
      -> Saved 103 diagrams from bike.
   üß† Building Vector Brain (This may take a moment)...

‚úÖ SYSTEM READY! Total Knowledge Base: 2300 records.
üëâ You can now run Cell 13 (The UI).


In [4]:
# --- CELL 13: Mechanic AI "Pro" Interface (Final + Images) ---
import ipywidgets as widgets
from IPython.display import display, HTML, Image, clear_output
import json
import re
from langchain_groq import ChatGroq
from langchain_core.prompts import ChatPromptTemplate

# ============================================
# 1. CONFIGURATION
# ============================================
# üëá PASTE YOUR ACTUAL KEY HERE
GROQ_API_KEY = "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" 

# Brain
gui_llm = ChatGroq(temperature=0.1, model_name="llama-3.1-8b-instant", api_key=GROQ_API_KEY)

# ============================================
# 2. PROMPTS & LOGIC
# ============================================
rag_prompt = """
You are a technical data extractor. Analyze Context for: '{question}'.
RULES:
1. Return ONLY valid JSON.
2. Structure: {{ "specs": [ {{ "component": "...", "value": "...", "unit": "...", "description": "..." }} ] }}
3. If NOT found, return {{ "specs": [] }}
Context:
{context}
"""

general_prompt = """
The user asked: '{question}'.
We searched the manuals but found NO specific match.
Answer based on general mechanical knowledge. Be concise.
"""

def clean_json(text):
    try:
        text = str(text).replace("```json", "").replace("```", "")
        match = re.search(r"\{.*\}", text, re.DOTALL)
        if match: return json.loads(match.group(0))
    except: pass
    return {"specs": []}

def detect_filter(query):
    q = query.lower()
    if any(x in q for x in ["f-16", "jet", "aircraft"]): return {"vehicle_type": "jet"}
    if any(x in q for x in ["bike", "motorcycle", "ducati"]): return {"vehicle_type": "bike"}
    if any(x in q for x in ["car", "ford", "f-150"]): return {"vehicle_type": "car"}
    return None

def handle_search(query):
    # 1. CHIT-CHAT
    if query.lower().strip() in ["hi", "hello", "help"]:
        return {"type": "chat", "content": "<b>System Ready.</b><br>I have access to F-16, Ducati, and Ford F-150 manuals.<br>Select a query below or type your own."}

    # 2. SEARCH
    active_filter = detect_filter(query)
    try:
        if active_filter: docs = vector_store.similarity_search(query, k=4, filter=active_filter)
        else: docs = vector_store.similarity_search(query, k=4)
    except: docs = []

    # 3. EXTRACT TEXT & IMAGES
    if docs:
        # A. Extract Text Context
        context = "\n\n".join([d.page_content for d in docs])
        
        # B. Extract Images (Deduplicate them)
        found_images = []
        seen_paths = set()
        for d in docs:
            path = d.metadata.get("image_path")
            if path and path not in seen_paths:
                found_images.append(path)
                seen_paths.add(path)

        # C. Generate Answer
        chain = ChatPromptTemplate.from_template(rag_prompt) | gui_llm
        response = chain.invoke({"context": context, "question": query})
        data = clean_json(response.content)
        
        if data.get("specs"):
            return {
                "type": "manual", 
                "specs": data["specs"], 
                "images": found_images, # <--- Passing images to UI
                "source": active_filter['vehicle_type'].upper() if active_filter else "DOCS"
            }
            
    # 4. FALLBACK
    gen_chain = ChatPromptTemplate.from_template(general_prompt) | gui_llm
    gen_response = gen_chain.invoke({"question": query})
    return {"type": "general", "content": gen_response.content}

# ============================================
# 3. UI STYLING
# ============================================
style = """
<style>
    @import url('https://fonts.googleapis.com/css2?family=Inter:wght@400;600&display=swap');
    .app-container { font-family: 'Inter', sans-serif; max-width: 900px; margin: 0 auto; color: #333; }
    .header { padding: 20px 0; border-bottom: 1px solid #eee; margin-bottom: 20px; }
    .header h1 { font-size: 22px; font-weight: 600; margin: 0; color: #111; display: flex; align-items: center; gap: 10px; }
    .header-sub { color: #666; font-size: 14px; margin-top: 5px; }
    .response-card { background: #fff; border: 1px solid #e0e0e0; border-radius: 12px; padding: 24px; box-shadow: 0 4px 12px rgba(0,0,0,0.05); animation: fadeIn 0.4s ease; }
    .data-table { width: 100%; border-collapse: collapse; margin-top: 15px; font-size: 14px; }
    .data-table th { text-align: left; color: #888; font-weight: 500; padding: 8px 0; border-bottom: 1px solid #eee; }
    .data-table td { padding: 12px 0; border-bottom: 1px solid #f5f5f5; vertical-align: top; }
    .val-text { font-weight: 600; color: #222; }
    .desc-text { color: #666; font-style: italic; }
    .tag { display: inline-block; padding: 4px 8px; border-radius: 6px; font-size: 11px; font-weight: 600; letter-spacing: 0.5px; }
    .tag-manual { background: #d3f9d8; color: #2b8a3e; }
    .tag-ai { background: #fff3bf; color: #f08c00; }
    @keyframes fadeIn { from { opacity: 0; transform: translateY(10px); } to { opacity: 1; transform: translateY(0); } }
</style>
"""

# ============================================
# 4. WIDGET CONSTRUCTION
# ============================================
header_html = widgets.HTML(f"{style}<div class='app-container'><div class='header'><h1>‚ö° Mechanic AI <span style='font-size:12px; background:#eee; padding:2px 6px; border-radius:4px; color:#555;'>FLEET COMMAND</span></h1><div class='header-sub'>Multi-Modal Retrieval System (v3.0 Final)</div></div>")

# Chips
btn_layout = widgets.Layout(width='98%', margin='2px')
btn_car1 = widgets.Button(description="Suspension Torque (Car)", icon='car', layout=btn_layout)
btn_car1.tooltip = "Torque specifications for front suspension (Car)"
btn_car2 = widgets.Button(description="Fluid Capacities (Car)", icon='tint', layout=btn_layout)
btn_car2.tooltip = "Fluid capacities (Car)"
btn_jet1 = widgets.Button(description="Engine Fire Proc. (Jet)", icon='plane', layout=btn_layout)
btn_jet1.tooltip = "Emergency procedure for engine fire on ground (F-16)"
btn_jet2 = widgets.Button(description="Gear Speed Limits (Jet)", icon='tachometer', layout=btn_layout)
btn_jet2.tooltip = "Landing gear extension speed limits (F-16)"
btn_bike1 = widgets.Button(description="Start Failure (Bike)", icon='motorcycle', layout=btn_layout)
btn_bike1.tooltip = "Troubleshooting engine starting failure (Bike)"
btn_bike2 = widgets.Button(description="Chain Tension (Bike)", icon='cogs', layout=btn_layout)
btn_bike2.tooltip = "Chain tension adjustment (Bike)"

col_car = widgets.VBox([widgets.HTML("<b>üöó Ford F-150</b>"), btn_car1, btn_car2], layout=widgets.Layout(width='33%'))
col_jet = widgets.VBox([widgets.HTML("<b>‚úàÔ∏è F-16 Jet</b>"), btn_jet1, btn_jet2], layout=widgets.Layout(width='33%'))
col_bike = widgets.VBox([widgets.HTML("<b>üèçÔ∏è Ducati</b>"), btn_bike1, btn_bike2], layout=widgets.Layout(width='33%'))
chip_container = widgets.HBox([col_car, col_jet, col_bike], layout=widgets.Layout(width='100%', margin='0 0 20px 0'))

# Input
txt_input = widgets.Text(placeholder="Ask a question...", layout=widgets.Layout(width='85%'))
btn_send = widgets.Button(icon='paper-plane', layout=widgets.Layout(width='10%'))
input_area = widgets.HBox([txt_input, btn_send])
out_display = widgets.Output()

def on_submit(b):
    if isinstance(b, widgets.Button) and hasattr(b, 'tooltip') and b.tooltip:
        query = b.tooltip
        txt_input.value = query 
    else:
        query = txt_input.value
        
    if not query: return
    out_display.clear_output()
    with out_display:
        display(HTML(f"<div style='color:#666; margin-top:20px;'>‚å¨ Processing <b>'{query}'</b>...</div>"))
        try:
            res = handle_search(query)
            out_display.clear_output()
            
            if res['type'] == 'manual':
                html = f"<div class='response-card'><div><span class='tag tag-manual'>‚úì MANUAL SOURCE: {res['source']}</span></div>"
                rows = "".join([f"<tr><td>{x['component']}</td><td class='val-text'>{x['value']} {x.get('unit','') or ''}</td><td class='desc-text'>{x.get('description','-')}</td></tr>" for x in res['specs']])
                html += f"<table class='data-table'><thead><tr><th width='30%'>Component / Step</th><th width='25%'>Value / Action</th><th>Notes</th></tr></thead><tbody>{rows}</tbody></table>"
                display(HTML(html + "</div>"))
                
                # --- IMAGE DISPLAY LOGIC ---
                if res.get('images') and len(res['images']) > 0:
                    display(HTML("<div style='margin-top:20px; font-weight:600; color:#444; border-top:1px solid #eee; padding-top:15px;'>üì∑ Visual Reference:</div>"))
                    for img_path in res['images']:
                        try:
                            display(Image(filename=img_path, width=600))
                            display(HTML(f"<div style='color:#888; font-size:11px; margin-bottom:15px;'>Source: {img_path}</div>"))
                        except:
                            display(HTML(f"<div style='color:red;'>‚ö†Ô∏è Image missing on disk: {img_path}</div>"))
                            
            elif res['type'] == 'general':
                display(HTML(f"<div class='response-card'><span class='tag tag-ai'>‚ö† GENERAL KNOWLEDGE</span><div style='margin-top:10px; color:#444; line-height:1.6;'>{res['content']}</div><div style='margin-top:15px; font-size:12px; color:#888; border-top:1px solid #eee; padding-top:10px;'>*Data not found in official fleet documents. Response generated by AI logic.</div></div>"))
            elif res['type'] == 'chat':
                display(HTML(f"<div class='response-card' style='background:#f8f9fa;'>{res['content']}</div>"))
        except Exception as e:
             display(HTML(f"<div style='color:red;'>‚ùå Error: {e}</div>"))

btn_send.on_click(on_submit)
txt_input.on_submit(on_submit)
for btn in [btn_car1, btn_car2, btn_jet1, btn_jet2, btn_bike1, btn_bike2]: btn.on_click(on_submit)

display(widgets.VBox([header_html, chip_container, input_area, out_display], layout=widgets.Layout(max_width='900px')))

  txt_input.on_submit(on_submit)


VBox(children=(HTML(value="\n<style>\n    @import url('https://fonts.googleapis.com/css2?family=Inter:wght@400‚Ä¶