In [1]:

%pip install langchain-community langchain-huggingface langchain-chroma easyocr PyMuPDF wikipedia langchain langchain-core




In [None]:

# --- IMPORTS ---
import os
import json
import shutil
import time
import numpy as np
import easyocr
import fitz  # PyMuPDF
from bs4 import BeautifulSoup
from tqdm.notebook import tqdm
import torch
import warnings
from google.colab import drive

# --- LANGCHAIN IMPORTS ---
from langchain_community.document_loaders import WikipediaLoader
from langchain_core.documents import Document
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_chroma import Chroma

# --- CONFIGURATION ---
if not os.path.exists('/content/drive'):
    drive.mount('/content/drive')

BASE_DRIVE_PATH = "/content/drive/MyDrive"
DB_PATH = os.path.join(BASE_DRIVE_PATH, "bmw_knowledge_db_rag")
DATA_PATH = os.path.join(BASE_DRIVE_PATH, "bmw_rag_data")
CLASS_MAP_PATH = os.path.join(BASE_DRIVE_PATH, "bmw_class_names_ddg_enet.json") # check drive for map
EMBEDDING_MODEL = "all-MiniLM-L6-v2"


# default focus list (fallback only)
FOCUS_CARS = [
    "E24", "E28", "E30", "E31", "E32", "E34",
    "E36-7", "E36-8", "E36",
    "E38", "E39", "E46",
    "E52", "E53", "E83",
    "Z1", "Z3", "Z8", "X5"
]

# --- LOAD ALL MODELS ---
all_models = []

# check local colab path first, then drive path
if os.path.exists('bmw_class_names.json'):
    with open('bmw_class_names.json', 'r') as f:
        class_map = json.load(f)
        all_models = list(class_map.values())
        print(f"loaded {len(all_models)} models from local json")
elif os.path.exists(CLASS_MAP_PATH):
    with open(CLASS_MAP_PATH, 'r') as f:
        class_map = json.load(f)
        all_models = list(class_map.values())
        print(f"loaded {len(all_models)} models from drive json")
else:
    print("map not found. using fallback list")
    all_models = FOCUS_CARS

# --- GPU CHECK ---
def get_device():
    if torch.cuda.is_available():
        print(f"gpu: {torch.cuda.get_device_name(0)}")
        return True
    print("Using cpu")
    return False

USE_GPU = get_device()

def get_matching_chassis(text):
    if not text: return None
    text = text.lower()
    for code in FOCUS_CARS:
        if code.lower() in text: return code
    return None

def build_smart_database():
    global DB_PATH
    documents = []

    # --- PHASE 1: WIKI (FULL LIST) ---
    print(f"\nphase 1: wiki scan ({len(all_models)} models)")

    # ignore non_bmw_cars, non_cars

    for model in tqdm(all_models, desc="Wiki Ingest"):
        # skip generic/non-car labels
        if "non_bmw" in model.lower() or "non_cars" in model.lower():
            continue

        try:
            # query wikipedia
            #print(model.replace('_', ' '))
            loader = WikipediaLoader(query=model.replace('_', ' '), load_max_docs=1)
            docs = loader.load()
            for d in docs:
                d.metadata["car_model"] = model
                d.metadata["source_type"] = "General History"
            documents.extend(docs)
        except:
            print("wikipedia failed noooo")
            continue

    # --- PHASE 2: LOCAL HTML FILES ---
    print("\nphase 2: local html files (drive)")
    if os.path.exists(DATA_PATH):
        manual_count = 0
        for root, dirs, files in os.walk(DATA_PATH):
            for file in files:
                if file.lower().endswith(('.html', '.htm')):
                    file_path = os.path.join(root, file)
                    try:
                        # encoding check
                        try:
                            with open(file_path, 'r', encoding='utf-8') as f: content = f.read()
                        except:
                            with open(file_path, 'r', encoding='latin-1') as f: content = f.read()

                        soup = BeautifulSoup(content, 'html.parser')
                        for junk in soup(["script", "style", "nav", "footer", "header", "aside", "iframe"]):
                            junk.extract()

                        text = soup.get_text(separator=' ', strip=True)
                        code = get_matching_chassis(file) or get_matching_chassis(root) or "General"

                        documents.append(Document(
                            page_content=text,
                            metadata={"source": file_path, "car_model": code, "source_type": "Manual HTML Save"}
                        ))
                        manual_count += 1
                    except Exception as e:
                        print(f"    failed {file}: {e}")
        print(f"  ingested {manual_count} local html files")
    else:
        print(f"  DATA_PATH not found at: {DATA_PATH}")

    # --- phase 3: pdf ocr (hardened) ---
    print("\n phase 3: pdf ocr")
    if os.path.exists(DATA_PATH):
        pdf_files = []
        for root, _, files in os.walk(DATA_PATH):
             pdf_files.extend([os.path.join(root, f) for f in files if f.endswith('.pdf')])

        if pdf_files:
             print(f"  found {len(pdf_files)} pdfs. initializing ocr...")
             try:
                reader = easyocr.Reader(['en'], gpu=USE_GPU)

                for pdf_path in tqdm(pdf_files, desc="ocr processing"):
                    # print current file to help debug crashes
                    # print(f"processing: {os.path.basename(pdf_path)}...")

                    try:
                        doc = fitz.open(pdf_path)
                        text = ""
                        for i, page in enumerate(doc):
                            try:
                                # this is where "overflow in 2d faxd" happens
                                pix = page.get_pixmap()

                                img = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.h, pix.w, pix.n)
                                if pix.n == 4: img = img[:, :, :3]

                                res = reader.readtext(img, detail=0)
                                text += f" [page {i+1}] " + " ".join(res)
                            except Exception as e:
                                print(f"    skipping corrupt page {i+1} in {os.path.basename(pdf_path)}: {e}")
                                continue

                        if text:
                            model_name = get_matching_chassis(pdf_path) or "general"
                            documents.append(Document(
                                page_content=text,
                                metadata={"car_model": model_name, "source_type": "manual", "filename": os.path.basename(pdf_path)}
                            ))
                        doc.close()
                    except Exception as e:
                        print(f"  critical error reading {os.path.basename(pdf_path)}: {e}")
                        continue
             except Exception as e:
                 print(f"  ocr failed: {e}")
    # --- BUILD DB ---
    if not documents:
        print("\nNo documents found! Please upload files to 'bmw_rag_data' in Drive.")
        return

    print(f"\nbuilding vector db with {len(documents)} docs...")
    splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
    splits = splitter.split_documents(documents)

    # handle db persistence
    if os.path.exists(DB_PATH):
        try:
            shutil.rmtree(DB_PATH)
            print("  replaced old database")
        except:
            print("  could not delete old db. saving to new folder")
            DB_PATH = f"{DB_PATH}_new_{int(time.time())}"

    device = "cuda" if USE_GPU else "cpu"
    emb = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL, model_kwargs={'device': device})
    db = Chroma(persist_directory=DB_PATH, embedding_function=emb)

    batch_size = 100
    for i in tqdm(range(0, len(splits), batch_size), desc="indexing"):
        db.add_documents(splits[i:i+batch_size])

    print(f"\nDONE! Database saved to Drive at: {DB_PATH}")

if __name__ == "__main__":
    build_smart_database()

loaded 151 models from drive json
gpu: NVIDIA A100-SXM4-80GB

phase 1: wiki scan (151 models)


Wiki Ingest:   0%|          | 0/151 [00:00<?, ?it/s]


phase 2: local html files (drive)
  ingested 1117 local html files

 phase 3: pdf ocr
  found 82 pdfs. initializing ocr...


ocr processing:   0%|          | 0/82 [00:00<?, ?it/s]

MuPDF error: format error: overflow in 2d faxd



In [1]:

#!apt-get update -qq
!apt-get install -y tesseract-ocr -qq
!pip install -q pytesseract langchain-community langchain-huggingface langchain-chroma PyMuPDF opencv-python-headless

In [2]:

import os
import json
import shutil
import time
import numpy as np
import pytesseract # The new OCR engine
import fitz  # PyMuPDF
import cv2   # For image resizing
from bs4 import BeautifulSoup
from tqdm.notebook import tqdm
import torch
import warnings
import gc
from google.colab import drive

# --- LANGCHAIN IMPORTS ---
from langchain_community.document_loaders import WikipediaLoader
from langchain_core.documents import Document
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_chroma import Chroma

# --- CONFIGURATION ---
if not os.path.exists('/content/drive'):
    drive.mount('/content/drive')

BASE_DRIVE_PATH = "/content/drive/MyDrive"
DB_PATH = os.path.join(BASE_DRIVE_PATH, "bmw_knowledge_db_rag")
DATA_PATH = os.path.join(BASE_DRIVE_PATH, "bmw_rag_data")
CLASS_MAP_PATH = os.path.join(BASE_DRIVE_PATH, "bmw_class_names_ddg_enet.json")
EMBEDDING_MODEL = "all-MiniLM-L6-v2"

warnings.filterwarnings("ignore")

# Default focus list
FOCUS_CARS = [
    "E24", "E28", "E30", "E31", "E32", "E34",
    "E36-7", "E36-8", "E36",
    "E38", "E39", "E46",
    "E52", "E53", "E83",
    "Z1", "Z3", "Z8", "X5"
]

# --- LOAD MODELS ---
all_models = []
if os.path.exists('bmw_class_names.json'):
    with open('bmw_class_names.json', 'r') as f:
        class_map = json.load(f)
        all_models = list(class_map.values())
elif os.path.exists(CLASS_MAP_PATH):
    with open(CLASS_MAP_PATH, 'r') as f:
        class_map = json.load(f)
        all_models = list(class_map.values())
else:
    all_models = FOCUS_CARS

# --- UTILS ---
def get_device():
    if torch.cuda.is_available():
        print(f"gpu: {torch.cuda.get_device_name(0)}")
        return True
    return False

USE_GPU = get_device()

def get_matching_chassis(text):
    if not text: return None
    text = text.lower()
    for code in FOCUS_CARS:
        if code.lower() in text: return code
    return None

def resize_for_ocr(img_array):
    """
    Resizes huge images to max 2500px width.
    This speeds up OCR by 3x with no loss in accuracy.
    """
    height, width = img_array.shape[:2]
    max_width = 2500

    if width > max_width:
        scale = max_width / width
        new_width = int(width * scale)
        new_height = int(height * scale)
        return cv2.resize(img_array, (new_width, new_height), interpolation=cv2.INTER_AREA)
    return img_array

def build_smart_database():
    global DB_PATH
    documents = []

    # --- PHASE 1: WIKI ---
    print(f"\n wiki Scan ({len(all_models)} models)")
    for model in tqdm(all_models, desc="wikipedia dl"):
        if "non_bmw" in model.lower() or "non_cars" in model.lower(): continue
        try:
            loader = WikipediaLoader(query=f"BMW {model.replace('_', ' ')}", load_max_docs=1)
            docs = loader.load()
            for d in docs:
                d.metadata["car_model"] = model
                d.metadata["source_type"] = "General History"
            documents.extend(docs)
        except: continue

    # --- PHASE 2: LOCAL HTML ---
    print("\n local html files")
    if os.path.exists(DATA_PATH):
        manual_count = 0
        for root, dirs, files in os.walk(DATA_PATH):
            for file in files:
                if file.lower().endswith(('.html', '.htm')):
                    file_path = os.path.join(root, file)
                    try:
                        try:
                            with open(file_path, 'r', encoding='utf-8') as f: content = f.read()
                        except:
                            with open(file_path, 'r', encoding='latin-1') as f: content = f.read()

                        soup = BeautifulSoup(content, 'html.parser')
                        for junk in soup(["script", "style", "nav", "footer", "header", "aside", "iframe"]):
                            junk.extract()

                        text = soup.get_text(separator=' ', strip=True)
                        code = get_matching_chassis(file) or get_matching_chassis(root) or "General"
                        documents.append(Document(
                            page_content=text,
                            metadata={"source": file_path, "car_model": code, "source_type": "Manual HTML Save"}
                        ))
                        manual_count += 1
                    except: pass
        print(f"  used {manual_count} local HTML files.")

    # --- PHASE 3: PDF OCR (TESSERACT) ---
    print("\n ocr by tesseract")
    if os.path.exists(DATA_PATH):
        pdf_files = []
        for root, _, files in os.walk(DATA_PATH):
             pdf_files.extend([os.path.join(root, f) for f in files if f.endswith('.pdf')])

        if pdf_files:
             print(f"  Found {len(pdf_files)} PDFs. Starting high-speed scan...")

             for pdf_path in tqdm(pdf_files, desc="Processing PDFs"):
                try:
                    doc = fitz.open(pdf_path)
                    text = ""

                    for i, page in enumerate(doc):
                        try:
                            # 1. Get image (simplified)
                            pix = page.get_pixmap(alpha=False)
                            img = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.h, pix.w, pix.n)

                            # 2. Resize if huge (The Speed Hack)
                            img_optimized = resize_for_ocr(img)

                            # 3. Tesseract OCR
                            # config='--psm 6' assumes a single uniform block of text (good for manuals)
                            # config='--psm 3' is default (auto page segmentation) - safer for varied layouts
                            page_text = pytesseract.image_to_string(img_optimized, config='--psm 3')

                            if len(page_text.strip()) > 20: # Filter empty noise pages
                                text += f" [Page {i+1}] {page_text} "

                            del pix, img, img_optimized
                        except Exception as e:
                            # print(f"    Skipping bad page {i+1} in {os.path.basename(pdf_path)}")
                            continue

                    if text:
                        model_name = get_matching_chassis(pdf_path) or "General"
                        documents.append(Document(
                            page_content=text,
                            metadata={"car_model": model_name, "source_type": "Manual", "filename": os.path.basename(pdf_path)}
                        ))

                    doc.close()
                    gc.collect()

                except Exception as e:
                    print(f"  failed to read {os.path.basename(pdf_path)}: {e}")
                    continue

    # --- BUILD DB ---
    if not documents:
        print("\n no documents found!")
        return

    print(f"\n building db with {len(documents)} docs...")
    splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
    splits = splitter.split_documents(documents)

    if os.path.exists(DB_PATH):
        try:
            shutil.rmtree(DB_PATH)
            print("  replaced old database.")
        except:
            DB_PATH = f"{DB_PATH}_new_{int(time.time())}"

    device = "cuda" if USE_GPU else "cpu"
    emb = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL, model_kwargs={'device': device})
    db = Chroma(persist_directory=DB_PATH, embedding_function=emb)

    batch_size = 100
    for i in tqdm(range(0, len(splits), batch_size), desc="indexing"):
        db.add_documents(splits[i:i+batch_size])

    print(f"\n DONE! database saved: {DB_PATH}")

if __name__ == "__main__":
    build_smart_database()

gpu: NVIDIA A100-SXM4-80GB

 wiki Scan (151 models)


wikipedia dl:   0%|          | 0/151 [00:00<?, ?it/s]


 local html files
  used 1117 local HTML files.

 ocr by tesseract
  Found 82 PDFs. Starting high-speed scan...


Processing PDFs:   0%|          | 0/82 [00:00<?, ?it/s]

MuPDF error: format error: overflow in 2d faxd

MuPDF error: format error: overflow in 2d faxd

MuPDF error: format error: overflow in 2d faxd


 building db with 1348 docs...


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

indexing:   0%|          | 0/409 [00:00<?, ?it/s]


 DONE! database saved: /content/drive/MyDrive/bmw_knowledge_db_rag


In [6]:
# Run this in a separate cell or shell command
!pip install "numpy<2"



In [1]:
from langchain_chroma import Chroma
from langchain_huggingface import HuggingFaceEmbeddings

# 1. Setup the Embedding Function (Must match what you used to build it)
# We can use CPU here to save credits, checking the DB is cheap.
emb = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2", model_kwargs={'device': 'cpu'})

# 2. Load the Database
# Point this to the FOLDER path, even if you see a file inside it.
DB_PATH = "/content/drive/MyDrive/bmw_knowledge_db_rag"

try:
    db = Chroma(persist_directory=DB_PATH, embedding_function=emb)

    # 3. Test a Query
    query = "How do I reset the oil light on an E46?"
    docs = db.similarity_search(query, k=3)

    print("✅ Database loaded successfully!")
    print(f"Found {len(docs)} relevant documents.")
    print("-" * 30)
    print(docs[0].page_content[:300]) # Preview the first result

except Exception as e:
    print("❌ Failed to load database.")
    print(e)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

ERROR:chromadb.telemetry.product.posthog:Failed to send telemetry event ClientStartEvent: capture() takes 1 positional argument but 3 were given
ERROR:chromadb.telemetry.product.posthog:Failed to send telemetry event ClientCreateCollectionEvent: capture() takes 1 positional argument but 3 were given
ERROR:chromadb.telemetry.product.posthog:Failed to send telemetry event CollectionQueryEvent: capture() takes 1 positional argument but 3 were given


✅ Database loaded successfully!
Found 3 relevant documents.
------------------------------
Min Read By: Alex Fiehl How to Reset BMW CBS Condition Based Service/Oil Reminder (E90/E82/E84) Extended oil change intervals are defying age-old myths about how long oil will last. Not only do they save money by reducing the amount of oil changes your vehicle needs, but they reduce the amount of wa


In [3]:
# @title 🚀 BMW Database Updater (New Files Only + GPU OCR)
import os
import sys
import fitz  # PyMuPDF
import cv2
import numpy as np
import easyocr
from langchain_chroma import Chroma
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.documents import Document
from tqdm.notebook import tqdm
import torch
import gc

# --- CONFIG ---
DB_PATH = "/content/drive/MyDrive/bmw_knowledge_db_rag"
DATA_PATH = "/content/drive/MyDrive/bmw_rag_data"
EMBEDDING_MODEL = "all-MiniLM-L6-v2"

# --- 1. SETUP & CHECKS ---
print("⚙️ Initializing GPU OCR engine...")
if torch.cuda.is_available():
    print(f"   Using GPU: {torch.cuda.get_device_name(0)}")
    reader = easyocr.Reader(['en'], gpu=True) # Load model once
else:
    print("⚠️ WARNING: No GPU detected. This will be slow!")
    reader = easyocr.Reader(['en'], gpu=False)

# --- 2. LOAD EXISTING DATABASE ---
print(f"\n📂 Loading existing database from {DB_PATH}...")
embedding_func = HuggingFaceEmbeddings(
    model_name=EMBEDDING_MODEL,
    model_kwargs={'device': 'cuda' if torch.cuda.is_available() else 'cpu'}
)
db = Chroma(persist_directory=DB_PATH, embedding_function=embedding_func)

# Fetch all metadata to see what we already have
print("   Scanning index for existing files...")
existing_records = db.get(include=["metadatas"])
existing_filenames = set()

for meta in existing_records['metadatas']:
    # We check both 'filename' (PDFs) and 'source' (HTMLs) keys to be safe
    if 'filename' in meta:
        existing_filenames.add(meta['filename'])
    elif 'source' in meta:
        existing_filenames.add(os.path.basename(meta['source']))

print(f"   ✅ Found {len(existing_filenames)} unique files already in the database.")

# --- 3. IDENTIFY NEW FILES ---
pdf_files_to_process = []
for root, _, files in os.walk(DATA_PATH):
    for f in files:
        if f.lower().endswith('.pdf'):
            if f not in existing_filenames:
                pdf_files_to_process.append(os.path.join(root, f))

print(f"\n🔍 Scan Complete. Found {len(pdf_files_to_process)} NEW PDF files to process.")

if len(pdf_files_to_process) == 0:
    print("🎉 Database is up to date! Stopping execution.")
    sys.exit()

# --- 4. PROCESSING FUNCTIONS ---
def resize_for_ocr(img_array):
    """Resize huge images to prevent memory errors, same as before."""
    height, width = img_array.shape[:2]
    max_width = 2500
    if width > max_width:
        scale = max_width / width
        new_width = int(width * scale)
        new_height = int(height * scale)
        return cv2.resize(img_array, (new_width, new_height), interpolation=cv2.INTER_AREA)
    return img_array

def get_matching_chassis(text):
    # Quick simple check for model name in filename
    focus_cars = ["E30", "E36", "E46", "E39", "E38", "E90", "X5", "Z3", "Z4"]
    text = text.upper()
    for code in focus_cars:
        if code in text: return code
    return "General"

# --- 5. EXECUTE BATCH PROCESSING ---
new_documents = []

print("\n🚀 Starting GPU OCR on new files...")
for pdf_path in tqdm(pdf_files_to_process, desc="Processing New PDFs"):
    try:
        doc = fitz.open(pdf_path)
        full_text = ""

        for i, page in enumerate(doc):
            try:
                # Render page to image
                pix = page.get_pixmap(alpha=False)
                img = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.h, pix.w, pix.n)

                # Resize
                img = resize_for_ocr(img)

                # EASYOCR MAGIC HERE
                # detail=0 returns simple list of strings
                result = reader.readtext(img, detail=0)
                page_text = " ".join(result)

                if len(page_text) > 20:
                    full_text += f" [Page {i+1}] {page_text} "
            except Exception as e:
                continue

        if full_text:
            model_name = get_matching_chassis(os.path.basename(pdf_path))
            new_documents.append(Document(
                page_content=full_text,
                metadata={
                    "car_model": model_name,
                    "source_type": "Manual",
                    "filename": os.path.basename(pdf_path) # Critical for future checks
                }
            ))

        doc.close()
        gc.collect() # Keep RAM clean

    except Exception as e:
        print(f"❌ Error reading {os.path.basename(pdf_path)}: {e}")

# --- 6. UPDATE DATABASE ---
if new_documents:
    print(f"\n💾 Adding {len(new_documents)} new documents to vector store...")
    splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
    splits = splitter.split_documents(new_documents)

    # Batch add to avoid timeouts
    batch_size = 100
    for i in tqdm(range(0, len(splits), batch_size), desc="Indexing chunks"):
        db.add_documents(splits[i:i+batch_size])

    print(f"\n✅ SUCCESS! Added {len(new_documents)} files. Database updated.")
else:
    print("\n⚠️ No valid text extracted from the new files.")

⚙️ Initializing GPU OCR engine...
   Using GPU: NVIDIA A100-SXM4-80GB

📂 Loading existing database from /content/drive/MyDrive/bmw_knowledge_db_rag...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


   Scanning index for existing files...
   ✅ Found 0 unique files already in the database.

🔍 Scan Complete. Found 82 NEW PDF files to process.

🚀 Starting GPU OCR on new files...


Processing New PDFs:   0%|          | 0/82 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [2]:
import shutil
# Zip the folder SAFELY on the linux machine first
shutil.make_archive("/content/drive/MyDrive/bmw_db_safe_latest", 'zip', "/content/drive/MyDrive/bmw_knowledge_db_rag")
print("✅ Created bmw_db_safe.zip in your Drive. Download THIS file.")

✅ Created bmw_db_safe.zip in your Drive. Download THIS file.


In [3]:
# Run this once in a cell before the script
!pip install paddlepaddle-gpu
!pip install "paddleocr>=2.0.1"
!pip install pymupdf opencv-python-headless langchain-chroma langchain-huggingface

Collecting paddlepaddle-gpu
  Downloading paddlepaddle_gpu-2.6.2-cp312-cp312-manylinux1_x86_64.whl.metadata (8.6 kB)
Collecting astor (from paddlepaddle-gpu)
  Downloading astor-0.8.1-py2.py3-none-any.whl.metadata (4.2 kB)
Collecting opt-einsum==3.3.0 (from paddlepaddle-gpu)
  Downloading opt_einsum-3.3.0-py3-none-any.whl.metadata (6.5 kB)
Downloading paddlepaddle_gpu-2.6.2-cp312-cp312-manylinux1_x86_64.whl (758.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m758.9/758.9 MB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading opt_einsum-3.3.0-py3-none-any.whl (65 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m65.5/65.5 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading astor-0.8.1-py2.py3-none-any.whl (27 kB)
Installing collected packages: opt-einsum, astor, paddlepaddle-gpu
  Attempting uninstall: opt-einsum
    Found existing installation: opt_einsum 3.4.0
    Uninstalling opt_einsum-3.4.0:
      Successfully uninstalled opt

Collecting numpy<2.3.0,>=2 (from opencv-python-headless)
  Downloading numpy-2.2.6-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (62 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.0/62.0 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
INFO: pip is looking at multiple versions of langchain-chroma to determine which version is compatible with other requirements. This could take a while.
Collecting langchain-chroma
  Using cached langchain_chroma-1.0.0-py3-none-any.whl.metadata (1.9 kB)
Collecting chromadb<2.0.0,>=1.0.20 (from langchain-chroma)
  Using cached chromadb-1.3.5-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.2 kB)
Collecting langchain-core<2.0.0,>=1.0.0 (from langchain-chroma)
  Downloading langchain_core-1.1.2-py3-none-any.whl.metadata (3.7 kB)
INFO: pip is looking at multiple versions of langchain-huggingface to determine which version is compatible with other requirements. This could take a while.
Collecting 

In [2]:
# 1. Uninstall the conflicting modular versions
#!pip uninstall -y langchain langchain-core langchain-community

# 2. Install the known stable version that aligns with PaddleOCR/paddlex dependencies
!pip install langchain==0.1.13
!pip install langchain-community==0.0.29
!pip install langchain-chroma==0.1.0
!pip install langchain-huggingface==0.0.2

Collecting langchain==0.1.13
  Downloading langchain-0.1.13-py3-none-any.whl.metadata (13 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain==0.1.13)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting langchain-community<0.1,>=0.0.29 (from langchain==0.1.13)
  Downloading langchain_community-0.0.38-py3-none-any.whl.metadata (8.7 kB)
Collecting langchain-core<0.2.0,>=0.1.33 (from langchain==0.1.13)
  Downloading langchain_core-0.1.53-py3-none-any.whl.metadata (5.9 kB)
Collecting langchain-text-splitters<0.1,>=0.0.1 (from langchain==0.1.13)
  Downloading langchain_text_splitters-0.0.2-py3-none-any.whl.metadata (2.2 kB)
Collecting langsmith<0.2.0,>=0.1.17 (from langchain==0.1.13)
  Downloading langsmith-0.1.147-py3-none-any.whl.metadata (14 kB)
Collecting numpy<2,>=1 (from langchain==0.1.13)
  Downloading numpy-1.26.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0

Collecting langchain-community==0.0.29
  Downloading langchain_community-0.0.29-py3-none-any.whl.metadata (8.3 kB)
Downloading langchain_community-0.0.29-py3-none-any.whl (1.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m25.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: langchain-community
  Attempting uninstall: langchain-community
    Found existing installation: langchain-community 0.0.38
    Uninstalling langchain-community-0.0.38:
      Successfully uninstalled langchain-community-0.0.38
Successfully installed langchain-community-0.0.29
Collecting langchain-chroma==0.1.0
  Downloading langchain_chroma-0.1.0-py3-none-any.whl.metadata (1.3 kB)
Collecting chromadb<0.5.0,>=0.4.0 (from langchain-chroma==0.1.0)
  Downloading chromadb-0.4.24-py3-none-any.whl.metadata (7.3 kB)
Collecting build>=1.0.3 (from chromadb<0.5.0,>=0.4.0->langchain-chroma==0.1.0)
  Downloading build-1.3.0-py3-none-any.whl.metadata (5.6 kB)
Collecting c

In [2]:
# 1. Uninstall the potentially conflicting package
!pip uninstall -y paddlepaddle paddlepaddle-gpu

# 2. Install the known stable version (2.6.1) for GPU
# NOTE: Using the official Baidu mirror for reliable install
!pip install paddlepaddle-gpu==2.6.1 -f https://paddlepaddle.org.cn/whl/linux/gpu/develop.html

# 3. Reinstall PaddleOCR to ensure dependencies align (it should now use the 2.6.1 core)
!pip install --upgrade paddleocr

[0mFound existing installation: paddlepaddle-gpu 2.6.2
Uninstalling paddlepaddle-gpu-2.6.2:
  Successfully uninstalled paddlepaddle-gpu-2.6.2
Looking in links: https://paddlepaddle.org.cn/whl/linux/gpu/develop.html
Collecting paddlepaddle-gpu==2.6.1
  Downloading paddlepaddle_gpu-2.6.1-cp312-cp312-manylinux1_x86_64.whl.metadata (8.6 kB)
Downloading paddlepaddle_gpu-2.6.1-cp312-cp312-manylinux1_x86_64.whl (758.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m758.8/758.8 MB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: paddlepaddle-gpu
Successfully installed paddlepaddle-gpu-2.6.1




In [1]:
import os
import json
import shutil
import time
import numpy as np
from bs4 import BeautifulSoup
from tqdm.notebook import tqdm
import torch
import warnings
from google.colab import drive
import fitz  # PyMuPDF
import sys # <-- IMPORT SYS HERE

# --- LANGCHAIN IMPORTS ---
from langchain_community.document_loaders import WikipediaLoader
from langchain_core.documents import Document
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_chroma import Chroma
import cv2 # Needed for image processing/resizing
from paddleocr import PaddleOCR # THE FAST OCR ENGINE

# --- CONFIGURATION (REST OF CONFIG REMAINS UNCHANGED) ---
if not os.path.exists('/content/drive'):
    drive.mount('/content/drive')

BASE_DRIVE_PATH = "/content/drive/MyDrive"
DB_PATH = os.path.join(BASE_DRIVE_PATH, "bmw_knowledge_db_rag")
DATA_PATH = os.path.join(BASE_DRIVE_PATH, "bmw_rag_data")
CLASS_MAP_PATH = os.path.join(BASE_DRIVE_PATH, "bmw_class_names_ddg_enet.json")
EMBEDDING_MODEL = "all-MiniLM-L6-v2"

# default focus list (fallback only)
FOCUS_CARS = [
    "E24", "E28", "E30", "E31", "E32", "E34",
    "E36-7", "E36-8", "E36",
    "E38", "E39", "E46",
    "E52", "E53", "E83",
    "Z1", "Z3", "Z8", "X5"
]

# --- LOAD ALL MODELS (REST OF LOAD LOGIC REMAINS UNCHANGED) ---
all_models = []

if os.path.exists('bmw_class_names.json'):
    with open('bmw_class_names.json', 'r') as f:
        class_map = json.load(f)
        all_models = list(class_map.values())
        print(f"loaded {len(all_models)} models from local json")
elif os.path.exists(CLASS_MAP_PATH):
    with open(CLASS_MAP_PATH, 'r') as f:
        class_map = json.load(f)
        all_models = list(class_map.values())
else:
    print("map not found. using fallback list")
    all_models = FOCUS_CARS

# --- GPU CHECK ---
def get_device():
    if torch.cuda.is_available():
        print(f"gpu: {torch.cuda.get_device_name(0)}")
        return True
    print("Using cpu")
    return False

USE_GPU = get_device()

# --- OCR ENGINE INITIALIZATION (Global) ---
print("\n⚙️ Initializing PaddleOCR (GPU Powered)...")
try:
    # FIX: Removed deprecated 'use_gpu' parameter. PaddleOCR uses GPU automatically if available.
    OCR_READER = PaddleOCR(use_textline_orientation=True, lang='en')
except Exception as e:
    print(f"❌ Critical OCR Initialization Error: {e}")
    sys.exit() # sys is now defined here

def get_matching_chassis(text):
    if not text: return None
    text = text.lower()
    for code in FOCUS_CARS:
        if code.lower() in text: return code
    return None

# --- PDF HELPERS ---
def resize_for_ocr(img_array):
    """Resize huge images to prevent memory errors."""
    height, width = img_array.shape[:2]
    max_width = 3000
    if width > max_width:
        scale = max_width / width
        new_width = int(width * scale)
        new_height = int(height * scale)
        return cv2.resize(img_array, (new_width, new_height), interpolation=cv2.INTER_AREA)
    return img_array


def build_smart_database():
    global DB_PATH
    documents = []

    # --- PHASE 1: WIKI (FULL LIST) ---
    print(f"\nphase 1: wiki scan ({len(all_models)} models)")
    for model in tqdm(all_models, desc="Wiki Ingest"):
        if "non_bmw" in model.lower() or "non_cars" in model.lower():
            continue
        try:
            # print(model.replace('_', ' '))
            loader = WikipediaLoader(query=model.replace('_', ' '), load_max_docs=1)
            docs = loader.load()
            for d in docs:
                d.metadata["car_model"] = model
                d.metadata["source_type"] = "General History"
            documents.extend(docs)
        except:
            continue

    # --- PHASE 2: LOCAL HTML FILES ---
    print("\nphase 2: local html files (drive)")
    if os.path.exists(DATA_PATH):
        manual_count = 0
        for root, dirs, files in os.walk(DATA_PATH):
            for file in files:
                if file.lower().endswith(('.html', '.htm')):
                    file_path = os.path.join(root, file)
                    try:
                        # Use Latin-1 as fallback encoding
                        try:
                            with open(file_path, 'r', encoding='utf-8') as f: content = f.read()
                        except:
                            with open(file_path, 'r', encoding='latin-1') as f: content = f.read()

                        soup = BeautifulSoup(content, 'html.parser')
                        for junk in soup(["script", "style", "nav", "footer", "header", "aside", "iframe"]):
                            junk.extract()

                        text = soup.get_text(separator=' ', strip=True)
                        code = get_matching_chassis(file) or get_matching_chassis(root) or "General"

                        documents.append(Document(
                            page_content=text,
                            metadata={"source": file_path, "car_model": code, "source_type": "Manual HTML Save"}
                        ))
                        manual_count += 1
                    except Exception as e:
                        print(f"    failed {file}: {e}")
        print(f"  ingested {manual_count} local html files")
    else:
        print(f"  DATA_PATH not found at: {DATA_PATH}")

    # --- PHASE 3: PDF OCR (FAST PADDLEOCR) ---
    print("\nphase 3: pdf ocr (PaddleOCR)")
    if os.path.exists(DATA_PATH):
        pdf_files = []
        for root, _, files in os.walk(DATA_PATH):
             pdf_files.extend([os.path.join(root, f) for f in files if f.endswith('.pdf')])

        if pdf_files:
             print(f"  found {len(pdf_files)} pdfs. starting OCR...")

             for pdf_path in tqdm(pdf_files, desc="OCR Processing"):
                 try:
                     doc = fitz.open(pdf_path)
                     full_text = ""

                     for i, page in enumerate(doc):
                         try:
                             # Render page to image (Matrix(2, 2) provides 2x zoom for better quality)
                             pix = page.get_pixmap(matrix=fitz.Matrix(2, 2), alpha=False)
                             img = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.h, pix.w, pix.n)

                             # Resize and convert color channels if necessary
                             img = resize_for_ocr(img)
                             if pix.n == 4: img = img[:, :, :3] # Remove alpha channel if present

                             # PADDLEOCR INFERENCE
                             result = OCR_READER.ocr(img, cls=True)

                             # Extract text from result structure: result[0] is for the image, then list of lines
                             if result and result[0]:
                                 page_text = " ".join([line[1][0] for line in result[0]])

                                 if len(page_text) > 20: # Ensure we didn't get empty page
                                     full_text += f" [Page {i+1}] {page_text} "
                         except Exception as e:
                             print(f"    skipping corrupt page {i+1} in {os.path.basename(pdf_path)}: {e}")
                             continue

                     if full_text:
                         model_name = get_matching_chassis(pdf_path) or "general"
                         documents.append(Document(
                             page_content=full_text,
                             metadata={"car_model": model_name, "source_type": "manual", "filename": os.path.basename(pdf_path)}
                         ))
                     doc.close()
                 except Exception as e:
                     print(f"  critical error reading {os.path.basename(pdf_path)}: {e}")
                     continue
    # --- BUILD DB ---
    if not documents:
        print("\nNo documents found! Please check data source and connection.")
        return

    print(f"\nbuilding vector db with {len(documents)} docs...")
    splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
    splits = splitter.split_documents(documents)

    # handle db persistence (Delete old)
    if os.path.exists(DB_PATH):
        try:
            shutil.rmtree(DB_PATH)
            print("  replaced old database")
        except:
            print("  could not delete old db. saving to new folder")
            DB_PATH = f"{DB_PATH}_new_{int(time.time())}"

    device = "cuda" if USE_GPU else "cpu"
    emb = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL, model_kwargs={'device': device})

    # Initialize Chroma DB
    db = Chroma(persist_directory=DB_PATH, embedding_function=emb)

    batch_size = 100
    for i in tqdm(range(0, len(splits), batch_size), desc="indexing"):
        db.add_documents(splits[i:i+batch_size])

    print(f"\nDONE! Database saved to Drive at: {DB_PATH}")

if __name__ == "__main__":
    build_smart_database()

gpu: NVIDIA A100-SXM4-80GB

⚙️ Initializing PaddleOCR (GPU Powered)...


[32mCreating model: ('PP-LCNet_x1_0_doc_ori', None)[0m
[32mModel files already exist. Using cached files. To redownload, please delete the directory manually: `/root/.paddlex/official_models/PP-LCNet_x1_0_doc_ori`.[0m
ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



❌ Critical OCR Initialization Error: 'paddle.base.libpaddle.AnalysisConfig' object has no attribute 'set_optimization_level'
Traceback (most recent call last):
  File "/tmp/ipython-input-1827953054.py", line 72, in <cell line: 0>
    OCR_READER = PaddleOCR(use_textline_orientation=True, lang='en')
                 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/paddleocr/_pipelines/ocr.py", line 163, in __init__
    super().__init__(**base_params)
  File "/usr/local/lib/python3.12/dist-packages/paddleocr/_pipelines/base.py", line 67, in __init__
    self.paddlex_pipeline = self._create_paddlex_pipeline()
                            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/paddleocr/_pipelines/base.py", line 105, in _create_paddlex_pipeline
    return create_pipeline(config=self._merged_paddlex_config, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/

TypeError: object of type 'NoneType' has no len()

In [2]:
!pip uninstall -y paddlepaddle paddlepaddle-gpu paddleocr
!pip install paddlepaddle-gpu==2.6.1 -i https://pypi.tuna.tsinghua.edu.cn/simple
!pip install paddleocr==2.7.3

[0mFound existing installation: paddlepaddle-gpu 2.6.1
Uninstalling paddlepaddle-gpu-2.6.1:
  Successfully uninstalled paddlepaddle-gpu-2.6.1
Found existing installation: paddleocr 3.3.2
Uninstalling paddleocr-3.3.2:
  Successfully uninstalled paddleocr-3.3.2
Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple
Collecting paddlepaddle-gpu==2.6.1
  Downloading https://pypi.tuna.tsinghua.edu.cn/packages/1e/33/a8fca91674380eb7f8b04e9aed8faef920c48779c097f4ec2907ba6c7f09/paddlepaddle_gpu-2.6.1-cp312-cp312-manylinux1_x86_64.whl (758.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m758.8/758.8 MB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: paddlepaddle-gpu
Successfully installed paddlepaddle-gpu-2.6.1


Collecting paddleocr==2.7.3
  Downloading paddleocr-2.7.3-py3-none-any.whl.metadata (26 kB)
Collecting imgaug (from paddleocr==2.7.3)
  Downloading imgaug-0.4.0-py2.py3-none-any.whl.metadata (1.8 kB)
Collecting lmdb (from paddleocr==2.7.3)
  Downloading lmdb-1.7.5-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl.metadata (1.4 kB)
Collecting visualdl (from paddleocr==2.7.3)
  Downloading visualdl-2.5.3-py3-none-any.whl.metadata (25 kB)
Collecting rapidfuzz (from paddleocr==2.7.3)
  Downloading rapidfuzz-3.14.3-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (12 kB)
Collecting opencv-python<=4.6.0.66 (from paddleocr==2.7.3)
  Downloading opencv_python-4.6.0.66-cp36-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (18 kB)
Collecting opencv-contrib-python<=4.6.0.66 (from paddleocr==2.7.3)
  Downloading opencv_contrib_python-4.6.0.66-cp36-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (18 kB)
Collecting premailer

In [1]:
import os
import json
import shutil
import time
import numpy as np
from bs4 import BeautifulSoup
from tqdm.notebook import tqdm
import torch
import warnings
from google.colab import drive
import fitz  # PyMuPDF
import sys

# --- LANGCHAIN IMPORTS ---
from langchain_community.document_loaders import WikipediaLoader
from langchain_core.documents import Document
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_chroma import Chroma
import cv2
from paddleocr import PaddleOCR

# --- CONFIGURATION ---
if not os.path.exists('/content/drive'):
    drive.mount('/content/drive')

BASE_DRIVE_PATH = "/content/drive/MyDrive"
DB_PATH = os.path.join(BASE_DRIVE_PATH, "bmw_knowledge_db_rag")
DATA_PATH = os.path.join(BASE_DRIVE_PATH, "bmw_rag_data")
CLASS_MAP_PATH = os.path.join(BASE_DRIVE_PATH, "bmw_class_names_ddg_enet.json")
EMBEDDING_MODEL = "all-MiniLM-L6-v2"

FOCUS_CARS = [
    "E24", "E28", "E30", "E31", "E32", "E34",
    "E36-7", "E36-8", "E36",
    "E38", "E39", "E46",
    "E52", "E53", "E83",
    "Z1", "Z3", "Z8", "X5"
]

# --- LOAD ALL MODELS ---
all_models = []

if os.path.exists('bmw_class_names.json'):
    with open('bmw_class_names.json', 'r') as f:
        class_map = json.load(f)
        all_models = list(class_map.values())
        print(f"Loaded {len(all_models)} models from local json")
elif os.path.exists(CLASS_MAP_PATH):
    with open(CLASS_MAP_PATH, 'r') as f:
        class_map = json.load(f)
        all_models = list(class_map.values())
else:
    print("Map not found. Using fallback list")
    all_models = FOCUS_CARS

# --- GPU CHECK ---
def get_device():
    if torch.cuda.is_available():
        print(f"GPU: {torch.cuda.get_device_name(0)}")
        return True
    print("Using CPU")
    return False

USE_GPU = get_device()

# --- OCR ENGINE INITIALIZATION ---
print("\n⚙️ Initializing PaddleOCR (GPU Powered)...")
try:
    # Compatible initialization for PaddleOCR 2.7.3 + PaddlePaddle 2.6.1
    OCR_READER = PaddleOCR(
        use_angle_cls=True,  # Use angle classification
        lang='en',
        use_gpu=True if USE_GPU else False,
        show_log=False  # Reduce console spam
    )
    print("✅ PaddleOCR initialized successfully!")
except Exception as e:
    print(f"❌ Critical OCR Initialization Error: {e}")
    print("\n🔧 Fix: Run this in a new cell:")
    print("!pip uninstall -y paddlepaddle paddlepaddle-gpu paddleocr")
    print("!pip install paddlepaddle-gpu==2.6.1")
    print("!pip install paddleocr==2.7.3")
    sys.exit()

def get_matching_chassis(text):
    if not text: return None
    text = text.lower()
    for code in FOCUS_CARS:
        if code.lower() in text: return code
    return None

# --- PDF HELPERS ---
def resize_for_ocr(img_array, max_width=2000):
    """Resize huge images to prevent memory errors."""
    height, width = img_array.shape[:2]
    if width > max_width:
        scale = max_width / width
        new_width = int(width * scale)
        new_height = int(height * scale)
        return cv2.resize(img_array, (new_width, new_height), interpolation=cv2.INTER_AREA)
    return img_array


def build_smart_database():
    global DB_PATH
    documents = []

    # --- PHASE 1: WIKI ---
    print(f"\nPhase 1: Wikipedia scan ({len(all_models)} models)")
    for model in tqdm(all_models, desc="Wiki Ingest"):
        if "non_bmw" in model.lower() or "non_cars" in model.lower():
            continue
        try:
            loader = WikipediaLoader(query=model.replace('_', ' '), load_max_docs=1)
            docs = loader.load()
            for d in docs:
                d.metadata["car_model"] = model
                d.metadata["source_type"] = "General History"
            documents.extend(docs)
        except:
            continue

    # --- PHASE 2: LOCAL HTML FILES ---
    print("\nPhase 2: Local HTML files (Drive)")
    if os.path.exists(DATA_PATH):
        manual_count = 0
        for root, dirs, files in os.walk(DATA_PATH):
            for file in files:
                if file.lower().endswith(('.html', '.htm')):
                    file_path = os.path.join(root, file)
                    try:
                        try:
                            with open(file_path, 'r', encoding='utf-8') as f:
                                content = f.read()
                        except:
                            with open(file_path, 'r', encoding='latin-1') as f:
                                content = f.read()

                        soup = BeautifulSoup(content, 'html.parser')
                        for junk in soup(["script", "style", "nav", "footer", "header", "aside", "iframe"]):
                            junk.extract()

                        text = soup.get_text(separator=' ', strip=True)
                        code = get_matching_chassis(file) or get_matching_chassis(root) or "General"

                        documents.append(Document(
                            page_content=text,
                            metadata={"source": file_path, "car_model": code, "source_type": "Manual HTML Save"}
                        ))
                        manual_count += 1
                    except Exception as e:
                        print(f"    Failed {file}: {e}")
        print(f"  Ingested {manual_count} local HTML files")
    else:
        print(f"  DATA_PATH not found at: {DATA_PATH}")

    # --- PHASE 3: PDF OCR ---
    print("\nPhase 3: PDF OCR (PaddleOCR)")
    if os.path.exists(DATA_PATH):
        pdf_files = []
        for root, _, files in os.walk(DATA_PATH):
             pdf_files.extend([os.path.join(root, f) for f in files if f.endswith('.pdf')])

        if pdf_files:
             print(f"  Found {len(pdf_files)} PDFs. Starting OCR...")

             for pdf_path in tqdm(pdf_files, desc="OCR Processing"):
                 try:
                     doc = fitz.open(pdf_path)
                     full_text = ""

                     for i, page in enumerate(doc):
                         try:
                             # Render at 2x for quality
                             pix = page.get_pixmap(matrix=fitz.Matrix(2, 2), alpha=False)
                             img = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.h, pix.w, pix.n)

                             # Resize and convert
                             img = resize_for_ocr(img)
                             if pix.n == 4:
                                 img = cv2.cvtColor(img, cv2.COLOR_RGBA2RGB)
                             elif pix.n == 1:
                                 img = cv2.cvtColor(img, cv2.COLOR_GRAY2RGB)

                             # PADDLEOCR INFERENCE
                             result = OCR_READER.ocr(img, cls=True)

                             # Extract text
                             if result and result[0]:
                                 page_text = " ".join([line[1][0] for line in result[0] if line[1][1] > 0.5])  # Confidence threshold
                                 if len(page_text) > 20:
                                     full_text += f" [Page {i+1}] {page_text} "
                         except Exception as e:
                             print(f"    Skipping page {i+1} in {os.path.basename(pdf_path)}: {str(e)[:50]}")
                             continue

                     if full_text:
                         model_name = get_matching_chassis(pdf_path) or "general"
                         documents.append(Document(
                             page_content=full_text,
                             metadata={"car_model": model_name, "source_type": "manual", "filename": os.path.basename(pdf_path)}
                         ))
                     doc.close()
                 except Exception as e:
                     print(f"  Critical error reading {os.path.basename(pdf_path)}: {e}")
                     continue

    # --- BUILD DB ---
    if not documents:
        print("\n❌ No documents found! Please check data source and connection.")
        return

    print(f"\n✅ Building vector DB with {len(documents)} documents...")
    splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
    splits = splitter.split_documents(documents)

    # Handle DB persistence
    if os.path.exists(DB_PATH):
        try:
            shutil.rmtree(DB_PATH)
            print("  Replaced old database")
        except:
            print("  Could not delete old DB. Saving to new folder")
            DB_PATH = f"{DB_PATH}_new_{int(time.time())}"

    device = "cuda" if USE_GPU else "cpu"
    emb = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL, model_kwargs={'device': device})

    # Initialize Chroma DB
    db = Chroma(persist_directory=DB_PATH, embedding_function=emb)

    batch_size = 100
    for i in tqdm(range(0, len(splits), batch_size), desc="Indexing"):
        db.add_documents(splits[i:i+batch_size])

    print(f"\n✅ DONE! Database saved to: {DB_PATH}")
    print(f"   Total chunks indexed: {len(splits)}")

if __name__ == "__main__":
    build_smart_database()

GPU: NVIDIA A100-SXM4-80GB

⚙️ Initializing PaddleOCR (GPU Powered)...
download https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_det_infer.tar to /root/.paddleocr/whl/det/en/en_PP-OCRv3_det_infer/en_PP-OCRv3_det_infer.tar


100%|██████████| 4.00M/4.00M [00:21<00:00, 184kiB/s]


download https://paddleocr.bj.bcebos.com/PP-OCRv4/english/en_PP-OCRv4_rec_infer.tar to /root/.paddleocr/whl/rec/en/en_PP-OCRv4_rec_infer/en_PP-OCRv4_rec_infer.tar


100%|██████████| 10.2M/10.2M [00:26<00:00, 385kiB/s] 


download https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_infer.tar to /root/.paddleocr/whl/cls/ch_ppocr_mobile_v2.0_cls_infer/ch_ppocr_mobile_v2.0_cls_infer.tar


100%|██████████| 2.19M/2.19M [00:18<00:00, 117kiB/s]


✅ PaddleOCR initialized successfully!

Phase 1: Wikipedia scan (151 models)


Wiki Ingest:   0%|          | 0/151 [00:00<?, ?it/s]


Phase 2: Local HTML files (Drive)
  Ingested 1117 local HTML files

Phase 3: PDF OCR (PaddleOCR)
  Found 82 PDFs. Starting OCR...


OCR Processing:   0%|          | 0/82 [00:00<?, ?it/s]

MuPDF error: format error: overflow in 2d faxd

MuPDF error: format error: overflow in 2d faxd

MuPDF error: format error: overflow in 2d faxd


✅ Building vector DB with 1348 documents...
  Replaced old database


Indexing:   0%|          | 0/531 [00:00<?, ?it/s]


✅ DONE! Database saved to: /content/drive/MyDrive/bmw_knowledge_db_rag
   Total chunks indexed: 53008


In [2]:
import shutil
# Zip the folder SAFELY on the linux machine first
shutil.make_archive("/content/drive/MyDrive/bmw_db_rag_easyocr", 'zip', "/content/drive/MyDrive/bmw_knowledge_db_rag_easyocr")
print("✅ Created zip in Drive.")

✅ Created zip in Drive.


In [2]:
!pip install numpy==1.26.4

Collecting numpy==1.26.4
  Using cached numpy-1.26.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
Using cached numpy-1.26.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.0 MB)
Installing collected packages: numpy
  Attempting uninstall: numpy
    Found existing installation: numpy 2.2.6
    Uninstalling numpy-2.2.6:
      Successfully uninstalled numpy-2.2.6
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
pytensor 2.35.1 requires numpy>=2.0, but you have numpy 1.26.4 which is incompatible.
xarray 2025.11.0 requires packaging>=24.1, but you have packaging 23.2 which is incompatible.
shap 0.50.0 requires numpy>=2, but you have numpy 1.26.4 which is incompatible.
jaxlib 0.7.2 requires numpy>=2.0, but you have numpy 1.26.4 which is incompatible.
db-dtypes 1.4.4 requires packaging>=24.2.0, but you have packaging 2

In [1]:
!pip in
import chromadb
print(chromadb.__version__)

ModuleNotFoundError: No module named 'chromadb'