In [2]:
import os
import json
import shutil
import requests
import numpy as np
import easyocr 
from pdf2image import convert_from_path 
from bs4 import BeautifulSoup
from urllib.parse import urljoin 
from langchain_community.document_loaders import WikipediaLoader, WebBaseLoader
from langchain_core.documents import Document
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_chroma import Chroma
from tqdm import tqdm
import torch
import warnings

# --- CONFIGURATION ---
DB_PATH = "bmw_knowledge_db_rag"
DATA_PATH = "rag_data"
EMBEDDING_MODEL = "all-MiniLM-L6-v2"

# Mute warnings
warnings.filterwarnings("ignore")

FOCUS_CARS = [
    "E24", "E28", "E30", "E31", "E32", "E34", 
    "E36-7", "E36-8", "E36", 
    "E38", "E39", "E46", 
    "E52", "E53", "E83", 
    "Z1", "Z3", "Z8"
]

# --- BROWSER HEADERS (To bypass blocking) ---
HEADERS = {
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
}

# --- GPU SAFETY CHECK ---
# This decides if we can actually use the P100s or if we must failover to CPU
def get_device():
    if torch.cuda.is_available():
        try:
            # Try to actually use the GPU. If arch is wrong, this throws an error.
            torch.zeros(1).cuda()
            print("  ‚úÖ Compatible GPU detected (CUDA).")
            return True # use_gpu = True
        except:
            print("  ‚ö†Ô∏è GPU detected but incompatible (P100 vs PyTorch Version).")
            print("  ‚ö†Ô∏è Switching to CPU mode. It will be slower, but it will work.")
            return False
    return False

USE_GPU = get_device()

# Load class map
if not os.path.exists('bmw_class_names.json'):
    raise FileNotFoundError("Critical error! bmw_class_names.json not found")

with open('bmw_class_names.json', 'r') as f:
    class_map = json.load(f)
    all_models = list(class_map.values()) 

# ... (Helper functions remain the same) ...
def get_matching_chassis(text):
    text = text.lower()
    for code in FOCUS_CARS:
        if code.lower() in text: return code
    return None

def is_focus_car(name):
    for code in FOCUS_CARS:
        if code in name.upper(): return code
    return None

def scrape_fcp_index():
    found = {} 
    base = "https://www.fcpeuro.com/blog/tag/bmw?page="
    print(f"\n crawling fcp euro blog...")
    
    # Reduced to 5 pages for speed testing
    for page in tqdm(range(1, 6), desc="scanning fcp"):
        try:
            r = requests.get(f"{base}{page}", headers=HEADERS, timeout=5)
            if r.status_code != 200: continue
            soup = BeautifulSoup(r.text, 'html.parser')
            for a in soup.find_all('a', href=True):
                href = a['href']
                if '/blog/' in href:
                    url = href if href.startswith('http') else f"https://www.fcpeuro.com{href}"
                    code = get_matching_chassis(url)
                    if code: found[url] = code
        except: continue
    return found

def scrape_pelican_index():
    found = {}
    master = "https://www.pelicanparts.com/bmw/techarticles/tech_main.htm"
    print(f"\n crawling pelican parts...")
    try:
        r = requests.get(master, headers=HEADERS, timeout=10)
        if r.status_code != 200: return {}
        soup = BeautifulSoup(r.text, 'html.parser')
        
        # 1. Sub-pages
        subs = set()
        for a in soup.find_all('a', href=True):
            code = get_matching_chassis(a.get_text()) or get_matching_chassis(a['href'])
            if code and "tech_main" in a['href']:
                subs.add((urljoin(master, a['href']), code))
        
        # 2. Articles
        for sub_url, code in tqdm(subs, desc="scanning sub-pages"):
            try:
                sr = requests.get(sub_url, headers=HEADERS, timeout=5)
                ss = BeautifulSoup(sr.text, 'html.parser')
                for sa in ss.find_all('a', href=True):
                    if "techarticles" in sa['href'] and sa['href'].endswith(".htm"):
                        found[urljoin(sub_url, sa['href'])] = code
            except: continue
    except: pass
    return found

def build_smart_database():
    documents = []
    
    # PHASE 1: WIKI
    print("\n wiki time!")
    for model in tqdm(all_models, desc="Wiki"):
        if "non_bmw" in model: continue
        try:
            # Loading just 1 doc to keep it fast
            loader = WikipediaLoader(query=model.replace("_", " "), load_max_docs=1)
            docs = loader.load()
            for d in docs:
                d.metadata["car_model"] = model
                d.metadata["source_type"] = "General History"
            documents.extend(docs)
        except: continue

    # PHASE 2: WEB
    print("\n web crawlers!")
    fcp = scrape_fcp_index()
    pel = scrape_pelican_index()
    web_urls = {**fcp, **pel}
    
    if web_urls:
        urls = list(web_urls.keys())
        print(f"  downloading {len(urls)} guides...")
        for i in tqdm(range(0, len(urls), 10), desc="downloading"):
            batch = urls[i:i+10]
            try:
                loader = WebBaseLoader(batch, header_template=HEADERS)
                loader.requests_per_second = 2
                docs = loader.load()
                for d in docs:
                    url = d.metadata.get('source', '')
                    code = fcp.get(url) or pel.get(url) or "General"
                    d.metadata["car_model"] = code
                    d.metadata["source_type"] = "Expert Guide"
                    d.page_content = d.page_content.replace("\n", " ")
                documents.extend(docs)
            except: pass

    # PHASE 3: PDF (With GPU Check)
    print("\n local pdfs (OCR)...")
    print(f"  OCR Device Mode: {'GPU (Fast)' if USE_GPU else 'CPU (Slow)'}")
    
    reader = easyocr.Reader(['en'], gpu=USE_GPU) 

    for model_name in all_models:
        if "non_bmw" in model_name: continue
        folder = os.path.join(DATA_PATH, model_name)
        if os.path.exists(folder) and os.path.isdir(folder):
            pdfs = [f for f in os.listdir(folder) if f.endswith('.pdf')]
            if not pdfs: continue
            
            code = is_focus_car(model_name) or model_name
            print(f"  processing {len(pdfs)} manuals for {code}...")
            
            for pdf in pdfs:
                try:
                    images = convert_from_path(os.path.join(folder, pdf))
                    text = ""
                    for i, img in enumerate(images):
                        res = reader.readtext(np.array(img), detail=0)
                        text += f" [Page {i+1}] " + " ".join(res)
                    
                    documents.append(Document(
                        page_content=text,
                        metadata={"car_model": code, "source_type": "Manual", "filename": pdf}
                    ))
                    print(f"    ‚úÖ read {pdf}")
                except Exception as e:
                    print(f"    ‚ùå failed {pdf}: {e}")

    # BUILD DB
    if not documents:
        print("‚ùå No documents found.")
        return

    print(f"\n Building DB with {len(documents)} docs...")
    splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
    splits = splitter.split_documents(documents)
    
    if os.path.exists(DB_PATH): shutil.rmtree(DB_PATH)
    
    # Embeddings also need to know if GPU is safe to use
    device = "cuda" if USE_GPU else "cpu"
    emb = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL, model_kwargs={'device': device})
    
    db = Chroma(persist_directory=DB_PATH, embedding_function=emb)
    
    for i in tqdm(range(0, len(splits), 100), desc="indexing"):
        db.add_documents(splits[i:i+100])
        
    print(f"\n üöÄ DONE! Database at {DB_PATH}")

if __name__ == "__main__":
    build_smart_database()

ModuleNotFoundError: No module named 'requests'

In [1]:
import os
import json
import shutil
import requests
import numpy as np
import easyocr
import fitz  # <--- REPLACES pdf2image
# from pdf2image import convert_from_path  <--- REMOVED
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from langchain_community.document_loaders import WikipediaLoader, WebBaseLoader
from langchain_core.documents import Document
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_chroma import Chroma
from tqdm import tqdm
import torch
import warnings

# --- CONFIGURATION ---
DB_PATH = "bmw_knowledge_db_rag"
DATA_PATH = "rag_data"
EMBEDDING_MODEL = "all-MiniLM-L6-v2"

# Mute warnings
warnings.filterwarnings("ignore")

FOCUS_CARS = [
    "E24", "E28", "E30", "E31", "E32", "E34", 
    "E36-7", "E36-8", "E36", 
    "E38", "E39", "E46", 
    "E52", "E53", "E83", 
    "Z1", "Z3", "Z8"
]

# --- BROWSER HEADERS (To bypass blocking) ---
HEADERS = {
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
}

# --- GPU SAFETY CHECK ---
def get_device():
    if torch.cuda.is_available():
        try:
            # Try to actually use the GPU. If arch is wrong, this throws an error.
            torch.zeros(1).cuda()
            print("  ‚úÖ Compatible GPU detected (CUDA).")
            return True 
        except:
            print("  ‚ö†Ô∏è GPU detected but incompatible (P100 vs PyTorch Version).")
            print("  ‚ö†Ô∏è Switching to CPU mode. It will be slower, but it will work.")
            return False
    return False

USE_GPU = get_device()

# Load class map
if not os.path.exists('bmw_class_names.json'):
    # Just creating a dummy map if missing so code doesn't crash during testing
    # You likely have this file, but this is a safety fallback
    print("Warning: bmw_class_names.json not found, using empty list.")
    all_models = []
else:
    with open('bmw_class_names.json', 'r') as f:
        class_map = json.load(f)
        all_models = list(class_map.values()) 

def get_matching_chassis(text):
    text = text.lower()
    for code in FOCUS_CARS:
        if code.lower() in text: return code
    return None

def is_focus_car(name):
    for code in FOCUS_CARS:
        if code in name.upper(): return code
    return None

def scrape_fcp_index():
    found = {} 
    base = "https://www.fcpeuro.com/blog/tag/bmw?page="
    print(f"\n crawling fcp euro blog...")
    
    for page in tqdm(range(1, 6), desc="scanning fcp"):
        try:
            r = requests.get(f"{base}{page}", headers=HEADERS, timeout=5)
            if r.status_code != 200: continue
            soup = BeautifulSoup(r.text, 'html.parser')
            for a in soup.find_all('a', href=True):
                href = a['href']
                if '/blog/' in href:
                    url = href if href.startswith('http') else f"https://www.fcpeuro.com{href}"
                    code = get_matching_chassis(url)
                    if code: found[url] = code
        except: continue
    return found

def scrape_pelican_index():
    found = {}
    master = "https://www.pelicanparts.com/bmw/techarticles/tech_main.htm"
    print(f"\n crawling pelican parts...")
    try:
        r = requests.get(master, headers=HEADERS, timeout=10)
        if r.status_code != 200: return {}
        soup = BeautifulSoup(r.text, 'html.parser')
        
        subs = set()
        for a in soup.find_all('a', href=True):
            code = get_matching_chassis(a.get_text()) or get_matching_chassis(a['href'])
            if code and "tech_main" in a['href']:
                subs.add((urljoin(master, a['href']), code))
        
        for sub_url, code in tqdm(subs, desc="scanning sub-pages"):
            try:
                sr = requests.get(sub_url, headers=HEADERS, timeout=5)
                ss = BeautifulSoup(sr.text, 'html.parser')
                for sa in ss.find_all('a', href=True):
                    if "techarticles" in sa['href'] and sa['href'].endswith(".htm"):
                        found[urljoin(sub_url, sa['href'])] = code
            except: continue
    except: pass
    return found

def build_smart_database():
    documents = []
    
    # PHASE 1: WIKI
    print("\n wiki time!")
    for model in tqdm(all_models, desc="Wiki"):
        if "non_bmw" in model: continue
        try:
            loader = WikipediaLoader(query=model.replace("_", " "), load_max_docs=1)
            docs = loader.load()
            for d in docs:
                d.metadata["car_model"] = model
                d.metadata["source_type"] = "General History"
            documents.extend(docs)
        except: continue

    # PHASE 2: WEB
    print("\n web crawlers!")
    fcp = scrape_fcp_index()
    pel = scrape_pelican_index()
    web_urls = {**fcp, **pel}
    
    if web_urls:
        urls = list(web_urls.keys())
        print(f"  downloading {len(urls)} guides...")
        for i in tqdm(range(0, len(urls), 10), desc="downloading"):
            batch = urls[i:i+10]
            try:
                loader = WebBaseLoader(batch, header_template=HEADERS)
                loader.requests_per_second = 2
                docs = loader.load()
                for d in docs:
                    url = d.metadata.get('source', '')
                    code = fcp.get(url) or pel.get(url) or "General"
                    d.metadata["car_model"] = code
                    d.metadata["source_type"] = "Expert Guide"
                    d.page_content = d.page_content.replace("\n", " ")
                documents.extend(docs)
            except: pass

    # PHASE 3: PDF (With Fitz & GPU Check)
    print("\n local pdfs (OCR)...")
    print(f"  OCR Device Mode: {'GPU (Fast)' if USE_GPU else 'CPU (Slow)'}")
    
    reader = easyocr.Reader(['en'], gpu=USE_GPU) 

    for model_name in all_models:
        if "non_bmw" in model_name: continue
        folder = os.path.join(DATA_PATH, model_name)
        if os.path.exists(folder) and os.path.isdir(folder):
            pdfs = [f for f in os.listdir(folder) if f.endswith('.pdf')]
            if not pdfs: continue
            
            code = is_focus_car(model_name) or model_name
            print(f"  processing {len(pdfs)} manuals for {code}...")
            
            for pdf in pdfs:
                pdf_path = os.path.join(folder, pdf)
                try:
                    # --- NEW FITZ LOGIC ---
                    doc = fitz.open(pdf_path)
                    text = ""
                    for i, page in enumerate(doc):
                        # Render page to image (pixmap)
                        pix = page.get_pixmap()
                        # Convert to numpy array (H, W, Channels)
                        img = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.h, pix.w, pix.n)
                        
                        # Handle transparency (4 channels -> 3 channels)
                        if pix.n == 4:
                            img = img[:, :, :3]
                            
                        # Pass to EasyOCR
                        res = reader.readtext(img, detail=0)
                        text += f" [Page {i+1}] " + " ".join(res)
                    # ----------------------
                    
                    documents.append(Document(
                        page_content=text,
                        metadata={"car_model": code, "source_type": "Manual", "filename": pdf}
                    ))
                    print(f"    ‚úÖ read {pdf}")
                    doc.close() # Close file handle
                    
                except Exception as e:
                    print(f"    ‚ùå failed {pdf}: {e}")

    # BUILD DB
    if not documents:
        print("‚ùå No documents found.")
        return

    print(f"\n Building DB with {len(documents)} docs...")
    splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
    splits = splitter.split_documents(documents)
    
    if os.path.exists(DB_PATH): shutil.rmtree(DB_PATH)
    
    # Embeddings setup
    device = "cuda" if USE_GPU else "cpu"
    emb = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL, model_kwargs={'device': device})
    
    db = Chroma(persist_directory=DB_PATH, embedding_function=emb)
    
    for i in tqdm(range(0, len(splits), 100), desc="indexing"):
        db.add_documents(splits[i:i+100])
        
    print(f"\n üöÄ DONE! Database at {DB_PATH}")

if __name__ == "__main__":
    build_smart_database()

ModuleNotFoundError: No module named 'requests'