# EmbeddingGemma Embedding futtat√°s RunPod GPU-n

In [None]:
%pip install faiss-cpu tqdm transformers accelerate huggingface_hub

/Users/zelenyianszkimate/Documents/CourtRankRL/.venv/bin/python: No module named pip
Note: you may need to restart the kernel to use updated packages.


In [None]:
import json
from pathlib import Path
from typing import List
import math

import faiss
import numpy as np
import torch
from tqdm import tqdm
from transformers import AutoModel, AutoTokenizer

# HuggingFace bejelentkez√©s
from huggingface_hub import login
import os

# HF token bet√∂lt√©se .env f√°jlb√≥l
from dotenv import load_dotenv
load_dotenv()  # .env f√°jl bet√∂lt√©se

hf_token = os.getenv('HUGGINGFACE_TOKEN')
if hf_token:
    login(token=hf_token)
    print("‚úÖ HuggingFace bejelentkez√©s sikeres")
else:
    print("‚ö†Ô∏è  HF token nem tal√°lhat√≥ - egyes modellek korl√°tozottak lehetnek")


## Param√©terek
√Åll√≠tsd be a bemeneti/ kimeneti el√©r√©si utakat.

In [None]:
# Param√©terek be√°ll√≠t√°sa - RunPod √∂n√°ll√≥ futtat√°shoz
# MINDEN el√©r√©si √∫t √©s konfigur√°ci√≥ hardcoded a f√ºggetlens√©g √©rdek√©ben

# El√©r√©si utak (RunPod munkak√∂nyvt√°rhoz igaz√≠tva)
chunks_path = Path("/workspace/data/processed/chunks.jsonl")
faiss_path = Path("/workspace/data/index/faiss_index.bin")
chunk_map_path = Path("/workspace/data/index/chunk_id_map.json")

# Modell konfigur√°ci√≥
model_name = "google/embeddinggemma-300m"

# Mem√≥ria optimaliz√°lt param√©terek
batch_size = 32  # GPU mem√≥ria probl√©m√°k elker√ºl√©s√©re
max_length = 512  # Cs√∂kkentett max token hossz

print("RunPod √∂n√°ll√≥ notebook konfigur√°ci√≥:")
print(f"Chunks path: {chunks_path}")
print(f"FAISS path: {faiss_path}")
print(f"Chunk map path: {chunk_map_path}")
print(f"Model: {model_name}")
print(f"Batch size: {batch_size}")
print(f"Max length: {max_length}")
print("Minden konfigur√°ci√≥ be√°ll√≠tva - notebook √∂n√°ll√≥an futtathat√≥!")


NameError: name '__file__' is not defined

## Modell bet√∂lt√©se
Felt√©telezz√ºk, hogy GPU el√©rhet≈ë (`torch.cuda.is_available()`).

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

# Default √©rt√©k az embedding_dim-nek
embedding_dim = 768  # EmbeddingGemma-300m default dimenzi√≥ja

# Mem√≥ria optimaliz√°l√°s
if device == 'cuda':
    torch.cuda.empty_cache()
    print(f"GPU mem√≥ria el≈ëtte: {torch.cuda.memory_allocated()/1024**3:.1f} GB")

    # Modell bet√∂lt√©se mem√≥ria optimaliz√°lt m√≥don
    print(f"Modell bet√∂lt√©s: {model_name}")
    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)

    try:
        # Pr√≥b√°ljuk meg az accelerate-t haszn√°lni (ha telep√≠tve van)
        model = AutoModel.from_pretrained(
            model_name,
            trust_remote_code=True,
            dtype=torch.float16,  # FP16 haszn√°lata mem√≥ria cs√∂kkent√©s√©re
            low_cpu_mem_usage=True,    # Alacsony CPU mem√≥ria haszn√°lat
            device_map="auto"          # Automatikus eszk√∂z elhelyez√©s
        ).to(device)
        print("Accelerate haszn√°lata sikeres")
    except (ImportError, ValueError) as e:
        # Fallback megold√°s ha nincs accelerate
        print(f"Accelerate hiba, fallback megold√°s: {e}")
        print("Modell bet√∂lt√©s CPU-ra, majd √°thelyez√©s GPU-ra...")
        model = AutoModel.from_pretrained(
            model_name,
            trust_remote_code=True,
            dtype=torch.float16,
            low_cpu_mem_usage=True
        )
        model = model.to(device)
        print("Modell sikeresen bet√∂ltve GPU-ra")

    model.eval()
    try:
        embedding_dim = model.config.hidden_size
    except:
        pass  # M√°r van default √©rt√©k be√°ll√≠tva

    if device == 'cuda':
        torch.cuda.empty_cache()
        print(f"GPU mem√≥ria ut√°na: {torch.cuda.memory_allocated()/1024**3:.1f} GB")

print(f"Model device: {device}")
print(f"Embedding dimension: {embedding_dim}")
print("Model loaded successfully!")
print(f"Batch size: {batch_size}, Max length: {max_length}")

## Embedding f√ºggv√©ny
Egyszer≈± batch feldolgoz√°s GPU-n.

In [None]:
def embed_batch(texts: List[str]) -> np.ndarray:
    if not texts:
        return np.zeros((0, embedding_dim), dtype=np.float32)

    # Mem√≥ria optimaliz√°l√°s - kisebb batch-ek kezel√©se
    if len(texts) > batch_size:
        # Ha t√∫l nagy a batch, feldaraboljuk
        all_embeddings = []
        for i in range(0, len(texts), batch_size):
            sub_texts = texts[i:i + batch_size]
            sub_embeddings = embed_batch(sub_texts)
            all_embeddings.append(sub_embeddings)
        return np.vstack(all_embeddings)

    inputs = tokenizer(
        texts,
        return_tensors='pt',
        truncation=True,
        max_length=max_length,
        padding=True
    )
    inputs = {k: v.to(device) for k, v in inputs.items()}

    with torch.no_grad():
        outputs = model(**inputs)
        if hasattr(outputs, 'last_hidden_state'):
            embeddings = outputs.last_hidden_state[:, 0, :].float()
        else:
            embeddings = outputs.pooler_output.float()

    # Mem√≥ria felszabad√≠t√°s
    del inputs, outputs
    if device == 'cuda':
        torch.cuda.empty_cache()

    embeddings = embeddings.cpu().numpy()
    norms = np.linalg.norm(embeddings, axis=1, keepdims=True)
    norms[norms == 0] = 1.0
    return embeddings / norms


## FAISS index l√©trehoz√°sa
Az index param√©tereit egyszer≈±re vessz√ºk: nlist = sqrt(N), PQ m=64 (igaz√≠tva a dimenzi√≥hoz).

In [None]:
chunk_ids = []
index = None
embedding_dim = None
vector_count = 0

with chunks_path.open('r', encoding='utf-8') as handle:
    iterator = tqdm(handle, desc='Streaming embedding gener√°l√°s')
    batch_texts = []
    batch_ids = []
    for line in iterator:
        line = line.strip()
        if not line:
            continue
        try:
            chunk = json.loads(line)
        except json.JSONDecodeError:
            continue
        chunk_id = str(chunk.get('chunk_id', '')).strip()
        text = chunk.get('text', '')
        if not isinstance(text, str):
            text = str(text)
        if not chunk_id or not text.strip():
            continue
        batch_ids.append(chunk_id)
        batch_texts.append(text)
        if len(batch_texts) >= batch_size:
            vectors = embed_batch(batch_texts)
            if vectors.size > 0:
                if index is None:
                    embedding_dim = vectors.shape[1]
                    index = faiss.IndexFlatIP(embedding_dim)
                index.add(vectors.astype(np.float32, copy=False))
                chunk_ids.extend(batch_ids)
                vector_count += vectors.shape[0]
            batch_texts.clear()
            batch_ids.clear()
    # Marad√©k batch kezel√©se
    if batch_texts:
        vectors = embed_batch(batch_texts)
        if vectors.size > 0:
            if index is None:
                embedding_dim = vectors.shape[1]
                index = faiss.IndexFlatIP(embedding_dim)
            index.add(vectors.astype(np.float32, copy=False))
            chunk_ids.extend(batch_ids)
            vector_count += vectors.shape[0]

# Ellen≈ërz√©s
if index is None or index.ntotal == 0:
    raise ValueError("Hiba: Nem siker√ºlt embedding indexet l√©trehozni!")
print(f"Feldolgozott chunk-ok sz√°ma: {len(chunk_ids)}")
print(f"Vektorok sz√°ma az indexben: {index.ntotal}")
vector_count

## FAISS param√©terek √©s index tr√©ning

In [None]:
# FAISS param√©terek √©s index l√©trehoz√°sa
if index is None:
    raise ValueError("Hiba: Az index nem lett inicializ√°lva!")
    
nlist = max(1, int(math.sqrt(vector_count)))
pq_m = 64
if embedding_dim % pq_m != 0:
    for candidate in range(pq_m, 0, -1):
        if embedding_dim % candidate == 0:
            pq_m = candidate
            break
pq_bits = 8

# Vektorok kinyer√©se az IndexFlatIP-b≈ël
print("üìä Vektorok kinyer√©se az IndexFlatIP-b≈ël...")
try:
    all_vectors = index.reconstruct_n(0, index.ntotal)
except AttributeError as err:
    raise AttributeError("Ez a FAISS build nem t√°mogatja az 'xb' attrib√∫tumot; a reconstruct_n API-t kell haszn√°lni a vektorok kinyer√©s√©hez.") from err
if not isinstance(all_vectors, np.ndarray):
    vector_to_array = getattr(faiss, "vector_float_to_array", getattr(faiss, "vector_to_array", None))
    if vector_to_array is None:
        raise RuntimeError("Nem tal√°lhat√≥ FAISS seg√©df√ºggv√©ny a vektorok numpy t√∂mbb√© alak√≠t√°s√°hoz.")
    all_vectors = vector_to_array(all_vectors)
    all_vectors = all_vectors.reshape(index.ntotal, embedding_dim)
all_vectors = np.ascontiguousarray(all_vectors, dtype=np.float32)
print(f"‚úÖ Vektorok kinyerve: {all_vectors.shape}")

# Kvant√°lt index l√©trehoz√°sa
quantizer = faiss.IndexFlatIP(embedding_dim)
new_index = faiss.IndexIVFPQ(quantizer, embedding_dim, nlist, pq_m, pq_bits, faiss.METRIC_INNER_PRODUCT)

# Index tr√©ning √©s felt√∂lt√©s
new_index.train(all_vectors)
new_index.add(all_vectors)

# Csere
index = new_index

# Ellen≈ërz√©s
print("Index created with", len(chunk_ids), "vectors")



## Eredm√©nyek ment√©se

In [None]:
faiss_path.parent.mkdir(parents=True, exist_ok=True)
faiss.write_index(index, str(faiss_path))
chunk_id_map = {str(i): cid for i, cid in enumerate(chunk_ids)}
with chunk_map_path.open('w', encoding='utf-8') as f:
    json.dump(chunk_id_map, f)
faiss_path, chunk_map_path