In [None]:
# === 1. K√ñRNYEZET BE√ÅLL√çT√ÅSA ===
# K√∂nyvt√°rak telep√≠t√©se √©s import√°l√°sa
%pip install -U torch sentence-transformers accelerate pyarrow pandas tqdm transformers runpod python-dotenv

import pandas as pd
import numpy as np
import gc
import json
import pyarrow.parquet as pq
import torch
import time
from tqdm.auto import tqdm
from typing import List
from pathlib import Path
import os
import runpod
from dotenv import load_dotenv

# .env f√°jl bet√∂lt√©se a RUNPOD_API_KEY bet√∂lt√©s√©hez
load_dotenv()

# GPU-specifikus optimaliz√°ci√≥k m√°r nem sz√ºks√©gesek, mivel a sz√°m√≠t√°s a felh≈ëben t√∂rt√©nik
# os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
# torch.backends.cudnn.benchmark = True
# torch.backends.cuda.matmul.allow_tf32 = True

print(f"CUDA el√©rhet≈ë lok√°lisan: {torch.cuda.is_available()}")

In [None]:
# === 2. KONFIGUR√ÅCI√ì ===
# RunPod Serverless API-hoz igaz√≠tott konfigur√°ci√≥.

from pathlib import Path
import os

# API kulcs √©s Endpoint ID beolvas√°sa
RUNPOD_API_KEY = os.getenv("RUNPOD_API_KEY")
RUNPOD_ENDPOINT_ID = "5hjxb1eht972gw" # A RunPod UI-b√≥l kim√°solt endpoint ID

if not RUNPOD_API_KEY:
    raise ValueError("A RUNPOD_API_KEY k√∂rnyezeti v√°ltoz√≥ nincs be√°ll√≠tva! √Åll√≠tsd be egy .env f√°jlban.")

# Bemeneti √©s kimeneti f√°jlok
INPUT_CSV_PATH = Path("../processed_data/cleaned_data_for_embedding.csv")
OUTPUT_PARQUET_PATH = Path("../processed_data/documents_with_embeddings_api.parquet")

# Embedding √©s batch m√©ret be√°ll√≠t√°sok
EMBEDDING_DIMENSION = 1024
# Kliens oldali batch m√©ret, egy k√©r√©sben ennyi sz√∂veg megy el az API-nak
BATCH_SIZE = 256

print(f"RunPod Endpoint ID: {RUNPOD_ENDPOINT_ID}")
print(f"Input: {INPUT_CSV_PATH}")
print(f"Output: {OUTPUT_PARQUET_PATH}")
print(f"Batch m√©ret: {BATCH_SIZE}")

In [None]:
# === 3. EMBEDDING GENER√ÅTOR OSZT√ÅLY (RUNPOD API) ===
class EmbeddingGeneratorAPI:
    def __init__(self, api_key: str, endpoint_id: str, batch_size: int, dimension: int):
        self.api_key = api_key
        self.endpoint_id = endpoint_id
        self.batch_size = batch_size
        self.dimension = dimension
        runpod.api_key = self.api_key
        self.endpoint = runpod.Endpoint(self.endpoint_id)
        print(f"RunPod API gener√°tor inicializ√°lva a(z) '{self.endpoint_id}' endpoint-ra.")

    def load_model(self):
        # Nincs sz√ºks√©g modell bet√∂lt√©s√©re, az endpoint kezeli.
        # Ez a met√≥dus a kompatibilit√°s miatt marad.
        print("A modell a RunPod szerveren fut, nincs sz√ºks√©g lok√°lis bet√∂lt√©sre.")

    def generate_embeddings(self, texts: List[str]) -> np.ndarray:
        all_embeddings = []
        
        # Feldolgoz√°s batch-ekben
        for i in tqdm(range(0, len(texts), self.batch_size), desc="Embedding gener√°l√°s (API)"):
            batch_texts = texts[i:i + self.batch_size]
            
            request = {
                "input": {
                    "texts": batch_texts,
                    "normalize_embeddings": True
                }
            }
            
            try:
                # Szinkron k√©r√©s k√ºld√©se, ami megv√°rja a v√°laszt
                result = self.endpoint.run_sync(request, timeout=600) # 10 perc timeout
                batch_embeddings = result['output']['embeddings']
                all_embeddings.extend(batch_embeddings)
            except Exception as e:
                print(f"Hiba a RunPod API h√≠v√°s sor√°n a(z) {i}-edik elemn√©l: {e}")
                # Hibakezel√©s: √ºres embeddingekkel t√∂ltj√ºk fel a hib√°s batch hely√©t
                error_placeholder = np.zeros((len(batch_texts), self.dimension), dtype=np.float32)
                all_embeddings.extend(error_placeholder.tolist())
                continue
        
        embeddings_array = np.array(all_embeddings, dtype=np.float32)

        # Biztons√°gi ellen≈ërz√©s a dimenzi√≥ra
        if embeddings_array.shape[1] != self.dimension:
             print(f"Figyelmeztet√©s: V√°ratlan embedding dimenzi√≥: {embeddings_array.shape[1]}. Korrekci√≥ {self.dimension}-ra.")
             # Itt vagy hib√°t dobunk, vagy megpr√≥b√°ljuk korrig√°lni, ha lehets√©ges
             # Most felt√©telezz√ºk, hogy ez egy kritikus hiba
             raise ValueError(f"V√°ratlan embedding dimenzi√≥: {embeddings_array.shape[1]}")
            
        return embeddings_array

    def _cleanup_memory(self):
        # Nincs sz√ºks√©g GPU mem√≥ria takar√≠t√°s√°ra
        pass

In [None]:
# === 4. F≈ê FELDOLGOZ√ÅSI FOLYAMAT ===
def create_metadata_json(row: pd.Series) -> str:
    """L√©trehoz egy JSON stringet a sor metaadataib√≥l."""
    metadata_cols = [col for col in row.index if col not in ['text', 'embedding']]
    metadata_dict = row[metadata_cols].dropna().to_dict()
    return json.dumps({k: str(v) for k, v in metadata_dict.items()}, ensure_ascii=False)

def main():
    print("Feldolgoz√°s ind√≠t√°sa a RunPod API-val...")
    start_time = time.time()

    # Bemeneti adatok beolvas√°sa
    if not INPUT_CSV_PATH.exists():
        raise FileNotFoundError(f"Hiba: A bemeneti f√°jl nem tal√°lhat√≥: {INPUT_CSV_PATH}")
    
    print(f"Bemeneti CSV beolvas√°sa: {INPUT_CSV_PATH}")
    # Egyszer≈±s√≠tett beolvas√°s, a C motor haszn√°lat√°val
    df = pd.read_csv(INPUT_CSV_PATH)
    print(f"{len(df):,} sor sikeresen beolvasva.")

    # Sz√∂vegek kinyer√©se
    df['text'] = df['text'].fillna('')
    texts_to_process = df['text'].astype(str).tolist()
    
    if not texts_to_process:
        print("Nincs feldolgozand√≥ sz√∂veg a bemeneti f√°jlban.")
        return

    # Embedding gener√°tor inicializ√°l√°sa √©s "modell bet√∂lt√©se"
    generator = EmbeddingGeneratorAPI(RUNPOD_API_KEY, RUNPOD_ENDPOINT_ID, BATCH_SIZE, EMBEDDING_DIMENSION)
    generator.load_model() # Kompatibilit√°si h√≠v√°s, val√≥j√°ban nem csin√°l semmit

    # Embedding gener√°l√°s a teljes adathalmazon
    print("Embedding gener√°l√°s megkezd√©se a RunPod API-n kereszt√ºl...")
    embeddings = generator.generate_embeddings(texts_to_process)
    
    # Mem√≥ria takar√≠t√°s a nagy m≈±velet ut√°n
    generator._cleanup_memory()

    # Eredm√©nyek hozz√°ad√°sa a DataFrame-hez
    if len(embeddings) == len(df):
        df['embedding'] = list(embeddings)
    else:
        print(f"KRITIKUS HIBA: Az embeddingek sz√°ma ({len(embeddings)}) nem egyezik a DataFrame sorainak sz√°m√°val ({len(df)}). A program le√°ll.")
        return

    # Metaadatok gener√°l√°sa
    print("Metaadat JSON gener√°l√°sa...")
    df['metadata_json'] = [create_metadata_json(row) for _, row in tqdm(df.iterrows(), total=len(df), desc="Metaadat JSON")]

    # Kimeneti DataFrame √©s ment√©s Parquet form√°tumba
    final_df = df[['doc_id', 'text', 'embedding', 'metadata_json']]
    OUTPUT_PARQUET_PATH.parent.mkdir(parents=True, exist_ok=True)
    print(f"Eredm√©nyek ment√©se a Parquet f√°jlba: {OUTPUT_PARQUET_PATH}")
    final_df.to_parquet(OUTPUT_PARQUET_PATH, index=False, compression='snappy')
    
    # √ñsszegz√©s
    total_rows_processed = len(final_df)
    total_time_seconds = time.time() - start_time
    rows_per_second = total_rows_processed / total_time_seconds if total_time_seconds > 0 else 0
    
    print("\n" + "="*50)
    print("‚úÖ FELDOLGOZ√ÅS BEFEJEZVE")
    print(f"üìÑ Kimeneti f√°jl: {OUTPUT_PARQUET_PATH}")
    print(f"‚è±Ô∏è Teljes id≈ë: {total_time_seconds:.2f} m√°sodperc ({total_time_seconds / 60:.2f} perc)")
    print(f"üìä Feldolgozott sorok: {total_rows_processed:,}")
    print(f"‚ö° √Åtlagos sebess√©g: {rows_per_second:.2f} sor/mp")
    print("="*50)

# F≈ë folyamat futtat√°sa
main()

In [None]:
# === 5. VALID√ÅCI√ì ===
print("Kimeneti Parquet f√°jl valid√°l√°sa...")

if OUTPUT_PARQUET_PATH.exists():
    try:
        parquet_file = pq.ParquetFile(OUTPUT_PARQUET_PATH)
        file_num_rows = parquet_file.metadata.num_rows
        file_size_mb = OUTPUT_PARQUET_PATH.stat().st_size / (1024 * 1024)
        
        df_sample = pd.read_parquet(OUTPUT_PARQUET_PATH, engine='pyarrow').head(5)
        sample_embedding = df_sample['embedding'].iloc[0]
        
        print("\n‚úÖ VALID√ÅCI√ì SIKERES!")
        print(f"  F√°jl m√©ret: {file_size_mb:.2f} MB")
        print(f"  Sorok sz√°ma: {file_num_rows:,}")
        print(f"  Oszlopok: {df_sample.columns.tolist()}")
        print(f"  Els≈ë embedding dimenzi√≥ja: {len(sample_embedding)}")
        print("\n--- Minta Adatsor ---")
        display(df_sample)
        
    except Exception as e:
        print(f"Hiba a Parquet f√°jl valid√°l√°sa k√∂zben: {e}")
        print(f"\n‚ùå HIBA a valid√°ci√≥ sor√°n: {e}")
else:
    print("A kimeneti Parquet f√°jl nem j√∂tt l√©tre.")
    print("\n‚ùå HIBA: A kimeneti f√°jl nem tal√°lhat√≥!")