In [None]:
# === 1. K√ñRNYEZET BE√ÅLL√çT√ÅSA ===
# K√∂nyvt√°rak telep√≠t√©se √©s import√°l√°sa
%pip install -U torch sentence-transformers accelerate pyarrow pandas tqdm transformers

import pandas as pd
import numpy as np
import gc
import json
import pyarrow.parquet as pq
from sentence_transformers import SentenceTransformer
import torch
import time
from tqdm.auto import tqdm
from typing import List
from pathlib import Path
import os

# A PyTorch rugalmasabban kezelje a GPU mem√≥ri√°t.
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

# GPU optimaliz√°ci√≥ A100-hoz
torch.backends.cudnn.benchmark = True
torch.backends.cuda.matmul.allow_tf32 = True

print(f"CUDA el√©rhet≈ë: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")

In [None]:
# === 2. KONFIGUR√ÅCI√ì ===
# RunPod A100 felh≈ë k√∂rnyezethez igaz√≠tott konfigur√°ci√≥.

from pathlib import Path

# Bemeneti √©s kimeneti f√°jlok
# Gy≈ëz≈ëdj meg r√≥la, hogy a f√°jlok a megfelel≈ë helyen vannak a felh≈ë k√∂rnyezetben!
INPUT_CSV_PATH = Path("/workspace/cleaned_data_for_embedding.csv")
OUTPUT_PARQUET_PATH = Path("/workspace/documents_with_embeddings.parquet")

# Modell √©s batch m√©ret be√°ll√≠t√°sok
MODEL_NAME = "Qwen/Qwen3-Embedding-0.6B"
EMBEDDING_DIMENSION = 1024
# A100 k√°rty√°n egy nagyobb batch m√©ret is hat√©kony lehet
BATCH_SIZE = 256

print(f"Input: {INPUT_CSV_PATH}")
print(f"Output: {OUTPUT_PARQUET_PATH}")
print(f"Modell: {MODEL_NAME}")
print(f"Batch m√©ret: {BATCH_SIZE}")

In [None]:
# === 3. EMBEDDING GENER√ÅTOR OSZT√ÅLY ===
class EmbeddingGenerator:
    def __init__(self, model_name: str, batch_size: int, dimension: int, device: str = 'cuda'):
        self.model_name = model_name
        self.batch_size = batch_size
        self.dimension = dimension
        self.device = device if torch.cuda.is_available() else 'cpu'
        self.model = None
        print(f"Gener√°tor inicializ√°lva a(z) '{self.device}' eszk√∂z√∂n.")

    def load_model(self):
        if self.model is not None:
            print("Modell m√°r be van t√∂ltve.")
            return
        try:
            print(f"'{self.model_name}' modell bet√∂lt√©se...")
            self.model = SentenceTransformer(self.model_name, device=self.device, trust_remote_code=True)
            self._warmup_model()
            print("Modell sikeresen bet√∂ltve √©s bemeleg√≠tve.")
        except Exception as e:
            print(f"Modell bet√∂lt√©si hiba: {e}")
            raise

    def _warmup_model(self):
        print("Modell bemeleg√≠t√©se...")
        # Egy r√∂vid sz√∂veggel "bemeleg√≠tj√ºk" a modellt
        self.generate_embeddings(["meleg√≠t√©s"])
        self._cleanup_memory()
        print("Bemeleg√≠t√©s k√©sz.")

    def generate_embeddings(self, texts: List[str]) -> np.ndarray:
        if self.model is None:
            raise RuntimeError("A modell nincs bet√∂ltve. H√≠vd meg a load_model() met√≥dust.")
        
        embeddings = self.model.encode(
            texts, 
            batch_size=self.batch_size, 
            normalize_embeddings=True, 
            show_progress_bar=True,
            convert_to_numpy=True
        )
        
        # Biztons√°gi ellen≈ërz√©s a dimenzi√≥ra
        if embeddings.shape[1] != self.dimension:
            print(f"Figyelmeztet√©s: V√°ratlan embedding dimenzi√≥: {embeddings.shape[1]}. Korrekci√≥ {self.dimension}-ra.")
            embeddings = embeddings[:, :self.dimension]
            
        return embeddings.astype(np.float32)

    def _cleanup_memory(self):
        gc.collect()
        if torch.cuda.is_available():
            torch.cuda.empty_cache()

In [None]:
# === 4. F≈ê FELDOLGOZ√ÅSI FOLYAMAT ===
def create_metadata_json(row: pd.Series) -> str:
    """L√©trehoz egy JSON stringet a sor metaadataib√≥l."""
    metadata_cols = [col for col in row.index if col not in ['text', 'embedding']]
    metadata_dict = row[metadata_cols].dropna().to_dict()
    return json.dumps({k: str(v) for k, v in metadata_dict.items()}, ensure_ascii=False)

def main():
    print("Feldolgoz√°s ind√≠t√°sa...")
    start_time = time.time()

    # Bemeneti adatok beolvas√°sa
    if not INPUT_CSV_PATH.exists():
        raise FileNotFoundError(f"Hiba: A bemeneti f√°jl nem tal√°lhat√≥: {INPUT_CSV_PATH}")
    
    print(f"Bemeneti CSV beolvas√°sa: {INPUT_CSV_PATH}")
    # Egyszer≈±s√≠tett beolvas√°s, a C motor haszn√°lat√°val
    df = pd.read_csv(INPUT_CSV_PATH)
    print(f"{len(df):,} sor sikeresen beolvasva.")

    # Sz√∂vegek kinyer√©se
    df['text'] = df['text'].fillna('')
    texts_to_process = df['text'].astype(str).tolist()
    
    if not texts_to_process:
        print("Nincs feldolgozand√≥ sz√∂veg a bemeneti f√°jlban.")
        return

    # Embedding gener√°tor inicializ√°l√°sa √©s modell bet√∂lt√©se
    generator = EmbeddingGenerator(MODEL_NAME, BATCH_SIZE, EMBEDDING_DIMENSION)
    generator.load_model()

    # Embedding gener√°l√°s a teljes adathalmazon
    print("Embedding gener√°l√°s megkezd√©se...")
    embeddings = generator.generate_embeddings(texts_to_process)
    
    # Mem√≥ria takar√≠t√°s a nagy m≈±velet ut√°n
    generator._cleanup_memory()

    # Eredm√©nyek hozz√°ad√°sa a DataFrame-hez
    if len(embeddings) == len(df):
        df['embedding'] = list(embeddings)
    else:
        print(f"KRITIKUS HIBA: Az embeddingek sz√°ma ({len(embeddings)}) nem egyezik a DataFrame sorainak sz√°m√°val ({len(df)}). A program le√°ll.")
        return

    # Metaadatok gener√°l√°sa
    print("Metaadat JSON gener√°l√°sa...")
    df['metadata_json'] = [create_metadata_json(row) for _, row in tqdm(df.iterrows(), total=len(df), desc="Metaadat JSON")]

    # Kimeneti DataFrame √©s ment√©s Parquet form√°tumba
    final_df = df[['doc_id', 'text', 'embedding', 'metadata_json']]
    OUTPUT_PARQUET_PATH.parent.mkdir(parents=True, exist_ok=True)
    print(f"Eredm√©nyek ment√©se a Parquet f√°jlba: {OUTPUT_PARQUET_PATH}")
    final_df.to_parquet(OUTPUT_PARQUET_PATH, index=False, compression='snappy')
    
    # √ñsszegz√©s
    total_rows_processed = len(final_df)
    total_time_seconds = time.time() - start_time
    rows_per_second = total_rows_processed / total_time_seconds if total_time_seconds > 0 else 0
    
    print("\n" + "="*50)
    print("‚úÖ FELDOLGOZ√ÅS BEFEJEZVE")
    print(f"üìÑ Kimeneti f√°jl: {OUTPUT_PARQUET_PATH}")
    print(f"‚è±Ô∏è Teljes id≈ë: {total_time_seconds:.2f} m√°sodperc ({total_time_seconds / 60:.2f} perc)")
    print(f"üìä Feldolgozott sorok: {total_rows_processed:,}")
    print(f"‚ö° √Åtlagos sebess√©g: {rows_per_second:.2f} sor/mp")
    print("="*50)

# F≈ë folyamat futtat√°sa
main()

In [None]:
# === 5. VALID√ÅCI√ì ===
print("Kimeneti Parquet f√°jl valid√°l√°sa...")

if OUTPUT_PARQUET_PATH.exists():
    try:
        parquet_file = pq.ParquetFile(OUTPUT_PARQUET_PATH)
        file_num_rows = parquet_file.metadata.num_rows
        file_size_mb = OUTPUT_PARQUET_PATH.stat().st_size / (1024 * 1024)
        
        df_sample = pd.read_parquet(OUTPUT_PARQUET_PATH, engine='pyarrow').head(5)
        sample_embedding = df_sample['embedding'].iloc[0]
        
        print("\n‚úÖ VALID√ÅCI√ì SIKERES!")
        print(f"  F√°jl m√©ret: {file_size_mb:.2f} MB")
        print(f"  Sorok sz√°ma: {file_num_rows:,}")
        print(f"  Oszlopok: {df_sample.columns.tolist()}")
        print(f"  Els≈ë embedding dimenzi√≥ja: {len(sample_embedding)}")
        print("\n--- Minta Adatsor ---")
        display(df_sample)
        
    except Exception as e:
        print(f"Hiba a Parquet f√°jl valid√°l√°sa k√∂zben: {e}")
        print(f"\n‚ùå HIBA a valid√°ci√≥ sor√°n: {e}")
else:
    print("A kimeneti Parquet f√°jl nem j√∂tt l√©tre.")
    print("\n‚ùå HIBA: A kimeneti f√°jl nem tal√°lhat√≥!")