In [None]:
# RunPod A100 GPU - K√∂nyvt√°rak telep√≠t√©se √©s import√°l√°sa
%pip install --upgrade pip
%pip install -U torch sentence-transformers accelerate pyarrow pandas tqdm transformers

import pandas as pd
import numpy as np
import gc
import json
import pyarrow as pa
import pyarrow.parquet as pq
from sentence_transformers import SentenceTransformer
import torch
import psutil
import time
import logging
from tqdm import tqdm
from typing import List, Dict, Any
import os
import warnings
warnings.filterwarnings('ignore')

# A100 GPU optimaliz√°ci√≥ + mem√≥ria fragment√°ci√≥ jav√≠t√°s
torch.backends.cudnn.benchmark = True
torch.backends.cudnn.deterministic = False
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True

print("RunPod A100 k√∂rnyezet inicializ√°lva!")
print(f"CUDA el√©rhet≈ë: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"GPU mem√≥ria: {torch.cuda.get_device_properties(0).total_memory / (1024**3):.1f}GB")

In [None]:
# RunPod A100 konfigur√°ci√≥
print("RunPod A100 konfigur√°ci√≥ be√°ll√≠t√°sa...")

# F√°jl el√©r√©si utak RunPod-on
INPUT_CSV = "/workspace/cleaned_data_for_embedding.csv"
OUTPUT_PARQUET = "/workspace/processed_documents_with_embeddings.parquet"

# üö® CRISIS MODE: Ha lass√∫, kapcsold √°t!
# MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"  # üöÄ VILL√ÅMGYORS! 
# EMBEDDING_DIMENSION = 384
# BATCH_SIZE = 512

# K√∂lts√©g-optimaliz√°lt konfigur√°ci√≥: 10 √≥ra alatt, $25 limit
MODEL_NAME = "Qwen/Qwen3-Embedding-0.6B"  # Kisebb modell a sebess√©g√©rt
EMBEDDING_DIMENSION = 1024  # Qwen3-0.6B val√≥di dimenzi√≥ja
BATCH_SIZE = 256           # NAGY batch (0.6B-hez ak√°r 512 is megy)
CHUNK_SIZE = 5000          # Standard chunk m√©ret
USE_MIXED_PRECISION = False # Stabilit√°s √©rdek√©ben
MEMORY_LIMIT_GB = 70       # Standard mem√≥ria limit

print(f"Bemeneti CSV: {INPUT_CSV}")
print(f"Kimeneti Parquet: {OUTPUT_PARQUET}")
print(f"Modell: {MODEL_NAME}")
print(f"Dimenzi√≥: {EMBEDDING_DIMENSION}")
print(f"Batch m√©ret: {BATCH_SIZE}")
print(f"Chunk m√©ret: {CHUNK_SIZE:,}")
print(f"Mixed Precision: {USE_MIXED_PRECISION}")
print(f"Mem√≥ria limit: {MEMORY_LIMIT_GB}GB")
print("Alap√©rtelmezett konfigur√°ci√≥ - tesztel√©si f√°zis")

# Logging konfigur√°ci√≥
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.StreamHandler(),
        logging.FileHandler('/workspace/embedding_generation.log')
    ]
)
logger = logging.getLogger(__name__)

In [None]:
# CHUNKED INPUT T√ÅMOGAT√ÅS - adatok bet√∂lt√©se √©s valid√°l√°sa
logger.info("Chunked input-kompatibilis adatvalid√°l√°s...")

# ===== 1. CHUNKED CLEANED INPUT ELLEN≈êRZ√âSE (PRIORIT√ÅS) =====
CHUNKED_CLEANED_DIR = "/workspace/processed_data/chunked_cleaned"
CHUNKED_INPUT_MODE = False
cleaned_chunk_files = []

if os.path.exists(CHUNKED_CLEANED_DIR):
    cleaned_chunk_files = sorted([
        os.path.join(CHUNKED_CLEANED_DIR, f) 
        for f in os.listdir(CHUNKED_CLEANED_DIR) 
        if f.startswith("cleaned_chunk_") and f.endswith(".csv")
    ])
    
    if cleaned_chunk_files:
        CHUNKED_INPUT_MODE = True
        logger.info(f"üéØ CHUNKED INPUT M√ìD: {len(cleaned_chunk_files)} cleaned chunk tal√°lhat√≥")

# ===== 2. UNIFIED CSV FALLBACK =====
if not CHUNKED_INPUT_MODE:
    if not os.path.exists(INPUT_CSV):
        raise FileNotFoundError(f"Nincs el√©rhet≈ë input! Sem chunked ({CHUNKED_CLEANED_DIR}), sem unified ({INPUT_CSV})")
    logger.info("üìÑ UNIFIED CSV M√ìD: Fallback unified CSV-re")

# ===== 3. MINTAADATOK BET√ñLT√âSE VALID√ÅL√ÅSHOZ =====
if CHUNKED_INPUT_MODE:
    # Els≈ë chunk-b√≥l minta
    df_sample = pd.read_csv(cleaned_chunk_files[0], nrows=1000)
    logger.info(f"Minta bet√∂ltve els≈ë chunk-b√≥l: {len(df_sample)} sor")
    
    # Teljes sorok becsl√©se chunk-okb√≥l
    total_rows = 0
    for chunk_file in cleaned_chunk_files:
        chunk_rows = sum(1 for _ in open(chunk_file, 'r', encoding='utf-8')) - 1
        total_rows += chunk_rows
    logger.info(f"Becs√ºlt teljes sorok (chunked): {total_rows:,}")
else:
    # Unified CSV minta
    df_sample = pd.read_csv(INPUT_CSV, nrows=1000)
    logger.info(f"Minta bet√∂ltve unified CSV-b≈ël: {len(df_sample)} sor")
    
    # Teljes f√°jl m√©ret becsl√©se
    total_rows = sum(1 for _ in open(INPUT_CSV, 'r', encoding='utf-8')) - 1
    logger.info(f"Becs√ºlt teljes sorok (unified): {total_rows:,}")

# ===== 4. OSZLOP VALID√ÅL√ÅS (K√ñZ√ñS LOGIKA) =====
# K√∂telez≈ë oszlopok ellen≈ërz√©se
required_columns = ['text', 'doc_id']
missing_columns = [col for col in required_columns if col not in df_sample.columns]
if missing_columns:
    raise ValueError(f"Hi√°nyz√≥ k√∂telez≈ë oszlopok: {missing_columns}")

# Teljes metadata oszlop lista
expected_metadata_columns = [
    'doc_id', 'text', 'birosag', 'JogTerulet', 'Azonosito', 'MeghozoBirosag',
    'EgyediAzonosito', 'HatarozatEve', 'AllKapcsolodoUgyszam', 'AllKapcsolodoBirosag',
    'KapcsolodoHatarozatok', 'Jogszabalyhelyek'
]

# Jelenlegi oszlopok list√°z√°sa
available_columns = list(df_sample.columns)
metadata_columns_present = [col for col in expected_metadata_columns if col in available_columns]
metadata_columns_missing = [col for col in expected_metadata_columns if col not in available_columns]

# ===== 5. EREDM√âNYEK =====
input_mode = "CHUNKED" if CHUNKED_INPUT_MODE else "UNIFIED"
print(f"\n‚úÖ {input_mode} INPUT VALID√ÅCI√ì SIKERES!")
print(f"üìä Teljes sorok: {total_rows:,}")
if CHUNKED_INPUT_MODE:
    print(f"üìÅ Chunk f√°jlok: {len(cleaned_chunk_files)}")
print(f"üìã √ñsszes oszlop: {len(available_columns)}")
print(f"‚úÖ Jelenlev≈ë metadata oszlopok ({len(metadata_columns_present)}): {metadata_columns_present}")
if metadata_columns_missing:
    print(f"‚ö†Ô∏è  Hi√°nyz√≥ metadata oszlopok ({len(metadata_columns_missing)}): {metadata_columns_missing}")

# ===== 6. SZ√ñVEG STATISZTIK√ÅK =====
text_lengths = df_sample['text'].str.len()
print(f"\nüìù Sz√∂veg hossz statisztik√°k (minta):")
print(f"  √Åtlag: {text_lengths.mean():.0f} karakter")
print(f"  Medi√°n: {text_lengths.median():.0f} karakter")
print(f"  Min: {text_lengths.min():.0f} karakter")
print(f"  Max: {text_lengths.max():.0f} karakter")

# ===== 7. FELDOLGOZ√ÅSI BECSL√âS =====
estimated_batches = (total_rows + BATCH_SIZE - 1) // BATCH_SIZE
estimated_chunks = (total_rows + CHUNK_SIZE - 1) // CHUNK_SIZE
print(f"\n‚ö° Becs√ºlt feldolgoz√°s:")
print(f"  Chunk-ok sz√°ma: {estimated_chunks:,}")
print(f"  Batch-ek sz√°ma: {estimated_batches:,}")
print(f"  Input m√≥d: {input_mode}")
if CHUNKED_INPUT_MODE:
    print(f"  üöÄ Memory-optimaliz√°lt chunked feldolgoz√°s!")
else:
    print(f"  ‚ö†Ô∏è  Memory-intenz√≠v unified feldolgoz√°s")

In [None]:
# Optimaliz√°lt Qwen3-Embedding-0.6B modell oszt√°ly (STABIL VERZI√ì)
logger.info("Optimaliz√°lt Qwen3-Embedding-0.6B modell oszt√°ly l√©trehoz√°sa...")

class OptimizedQwen3EmbeddingGenerator:
    def __init__(self):
        self.model_name = MODEL_NAME
        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
        self.dimension = EMBEDDING_DIMENSION
        self.batch_size = BATCH_SIZE
        
        # Teljes√≠tm√©ny k√∂vet√©s
        self.processed_count = 0
        self.failed_count = 0
        self.batch_times = []
        self.peak_memory_usage = 0
        
        logger.info(f"Device: {self.device}")
        
        try:
            # Alap√©rtelmezett modell bet√∂lt√©s - STABIL konfigur√°ci√≥
            logger.info("Qwen3-0.6B modell bet√∂lt√©se (STABIL)...")
            self.model = SentenceTransformer(
                self.model_name,
                device=self.device,
                trust_remote_code=True
            )
            
            # GPU mem√≥ria kezel√©s
            if self.device == 'cuda':
                torch.cuda.empty_cache()
                
            # Modell warmup
            self._warmup_model()
            logger.info("Modell sikeresen inicializ√°lva!")
            
        except Exception as e:
            logger.error(f"Modell bet√∂lt√©s hiba: {e}")
            raise


In [None]:
# Qwen3-Embedding-0.6B modell oszt√°ly alap√©rtelmezett konfigur√°ci√≥val
logger.info("Qwen3-Embedding-0.6B modell oszt√°ly l√©trehoz√°sa...")

class OptimizedQwen3EmbeddingGenerator:
    def __init__(self):
        self.model_name = MODEL_NAME
        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
        self.dimension = EMBEDDING_DIMENSION
        self.batch_size = BATCH_SIZE
        
        # Teljes√≠tm√©ny k√∂vet√©s
        self.processed_count = 0
        self.failed_count = 0
        self.batch_times = []
        self.peak_memory_usage = 0
        
        logger.info(f"Device: {self.device}")
        
        try:
            # Alap√©rtelmezett modell bet√∂lt√©s
            logger.info("Qwen3-0.6B modell bet√∂lt√©se...")
            self.model = SentenceTransformer(
                self.model_name,
                device=self.device,
                trust_remote_code=True
            )
            
            # Alapvet≈ë GPU mem√≥ria kezel√©s
            if self.device == 'cuda':
                torch.cuda.empty_cache()
                logger.info("GPU mem√≥ria tiszt√≠tva")
                
            # Modell warmup
            self._warmup_model()
            logger.info("Modell sikeresen bet√∂ltve √©s inicializ√°lva")
            
        except Exception as e:
            logger.error(f"Modell bet√∂lt√©si hiba: {e}")
            raise
    
    def _warmup_model(self):
        """Modell warmup konzisztens teljes√≠tm√©ny√©rt"""
        logger.info("Modell warmup...")
        dummy_texts = ["Ez egy teszt sz√∂veg a modell bemeleg√≠t√©s√©hez."] * 8
        
        try:
            _ = self.model.encode(dummy_texts, show_progress_bar=False)
            logger.info("Warmup sikeresen befejezve")
        except Exception as e:
            logger.warning(f"Warmup hiba: {e}")
        
        self._cleanup_memory()
    
    def _cleanup_memory(self):
        """Alapvet≈ë mem√≥ria tiszt√≠t√°s"""
        gc.collect()
        if torch.cuda.is_available():
            torch.cuda.empty_cache()
    
    def _monitor_memory(self):
        """GPU mem√≥ria monitoring"""
        if not torch.cuda.is_available():
            return {}
        
        allocated = torch.cuda.memory_allocated() / (1024**3)
        reserved = torch.cuda.memory_reserved() / (1024**3)
        
        self.peak_memory_usage = max(self.peak_memory_usage, allocated)
        
        return {
            'allocated_gb': allocated,
            'reserved_gb': reserved,
            'peak_usage_gb': self.peak_memory_usage
        }

# Embedding gener√°tor inicializ√°l√°sa
embedding_generator = OptimizedQwen3EmbeddingGenerator()
print("Qwen3-0.6B modell sikeresen inicializ√°lva!")
print(f"Dimenzi√≥: {embedding_generator.dimension}")
print(f"Device: {embedding_generator.device}")
print("Teljes√≠tm√©ny tesztel√©s - baseline m√©r√©s")

In [None]:
# DIAGNOSZTIKA - Futtasd a modell inicializ√°l√°sa ut√°n!
print("=== KRITIKUS DIAGNOSZTIKA ===")
print(f"Device: {embedding_generator.device}")
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"Model on device: {next(embedding_generator.model.parameters()).device}")
print(f"Actual embedding dim: {embedding_generator.dimension}")

# üöÄ SEBESS√âG TESZT - 50 sz√∂veggel (re√°lis m√©ret)
test_texts = [f"Ez egy teszt sz√∂veg a b√≠r√≥s√°gi hat√°rozat feldolgoz√°s√°hoz. Sz√°m: {i}. Lorem ipsum dolor sit amet, consectetur adipiscing elit." for i in range(50)]

print(f"Test sz√∂vegek hossza: {len(test_texts[0])} karakter")

start_time = time.time()
test_embeddings = embedding_generator.model.encode(test_texts, batch_size=BATCH_SIZE, show_progress_bar=False)
test_time = time.time() - start_time

print(f"50 sz√∂veg: {test_time:.2f} sec")
print(f"‚ö° SEBESS√âG: {50/test_time:.1f} sor/sec")
print(f"Test embedding shape: {test_embeddings.shape}")

# üìä BECSL√âS 213,000 sorra
total_rows = 213000
estimated_hours = (total_rows / (50/test_time)) / 3600
estimated_cost = estimated_hours * 2.10  # $2.10/hour RunPod A100

print(f"\nüìä BECSL√âS:")
print(f"213,000 sor: {estimated_hours:.1f} √≥ra")
print(f"Becs√ºlt k√∂lts√©g: ${estimated_cost:.1f}")

# üö® KRITIKUS D√ñNT√âS
if 50/test_time < 3:
    print("üö® T√öLLASS√ö! Modellv√°lt√°s sz√ºks√©ges!")
elif 50/test_time < 6:
    print("‚ö†Ô∏è Lass√∫, de elfogadhat√≥")
else:
    print("‚úÖ J√≥ sebess√©g!")

# GPU mem√≥ria info
if torch.cuda.is_available():
    print(f"GPU memory: {torch.cuda.memory_allocated()/1024**3:.1f}GB allocated")
    print(f"GPU memory: {torch.cuda.memory_reserved()/1024**3:.1f}GB reserved")

print("=== DIAGNOSZTIKA V√âGE ===")


In [None]:
# Embedding gener√°l√°s met√≥dus hozz√°ad√°sa
def generate_embeddings_batch(self, texts):
    """Robosztus batch embedding gener√°l√°s"""
    batch_start_time = time.time()
    
    try:
        # Sz√∂vegek haszn√°lata k√∂zvetlen√ºl - az eda_clean_for_embedding.py m√°r feldolgozta
        processed_texts = [str(text) for text in texts]
        
        # Alap√©rtelmezett embedding gener√°l√°s
        embeddings = self.model.encode(
            processed_texts,
            normalize_embeddings=True,
            show_progress_bar=False,
            convert_to_numpy=True
        )
        
        # Gyors dimenzi√≥ ellen≈ërz√©s
        if embeddings.shape[1] != self.dimension:
            logger.warning(f"Dimenzi√≥ hiba: {embeddings.shape[1]} != {self.dimension}")
            if embeddings.shape[1] > self.dimension:
                embeddings = embeddings[:, :self.dimension]
            else:
                padding = np.zeros((embeddings.shape[0], self.dimension - embeddings.shape[1]))
                embeddings = np.hstack([embeddings, padding])
        
        # Teljes√≠tm√©ny k√∂vet√©s
        batch_time = time.time() - batch_start_time
        self.batch_times.append(batch_time)
        self.processed_count += len(texts)
        
        # Sebess√©g sz√°m√≠t√°s
        speed = len(texts) / batch_time
        if speed < 5.0:  # Ha 5 sor/sec alatt
            logger.warning(f"Lass√∫ batch: {speed:.1f} sor/sec")
        
        return embeddings.astype(np.float32)
        
    except Exception as e:
        logger.error(f"Batch feldolgoz√°si hiba: {e}")
        self.failed_count += len(texts)
        # Fallback: NaN vektorok
        return np.full((len(texts), self.dimension), np.nan, dtype=np.float32)
    
    finally:
        # Alapvet≈ë mem√≥ria cleanup
        if self.processed_count % 500 == 0:
            self._cleanup_memory()

# Met√≥dus hozz√°ad√°sa az oszt√°lyhoz (ellen≈ërizz√ºk, hogy l√©tezik-e az oszt√°ly)
if 'embedding_generator' in globals():
    OptimizedQwen3EmbeddingGenerator.generate_embeddings_batch = generate_embeddings_batch
    print("Embedding gener√°l√°s met√≥dus hozz√°adva!")
else:
    print("HIBA: El≈ësz√∂r futtasd a modell inicializ√°l√≥ cell√°t!")

In [None]:
# Seg√©df√ºggv√©nyek
def create_metadata_json(row):
    """Teljes metadata JSON k√©sz√≠t√©se az √∂sszes el√©rhet≈ë oszloppal"""
    metadata = {
        'doc_id': str(row.get('doc_id', '')),
        'birosag': str(row.get('birosag', '')),
        'JogTerulet': str(row.get('JogTerulet', '')),
        'Azonosito': str(row.get('Azonosito', '')),
        'MeghozoBirosag': str(row.get('MeghozoBirosag', '')),
        'EgyediAzonosito': str(row.get('EgyediAzonosito', '')),
        'HatarozatEve': str(row.get('HatarozatEve', '')),
        'AllKapcsolodoUgyszam': str(row.get('AllKapcsolodoUgyszam', '')),
        'AllKapcsolodoBirosag': str(row.get('AllKapcsolodoBirosag', '')),
        'KapcsolodoHatarozatok': str(row.get('KapcsolodoHatarozatok', '')),
        'Jogszabalyhelyek': str(row.get('Jogszabalyhelyek', '')),
        'text_length': len(str(row.get('text', ''))),
        'processed_timestamp': time.time()
    }
    return json.dumps(metadata, ensure_ascii=False)

def adaptive_batch_size(text_lengths, base_batch_size=BATCH_SIZE):
    """Adapt√≠v batch m√©ret sz√∂veg hossz alapj√°n"""
    avg_length = np.mean(text_lengths)
    
    if avg_length > 6000:
        return max(8, base_batch_size // 4)
    elif avg_length > 4000:
        return max(16, base_batch_size // 2)
    elif avg_length > 2000:
        return base_batch_size
    else:
        return min(64, base_batch_size * 2)

def prepare_final_columns(chunk_df):
    """V√©gs≈ë oszlopok el≈ëk√©sz√≠t√©se - √∂sszes metadata meg≈ërz√©se"""
    # Alapvet≈ë oszlopok (k√∂telez≈ë)
    final_columns = ['doc_id', 'text', 'embedding', 'metadata_json']
    
    # √ñsszes metadata oszlop hozz√°ad√°sa, ha l√©tezik
    metadata_columns = [
        'birosag', 'JogTerulet', 'Azonosito', 'MeghozoBirosag',
        'EgyediAzonosito', 'HatarozatEve', 'AllKapcsolodoUgyszam', 
        'AllKapcsolodoBirosag', 'KapcsolodoHatarozatok', 'Jogszabalyhelyek'
    ]
    
    # Csak a l√©tez≈ë oszlopokat adjuk hozz√°
    for col in metadata_columns:
        if col in chunk_df.columns:
            final_columns.append(col)
    
    # Visszaadjuk a l√©tez≈ë oszlopokat
    available_columns = [col for col in final_columns if col in chunk_df.columns]
    return available_columns

print("Seg√©df√ºggv√©nyek bet√∂ltve!")

In [None]:
# A100 f≈ëfolyamat - Robosztus embedding gener√°l√°s
def process_embeddings_a100():
    """
    A100 GPU-ra optimaliz√°lt robosztus embedding gener√°l√°s
    √öJDONS√ÅG: Chunked input t√°mogat√°s memory-safe feldolgoz√°shoz
    """
    
    start_time = time.time()
    logger.info("A100 chunked-kompatibilis embedding feldolgoz√°s kezd√©se...")
    
    processed_rows = 0
    all_results = []
    
    # ===== CHUNKED INPUT M√ìD =====
    if CHUNKED_INPUT_MODE:
        logger.info(f"üéØ CHUNKED INPUT feldolgoz√°s: {len(cleaned_chunk_files)} chunk f√°jl")
        
        with tqdm(total=len(cleaned_chunk_files), desc="Cleaned chunk feldolgoz√°s", unit="file") as file_pbar:
            
            for file_idx, chunk_file in enumerate(cleaned_chunk_files):
                chunk_start_time = time.time()
                file_name = os.path.basename(chunk_file)
                
                try:
                    # Cleaned chunk bet√∂lt√©se
                    chunk_df = pd.read_csv(chunk_file, encoding='utf-8')
                    logger.info(f"Chunk f√°jl bet√∂ltve: {file_name} ({len(chunk_df):,} sor)")
                    
                    # Alapvet≈ë adatellen≈ërz√©s
                    original_len = len(chunk_df)
                    chunk_df = chunk_df.dropna(subset=['text', 'doc_id'])
                    chunk_df['text'] = chunk_df['text'].astype(str)
                    
                    if len(chunk_df) == 0:
                        logger.warning(f"Chunk f√°jl √ºres: {file_name}")
                        file_pbar.update(1)
                        continue
                    
                    logger.info(f"Chunk feldolgoz√°s: {file_name} - {len(chunk_df):,} √©rv√©nyes sor")
                    
                    # Embedding gener√°l√°s a chunk-hoz
                    chunk_with_embeddings = process_single_chunk_embeddings(
                        chunk_df, f"File-{file_idx+1}/{len(cleaned_chunk_files)}"
                    )
                    
                    all_results.append(chunk_with_embeddings)
                    processed_rows += len(chunk_df)
                    
                    # Progress update
                    chunk_time = time.time() - chunk_start_time
                    rows_per_sec = len(chunk_df) / chunk_time
                    
                    file_pbar.set_postfix({
                        'F√°jl': file_name[:20],
                        'Sorok/sec': f'{rows_per_sec:.1f}',
                        'Mem√≥ria': f'{embedding_generator._monitor_memory().get("allocated_gb", 0):.1f}GB',
                        '√ñsszes': f'{processed_rows:,}'
                    })
                    file_pbar.update(1)
                    
                    # Rendszeres cleanup
                    if file_idx % 3 == 0:
                        embedding_generator._cleanup_memory()
                        
                except Exception as e:
                    logger.error(f"Hiba a chunk f√°jl feldolgoz√°s√°ban ({file_name}): {e}")
                    file_pbar.update(1)
                    continue
    
    # ===== UNIFIED CSV FALLBACK M√ìD =====
    else:
        logger.info("üìÑ UNIFIED CSV feldolgoz√°s (fallback mode)")
        
        # Teljes f√°jl m√©ret becsl√©se
        total_rows = sum(1 for _ in open(INPUT_CSV, 'r', encoding='utf-8')) - 1
        total_chunks = (total_rows + CHUNK_SIZE - 1) // CHUNK_SIZE
        
        logger.info(f"Feldolgozand√≥ sorok: {total_rows:,}")
        logger.info(f"Chunk m√©ret: {CHUNK_SIZE:,}")
        logger.info(f"Batch m√©ret: {BATCH_SIZE}")
        
        chunk_count = 0
        
        with tqdm(total=total_chunks, desc="Unified CSV chunk feldolgoz√°s", unit="chunk") as chunk_pbar:
            
            for chunk_df in pd.read_csv(INPUT_CSV, chunksize=CHUNK_SIZE, encoding='utf-8'):
                chunk_count += 1
                chunk_start_time = time.time()
                
                # Alapvet≈ë adatellen≈ërz√©s
                original_len = len(chunk_df)
                chunk_df = chunk_df.dropna(subset=['text', 'doc_id'])
                chunk_df['text'] = chunk_df['text'].astype(str)
                
                if len(chunk_df) == 0:
                    logger.warning(f"Chunk {chunk_count}: nincs √©rv√©nyes adat")
                    chunk_pbar.update(1)
                    continue
                
                logger.info(f"Chunk {chunk_count}/{total_chunks}: {len(chunk_df):,} √©rv√©nyes sor")
                
                # Embedding gener√°l√°s a chunk-hoz
                chunk_with_embeddings = process_single_chunk_embeddings(
                    chunk_df, f"Chunk-{chunk_count}/{total_chunks}"
                )
                
                all_results.append(chunk_with_embeddings)
                processed_rows += len(chunk_df)
                
                # Progress update
                chunk_time = time.time() - chunk_start_time
                rows_per_sec = len(chunk_df) / chunk_time
                
                chunk_pbar.set_postfix({
                    'Sorok/sec': f'{rows_per_sec:.1f}',
                    'Mem√≥ria': f'{embedding_generator._monitor_memory().get("allocated_gb", 0):.1f}GB',
                    'Sikeres': embedding_generator.processed_count,
                    'Hib√°s': embedding_generator.failed_count
                })
                chunk_pbar.update(1)
                
                # Rendszeres cleanup minden 5. chunk ut√°n
                if chunk_count % 5 == 0:
                    embedding_generator._cleanup_memory()
    
    # ===== EREDM√âNYEK EGYES√çT√âSE =====
    logger.info("DataFrame-ek egyes√≠t√©se...")
    if not all_results:
        raise ValueError("Nincs feldolgozott adat!")
    
    final_df = pd.concat(all_results, ignore_index=True)
    logger.info(f"Egyes√≠tett DataFrame: {len(final_df):,} sor")
    
    return final_df, processed_rows, time.time() - start_time

def process_single_chunk_embeddings(chunk_df, chunk_label):
    """
    Egyetlen chunk embedding feldolgoz√°sa (k√∂z√∂s logika chunked √©s unified m√≥dhoz).
    """
    # Sz√∂vegek √©s adapt√≠v batch m√©ret
    texts = chunk_df['text'].tolist()
    text_lengths = [len(text) for text in texts]
    dynamic_batch_size = adaptive_batch_size(text_lengths, BATCH_SIZE)
    
    # Batch-es embedding gener√°l√°s
    all_embeddings = []
    total_batches_in_chunk = (len(texts) + dynamic_batch_size - 1) // dynamic_batch_size
    
    with tqdm(total=total_batches_in_chunk, desc=f"{chunk_label} batch-ek", 
             unit="batch", leave=False) as batch_pbar:
        
        for batch_idx in range(0, len(texts), dynamic_batch_size):
            batch_texts = texts[batch_idx:batch_idx + dynamic_batch_size]
            
            # Embedding gener√°l√°s hibakezel√©ssel
            try:
                batch_embeddings = embedding_generator.generate_embeddings_batch(batch_texts)
                all_embeddings.extend(batch_embeddings.tolist())
                
                # Alapvet≈ë mem√≥ria monitoring
                memory_info = embedding_generator._monitor_memory()
                if memory_info.get('allocated_gb', 0) > MEMORY_LIMIT_GB * 0.85:
                    logger.warning(f"Magas mem√≥ria: {memory_info.get('allocated_gb', 0):.1f}GB")
                    embedding_generator._cleanup_memory()
                
            except Exception as e:
                logger.error(f"Batch hiba: {e}")
                # Fallback NaN vektorok
                nan_embeddings = np.full((len(batch_texts), EMBEDDING_DIMENSION), np.nan)
                all_embeddings.extend(nan_embeddings.tolist())
            
            batch_pbar.update(1)
    
    # Embedding sz√°moss√°gi ellen≈ërz√©s
    if len(all_embeddings) != len(chunk_df):
        logger.error(f"Embedding sz√°moss√°gi hiba: {len(all_embeddings)} != {len(chunk_df)}")
        # Kieg√©sz√≠t√©s NaN-okkal
        while len(all_embeddings) < len(chunk_df):
            all_embeddings.append(np.full(EMBEDDING_DIMENSION, np.nan).tolist())
    
    # Eredm√©nyek hozz√°ad√°sa
    chunk_df['embedding'] = all_embeddings
    chunk_df['metadata_json'] = chunk_df.apply(create_metadata_json, axis=1)
    
    # V√©gs≈ë oszlopok - √∂sszes metadata meg≈ërz√©se
    available_columns = prepare_final_columns(chunk_df)
    chunk_result = chunk_df[available_columns].copy()
    
    return chunk_result

# A100 f≈ëfolyamat ind√≠t√°sa
logger.info("A100 embedding feldolgoz√°s ind√≠t√°sa...")
final_df, processed_rows, total_time = process_embeddings_a100()

In [None]:
# Parquet ment√©s √©s v√©gs≈ë valid√°ci√≥
logger.info("Parquet ment√©s √©s valid√°ci√≥...")

# Embedding valid√°ci√≥
valid_embeddings = 0
nan_embeddings = 0
dimension_errors = 0

for idx, emb in enumerate(final_df['embedding']):
    if isinstance(emb, list):
        if len(emb) == EMBEDDING_DIMENSION:
            if not np.any(np.isnan(emb)):
                valid_embeddings += 1
            else:
                nan_embeddings += 1
        else:
            dimension_errors += 1
    else:
        dimension_errors += 1

logger.info(f"Embedding valid√°ci√≥:")
logger.info(f"  √ârv√©nyes: {valid_embeddings:,}")
logger.info(f"  NaN: {nan_embeddings:,}")
logger.info(f"  Dimenzi√≥ hiba: {dimension_errors:,}")

# Parquet ment√©s
logger.info(f"V√©gs≈ë Parquet ment√©s: {OUTPUT_PARQUET}")

final_df.to_parquet(
    OUTPUT_PARQUET,
    engine='pyarrow',
    index=False,
    compression='snappy',
    row_group_size=50000
)

# F√°jl valid√°ci√≥
file_size = os.path.getsize(OUTPUT_PARQUET) / (1024**3)

# Gyors visszaolvas√°si teszt
test_df = pd.read_parquet(OUTPUT_PARQUET, nrows=100)
logger.info(f"Visszaolvas√°si teszt sikeres: {len(test_df)} sor")

# V√©gs≈ë statisztik√°k
logger.info("A100 QWEN3-4b EMBEDDING GENER√ÅL√ÅS BEFEJEZVE!")
logger.info(f"Feldolgozott sorok: {processed_rows:,}")
logger.info(f"V√©gs≈ë sorok: {len(final_df):,}")
logger.info(f"V√©gs≈ë oszlopok ({len(final_df.columns)}): {list(final_df.columns)}")
logger.info(f"√ârv√©nyes embeddings: {valid_embeddings:,}")
logger.info(f"F√°jl m√©ret: {file_size:.2f}GB")
logger.info(f"Teljes fut√°si id≈ë: {total_time/3600:.2f} √≥ra")

print("\n" + "="*80)
print("QWEN3-0.6B EMBEDDING FELDOLGOZAS BEFEJEZVE!")
print("="*80)
print(f"Feldolgozott dokumentumok: {processed_rows:,}")
print(f"Vegso Parquet fajl: {OUTPUT_PARQUET}")
print(f"Oszlopok szama: {len(final_df.columns)}")
print(f"Ervenyes embeddings: {valid_embeddings:,}")
print(f"Fajl meret: {file_size:.2f}GB")
print(f"Futasi ido: {total_time/3600:.2f} ora")
print("="*80)
logger.info(f"Teljes fut√°si id≈ë: {total_time/3600:.2f} √≥ra")
logger.info(f"√Åtlag sebess√©g: {processed_rows/total_time:.1f} sor/sec")
logger.info(f"F√°jl m√©ret: {file_size:.2f} GB")
logger.info(f"Cs√∫cs mem√≥ria: {embedding_generator.peak_memory_usage:.1f}GB")

print("\nA100 QWEN3-0.6B EMBEDDING GENERALAS SIKERESEN BEFEJEZVE!")
print(f"Feldolgozott sorok: {processed_rows:,}")
print(f"√ârv√©nyes embeddings: {valid_embeddings:,}")
print(f"F√°jl m√©ret: {file_size:.2f} GB")
print(f"Teljes id≈ë: {total_time/3600:.2f} √≥ra")
print(f"Sebess√©g: {processed_rows/total_time:.1f} sor/sec")
print(f"Cs√∫cs mem√≥ria: {embedding_generator.peak_memory_usage:.1f}GB")
print(f"Sikeress√©gi ar√°ny: {(valid_embeddings/len(final_df)*100):.1f}%")