In [None]:
# RunPod A100 GPU - K√∂nyvt√°rak telep√≠t√©se √©s import√°l√°sa
!pip install -U torch sentence-transformers accelerate pyarrow pandas tqdm transformers

import pandas as pd
import numpy as np
import gc
import json
import pyarrow as pa
import pyarrow.parquet as pq
from sentence_transformers import SentenceTransformer
import torch
import psutil
import time
import logging
from tqdm import tqdm
from typing import List, Dict, Any
import os
import warnings
warnings.filterwarnings('ignore')

# A100 GPU optimaliz√°ci√≥
torch.backends.cudnn.benchmark = True
torch.backends.cudnn.deterministic = False
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True

print("RunPod A100 k√∂rnyezet inicializ√°lva!")
print(f"CUDA el√©rhet≈ë: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"GPU mem√≥ria: {torch.cuda.get_device_properties(0).total_memory / (1024**3):.1f}GB")

In [None]:
# RunPod A100 konfigur√°ci√≥
print("RunPod A100 konfigur√°ci√≥ be√°ll√≠t√°sa...")

# F√°jl el√©r√©si utak RunPod-on
INPUT_CSV = "/workspace/cleaned_data_for_embedding.csv"
OUTPUT_PARQUET = "/workspace/processed_documents_with_embeddings.parquet"

# Qwen3-8B A100 optimaliz√°lt param√©terek
MODEL_NAME = "Qwen/Qwen3-Embedding-8B"
EMBEDDING_DIMENSION = 8192
BATCH_SIZE = 32            # A100 optim√°lis
CHUNK_SIZE = 5000          # Chunk m√©ret
MAX_TOKEN_LENGTH = 8192
USE_MIXED_PRECISION = True
MEMORY_LIMIT_GB = 75       # A100 80GB-b√≥l 75GB haszn√°lata

print(f"Bemeneti CSV: {INPUT_CSV}")
print(f"Kimeneti Parquet: {OUTPUT_PARQUET}")
print(f"Modell: {MODEL_NAME}")
print(f"Dimenzi√≥: {EMBEDDING_DIMENSION}")
print(f"Batch m√©ret: {BATCH_SIZE}")
print(f"Chunk m√©ret: {CHUNK_SIZE:,}")
print(f"Mixed Precision: {USE_MIXED_PRECISION}")
print(f"Mem√≥ria limit: {MEMORY_LIMIT_GB}GB")

# Logging konfigur√°ci√≥
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.StreamHandler(),
        logging.FileHandler('/workspace/embedding_generation.log')
    ]
)
logger = logging.getLogger(__name__)

In [None]:
# CSV adatok bet√∂lt√©se √©s valid√°l√°sa
logger.info("CSV adatok valid√°l√°sa...")

# F√°jl l√©tez√©s ellen≈ërz√©se
if not os.path.exists(INPUT_CSV):
    raise FileNotFoundError(f"CSV f√°jl nem tal√°lhat√≥: {INPUT_CSV}")

# Mintaadatok bet√∂lt√©se strukt√∫ra ellen≈ërz√©shez
df_sample = pd.read_csv(INPUT_CSV, nrows=1000)
logger.info(f"Minta bet√∂ltve: {len(df_sample)} sor")

# Teljes f√°jl m√©ret becsl√©se
total_rows = sum(1 for _ in open(INPUT_CSV, 'r', encoding='utf-8')) - 1
logger.info(f"Becs√ºlt teljes sorok: {total_rows:,}")

# K√∂telez≈ë oszlopok ellen≈ërz√©se
required_columns = ['text', 'doc_id']
missing_columns = [col for col in required_columns if col not in df_sample.columns]
if missing_columns:
    raise ValueError(f"Hi√°nyz√≥ k√∂telez≈ë oszlopok: {missing_columns}")

# Teljes metadata oszlop lista a preprocess_documents.py alapj√°n
expected_metadata_columns = [
    'doc_id', 'text', 'birosag', 'JogTerulet', 'Azonosito', 'MeghozoBirosag',
    'EgyediAzonosito', 'HatarozatEve', 'AllKapcsolodoUgyszam', 'AllKapcsolodoBirosag',
    'KapcsolodoHatarozatok', 'Jogszabalyhelyek'
]

# Jelenlegi oszlopok list√°z√°sa
available_columns = list(df_sample.columns)
metadata_columns_present = [col for col in expected_metadata_columns if col in available_columns]
metadata_columns_missing = [col for col in expected_metadata_columns if col not in available_columns]

print("CSV valid√°ci√≥ sikeres!")
print(f"Teljes sorok: {total_rows:,}")
print(f"√ñsszes oszlop: {len(available_columns)}")
print(f"Jelenlev≈ë metadata oszlopok ({len(metadata_columns_present)}): {metadata_columns_present}")
if metadata_columns_missing:
    print(f"Hi√°nyz√≥ metadata oszlopok ({len(metadata_columns_missing)}): {metadata_columns_missing}")

# Sz√∂veg hossz statisztik√°k
text_lengths = df_sample['text'].str.len()
print(f"\nSz√∂veg hossz statisztik√°k (minta):")
print(f"  √Åtlag: {text_lengths.mean():.0f} karakter")
print(f"  Medi√°n: {text_lengths.median():.0f} karakter")
print(f"  Min: {text_lengths.min():.0f} karakter")
print(f"  Max: {text_lengths.max():.0f} karakter")

# Becs√ºlt feldolgoz√°si id≈ë
estimated_batches = (total_rows + BATCH_SIZE - 1) // BATCH_SIZE
estimated_chunks = (total_rows + CHUNK_SIZE - 1) // CHUNK_SIZE
print(f"\nBecs√ºlt feldolgoz√°s:")
print(f"  Chunk-ok sz√°ma: {estimated_chunks:,}")
print(f"  Batch-ek sz√°ma: {estimated_batches:,}")
print(f"  Becs√ºlt id≈ë: 2-3 √≥ra A100 GPU-n")

In [None]:
# Qwen3-Embedding-8B modell oszt√°ly A100-ra optimaliz√°lva
logger.info("Qwen3-Embedding-8B modell oszt√°ly l√©trehoz√°sa...")

class OptimizedQwen3EmbeddingGenerator:
    def __init__(self):
        self.model_name = MODEL_NAME
        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
        self.dimension = EMBEDDING_DIMENSION
        self.max_tokens = MAX_TOKEN_LENGTH
        self.batch_size = BATCH_SIZE
        
        # Teljes√≠tm√©ny k√∂vet√©s
        self.processed_count = 0
        self.failed_count = 0
        self.batch_times = []
        self.peak_memory_usage = 0
        
        logger.info(f"Device: {self.device}")
        
        try:
            # Modell bet√∂lt√©se
            logger.info("Qwen3-8B modell bet√∂lt√©se...")
            self.model = SentenceTransformer(
                self.model_name,
                device=self.device,
                trust_remote_code=True
            )
            
            # Mixed precision optimaliz√°ci√≥
            if self.device == 'cuda' and USE_MIXED_PRECISION:
                self.model.half()
                logger.info("Mixed precision bekapcsolva")
            
            # Modell warmup
            self._warmup_model()
            logger.info("Modell sikeresen bet√∂ltve √©s optimaliz√°lva")
            
        except Exception as e:
            logger.error(f"Modell bet√∂lt√©si hiba: {e}")
            raise
    
    def _warmup_model(self):
        """Modell warmup konzisztens teljes√≠tm√©ny√©rt"""
        logger.info("Modell warmup...")
        dummy_texts = ["Ez egy teszt sz√∂veg a modell bemeleg√≠t√©s√©hez."] * 4
        
        with torch.cuda.amp.autocast(enabled=USE_MIXED_PRECISION):
            _ = self.model.encode(dummy_texts, show_progress_bar=False)
        
        self._cleanup_memory()
        logger.info("Warmup befejezve")
    
    def _cleanup_memory(self):
        """Mem√≥ria tiszt√≠t√°s"""
        gc.collect()
        if torch.cuda.is_available():
            torch.cuda.empty_cache()
            torch.cuda.synchronize()
    
    def _monitor_memory(self):
        """GPU mem√≥ria monitoring"""
        if not torch.cuda.is_available():
            return {}
        
        allocated = torch.cuda.memory_allocated() / (1024**3)
        reserved = torch.cuda.memory_reserved() / (1024**3)
        
        self.peak_memory_usage = max(self.peak_memory_usage, allocated)
        
        return {
            'allocated_gb': allocated,
            'reserved_gb': reserved,
            'peak_usage_gb': self.peak_memory_usage
        }

# Embedding gener√°tor inicializ√°l√°sa
embedding_generator = OptimizedQwen3EmbeddingGenerator()
print("Qwen3-8B modell sikeresen inicializ√°lva!")
print(f"Dimenzi√≥: {embedding_generator.dimension}")
print(f"Device: {embedding_generator.device}")
print(f"Mixed Precision: {USE_MIXED_PRECISION}")

In [None]:
# Embedding gener√°l√°s met√≥dus hozz√°ad√°sa
def generate_embeddings_batch(self, texts):
    """A100 optimaliz√°lt batch embedding gener√°l√°s"""
    batch_start_time = time.time()
    
    try:
        # Sz√∂veg el≈ëfeldolgoz√°s
        processed_texts = []
        for text in texts:
            if len(text) > self.max_tokens * 3:  # ~3 char/token becsl√©s
                text = text[:self.max_tokens * 3]
            processed_texts.append(text)
        
        # Mixed precision embedding gener√°l√°s
        with torch.cuda.amp.autocast(enabled=USE_MIXED_PRECISION):
            embeddings = self.model.encode(
                processed_texts,
                normalize_embeddings=True,
                show_progress_bar=False,
                convert_to_numpy=True,
                batch_size=len(processed_texts)
            )
        
        # Dimenzi√≥ ellen≈ërz√©s √©s korrekci√≥
        if embeddings.shape[1] != self.dimension:
            if embeddings.shape[1] > self.dimension:
                embeddings = embeddings[:, :self.dimension]
            else:
                padding = np.zeros((embeddings.shape[0], self.dimension - embeddings.shape[1]))
                embeddings = np.hstack([embeddings, padding])
        
        # Teljes√≠tm√©ny k√∂vet√©s
        batch_time = time.time() - batch_start_time
        self.batch_times.append(batch_time)
        self.processed_count += len(texts)
        
        return embeddings.astype(np.float32)
        
    except Exception as e:
        logger.error(f"Batch feldolgoz√°si hiba: {e}")
        self.failed_count += len(texts)
        # Fallback: NaN vektorok
        return np.full((len(texts), self.dimension), np.nan, dtype=np.float32)
    
    finally:
        # Rendszeres mem√≥ria cleanup
        if self.processed_count % 1000 == 0:
            self._cleanup_memory()

# Met√≥dus hozz√°ad√°sa az oszt√°lyhoz
OptimizedQwen3EmbeddingGenerator.generate_embeddings_batch = generate_embeddings_batch

print("Embedding gener√°l√°s met√≥dus hozz√°adva!")

In [None]:
# Seg√©df√ºggv√©nyek
def create_metadata_json(row):
    """Teljes metadata JSON k√©sz√≠t√©se az √∂sszes el√©rhet≈ë oszloppal"""
    metadata = {
        'doc_id': str(row.get('doc_id', '')),
        'birosag': str(row.get('birosag', '')),
        'JogTerulet': str(row.get('JogTerulet', '')),
        'Azonosito': str(row.get('Azonosito', '')),
        'MeghozoBirosag': str(row.get('MeghozoBirosag', '')),
        'EgyediAzonosito': str(row.get('EgyediAzonosito', '')),
        'HatarozatEve': str(row.get('HatarozatEve', '')),
        'AllKapcsolodoUgyszam': str(row.get('AllKapcsolodoUgyszam', '')),
        'AllKapcsolodoBirosag': str(row.get('AllKapcsolodoBirosag', '')),
        'KapcsolodoHatarozatok': str(row.get('KapcsolodoHatarozatok', '')),
        'Jogszabalyhelyek': str(row.get('Jogszabalyhelyek', '')),
        'text_length': len(str(row.get('text', ''))),
        'processed_timestamp': time.time()
    }
    return json.dumps(metadata, ensure_ascii=False)

def adaptive_batch_size(text_lengths, base_batch_size=BATCH_SIZE):
    """Adapt√≠v batch m√©ret sz√∂veg hossz alapj√°n"""
    avg_length = np.mean(text_lengths)
    
    if avg_length > 6000:
        return max(8, base_batch_size // 4)
    elif avg_length > 4000:
        return max(16, base_batch_size // 2)
    elif avg_length > 2000:
        return base_batch_size
    else:
        return min(64, base_batch_size * 2)

def prepare_final_columns(chunk_df):
    """V√©gs≈ë oszlopok el≈ëk√©sz√≠t√©se - √∂sszes metadata meg≈ërz√©se"""
    # Alapvet≈ë oszlopok (k√∂telez≈ë)
    final_columns = ['doc_id', 'text', 'embedding', 'metadata_json']
    
    # √ñsszes metadata oszlop hozz√°ad√°sa, ha l√©tezik
    metadata_columns = [
        'birosag', 'JogTerulet', 'Azonosito', 'MeghozoBirosag',
        'EgyediAzonosito', 'HatarozatEve', 'AllKapcsolodoUgyszam', 
        'AllKapcsolodoBirosag', 'KapcsolodoHatarozatok', 'Jogszabalyhelyek'
    ]
    
    # Csak a l√©tez≈ë oszlopokat adjuk hozz√°
    for col in metadata_columns:
        if col in chunk_df.columns:
            final_columns.append(col)
    
    # Visszaadjuk a l√©tez≈ë oszlopokat
    available_columns = [col for col in final_columns if col in chunk_df.columns]
    return available_columns

print("Seg√©df√ºggv√©nyek bet√∂ltve!")

In [None]:
# A100 f≈ëfolyamat - Robosztus embedding gener√°l√°s
def process_embeddings_a100():
    """
    A100 GPU-ra optimaliz√°lt robosztus embedding gener√°l√°s
    """
    
    start_time = time.time()
    logger.info("A100 embedding feldolgoz√°s kezd√©se...")
    
    total_rows = sum(1 for _ in open(INPUT_CSV, 'r', encoding='utf-8')) - 1
    processed_rows = 0
    all_results = []
    
    logger.info(f"Feldolgozand√≥ sorok: {total_rows:,}")
    logger.info(f"Chunk m√©ret: {CHUNK_SIZE:,}")
    logger.info(f"Batch m√©ret: {BATCH_SIZE}")
    
    # Chunk-alap√∫ feldolgoz√°s
    chunk_count = 0
    total_chunks = (total_rows + CHUNK_SIZE - 1) // CHUNK_SIZE
    
    with tqdm(total=total_chunks, desc="Chunk feldolgoz√°s", unit="chunk") as chunk_pbar:
        
        for chunk_df in pd.read_csv(INPUT_CSV, chunksize=CHUNK_SIZE, encoding='utf-8'):
            chunk_count += 1
            chunk_start_time = time.time()
            
            # Adatok tiszt√≠t√°sa
            original_len = len(chunk_df)
            chunk_df = chunk_df.dropna(subset=['text', 'doc_id'])
            chunk_df['text'] = chunk_df['text'].astype(str)
            chunk_df = chunk_df[chunk_df['text'].str.len() > 10]  # Min sz√∂veghossz
            
            if len(chunk_df) == 0:
                logger.warning(f"Chunk {chunk_count}: nincs √©rv√©nyes adat")
                chunk_pbar.update(1)
                continue
            
            logger.info(f"Chunk {chunk_count}/{total_chunks}: {len(chunk_df):,} √©rv√©nyes sor")
            
            # Sz√∂vegek √©s adapt√≠v batch m√©ret
            texts = chunk_df['text'].tolist()
            text_lengths = [len(text) for text in texts]
            dynamic_batch_size = adaptive_batch_size(text_lengths, BATCH_SIZE)
            
            # Batch-es embedding gener√°l√°s
            all_embeddings = []
            total_batches_in_chunk = (len(texts) + dynamic_batch_size - 1) // dynamic_batch_size
            
            with tqdm(total=total_batches_in_chunk, desc=f"Chunk {chunk_count} batch-ek", 
                     unit="batch", leave=False) as batch_pbar:
                
                for batch_idx in range(0, len(texts), dynamic_batch_size):
                    batch_texts = texts[batch_idx:batch_idx + dynamic_batch_size]
                    
                    # Embedding gener√°l√°s hibakezel√©ssel
                    try:
                        batch_embeddings = embedding_generator.generate_embeddings_batch(batch_texts)
                        all_embeddings.extend(batch_embeddings.tolist())
                        
                        # Mem√≥ria monitoring
                        memory_info = embedding_generator._monitor_memory()
                        if memory_info.get('allocated_gb', 0) > MEMORY_LIMIT_GB * 0.9:
                            logger.warning(f"Magas mem√≥ria: {memory_info.get('allocated_gb', 0):.1f}GB")
                            embedding_generator._cleanup_memory()
                        
                    except Exception as e:
                        logger.error(f"Batch hiba: {e}")
                        # Fallback NaN vektorok
                        nan_embeddings = np.full((len(batch_texts), EMBEDDING_DIMENSION), np.nan)
                        all_embeddings.extend(nan_embeddings.tolist())
                    
                    batch_pbar.update(1)
            
            # Embedding sz√°moss√°gi ellen≈ërz√©s
            if len(all_embeddings) != len(chunk_df):
                logger.error(f"Embedding sz√°moss√°gi hiba: {len(all_embeddings)} != {len(chunk_df)}")
                # Kieg√©sz√≠t√©s NaN-okkal
                while len(all_embeddings) < len(chunk_df):
                    all_embeddings.append(np.full(EMBEDDING_DIMENSION, np.nan).tolist())
            
            # Eredm√©nyek hozz√°ad√°sa
            chunk_df['embedding'] = all_embeddings
            chunk_df['metadata_json'] = chunk_df.apply(create_metadata_json, axis=1)
            
            # V√©gs≈ë oszlopok - √∂sszes metadata meg≈ërz√©se
            available_columns = prepare_final_columns(chunk_df)
            chunk_result = chunk_df[available_columns].copy()
            
            all_results.append(chunk_result)
            processed_rows += len(chunk_df)
            
            # Progress update
            chunk_time = time.time() - chunk_start_time
            rows_per_sec = len(chunk_df) / chunk_time
            
            chunk_pbar.set_postfix({
                'Sorok/sec': f'{rows_per_sec:.1f}',
                'Mem√≥ria': f'{embedding_generator._monitor_memory().get("allocated_gb", 0):.1f}GB',
                'Sikeres': embedding_generator.processed_count,
                'Hib√°s': embedding_generator.failed_count
            })
            chunk_pbar.update(1)
            
            # Rendszeres cleanup
            if chunk_count % 3 == 0:
                embedding_generator._cleanup_memory()
    
    # Eredm√©nyek egyes√≠t√©se
    logger.info("DataFrame-ek egyes√≠t√©se...")
    if not all_results:
        raise ValueError("Nincs feldolgozott adat!")
    
    final_df = pd.concat(all_results, ignore_index=True)
    logger.info(f"Egyes√≠tett DataFrame: {len(final_df):,} sor")
    
    return final_df, processed_rows, time.time() - start_time

# A100 f≈ëfolyamat ind√≠t√°sa
logger.info("A100 embedding feldolgoz√°s ind√≠t√°sa...")
final_df, processed_rows, total_time = process_embeddings_a100()

In [None]:
# Parquet ment√©s √©s v√©gs≈ë valid√°ci√≥
logger.info("Parquet ment√©s √©s valid√°ci√≥...")

# Embedding valid√°ci√≥
valid_embeddings = 0
nan_embeddings = 0
dimension_errors = 0

for idx, emb in enumerate(final_df['embedding']):
    if isinstance(emb, list):
        if len(emb) == EMBEDDING_DIMENSION:
            if not np.any(np.isnan(emb)):
                valid_embeddings += 1
            else:
                nan_embeddings += 1
        else:
            dimension_errors += 1
    else:
        dimension_errors += 1

logger.info(f"Embedding valid√°ci√≥:")
logger.info(f"  √ârv√©nyes: {valid_embeddings:,}")
logger.info(f"  NaN: {nan_embeddings:,}")
logger.info(f"  Dimenzi√≥ hiba: {dimension_errors:,}")

# Parquet ment√©s
logger.info(f"V√©gs≈ë Parquet ment√©s: {OUTPUT_PARQUET}")

final_df.to_parquet(
    OUTPUT_PARQUET,
    engine='pyarrow',
    index=False,
    compression='snappy',
    row_group_size=50000
)

# F√°jl valid√°ci√≥
file_size = os.path.getsize(OUTPUT_PARQUET) / (1024**3)

# Gyors visszaolvas√°si teszt
test_df = pd.read_parquet(OUTPUT_PARQUET, nrows=100)
logger.info(f"Visszaolvas√°si teszt sikeres: {len(test_df)} sor")

# V√©gs≈ë statisztik√°k
logger.info("A100 QWEN3-8B EMBEDDING GENER√ÅL√ÅS BEFEJEZVE!")
logger.info(f"Feldolgozott sorok: {processed_rows:,}")
logger.info(f"V√©gs≈ë sorok: {len(final_df):,}")
logger.info(f"V√©gs≈ë oszlopok ({len(final_df.columns)}): {list(final_df.columns)}")
logger.info(f"√ârv√©nyes embeddings: {valid_embeddings:,}")
logger.info(f"F√°jl m√©ret: {file_size:.2f}GB")
logger.info(f"Teljes fut√°si id≈ë: {total_time/3600:.2f} √≥ra")

print("\n" + "="*80)
print("QWEN3-8B EMBEDDING FELDOLGOZ√ÅS BEFEJEZVE!")
print("="*80)
print(f"üìä Feldolgozott dokumentumok: {processed_rows:,}")
print(f"üìÅ V√©gs≈ë Parquet f√°jl: {OUTPUT_PARQUET}")
print(f"üìà Oszlopok sz√°ma: {len(final_df.columns)}")
print(f"üéØ √ârv√©nyes embeddings: {valid_embeddings:,}")
print(f"üíæ F√°jl m√©ret: {file_size:.2f}GB")
print(f"‚è±Ô∏è  Fut√°si id≈ë: {total_time/3600:.2f} √≥ra")
print("="*80)
logger.info(f"Teljes fut√°si id≈ë: {total_time/3600:.2f} √≥ra")
logger.info(f"√Åtlag sebess√©g: {processed_rows/total_time:.1f} sor/sec")
logger.info(f"F√°jl m√©ret: {file_size:.2f} GB")
logger.info(f"Cs√∫cs mem√≥ria: {embedding_generator.peak_memory_usage:.1f}GB")

print("\nA100 QWEN3-8B EMBEDDING GENER√ÅL√ÅS SIKERESEN BEFEJEZVE!")
print(f"Feldolgozott sorok: {processed_rows:,}")
print(f"√ârv√©nyes embeddings: {valid_embeddings:,}")
print(f"F√°jl m√©ret: {file_size:.2f} GB")
print(f"Teljes id≈ë: {total_time/3600:.2f} √≥ra")
print(f"Sebess√©g: {processed_rows/total_time:.1f} sor/sec")
print(f"Cs√∫cs mem√≥ria: {embedding_generator.peak_memory_usage:.1f}GB")
print(f"Sikeress√©gi ar√°ny: {(valid_embeddings/len(final_df)*100):.1f}%")