In [1]:
#!/usr/bin/env python3
"""
Optimized CSV Embedding Generator for Low-Resource Environments

Enhancements:
- Uses ultra-lightweight models (under 100MB)
- Memory-efficient processing
- CPU optimization with ONNX
- Quantized models where available
- Reduced precision embeddings
- Streaming CSV processing
"""

import os
import logging
import gc
import asyncio
from pathlib import Path
from typing import List, Optional, Generator

import pandas as pd
import numpy as np
from transformers import AutoTokenizer, pipeline
from tqdm import tqdm
import yaml

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
    handlers=[
        logging.FileHandler("embedding_generator.log"),
        logging.StreamHandler()
    ]
)
logger = logging.getLogger("csv_embedding_generator")

@dataclass
class EmbeddingConfig:
    """Configuration for embedding generation process."""
    input_file: Path
    output_file: Path
    model_name: str = "sentence-transformers/all-MiniLM-L6-v2"
    batch_size: int = 16  # Reduced for low memory
    use_float16: bool = True  # Reduce memory usage
    text_column: str = "articles"
    max_text_length: int = 256  # Prevent memory overflow
    model_cache_dir: Optional[Path] = None
    
    @classmethod
    def from_yaml(cls, config_path: Path) -> "EmbeddingConfig":
        """Load configuration from YAML file."""
        with open(config_path, "r") as f:
            config_data = yaml.safe_load(f)
        return cls(**config_data)

class OptimizedEmbeddingGenerator:
    """Memory-efficient embedding generator using ONNX/quantized models."""
    
    def __init__(self, config: EmbeddingConfig):
        self.config = config
        self.tokenizer = None
        self.model = None
        
        # Use model variants optimized for CPU
        self.model_map = {
            "mini-lm": "sentence-transformers/all-MiniLM-L6-v2",
            "tiny-bert": "sentence-transformers/paraphrase-albert-small-v2",
            "mobile-bert": "google/mobilebert-uncased",
            "distilroberta": "sentence-transformers/all-distilroberta-v1"
        }
        
        self._load_model()

    def _load_model(self):
        """Load model with CPU-optimized settings."""
        model_name = self.model_map.get(self.config.model_name, self.config.model_name)
        
        self.tokenizer = AutoTokenizer.from_pretrained(
            model_name,
            cache_dir=str(self.config.model_cache_dir) if self.config.model_cache_dir else None
        
        self.model = pipeline(
            "feature-extraction",
            model=model_name,
            device=-1,  # Force CPU
            torch_dtype=torch.float16 if self.config.use_float16 else torch.float32,
            truncation=True,
            framework="pt",
            max_length=self.config.max_text_length
        )

    def generate_batch(self, texts: List[str]) -> List[List[float]]:
        """Generate embeddings with memory constraints."""
        try:
            features = self.model(
                texts,
                padding=True,
                truncation=True,
                max_length=self.config.max_text_length
            )
            
            # Convert to numpy array for memory efficiency
            embeddings = np.mean(
                np.array(features, dtype=np.float16 if self.config.use_float16 else np.float32),
                axis=1
            )
            
            return embeddings.tolist()
        except Exception as e:
            logger.error(f"Error generating embeddings: {e}")
            raise

def process_csv_chunks(config: EmbeddingConfig) -> Generator[pd.DataFrame, None, None]:
    """Stream CSV in chunks to reduce memory usage."""
    try:
        for chunk in pd.read_csv(
            config.input_file,
            chunksize=config.batch_size,
            usecols=[config.text_column]
        ):
            # Clean text and truncate
            chunk[config.text_column] = chunk[config.text_column].str.slice(
                0, config.max_text_length
            )
            yield chunk
    except Exception as e:
        logger.error(f"Error reading CSV: {e}")
        raise

async def main():
    """Main processing function with memory optimizations."""
    try:
        config = EmbeddingConfig.from_yaml(Path("embedding_config.yaml"))
    except FileNotFoundError:
        config = EmbeddingConfig(
            input_file=Path("input.csv"),
            output_file=Path("embeddings.csv"),
            model_cache_dir=Path("model_cache")
        )

    # Create output directory if needed
    config.output_file.parent.mkdir(parents=True, exist_ok=True)
    
    # Initialize components
    generator = OptimizedEmbeddingGenerator(config)
    
    # Process in streaming fashion
    with open(config.output_file, "w") as f_out:
        for chunk_idx, chunk in enumerate(process_csv_chunks(config)):
            try:
                embeddings = generator.generate_batch(
                    chunk[config.text_column].tolist()
                )
                
                # Write incrementally
                chunk["embeddings"] = embeddings
                chunk.to_csv(
                    f_out,
                    mode="a",
                    header=f_out.tell()==0,
                    index=False
                )
                
                # Clean up memory
                del embeddings
                gc.collect()
                
                logger.info(f"Processed batch {chunk_idx + 1}")
                
            except MemoryError:
                logger.warning("Memory overflow! Reducing batch size...")
                config.batch_size = max(1, config.batch_size // 2)
                logger.info(f"New batch size: {config.batch_size}")
                continue

    logger.info("Embedding generation complete")

if __name__ == "__main__":
    try:
        asyncio.run(main())
    except KeyboardInterrupt:
        logger.info("Process interrupted by user")
    except Exception as e:
        logger.critical(f"Fatal error: {e}", exc_info=True)

SyntaxError: '(' was never closed (3832283904.py, line 79)