In [4]:
#!/usr/bin/env python3
"""
Ultra-Efficient CSV Embedder for CPU
"""

import pandas as pd
import numpy as np
from transformers import pipeline
import gc

# Configuration
MODEL = "sentence-transformers/all-MiniLM-L6-v2"  # 80MB, CPU-optimized
BATCH_SIZE = 4  # Even smaller batches for 8GB RAM
TEXT_COL = "Ideas"  # Match your CSV column name
MAX_LENGTH = 128  # Characters

def embed_chunk(chunk):
    """Process text chunks with memory constraints"""
    model = pipeline(
        'feature-extraction',
        model=MODEL,
        device=-1,  # Force CPU
        truncation=True
    )
    
    texts = chunk[TEXT_COL].str[:MAX_LENGTH].tolist()
    embeddings = np.mean(model(texts), axis=1).astype(np.float16)
    return pd.DataFrame({'text': texts, 'embedding': embeddings.tolist()})

if __name__ == "__main__":
    # Stream CSV with minimal memory
    reader = pd.read_csv("151_ideas_updated2.csv", 
                       chunksize=BATCH_SIZE, 
                       usecols=[TEXT_COL])
    
    for idx, chunk in enumerate(reader):
        result = embed_chunk(chunk)
        mode = "a" if idx > 0 else "w"
        result.to_csv("embeddings.csv", mode=mode, 
                     header=(mode=="w"), index=False)
        
        # Force clean-up
        del result
        gc.collect()

ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 2 dimensions. The detected shape was (4, 1) + inhomogeneous part.