In [None]:
# works makes cleaned and embeddings 

import pandas as pd
import re
from transformers import AutoTokenizer, AutoModel
import torch
import numpy as np

# Step 1: Load data from CSV
df = pd.read_csv('151_ideas_updated2.csv')

# Step 2: Text preprocessing
def preprocess_text(text):
    if not isinstance(text, str):
        return ""
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = re.sub(r'\s+', ' ', text)     # Remove extra whitespace
    return text.strip()

# Apply preprocessing
df['Cleaned_Ideas'] = df['Ideas'].apply(preprocess_text)

# Step 3: Initialize RoBERTa model
model_name = 'roberta-base'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

# Step 4: Embedding generation function
def get_roberta_embeddings(texts, batch_size=32):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)
    
    embeddings = []
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i+batch_size]
        inputs = tokenizer(
            batch,
            padding=True,
            truncation=True,
            max_length=512,
            return_tensors='pt'
        ).to(device)
        
        with torch.no_grad():
            outputs = model(**inputs)
        
        # Use mean pooling for sentence embeddings
        batch_embeddings = outputs.last_hidden_state.mean(dim=1).cpu().numpy()
        embeddings.extend(batch_embeddings)
    
    return np.array(embeddings)

# Step 5: Generate embeddings in batches
text_list = df['Cleaned_Ideas'].tolist()
embeddings = get_roberta_embeddings(text_list)

# Step 6: Save embeddings back to DataFrame
df['Embeddings'] = embeddings.tolist()

# Step 7: Save to new CSV
df.to_csv('ideas_with_embeddings.csv', index=False)

print("Embeddings generated and saved successfully!")