In [10]:
import json
import torch
import open_clip
import pandas as pd

# Load CLIP model (Fixed model name)
device = "cpu"  # Change to "cuda" if using GPU
model_name = "ViT-B-32"  # Correct OpenCLIP model format
model, _, preprocess = open_clip.create_model_and_transforms(model_name, pretrained="openai", device=device)
tokenizer = open_clip.get_tokenizer(model_name)

# Load JSON file
json_file_path = "images/test_data.json"  # Change this to your JSON file path

# Read JSON file (handling potential formatting issues)
with open(json_file_path, "r", encoding="utf-8") as file:
    data = [json.loads(line) for line in file.readlines()]

# Extract product titles
product_titles = [item["product_title"] for item in data]

# Tokenize and encode text
text_inputs = tokenizer(product_titles).to(device)

with torch.no_grad():
    text_features = model.encode_text(text_inputs)

# Normalize embeddings
text_features /= text_features.norm(dim=-1, keepdim=True)

# Convert to a list for storage
embedding_list = text_features.cpu().numpy().tolist()

# Create DataFrame
df = pd.DataFrame({"product_title": product_titles, "embedding": embedding_list})

# Save to CSV file
output_csv_path = "clip_text_embeddings.csv"
df.to_csv(output_csv_path, index=False)

print(f"Embeddings saved to {output_csv_path}")


Embeddings saved to clip_text_embeddings.csv
