In [None]:
# get top 5 similar items for each item in a dataset using OpenAI embeddings
# this script is for demonstration purposes only, please refer to OpenAI's API documentation for more details
# it has been tested wiht over 100K items and works well
# for over 100K I would consider batching te 
import pandas as pd
import numpy as np
from openai import OpenAI
from tqdm.notebook import tqdm
from sklearn.metrics.pairwise import cosine_similarity
import time

# Initialize OpenAI client
client = OpenAI() #make sure you have the OpenAI API key in your shell environment variables

# Read the CSV file
df = pd.read_csv('your-file.csv') # change this to your file. Make sure you have a column named "description" with the text you want to use for similarity

# Keep batching for OpenAI API calls
EMBEDDING_BATCH_SIZE = 100  # OpenAI recommended batch size for embeddings requests

def get_embeddings_batch(texts):
    try:
        response = client.embeddings.create(
            input=texts,
            model="text-embedding-3-small" #using the small OpenAI model
        )
        return [data.embedding for data in response.data]
    except Exception as e:
        print(f"Error in batch: {e}")
        time.sleep(1)
        return get_embeddings_batch(texts)

# Generate embeddings in batches (keep this because of API limits)
print("Generating embeddings in batches...")
all_embeddings = []
for i in tqdm(range(0, len(df), EMBEDDING_BATCH_SIZE)):
    batch_texts = df['description'].iloc[i:i + EMBEDDING_BATCH_SIZE].tolist() # using the column "description"
    batch_embeddings = get_embeddings_batch(batch_texts)
    all_embeddings.extend(batch_embeddings)
    time.sleep(0.1)

# Calculate full similarity matrix at once
print("Calculating similarities...")
embeddings_array = np.array(all_embeddings)
similarities = cosine_similarity(embeddings_array)

# Prepare results DataFrame
result_df = df.copy()
for i in range(5):
    result_df[f'similar_text_{i+1}'] = '' 

# Find top 5 similar items for each entry
print("Finding top 5 similar items...")
for i in tqdm(range(len(df))):
    similar_indices = np.argsort(similarities[i])[-6:-1][::-1]
    similar_entries = [
        f"{df.iloc[j]['title']}:{similarities[i][j]:.3f}"
        for j in similar_indices
    ]
    for k, entry in enumerate(similar_entries):
        result_df.iloc[i, result_df.columns.get_loc(f'similar_title_{k+1}')] = entry


# Save intermediate embeddings (optional but recommended for large datasets)
np.save('embeddings.npy', embeddings_array)

# Save results
result_df.to_csv('results_with_similarity.csv', index=False)