In [None]:
import pandas as pd
import json
from dotenv import load_dotenv
import os
from openai import OpenAI
import tiktoken
from pyprojroot import here

load_dotenv()

True

In [26]:
os.environ['OPENAI_API_KEY'] = os.getenv("OPENAI_API_KEY")

In [10]:
client = OpenAI() # This automagically uses the key from the environment variable

In [11]:
df = pd.read_csv(here('data/imdb_movies_100k.csv'))

In [12]:
def format_movie_data(movie):
    return f"""
Title: {movie['primaryTitle']} ({movie['startYear']})
Type: {movie['titleType']}
Runtime: {movie['runtimeMinutes']} minutes
Genres: {movie['genres']}
IMDb Rating: {movie['averageRating']} (Votes: {movie['numVotes']})

Cast & Crew:
{movie['cast_info']}
    """

#### Transforming the data the correct format for the OpenAI batch embedding endpoint

In [13]:
batch_data = [{
    "custom_id": f"movie-{i}",
    "method": "POST",
    "url":"/v1/embeddings",
    "body": {
        "model": "text-embedding-3-small",
        "input": format_movie_data(row)
    }
    } for i, row in df.iterrows()]

In [14]:
len(batch_data)

99935

In [15]:
def count_tokens(text, model="text-embedding-3-small"):
    encoding = tiktoken.encoding_for_model(model)
    return len(encoding.encode(text))

### Batch embedding limits
The batch embedding endpoint only allows for 3 million tokens being queued for embedding simultaneously. Therefore, we cannot upload all the batches at the same time. 

We write all the batches to separate files so that they are ready for upload

In [13]:
n_batches = 15 # 41 million token / 15 batches is less than 3 million tokens per batch

In [None]:
n_rows = len(batch_data)
batch_size = n_rows / n_batches + n_rows % n_batches
for i in range(n_batches):
    start_index = i * batch_size
    end_index = min((i + 1) * batch_size, n_rows)
    
    with open(here(f"data/movie_batches/movie_batch_{i+1}.jsonl"), "w") as f:
        for entry in batch_data[start_index:end_index]:
            f.write(json.dumps(entry) + "\n")