In [101]:
import pandas as pd
import numpy as np
import math
import os
from tqdm.notebook import tqdm

In [2]:
df = pd.read_csv('train.csv', index_col=0)

In [98]:
from utils import entry_to_desc
from openai import OpenAI
client = OpenAI(api_key=os.environ['OPENAI_KEY'])

def df_to_desc(df_):
    descs = []
    for i in notebook.tqdm(df_.index):
        entry = df_.loc[i]
        desc = entry_to_desc(entry)
        descs.append((i, desc))
    return descs

In [108]:
embedding_model = "text-embedding-ada-002"
embedding_length = 1536

def df_to_embeddings(df_, output):
    if os.path.isfile(output):
        return []
    n_rows, _ = df_.shape
    embeddings = np.empty((n_rows, embedding_length+1))
    for row, i in enumerate(df_.index):
        entry = df_.loc[i]
        desc = entry_to_desc(entry)
        response = client.embeddings.create(input=desc, model=embedding_model)
        embeddings[row, 0] = i
        embeddings[row, 1:] = np.array(response.data[0].embedding)
    np.save(output, embeddings)
    print(f'Processed {output}')
    return embeddings

In [109]:
import multiprocess as mp

with mp.Pool(20) as pool:
    # Initialize tqdm with the total number of tasks
    total_tasks = 3460
    per_task = 100
    args_list = [(df[j*per_task:(j+1)*per_task], f'training_embeddings/training_embeddings_{j}.npy') for j in range(total_tasks)]
    result = pool.starmap_async(df_to_embeddings, args_list)
    # Wait for all processes to finish
    emb = result.get()