In [1]:
import os
import json
import time
import httpx
import openai

import pandas as pd

from datetime import datetime
from tqdm import tqdm
from dotenv import load_dotenv
from openai import OpenAI 
from glob import glob

In [4]:
base_dir = os.path.join(os.path.expanduser('~'), 'data','ofij')
if not os.path.exists(base_dir):
    os.makedirs(base_dir)

stmeta_file = os.path.join(base_dir,'stock_meta.feather')
news_files = glob(os.path.join(base_dir, 'news*.feather'))
output_filepath = os.path.join(base_dir, "batch_embedding_output.jsonl")


In [None]:
dfnews = pd.read_feather(news_files[0])

In [None]:
dfnews.head()

In [None]:
dfnews.shape

In [None]:
dfnews = dfnews.drop_duplicates()
dfnews.shape

In [None]:
texts_jsonl = []
for i, row in dfnews.iterrows():
    texts_jsonl.append({
        'custom_id': str(i), 
        'method': 'POST',
        'url': '/v1/embeddings',
        'body': {
            'input': row['hts_pbnt_titl_cntt'], 
            'model':'text-embedding-3-small',
            'encoding_format': 'float'
            }
        }
    )

In [None]:
texts_jsonl[:5]

In [None]:
# save texts_jsonl to file
text_jsonfile = os.path.join(base_dir,'news_texts.jsonl')
with open(text_jsonfile, 'w', encoding='utf-8') as f:
    for text in texts_jsonl:
        jsonrec = json.dumps(text, ensure_ascii=False)
        f.write(f'{jsonrec}\n')

In [None]:
load_dotenv()

In [None]:
httpx_client= httpx.Client(verify=False) #, timeout=60)
openai.api_key  = os.environ['OPENAI_API_KEY']
client = OpenAI(http_client=httpx_client)

In [None]:
batch_input_file = client.files.create(
    file=open(text_jsonfile, "rb"),
    purpose="batch"
)
batch_input_file_id = batch_input_file.id

In [None]:
batch_input_file_id

In [None]:
# Create batch embedding request
batch_embedding_obj = client.batches.create(
    input_file_id=batch_input_file_id,
    endpoint="/v1/embeddings",
    completion_window="24h",
    metadata={
        "description": "Embedding batch run"
    }
)

print(f"Embedding Batch ID: {batch_embedding_obj.id}")

In [None]:
batch_id = batch_embedding_obj.id

POLL_INTERVAL = 60

def timestamp():
    return datetime.now().strftime("%Y-%m-%d %H:%M:%S")

while True:
    batch_status = client.batches.retrieve(batch_id)
    current_status = batch_status.status
    print(f"[{timestamp()}] Batch status: {current_status}")

    if current_status == "completed":
        print(f"[{timestamp()}] ✅ Batch completed successfully!")

        output_file_id = batch_status.output_file_id
        output_file = client.files.retrieve(output_file_id)

        result = client.files.content(output_file_id)
    
        with open(output_filepath, "wb") as f:
            f.write(result.content)
        print(f"[{timestamp()}] ✅ Embedding results saved at: {output_filepath}")
        break

    elif current_status in {"failed", "cancelled", "expired"}:
        print(f"[{timestamp()}] ❌ Batch terminated with status: {current_status}. Please check logs on OpenAI's dashboard.")
        break

    time.sleep(POLL_INTERVAL)

print(f"[{timestamp()}] 🎉 Monitoring script finished.")

In [7]:
resobj = []
with open(output_filepath, 'r', encoding='utf-8') as f:
	for line in f:
		resobj.append(json.loads(line))

In [15]:
resobj[0]['custom_id']

'0'

In [13]:
resobj[0]['response']['body']['data'][0]['embedding']

[0.02361799,
 0.007665091,
 -0.009329446,
 -0.016892638,
 0.006566843,
 -0.056112543,
 0.045809392,
 0.007432987,
 -0.032902148,
 0.015703812,
 -0.030524498,
 0.030411277,
 0.019406153,
 -0.01156557,
 0.04184664,
 -0.020515723,
 -0.077262305,
 -0.003560814,
 0.028554445,
 0.041914575,
 -0.024002943,
 -0.03016219,
 0.0033938123,
 0.025497466,
 0.031181183,
 0.014752753,
 -0.010959835,
 0.030773586,
 0.06671007,
 0.040080387,
 -0.02794305,
 -0.03650259,
 0.057924084,
 -0.05937332,
 0.024818137,
 0.050723203,
 0.022961307,
 -0.021455461,
 -0.043024145,
 -0.0077783125,
 -0.013439384,
 0.009448329,
 0.030184833,
 0.012963854,
 0.060414955,
 0.049319256,
 -0.034894846,
 0.030705653,
 0.04633021,
 0.053395227,
 -0.021885702,
 0.05008916,
 0.02272354,
 0.02361799,
 0.00074443093,
 -0.037883893,
 -0.037544228,
 -0.043409098,
 -0.03571004,
 -0.025021937,
 0.056067254,
 0.007353732,
 -0.0072178664,
 0.02171587,
 -0.028146848,
 -0.016711483,
 -0.022689575,
 0.032494552,
 -0.008446319,
 0.051538397

In [None]:
client.close()