In [1]:
import os
import json
import time
import httpx
import openai

import pandas as pd

from datetime import datetime
from tqdm import tqdm
from dotenv import load_dotenv
from openai import OpenAI 
from glob import glob

In [None]:
base_dir = os.path.join(os.path.expanduser('~'), 'data','ofij')
if not os.path.exists(base_dir):
    os.makedirs(base_dir)

stmeta_file = os.path.join(base_dir,'stock_meta.feather')
news_files = glob(os.path.join(base_dir, 'news*.feather'))



In [3]:
dfnews = pd.read_feather(news_files[0])

In [4]:
dfnews.head()

Unnamed: 0,cntt_usiq_srno,news_ofer_entp_code,data_dt,data_tm,hts_pbnt_titl_cntt,news_lrdv_code,dorg,iscd1,kor_isnm1
0,2024032100100098155,U,20240321,1000,[기자의 눈] 벚꽃없는 벚꽃축제,39,서울경제,,
1,2024032100090896253,2,20240321,908,"외신들,""엔비디아 젠슨황이 AI계 스티브 잡스"" 평가",4,한국경제신문,,
2,2024032100053623852,2,20240321,536,베트남 GDP 3% 횡령한 부동산 재벌…사형 구형,9,한국경제신문,,
3,2024032100050072449,U,20240321,500,[사설] “정규직 과보호에 중장년 고용 불안”···노동 유연화 서둘러야,39,서울경제,,
4,2024032100050059151,U,20240321,500,"[사설] 의대별 정원 확정, 특위에서 필수?지역 의료 정상화에 머리 맞대라",39,서울경제,,


In [5]:
dfnews.shape

(342960, 9)

In [6]:
dfnews = dfnews.drop_duplicates()
dfnews.shape

(10401, 9)

In [8]:
dfmeta = pd.read_feather(stmeta_file)
dfmeta.columns

Index(['한글명', '표준코드', '단축코드', '상장일자', '시장구분', '업종대분류', '업종중분류', '표준산업분류',
       '액면가', '시가총액(억 원)', '매출액(억 원)', '영업이익(억 원)', '당기순이익(억 원)', 'ROE(%)',
       '전일종가(원)', '신용가능', '증거금비율(%)', 'KRX바이오', '관리종목', '거래정지', '불성실공시',
       '이상급등'],
      dtype='object')

In [35]:
metarecs = []
for i, rec in dfmeta.iterrows():
    metarecs.append('; '.join([f'{metacol}: {rec[metacol]}' for metacol in ['한글명','업종대분류','업종중분류','표준산업분류']]))


In [36]:
metarecs[2]

'한글명: 경방; 업종대분류: 시가총액규모중; 업종중분류: 섬유,의복; 표준산업분류: 종합 소매업'

In [21]:
load_dotenv()
httpx_client= httpx.Client(verify=False) #, timeout=60)
openai.api_key  = os.environ['OPENAI_API_KEY']
client = OpenAI(http_client=httpx_client)

### Stock meta sentence generation

In [37]:
system_message = """
Using the provided stock metadata, generate a concise and informative sentence describing the stock.
Your response must be a single sentence and must accurately reflect the given metadata.
"""

In [38]:
texts_jsonl = []
for i, row in enumerate(metarecs):
    texts_jsonl.append({
        'custom_id': str(i), 
        'method': 'POST',
        'url': '/v1/chat/completions',
        'body': {
            'model':'gpt-4o-mini',
            'messages':[{'role': 'system', 'content': system_message}, {'role':'user','content': row}], 'max_tokens':1000
            }
        })

In [39]:
len(texts_jsonl)

932

In [40]:
# save texts_jsonl to file
text_jsonfile = os.path.join(base_dir,'stock_meta.jsonl')
with open(text_jsonfile, 'w', encoding='utf-8') as f:
    for text in texts_jsonl:
        jsonrec = json.dumps(text, ensure_ascii=False)
        f.write(f'{jsonrec}\n')

In [41]:
batch_input_file = client.files.create(
    file=open(text_jsonfile, "rb"),
    purpose="batch"
)
batch_input_file_id = batch_input_file.id

In [42]:
output_filepath = os.path.join(base_dir, 'stock_meta_response.jsonl')

In [43]:
texts_jsonl[2]

{'custom_id': '2',
 'method': 'POST',
 'url': '/v1/chat/completions',
 'body': {'model': 'gpt-4o-mini',
  'messages': [{'role': 'system',
    'content': '\nUsing the provided stock metadata, generate a concise and informative sentence describing the stock.\nYour response must be a single sentence and must accurately reflect the given metadata.\n'},
   {'role': 'user',
    'content': '한글명: 경방; 업종대분류: 시가총액규모중; 업종중분류: 섬유,의복; 표준산업분류: 종합 소매업'}],
  'max_tokens': 1000}}

In [44]:
# Create batch embedding request
batch_embedding_obj = client.batches.create(
    input_file_id=batch_input_file_id,
    endpoint="/v1/chat/completions",
    completion_window="24h",
    metadata={
        "description": "news articles sentiment classification"
    }
)

batch_id = batch_embedding_obj.id
print(f"classification Batch ID: {batch_embedding_obj.id}")

classification Batch ID: batch_67fa486e837081908513665ade19efec


### News embedding generation

In [56]:
texts_jsonl = []
for i, row in dfnews.iterrows():
    texts_jsonl.append({
        'custom_id': str(i), 
        'method': 'POST',
        'url': '/v1/embeddings',
        'body': {
            'input': row['hts_pbnt_titl_cntt'], 
            'model':'text-embedding-3-small',
            'encoding_format': 'float'
            }
        }
    )

In [57]:
texts_jsonl[:5]

[{'custom_id': '0',
  'method': 'POST',
  'url': '/v1/embeddings',
  'body': {'input': '[기자의 눈] 벚꽃없는 벚꽃축제',
   'model': 'text-embedding-3-small',
   'encoding_format': 'float'}},
 {'custom_id': '1',
  'method': 'POST',
  'url': '/v1/embeddings',
  'body': {'input': '외신들,"엔비디아 젠슨황이 AI계 스티브 잡스" 평가',
   'model': 'text-embedding-3-small',
   'encoding_format': 'float'}},
 {'custom_id': '2',
  'method': 'POST',
  'url': '/v1/embeddings',
  'body': {'input': '베트남 GDP 3% 횡령한 부동산 재벌…사형 구형',
   'model': 'text-embedding-3-small',
   'encoding_format': 'float'}},
 {'custom_id': '3',
  'method': 'POST',
  'url': '/v1/embeddings',
  'body': {'input': '[사설] “정규직 과보호에 중장년 고용 불안”···노동 유연화 서둘러야',
   'model': 'text-embedding-3-small',
   'encoding_format': 'float'}},
 {'custom_id': '4',
  'method': 'POST',
  'url': '/v1/embeddings',
  'body': {'input': '[사설] 의대별 정원 확정, 특위에서 필수?지역 의료 정상화에 머리 맞대라',
   'model': 'text-embedding-3-small',
   'encoding_format': 'float'}}]

In [None]:
# save texts_jsonl to file
text_jsonfile = os.path.join(base_dir,'news_texts.jsonl')
with open(text_jsonfile, 'w', encoding='utf-8') as f:
    for text in texts_jsonl:
        jsonrec = json.dumps(text, ensure_ascii=False)
        f.write(f'{jsonrec}\n')

In [None]:
batch_input_file = client.files.create(
    file=open(text_jsonfile, "rb"),
    purpose="batch"
)
batch_input_file_id = batch_input_file.id

In [None]:
output_filepath = os.path.join(base_dir, 'news_embedding_response.jsonl')

In [None]:
# Create batch embedding request
batch_embedding_obj = client.batches.create(
    input_file_id=batch_input_file_id,
    endpoint="/v1/embeddings",
    completion_window="24h",
    metadata={
        "description": "Embedding batch run"
    }
)
batch_id = batch_embedding_obj.id
print(f"Embedding Batch ID: {batch_embedding_obj.id}")

### DOWNLOAD complete file

In [45]:
output_filepath

'C:\\Users\\by003457\\data\\ofij\\stock_meta_response.jsonl'

In [47]:
POLL_INTERVAL = 60

def timestamp():
    return datetime.now().strftime("%Y-%m-%d %H:%M:%S")

while True:
    batch_status = client.batches.retrieve(batch_id)
    current_status = batch_status.status
    print(f"[{timestamp()}] Batch status: {current_status}")

    if current_status == "completed":
        print(f"[{timestamp()}] ✅ Batch completed successfully!")

        output_file_id = batch_status.output_file_id
        output_file = client.files.retrieve(output_file_id)

        result = client.files.content(output_file_id)
    
        with open(output_filepath, "wb") as f:
            f.write(result.content)
        print(f"[{timestamp()}] ✅ Embedding results saved at: {output_filepath}")
        break

    elif current_status in {"failed", "cancelled", "expired"}:
        print(f"[{timestamp()}] ❌ Batch terminated with status: {current_status}. Please check logs on OpenAI's dashboard.")
        break

    time.sleep(POLL_INTERVAL)

print(f"[{timestamp()}] 🎉 Monitoring script finished.")

[2025-04-12 13:03:26] Batch status: in_progress
[2025-04-12 13:04:27] Batch status: in_progress
[2025-04-12 13:05:27] Batch status: in_progress
[2025-04-12 13:06:27] Batch status: in_progress
[2025-04-12 13:07:27] Batch status: in_progress
[2025-04-12 13:08:28] Batch status: in_progress
[2025-04-12 13:09:28] Batch status: in_progress
[2025-04-12 13:10:28] Batch status: in_progress
[2025-04-12 13:11:28] Batch status: in_progress
[2025-04-12 13:12:28] Batch status: in_progress
[2025-04-12 13:13:29] Batch status: in_progress
[2025-04-12 13:14:29] Batch status: in_progress
[2025-04-12 13:15:29] Batch status: in_progress
[2025-04-12 13:16:29] Batch status: in_progress
[2025-04-12 13:17:30] Batch status: in_progress
[2025-04-12 13:18:30] Batch status: in_progress
[2025-04-12 13:19:30] Batch status: in_progress
[2025-04-12 13:20:30] Batch status: in_progress
[2025-04-12 13:21:31] Batch status: in_progress
[2025-04-12 13:22:31] Batch status: in_progress
[2025-04-12 13:23:31] Batch status: in_p

In [48]:
output_file_id

'file-D5iNmWiRWZmzUhkGFrSxs3'

In [49]:
resobj = []
with open(output_filepath, 'r', encoding='utf-8') as f:
	for line in f:
		resobj.append(json.loads(line))

In [50]:
resobj[0]['custom_id']

'0'

In [52]:
resobj[0]['response']['body'] #['data'][0]

{'id': 'chatcmpl-BLSwhtgUoIsOn1qyxRzHt887DgpxD',
 'object': 'chat.completion',
 'created': 1744455855,
 'model': 'gpt-4o-mini-2024-07-18',
 'choices': [{'index': 0,
   'message': {'role': 'assistant',
    'content': '동화약품은 중규모 시가총액을 가진 의약품 제조업체로, 의약품 분야에 속하는 기업입니다.',
    'refusal': None,
    'annotations': []},
   'logprobs': None,
   'finish_reason': 'stop'}],
 'usage': {'prompt_tokens': 88,
  'completion_tokens': 33,
  'total_tokens': 121,
  'prompt_tokens_details': {'cached_tokens': 0, 'audio_tokens': 0},
  'completion_tokens_details': {'reasoning_tokens': 0,
   'audio_tokens': 0,
   'accepted_prediction_tokens': 0,
   'rejected_prediction_tokens': 0}},
 'service_tier': 'default',
 'system_fingerprint': 'fp_44added55e'}

In [None]:
client.close()