In [9]:
import os
import json
import pandas as pd

from glob import glob

In [2]:
base_dir = os.path.join(os.path.expanduser("~"), "data", "ofij")
if not os.path.exists(base_dir):
    os.makedirs(base_dir)


In [8]:
stmeta_file = os.path.join(base_dir,'stock_meta.feather')
news_files = glob(os.path.join(base_dir, 'news*.feather'))
newsembed_file = os.path.join(base_dir, 'batch_embedding_output.jsonl')

In [10]:
# load jsonl file 
with open(newsembed_file, 'r') as f:
    newsembed = [json.loads(line) for line in f]
len(newsembed)

10401

In [4]:
dfmeta = pd.read_feather(stmeta_file)

In [26]:
dfnews = pd.read_feather(news_files[0])

In [27]:
dfnews = dfnews.drop_duplicates().reset_index(drop=False)


In [29]:
# convert index column to string
dfnews['index'] = dfnews['index'].astype(str)
dfnews.head()

Unnamed: 0,index,cntt_usiq_srno,news_ofer_entp_code,data_dt,data_tm,hts_pbnt_titl_cntt,news_lrdv_code,dorg,iscd1,kor_isnm1
0,0,2024032100100098155,U,20240321,1000,[기자의 눈] 벚꽃없는 벚꽃축제,39,서울경제,,
1,1,2024032100090896253,2,20240321,908,"외신들,""엔비디아 젠슨황이 AI계 스티브 잡스"" 평가",4,한국경제신문,,
2,2,2024032100053623852,2,20240321,536,베트남 GDP 3% 횡령한 부동산 재벌…사형 구형,9,한국경제신문,,
3,3,2024032100050072449,U,20240321,500,[사설] “정규직 과보호에 중장년 고용 불안”···노동 유연화 서둘러야,39,서울경제,,
4,4,2024032100050059151,U,20240321,500,"[사설] 의대별 정원 확정, 특위에서 필수?지역 의료 정상화에 머리 맞대라",39,서울경제,,


In [7]:
dfmeta.head()

Unnamed: 0,한글명,표준코드,단축코드,상장일자,시장구분,업종대분류,업종중분류,표준산업분류,액면가,시가총액(억 원),...,당기순이익(억 원),ROE(%),전일종가(원),신용가능,증거금비율(%),KRX바이오,관리종목,거래정지,불성실공시,이상급등
0,동화약품,KR7000020008,20,1976-03-24,KOSPI,시가총액규모중,의약품,의약품 제조업,1000,1678,...,21,1.49,6010,True,60,True,False,False,False,False
1,KR모터스,KR7000040006,40,1976-05-25,KOSPI,시가총액규모소,운수장비,그외 기타 운송장비 제조업,500,217,...,-142,-42.76,362,False,100,False,False,False,False,False
2,경방,KR7000050005,50,1956-03-03,KOSPI,시가총액규모중,"섬유,의복",종합 소매업,500,1727,...,236,3.15,6300,True,60,False,False,False,False,False
3,삼양홀딩스,KR7000070003,70,1968-12-27,KOSPI,시가총액규모중,음식료품,기타 금융업,5000,4830,...,895,1.38,56400,True,60,False,False,False,False,False
4,삼양홀딩스우,KR7000071001,75,1992-02-21,Other,시가총액규모중,음식료품,기타 금융업,5000,171,...,0,0.0,56300,False,100,False,False,False,False,False


In [14]:
newsembed[0]

{'id': 'batch_req_67f5155c7e4c81908329a224ed4c2752',
 'custom_id': '0',
 'response': {'status_code': 200,
  'request_id': '46b72c4d93cc9309f9231099adb72908',
  'body': {'object': 'list',
   'data': [{'object': 'embedding',
     'index': 0,
     'embedding': [0.02361799,
      0.007665091,
      -0.009329446,
      -0.016892638,
      0.006566843,
      -0.056112543,
      0.045809392,
      0.007432987,
      -0.032902148,
      0.015703812,
      -0.030524498,
      0.030411277,
      0.019406153,
      -0.01156557,
      0.04184664,
      -0.020515723,
      -0.077262305,
      -0.003560814,
      0.028554445,
      0.041914575,
      -0.024002943,
      -0.03016219,
      0.0033938123,
      0.025497466,
      0.031181183,
      0.014752753,
      -0.010959835,
      0.030773586,
      0.06671007,
      0.040080387,
      -0.02794305,
      -0.03650259,
      0.057924084,
      -0.05937332,
      0.024818137,
      0.050723203,
      0.022961307,
      -0.021455461,
      -0.0430241

In [21]:
# extract custom_id and embedding from newsembed
dfemb = []
for newsrec in newsembed:
    custom_id = newsrec['custom_id']
    embedding = newsrec['response']['body']['data'][0]['embedding']
    dfemb.append((custom_id, embedding))

In [32]:
dfemb = pd.DataFrame(dfemb, columns=['custom_id', 'embedding'])

In [33]:
# merge dfnews and dfemb on index column and custom_id
dfemb = dfnews.merge(dfemb, left_on='index', right_on='custom_id', how='inner')
dfemb.head()

Unnamed: 0,index,cntt_usiq_srno,news_ofer_entp_code,data_dt,data_tm,hts_pbnt_titl_cntt,news_lrdv_code,dorg,iscd1,kor_isnm1,custom_id,embedding
0,0,2024032100100098155,U,20240321,1000,[기자의 눈] 벚꽃없는 벚꽃축제,39,서울경제,,,0,"[0.02361799, 0.007665091, -0.009329446, -0.016..."
1,1,2024032100090896253,2,20240321,908,"외신들,""엔비디아 젠슨황이 AI계 스티브 잡스"" 평가",4,한국경제신문,,,1,"[0.01607047, -0.0054703546, 0.0006299249, 0.01..."
2,2,2024032100053623852,2,20240321,536,베트남 GDP 3% 횡령한 부동산 재벌…사형 구형,9,한국경제신문,,,2,"[0.029173603, 0.020256389, 0.060813203, 0.0384..."
3,3,2024032100050072449,U,20240321,500,[사설] “정규직 과보호에 중장년 고용 불안”···노동 유연화 서둘러야,39,서울경제,,,3,"[0.033023026, 0.077666745, 0.0033049677, 0.037..."
4,4,2024032100050059151,U,20240321,500,"[사설] 의대별 정원 확정, 특위에서 필수?지역 의료 정상화에 머리 맞대라",39,서울경제,,,4,"[-0.022171568, 0.0067736073, 0.014170361, 0.04..."


In [34]:
dfemb.drop(columns=['custom_id', 'iscd1','kor_isnm1'], inplace=True)

In [35]:
dfemb.head()

Unnamed: 0,index,cntt_usiq_srno,news_ofer_entp_code,data_dt,data_tm,hts_pbnt_titl_cntt,news_lrdv_code,dorg,embedding
0,0,2024032100100098155,U,20240321,1000,[기자의 눈] 벚꽃없는 벚꽃축제,39,서울경제,"[0.02361799, 0.007665091, -0.009329446, -0.016..."
1,1,2024032100090896253,2,20240321,908,"외신들,""엔비디아 젠슨황이 AI계 스티브 잡스"" 평가",4,한국경제신문,"[0.01607047, -0.0054703546, 0.0006299249, 0.01..."
2,2,2024032100053623852,2,20240321,536,베트남 GDP 3% 횡령한 부동산 재벌…사형 구형,9,한국경제신문,"[0.029173603, 0.020256389, 0.060813203, 0.0384..."
3,3,2024032100050072449,U,20240321,500,[사설] “정규직 과보호에 중장년 고용 불안”···노동 유연화 서둘러야,39,서울경제,"[0.033023026, 0.077666745, 0.0033049677, 0.037..."
4,4,2024032100050059151,U,20240321,500,"[사설] 의대별 정원 확정, 특위에서 필수?지역 의료 정상화에 머리 맞대라",39,서울경제,"[-0.022171568, 0.0067736073, 0.014170361, 0.04..."


In [37]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

In [38]:
texts = dfemb.hts_pbnt_titl_cntt.to_list()
embeddings = np.array(dfemb.embedding.to_list())

In [40]:
similarity_mtx = cosine_similarity(embeddings)

In [None]:
similarity_mtx

(10401, 10401)

In [62]:
similarity_mtx[0].argsort()[::-1]

array([   0, 3416, 7422, ..., 1878, 1920, 2245], dtype=int64)

In [63]:
texts[3416]

"'3월 벚꽃축제' 안양천서 열린다"

In [64]:
texts[7422]

'벚꽃·유채꽃·철쭉 흐드러진 경남, 봄꽃 축제 보러 오세요'