In [3]:
import joblib
from sentence_transformers import SentenceTransformer
import requests

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
chunks = joblib.load('./data/merged_plus_id.pkl')

In [None]:
# faiss 클라이언트 설정
session = requests.Session()
url = "http://localhost:8070"

In [6]:
# 1. 텍스트 임베딩 모델 로드 (Hugging Face SentenceTransformer)
model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')  # 예

In [7]:
# 2. 텍스트 데이터를 벡터로 변환
text = "Faiss is a distributed, RESTful search engine."
vector = model.encode(text).tolist()  # 벡터를 리스트로 변환
print("vector", vector)
dimension = len(vector)
print("dimension:", dimension)

vector [-0.2365940362215042, -0.27777886390686035, -0.2769816219806671, -0.148578479886055, -0.044665779918432236, 0.18730643391609192, -0.2007625550031662, -0.10773374140262604, 0.017776891589164734, -0.2910635471343994, 0.04522012174129486, 0.24614793062210083, 0.22713202238082886, 0.3715372681617737, -0.23451651632785797, -0.08502563834190369, 0.2329074889421463, -0.13126130402088165, 0.28576672077178955, -0.1717987060546875, -0.10931780189275742, 0.0002679969184100628, 0.07361256331205368, -0.10275539755821228, -0.49946337938308716, -0.15136031806468964, -0.17468689382076263, 0.15553078055381775, 0.16283665597438812, -0.2169361561536789, 0.2636154890060425, -0.007321728393435478, -0.3107620179653168, 0.20946000516414642, -0.0660012885928154, -0.2510432004928589, -0.05435565114021301, 0.12379426509141922, -0.1986735761165619, 0.13712990283966064, -0.08966284990310669, 0.2245103418827057, -0.13555888831615448, -0.08458245545625687, -0.08591359108686447, -0.13387733697891235, 0.007383

In [8]:
# 3. 인덱스 생성
index_name = "test_script"
response = session.post(f"{url}/api/index", json={"index" : index_name, "algorithm" : 1, "dimension": dimension}, timeout=15)

In [13]:
# 4. 인덱스가 존재하는 지 확인
response = session.get(f"{url}/api/index", params={"index" : index_name}, timeout=15)
print(response.json())

{'status': 'ok', 'message': '해당 인덱스가 존재합니다.', 'index': 'test_script'}


In [15]:
# 5. 인덱스 삭제 (인덱스가 존재하는 경우에만)
if input("Delete index? (y/n): ") == "y":
  delete_index = 'test_script'
  response = session.delete(f"{url}/api/index", params={"index" : index_name}, timeout=15)
  print(response.json())


{'status': 'ok', 'message': '인덱스를 삭제하였습니다.', 'index': 'test_script'}


In [19]:
index_name = "sdsdd"

In [20]:
# 6. 데이터 삽입
text = "Faiss is a distributed, RESTful search engine."
query_vector = model.encode(text).tolist()

response = session.post(f"{url}/api/context", json={"index" : index_name, "input_vector" : query_vector}, timeout=15)
print(response.json())

{'status': 'ok', 'message': '데이터 삽입에 성공하였습니다.'}


In [36]:
# 7. 데이터 조회
text = "재무상태"
query_vector = model.encode(text).tolist()

response = session.post(f"{url}/api/context/search-by-vector", json={"index" : index_name, "query_vector" : query_vector, "size" : 5}, timeout=15)
print(response.json())



{'status': 'ok', 'results': [{'document_id': 3532, 'distance': 1.763096570968628}, {'document_id': 63, 'distance': 1.7630972862243652}, {'document_id': 245, 'distance': 1.7630972862243652}, {'document_id': 358, 'distance': 1.7630972862243652}, {'document_id': 455, 'distance': 1.7630972862243652}]}


In [38]:
# 8. 반환된 데이터를 적절히 변환

for item in response.json()['results']:
  print(item)
  print(chunks[item['document_id']])
  print("")

{'document_id': 3532, 'distance': 1.763096570968628}
page_content='재무상태표' metadata={'category': 'paragraph', 'coordinates': [{'x': 0.0825, 'y': 0.1074}, {'x': 0.1815, 'y': 0.1074}, {'x': 0.1815, 'y': 0.1229}, {'x': 0.0825, 'y': 0.1229}], 'page': 7, 'id': 98, 'company_name': '네이버', 'report_date': '20241111', 'securities_firm': '신한투자증권', 'source_file': '네이버_20241111_신한투자증권.pdf', 'content_types': {'markdown': '재무상태표', 'text': '재무상태표', 'html': "<p id='98' data-category='paragraph' style='font-size:20px'>재무상태표</p>"}, 'uid': '3d2fb2fc-b6bb-411a-9d9c-41b037efc02b', 'nid': 3533}

{'document_id': 63, 'distance': 1.7630972862243652}
page_content='재무상태표' metadata={'category': 'paragraph', 'coordinates': [{'x': 0.5202, 'y': 0.1592}, {'x': 0.5898, 'y': 0.1592}, {'x': 0.5898, 'y': 0.1716}, {'x': 0.5202, 'y': 0.1716}], 'page': 5, 'id': 63, 'company_name': 'CJ제일제당', 'report_date': '20241021', 'securities_firm': 'IBK투자증권', 'source_file': 'CJ제일제당_20241021_IBK투자증권.pdf', 'content_types': {'markdown': '재무상