# chunk to faiss with search

faiss db와 연결하여 인덱스 생성, 조회 및 데이터 조회를 수행해보는 예제입니다.

엘라스틱서치와의 결합을 통해 하이브리드 검색도 진행할 수 있습니다.

In [2]:
import os
from dotenv import load_dotenv

import joblib
from sentence_transformers import SentenceTransformer
from elasticsearch import Elasticsearch

import requests

  from .autonotebook import tqdm as notebook_tqdm


In [3]:

load_dotenv()
FAISS_API_KEY = os.getenv("API_KEY")
print(FAISS_API_KEY)

nlp13_access_token_IJkdk3d2


In [4]:
chunks = joblib.load('./data/merged_plus_id.pkl')

In [5]:
# faiss 클라이언트 설정
session = requests.Session()
url = "http://3.34.62.202:8060/"

In [6]:
# 1. 텍스트 임베딩 모델 로드 (Hugging Face SentenceTransformer)
model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')  # 예

In [7]:
# 2. 텍스트 데이터를 벡터로 변환
text = "Faiss is a distributed, RESTful search engine."
vector = model.encode(text).tolist()  # 벡터를 리스트로 변환
print("vector", vector)
dimension = len(vector)
print("dimension:", dimension)

vector [-0.2365940362215042, -0.27777886390686035, -0.2769816219806671, -0.148578479886055, -0.044665779918432236, 0.18730643391609192, -0.2007625550031662, -0.10773374140262604, 0.017776891589164734, -0.2910635471343994, 0.04522012174129486, 0.24614793062210083, 0.22713202238082886, 0.3715372681617737, -0.23451651632785797, -0.08502563834190369, 0.2329074889421463, -0.13126130402088165, 0.28576672077178955, -0.1717987060546875, -0.10931780189275742, 0.0002679969184100628, 0.07361256331205368, -0.10275539755821228, -0.49946337938308716, -0.15136031806468964, -0.17468689382076263, 0.15553078055381775, 0.16283665597438812, -0.2169361561536789, 0.2636154890060425, -0.007321728393435478, -0.3107620179653168, 0.20946000516414642, -0.0660012885928154, -0.2510432004928589, -0.05435565114021301, 0.12379426509141922, -0.1986735761165619, 0.13712990283966064, -0.08966284990310669, 0.2245103418827057, -0.13555888831615448, -0.08458245545625687, -0.08591359108686447, -0.13387733697891235, 0.007383

In [8]:
# 3. 인덱스 생성
index_name = "prod-labq-documents-multi-minilm-l12-v2"
response = session.post(f"{url}/api/index", headers={"x-api-key" : FAISS_API_KEY}, json={"index" : index_name, "algorithm" : 1, "dimension": dimension}, timeout=15)
print(response.json())

{'detail': '서버 내부 오류: 400: 현재 같은 인덱스가 존재합니다.'}


In [9]:
# 4. 인덱스가 존재하는 지 확인
response = session.get(f"{url}/api/index", headers={"x-api-key" : FAISS_API_KEY}, params={"index" : index_name}, timeout=15)
print(response.json())

{'status': 'ok', 'message': '해당 인덱스가 존재합니다.', 'index': 'prod-labq-documents-multi-minilm-l12-v2'}


In [10]:
# 5. 데이터 대량 삽입
for chunk in chunks:
    vectors = model.encode(chunk.page_content).tolist()
    response = session.post(f"{url}/api/context", headers={"x-api-key" : FAISS_API_KEY}, json={"index" : index_name, "input_vector" : vectors}, timeout=15)
    print(response.json())
    
    


{'status': 'ok', 'message': '데이터 삽입에 성공하였습니다.'}
{'status': 'ok', 'message': '데이터 삽입에 성공하였습니다.'}
{'status': 'ok', 'message': '데이터 삽입에 성공하였습니다.'}
{'status': 'ok', 'message': '데이터 삽입에 성공하였습니다.'}
{'status': 'ok', 'message': '데이터 삽입에 성공하였습니다.'}
{'status': 'ok', 'message': '데이터 삽입에 성공하였습니다.'}
{'status': 'ok', 'message': '데이터 삽입에 성공하였습니다.'}
{'status': 'ok', 'message': '데이터 삽입에 성공하였습니다.'}
{'status': 'ok', 'message': '데이터 삽입에 성공하였습니다.'}
{'status': 'ok', 'message': '데이터 삽입에 성공하였습니다.'}
{'status': 'ok', 'message': '데이터 삽입에 성공하였습니다.'}
{'status': 'ok', 'message': '데이터 삽입에 성공하였습니다.'}
{'status': 'ok', 'message': '데이터 삽입에 성공하였습니다.'}
{'status': 'ok', 'message': '데이터 삽입에 성공하였습니다.'}
{'status': 'ok', 'message': '데이터 삽입에 성공하였습니다.'}
{'status': 'ok', 'message': '데이터 삽입에 성공하였습니다.'}
{'status': 'ok', 'message': '데이터 삽입에 성공하였습니다.'}
{'status': 'ok', 'message': '데이터 삽입에 성공하였습니다.'}
{'status': 'ok', 'message': '데이터 삽입에 성공하였습니다.'}
{'status': 'ok', 'message': '데이터 삽입에 성공하였습니다.'}
{'status': 'ok', 'message': '데이터 삽입에 성공하

In [28]:
# 6. 데이터 조회
query_text = "sk 하이닉스의 내년 주가는 오를 전망이야?"
query_vector = model.encode(query_text).tolist()

response = session.post(f"{url}/api/context/search-by-vector", headers={"x-api-key" : FAISS_API_KEY}, json={"index" : index_name, "query_vector" : query_vector, "size" : 5}, timeout=15)
print(response.json())

{'status': 'ok', 'results': [{'document_id': 1647, 'distance': 4.154514312744141}, {'document_id': 1719, 'distance': 4.154514312744141}, {'document_id': 1726, 'distance': 4.154514312744141}, {'document_id': 1992, 'distance': 4.154514312744141}, {'document_id': 2263, 'distance': 4.154514312744141}]}


In [29]:
# 8. 반환된 데이터를 디코딩
faiss_results = response.json()['results']
for item in faiss_results:
  print(item)
  print(chunks[item['document_id']])
  print("")

{'document_id': 1647, 'distance': 4.154514312744141}
page_content='# SK케미칼' metadata={'category': 'heading1', 'coordinates': [{'x': 0.3862, 'y': 0.1347}, {'x': 0.6105, 'y': 0.1347}, {'x': 0.6105, 'y': 0.1735}, {'x': 0.3862, 'y': 0.1735}], 'page': 1, 'id': 1, 'company_name': 'SK케미칼', 'report_date': '20231120', 'securities_firm': 'DB증권', 'source_file': 'SK케미칼_20231120_DB증권.pdf', 'content_types': {'markdown': '# SK케미칼', 'text': 'SK케미칼', 'html': "<br><h1 id='1' style='font-size:22px'>SK케미칼</h1>"}, 'uid': '6682eccd-29cf-48ef-ac4f-1a9f61bfd1ea', 'nid': 1648}

{'document_id': 1719, 'distance': 4.154514312744141}
page_content='# SK케미칼' metadata={'category': 'heading1', 'coordinates': [{'x': 0.2574, 'y': 0.1228}, {'x': 0.4807, 'y': 0.1228}, {'x': 0.4807, 'y': 0.1598}, {'x': 0.2574, 'y': 0.1598}], 'page': 1, 'id': 0, 'company_name': 'SK케미칼', 'report_date': '20240208', 'securities_firm': 'DB금융투자', 'source_file': 'SK케미칼_20240208_DB금융투자.pdf', 'content_types': {'markdown': '# SK케미칼', 'text': 'SK케미칼'

## Search With Elasticsearch

In [13]:
# Elasticsearch 클라이언트 설정
es = Elasticsearch("http://3.34.62.202:9200")

In [14]:
es_index_name = "prod-labq-documents-multi-minilm-l12-v2-2"

In [24]:
response = es.indices.delete(index=es_index_name)
print(response)

{'acknowledged': True}


In [25]:
# Elasticsearch에 인덱스 존재하는 지 확인

if not es.indices.exists(index=es_index_name):
    response = es.indices.create(index=es_index_name)
    print(response)
    print("Elasticsearch 인덱스 생성 완료")
    response = es.indices.refresh(index=es_index_name)
    print(response)
else:
    print("Elasticsearch 인덱스 이미 존재")
response = es.indices.stats(index=es_index_name)
print(response)

{'acknowledged': True, 'shards_acknowledged': True, 'index': 'prod-labq-documents-multi-minilm-l12-v2-2'}
Elasticsearch 인덱스 생성 완료
{'_shards': {'total': 2, 'successful': 1, 'failed': 0}}
{'_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_all': {'primaries': {'docs': {'count': 0, 'deleted': 0, 'total_size_in_bytes': 0}, 'shard_stats': {'total_count': 1}, 'store': {'size_in_bytes': 227, 'total_data_set_size_in_bytes': 227, 'reserved_in_bytes': 0}, 'indexing': {'index_total': 0, 'index_time_in_millis': 0, 'index_current': 0, 'index_failed': 0, 'delete_total': 0, 'delete_time_in_millis': 0, 'delete_current': 0, 'noop_update_total': 0, 'is_throttled': False, 'throttle_time_in_millis': 0, 'write_load': 0.0}, 'get': {'total': 0, 'time_in_millis': 0, 'exists_total': 0, 'exists_time_in_millis': 0, 'missing_total': 0, 'missing_time_in_millis': 0, 'current': 0}, 'search': {'open_contexts': 0, 'query_total': 0, 'query_time_in_millis': 0, 'query_current': 0, 'query_failure': 0, 'fetch_total':

In [26]:
# 데이터 대량 삽입

for chunk in chunks:
    text = chunk.page_content
    metadata = chunk.metadata
    vector = model.encode(text).tolist()  # 벡터를 리스트로 변환

    # Elasticsearch에 데이터 삽입
    doc = {
        "uid": metadata['uid'],
        "nid": metadata['nid'],
        "text": text,
        "embedding": vector,
        "metadata": metadata
        
    }
    response = es.index(index=es_index_name, body=doc)
    print("Document indexed:", response)
    

Document indexed: {'_index': 'prod-labq-documents-multi-minilm-l12-v2-2', '_id': 'Qz0appQBLfL4KSeUcIAd', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 0, '_primary_term': 1}
Document indexed: {'_index': 'prod-labq-documents-multi-minilm-l12-v2-2', '_id': 'RD0appQBLfL4KSeUcIDV', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 1, '_primary_term': 1}
Document indexed: {'_index': 'prod-labq-documents-multi-minilm-l12-v2-2', '_id': 'RT0appQBLfL4KSeUcYAA', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 2, '_primary_term': 1}
Document indexed: {'_index': 'prod-labq-documents-multi-minilm-l12-v2-2', '_id': 'Rj0appQBLfL4KSeUcYAj', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 3, '_primary_term': 1}
Document indexed: {'_index': 'prod-labq-documents-multi-minilm-l12-v2-2', '_id':

In [31]:

query = {
    "query": {
        "match": {
            "text": query_text
        }
    }
}

# 검색 실행
response = es.search(index=es_index_name, body=query, size=5)

  response = es.search(index=es_index_name, body=query, size=5)


In [32]:


print(response)
for hit in response["hits"]["hits"]:
    print(f"ID: {hit['_id']}, Score: {hit['_score']}")
    print(f"Content: {hit['_source']}")

{'took': 387, 'timed_out': False, '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0}, 'hits': {'total': {'value': 183, 'relation': 'eq'}, 'max_score': 10.353854, 'hits': [{'_index': 'prod-labq-documents-multi-minilm-l12-v2-2', '_id': '8T0cppQBLfL4KSeU1Ytc', '_score': 10.353854, '_ignored': ['metadata.content_types.html.keyword'], '_source': {'uid': '44276765-794d-4708-9855-a7cf34a2a669', 'nid': 2991, 'text': 'SK 증권은 4Q24 SK 하이닉스의 영업이익을 8.2 조원 (+17% QoQ)으로 예상한다.\n전통 B2C세트 수요 부진에 따른 Commodity 가격 하락에도 불구하고, HBM 등 고부\n가 중심의 확판과 레거시 출하 지양으로 우려보다 양호한 가격 (DRAM +10%,\nNAND -1%)을 예상하기 때문이다. DRAM 내 HBM 매출액 비중은 40% (+10%p\nQoQ)로 상승하며, 업황 우려가 본격화되는 시점에 높은 방어력을 확인할 것이다.', 'embedding': [-0.031807634979486465, 0.07579916715621948, -0.19842644035816193, -0.06987877190113068, -0.10593464970588684, -0.050324853509664536, -0.04604092240333557, 0.16656620800495148, -0.008283997885882854, 0.16237640380859375, -0.029495039954781532, 0.05522806942462921, 0.16236358880996704, -0.0737163424491

In [35]:



## Faiss에서 검색한 결과에서 document_id를 이용해서 Elasticsearch에서 데이터를 가져와 nid를 통해 디코딩.

nid_values = [item['document_id']+1 for item in faiss_results]

query = {
    "query": {
        "terms": {
            "nid": nid_values
        }
    }
}

response = es.search(index=es_index_name, body=query)
  


In [36]:
for hit in response["hits"]["hits"]:
    print(f"ID: {hit['_id']}, Score: {hit['_score']}")
    print(f"Content: {hit['_source']}")
    print(f"Metadata: {hit['_source']['metadata']}")
    print("")

ID: sj0bppQBLfL4KSeUwYY4, Score: 1.0
Content: {'uid': '6682eccd-29cf-48ef-ac4f-1a9f61bfd1ea', 'nid': 1648, 'text': '# SK케미칼', 'embedding': [-0.0370810404419899, 0.13330866396427155, -0.12111193686723709, -0.020460253581404686, 0.004756390582770109, -0.04190143570303917, 0.2754575312137604, 0.09036822617053986, 0.05623212456703186, 0.0061295670457184315, 0.12813904881477356, -0.10817084461450577, 0.12239859253168106, 0.025538666173815727, 0.016301119700074196, -0.08820291608572006, 0.02635965682566166, -0.04454326257109642, -0.09604798257350922, 0.02802097611129284, -0.06685570627450943, -0.022615717723965645, 0.14265702664852142, -0.009598059579730034, 0.061597276479005814, -0.0958332046866417, -0.06230057030916214, -0.031762655824422836, -0.1005207896232605, -0.07158495485782623, 0.02520197443664074, -0.04220166429877281, 0.047902025282382965, 0.06969311088323593, 0.02518806979060173, 0.05243467539548874, -0.002617109566926956, 0.1440429836511612, 0.039955027401447296, 0.0268464591354

In [47]:
## hybrid 검색 - Faiss와 Elasticsearch를 조합해서 검색 (Faiss로 검색한 결과 중에서 Elasticsearch로 bm25 검색)

query_text = "sk 하이닉스의 내년 주가는 오를 전망이야?"
query_vector = model.encode(query_text).tolist()

response = session.post(f"{url}/api/context/search-by-vector", headers={"x-api-key" : FAISS_API_KEY}, json={"index" : index_name, "query_vector" : query_vector, "size" : 100}, timeout=15)

faiss_results = response.json()['results']

In [49]:

for item in faiss_results:
    print(item)
    doc = {
        "query": {
            "match": {
                "nid": item['document_id'] + 1
            }
        }
    }
    response = es.search(index=es_index_name, body=doc)
    print(response["hits"]["hits"][0]["_source"])
    print("")

{'document_id': 1647, 'distance': 4.154514312744141}
{'uid': '6682eccd-29cf-48ef-ac4f-1a9f61bfd1ea', 'nid': 1648, 'text': '# SK케미칼', 'embedding': [-0.0370810404419899, 0.13330866396427155, -0.12111193686723709, -0.020460253581404686, 0.004756390582770109, -0.04190143570303917, 0.2754575312137604, 0.09036822617053986, 0.05623212456703186, 0.0061295670457184315, 0.12813904881477356, -0.10817084461450577, 0.12239859253168106, 0.025538666173815727, 0.016301119700074196, -0.08820291608572006, 0.02635965682566166, -0.04454326257109642, -0.09604798257350922, 0.02802097611129284, -0.06685570627450943, -0.022615717723965645, 0.14265702664852142, -0.009598059579730034, 0.061597276479005814, -0.0958332046866417, -0.06230057030916214, -0.031762655824422836, -0.1005207896232605, -0.07158495485782623, 0.02520197443664074, -0.04220166429877281, 0.047902025282382965, 0.06969311088323593, 0.02518806979060173, 0.05243467539548874, -0.002617109566926956, 0.1440429836511612, 0.039955027401447296, 0.026846

In [52]:

nid_values = [item['document_id']+1 for item in faiss_results]

query = {
     "query": {
        "bool": {
            "filter": {
                "terms": {
                    "nid": nid_values
                }
            },
            "must": {
                "match": {
                    "text": query_text 
                }
            }
        }
    },
    "size": 10,  # 최상위 도큐먼트 10개
    "sort": [
        {
            "_score": {
                "order": "desc"
            }
        }
    ]
}

response = es.search(index=es_index_name, body=query)


In [53]:
for hit in response["hits"]["hits"]:
    print(f"ID: {hit['_id']}, Score: {hit['_score']}")
    print(f"Content: {hit['_source']}")
    print(f"Metadata: {hit['_source']['metadata']}")
    print("")

ID: lz0hppQBLfL4KSeUQ6B1, Score: 8.329401
Content: {'uid': 'cf24d3bd-2127-4742-b6f9-4011c9eaec3c', 'nid': 8277, 'text': '- - 내년 영업현금 활용 계획\n', 'embedding': [0.03073793649673462, 0.15221022069454193, -0.06348896771669388, -0.06330962479114532, -0.06879971921443939, -0.028168505057692528, 0.18609756231307983, 0.017175639048218727, -0.052078764885663986, 0.07632709294557571, 0.15546293556690216, -0.05770432949066162, 0.06667841970920563, -0.019824707880616188, 0.16403374075889587, -0.03396424651145935, 0.06971000134944916, -0.0687241330742836, 0.08839192986488342, -0.09666100889444351, 0.008988841436803341, -0.0882948487997055, 0.025869999080896378, -0.09404242783784866, 0.1485063135623932, -0.11223360151052475, 0.019833911210298538, 0.07842333614826202, 0.0292989369481802, -0.04993797466158867, 0.0807996317744255, -0.02636762335896492, 0.14887075126171112, 0.052137769758701324, 0.07461860030889511, 0.11478850990533829, 0.05204196646809578, -0.012230915948748589, -0.00693689938634634, 0.1