In [1]:
import joblib
from elasticsearch import Elasticsearch
from sentence_transformers import SentenceTransformer
from langchain.schema import Document

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
chunks = joblib.load('./data/merged.pkl')

In [4]:
# Elasticsearch 클라이언트 설정
es = Elasticsearch("http://3.34.62.202:9200")

In [8]:
# 1. 텍스트 임베딩 모델 로드 (Hugging Face SentenceTransformer)
model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')  # 예제 모델

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [10]:
# 2. 텍스트 데이터를 벡터로 변환
text = "Elasticsearch is a distributed, RESTful search engine."
vector = model.encode(text).tolist()  # 벡터를 리스트로 변환
print(vector)

[-0.3862643539905548, -0.23479163646697998, -0.04144388437271118, -0.22436021268367767, -0.25369277596473694, -0.05062592774629593, -0.28605586290359497, -0.02322654239833355, -0.10330481082201004, -0.21860302984714508, 0.14308786392211914, 0.3895414471626282, 0.3408954441547394, 0.24099014699459076, -0.20703557133674622, -0.03763832524418831, 0.15360815823078156, 0.15570849180221558, -0.04695827141404152, -0.11894454061985016, -0.12752550840377808, -0.2257540374994278, 0.17144452035427094, -0.3120081126689911, -0.45358091592788696, 0.1145983338356018, -0.43252190947532654, 0.03380260989069939, 0.36177802085876465, -0.2619110643863678, 0.3974013924598694, -0.2588578760623932, 0.08834639191627502, -0.019985893741250038, -0.016092797741293907, 0.021758558228611946, -0.21366935968399048, -0.04472183808684349, -0.27179378271102905, 0.3061847686767578, -0.138699010014534, 0.06934622675180435, -0.4939483106136322, 0.050153281539678574, -0.0012281984090805054, -0.12441738694906235, 0.29996502

In [31]:
# 3. Elasticsearch 인덱스 생성 (dense_vector 필드 포함)
index_name = "prod-labq-documents-multi-minilm-l12-v2-2"
if not es.indices.exists(index=index_name):
    es.indices.create(
        index=index_name,
        body={
            "mappings": {
                "properties": {
                    "id": {"type": "integer"},
                    "text": {"type": "text"},
                    "embedding": {
                        "type": "dense_vector",
                        "dims": len(vector)  # 벡터 차원 수
                    },
                    "metadata": {"type": "object"}
                }
            }
        }
    )

In [32]:
print(chunks[0])
print(chunks[0].metadata['id'])

page_content='# 음식료 | 2024. 10. 21' metadata={'category': 'heading1', 'coordinates': [{'x': 0.0643, 'y': 0.0616}, {'x': 0.2377, 'y': 0.0616}, {'x': 0.2377, 'y': 0.0778}, {'x': 0.0643, 'y': 0.0778}], 'page': 1, 'id': 0, 'company_name': 'CJ제일제당', 'report_date': '20241021', 'securities_firm': 'IBK투자증권', 'source_file': 'CJ제일제당_20241021_IBK투자증권.pdf', 'content_types': {'markdown': '# 음식료 | 2024. 10. 21', 'text': '음식료 | 2024. 10. 21', 'html': "<h1 id='0' style='font-size:18px'>음식료 | 2024. 10. 21</h1>"}}
0


In [33]:

for chunk in chunks:
    text = chunk.page_content
    metadata = chunk.metadata
    vector = model.encode(text).tolist()  # 벡터를 리스트로 변환

    # 4. Elasticsearch에 데이터 삽입
    doc = {
        "id": metadata['id'],
        "text": text,
        "embedding": vector,
        "metadata": metadata
        
    }
    response = es.index(index=index_name, body=doc)
    print("Document indexed:", response)

Document indexed: {'_index': 'prod-labq-documents-multi-minilm-l12-v2-2', '_id': 'iD27lZQBLfL4KSeUgl8Y', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 0, '_primary_term': 1}
Document indexed: {'_index': 'prod-labq-documents-multi-minilm-l12-v2-2', '_id': 'iT27lZQBLfL4KSeUgl9_', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 1, '_primary_term': 1}
Document indexed: {'_index': 'prod-labq-documents-multi-minilm-l12-v2-2', '_id': 'ij27lZQBLfL4KSeUgl-0', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 2, '_primary_term': 1}
Document indexed: {'_index': 'prod-labq-documents-multi-minilm-l12-v2-2', '_id': 'iz27lZQBLfL4KSeUgl_8', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 3, '_primary_term': 1}
Document indexed: {'_index': 'prod-labq-documents-multi-minilm-l12-v2-2', '_id':

In [35]:

### 인덱싱된 문서 수 확인
response = es.count(index=index_name)
print("Documents indexed:", response["count"])

Documents indexed: 8379


In [45]:
## id 기반 3개 document 검색

response = es.search(
    index=index_name,
    body={
        "query": {
            "match_all": {}
        },
        "sort": [
            {"id": "asc"}  # id 필드 기준 오름차순 정렬
        ],
        "from": 0,
        "size": 3
    }
)

for hit in response["hits"]["hits"]:
    print(hit["_source"])
    doc = hit["_source"]["text"], hit["_source"]["metadata"]
    print(doc)


{'id': 0, 'text': '# | 음식료 | 2024. 11. 13', 'embedding': [-0.26708024740219116, 0.0823521688580513, 0.024815471842885017, 0.06229376420378685, 0.2406345158815384, 0.03759411349892616, -0.031037338078022003, -0.21020494401454926, -0.16508308053016663, -0.08668366819620132, 0.2906043827533722, -0.5234279036521912, -0.2911156415939331, -0.20865851640701294, 0.024360625073313713, -0.3257460296154022, 0.11370376497507095, -0.19123680889606476, -0.09447875618934631, -0.19627246260643005, 0.06429991871118546, 0.05710705742239952, -0.021909473463892937, -0.04569835960865021, 0.21656295657157898, -0.030843107029795647, -0.1380767673254013, 0.09986615180969238, -0.060922905802726746, -0.336270809173584, 0.2891808748245239, 0.298706978559494, 0.35384219884872437, -0.023434612900018692, 0.322532594203949, 0.022357042878866196, 0.06311772763729095, -0.2117757946252823, 0.0036750000435858965, 0.2754843533039093, 0.024245610460639, -0.1001066267490387, 0.07297275960445404, 0.00868928525596857, 0.2607

In [48]:
## 키워드 기반(bm25) 검색 예제


search_text = "sk 하이닉스의 내년 주가는 어떻게 변동될까?"


query = {
    "query": {
        "match": {
            "text": search_text
        }
    }
}

# 검색 실행
response = es.search(index=index_name, body=query, size=5)


  response = es.search(index=index_name, body=query, size=5)


In [49]:




print(response)
for hit in response["hits"]["hits"]:
    print(f"ID: {hit['_id']}, Score: {hit['_score']}")
    print(f"Content: {hit['_source']}")


{'took': 4, 'timed_out': False, '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0}, 'hits': {'total': {'value': 184, 'relation': 'eq'}, 'max_score': 15.89027, 'hits': [{'_index': 'prod-labq-documents-multi-minilm-l12-v2-2', '_id': '7T29lZQBLfL4KSeUVGiZ', '_score': 15.89027, '_source': {'id': 60, 'text': '3. Blackwell 플랫폼 공급 지연 가능성? 내년 HBM 공급 과잉 우려에 대해서 회사\n는 어떻게 보고 있는지?', 'embedding': [-0.1527525633573532, 0.11171586066484451, -0.2381572425365448, -0.2677249610424042, 0.01063876785337925, 0.13782955706119537, -0.24537697434425354, 0.07981013506650925, -0.3049568831920624, -0.09522341936826706, -0.07043930143117905, 0.15877579152584076, 0.02561449259519577, -0.15291942656040192, 0.14984042942523956, 0.23093417286872864, -0.032400354743003845, -0.31444981694221497, 0.3188423812389374, -0.03163227066397667, 0.060451604425907135, -0.24929741024971008, -0.07601596415042877, -0.0733405128121376, 0.19841109216213226, -0.02397686429321766, 0.058923397213220596, 0.0165620967745

In [51]:
## 벡터 기반 검색 예제
search_text = "sk 하이닉스의 내년 주가는 어떻게 변동될까?
query_vector = model.encode(search_text).tolist()

query = {
    "query": {
        "script_score": {
            "query": {
                "match_all": {}
            },
            "script": {
                "source": "cosineSimilarity(params.query_vector, 'embedding') + 1.0",
                "params": {
                    "query_vector": query_vector
                }
            }
        }
    }
}

# 검색 실행
response = es.search(index=index_name, body=query, size=5)

for hit in response["hits"]["hits"]:
    print(f"ID: {hit['_id']}, Score: {hit['_score']}")
    print(f"Content: {hit['_source']}")

Object `변동될까` not found.
ID: Qj29lZQBLfL4KSeU2WuY, Score: 1.646971
Content: {'id': 33, 'text': 'SK 하이닉스 연간 실적 추이 및 전망', 'embedding': [-0.10640256106853485, 0.17867973446846008, -0.19496867060661316, -0.08797327429056168, -0.19843871891498566, -0.20501916110515594, 0.03127412497997284, 0.02151019684970379, -0.07902669906616211, -0.0855758860707283, 0.22887322306632996, -0.18477585911750793, 0.29995909333229065, -0.22918753325939178, 0.08456826210021973, -0.21834424138069153, 0.1941652148962021, 0.03486054390668869, 0.26304855942726135, -0.18860343098640442, -0.03810490295290947, -0.1170087531208992, -0.10461335629224777, -0.005077655427157879, 0.028137193992733955, -0.09629642963409424, 0.14776639640331268, 0.07268761098384857, -0.07016243785619736, -0.08409211784601212, -0.03264220431447029, 0.13219241797924042, 0.029436713084578514, 0.16712792217731476, -0.015430202707648277, 0.10682350397109985, -0.04539518803358078, 0.00027253106236457825, -0.21707014739513397, 0.04971960186958313, 

  response = es.search(index=index_name, body=query, size=5)


In [54]:
## bm25 필터링 기반 하이브리드 검색 예제

search_text = "sk 하이닉스의 내년 주가는 어떻게 변동될까?"
query_vector = model.encode(search_text).tolist()

query = {
    "query": {
        "bool": {
            "should": [
                {
                    "match": {
                        "text": search_text
                    }
                },
                {
                    "script_score": {
                        "query": {
                            "match_all": {}
                        },
                        "script": {
                            "source": "cosineSimilarity(params.query_vector, 'embedding') + 1.0",
                            "params": {
                                "query_vector": query_vector
                            }
                        }
                    }
                }
            ]
        }
    }
}

# 검색 실행
response = es.search(index=index_name, body=query, size=5)

for hit in response["hits"]["hits"]:
    print(f"ID: {hit['_id']}, Score: {hit['_score']}")
    print(f"Content: {hit['_source']}")

ID: 7T29lZQBLfL4KSeUVGiZ, Score: 17.163126
Content: {'id': 60, 'text': '3. Blackwell 플랫폼 공급 지연 가능성? 내년 HBM 공급 과잉 우려에 대해서 회사\n는 어떻게 보고 있는지?', 'embedding': [-0.1527525633573532, 0.11171586066484451, -0.2381572425365448, -0.2677249610424042, 0.01063876785337925, 0.13782955706119537, -0.24537697434425354, 0.07981013506650925, -0.3049568831920624, -0.09522341936826706, -0.07043930143117905, 0.15877579152584076, 0.02561449259519577, -0.15291942656040192, 0.14984042942523956, 0.23093417286872864, -0.032400354743003845, -0.31444981694221497, 0.3188423812389374, -0.03163227066397667, 0.060451604425907135, -0.24929741024971008, -0.07601596415042877, -0.0733405128121376, 0.19841109216213226, -0.02397686429321766, 0.058923397213220596, 0.016562096774578094, -0.20615160465240479, -0.23162700235843658, 0.052845392376184464, -0.09220164269208908, 0.027956601232290268, -0.021238723769783974, 0.2941484749317169, 0.037811245769262314, 0.2679117023944855, -0.08441942185163498, -0.19406993687152863, 0.075

  response = es.search(index=index_name, body=query, size=5)


In [55]:
## bm25 + cosine similarity 하이브리드 검색색


search_text = "sk 하이닉스의 내년 주가는 어떻게 변동될까?"
query_vector = model.encode(search_text).tolist()

search_query = {
  "query": {
    "script_score": {
      "query": {
        "match_all": {}
        # "match": {
        #   "text": query_text,
        # }
      },
      "script": {
        # BM25 점수는 _score 로 접근 가능
        # cosineSimilarity(params.query_vector, 'content_vector')는 벡터 유사도
        # alpha는 BM25와 벡터 유사도의 가중치
        "source": """
          double bm25Score = _score;
          double vectorSim = cosineSimilarity(params.query_vector, 'embedding') + 1.0;
          double alpha = 0.8; 
          
          return alpha * bm25Score + (1 - alpha) * vectorSim;
        """,
        "params": {
          "query_vector": query_vector
        }
      }
    }
  }
}

response = es.search(index=index_name, body=search_query, size=5)
print(response)
for hit in response["hits"]["hits"]:
    print(f"ID: {hit['_id']}, Score: {hit['_score']}")
    print(f"Content: {hit['_source']}")

{'took': 5, 'timed_out': False, '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0}, 'hits': {'total': {'value': 8379, 'relation': 'eq'}, 'max_score': 1.1293942, 'hits': [{'_index': 'prod-labq-documents-multi-minilm-l12-v2-2', '_id': 'Qj29lZQBLfL4KSeU2WuY', '_score': 1.1293942, '_source': {'id': 33, 'text': 'SK 하이닉스 연간 실적 추이 및 전망', 'embedding': [-0.10640256106853485, 0.17867973446846008, -0.19496867060661316, -0.08797327429056168, -0.19843871891498566, -0.20501916110515594, 0.03127412497997284, 0.02151019684970379, -0.07902669906616211, -0.0855758860707283, 0.22887322306632996, -0.18477585911750793, 0.29995909333229065, -0.22918753325939178, 0.08456826210021973, -0.21834424138069153, 0.1941652148962021, 0.03486054390668869, 0.26304855942726135, -0.18860343098640442, -0.03810490295290947, -0.1170087531208992, -0.10461335629224777, -0.005077655427157879, 0.028137193992733955, -0.09629642963409424, 0.14776639640331268, 0.07268761098384857, -0.07016243785619736, -0.08409211

  response = es.search(index=index_name, body=search_query, size=5)


In [17]:


#### 주의 ! 인덱스 삭제 명령

if es.indices.exists(index=index_name):  # 인덱스가 존재하는지 확인
  confirm = input("인덱스를 삭제하시겠어요? (y/n): ")
  if confirm.lower() == "y":
    es.indices.delete(index=index_name)  # 인덱스 삭제
    print("Index deleted.")
  else:
    print("Index not deleted.")

Index deleted.
