In [2]:
import joblib
from elasticsearch import Elasticsearch
from sentence_transformers import SentenceTransformer
from langchain.schema import Document

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
chunks = joblib.load('./data/merged.pkl')

In [4]:
# Elasticsearch 클라이언트 설정
es = Elasticsearch("http://3.34.62.202:9200")

In [5]:
# 1. 텍스트 임베딩 모델 로드 (Hugging Face SentenceTransformer)
model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')  # 예제 모델

In [10]:
# 2. 텍스트 데이터를 벡터로 변환
text = "Elasticsearch is a distributed, RESTful search engine."
vector = model.encode(text).tolist()  # 벡터를 리스트로 변환
print(vector)

[-0.3862643539905548, -0.23479163646697998, -0.04144388437271118, -0.22436021268367767, -0.25369277596473694, -0.05062592774629593, -0.28605586290359497, -0.02322654239833355, -0.10330481082201004, -0.21860302984714508, 0.14308786392211914, 0.3895414471626282, 0.3408954441547394, 0.24099014699459076, -0.20703557133674622, -0.03763832524418831, 0.15360815823078156, 0.15570849180221558, -0.04695827141404152, -0.11894454061985016, -0.12752550840377808, -0.2257540374994278, 0.17144452035427094, -0.3120081126689911, -0.45358091592788696, 0.1145983338356018, -0.43252190947532654, 0.03380260989069939, 0.36177802085876465, -0.2619110643863678, 0.3974013924598694, -0.2588578760623932, 0.08834639191627502, -0.019985893741250038, -0.016092797741293907, 0.021758558228611946, -0.21366935968399048, -0.04472183808684349, -0.27179378271102905, 0.3061847686767578, -0.138699010014534, 0.06934622675180435, -0.4939483106136322, 0.050153281539678574, -0.0012281984090805054, -0.12441738694906235, 0.29996502

In [31]:
# 3. Elasticsearch 인덱스 생성 (dense_vector 필드 포함)
index_name = "prod-labq-documents-multi-minilm-l12-v2-2"
if not es.indices.exists(index=index_name):
    es.indices.create(
        index=index_name,
        body={
            "mappings": {
                "properties": {
                    "id": {"type": "integer"},
                    "text": {"type": "text"},
                    "embedding": {
                        "type": "dense_vector",
                        "dims": len(vector)  # 벡터 차원 수
                    },
                    "metadata": {"type": "object"}
                }
            }
        }
    )

In [32]:
print(chunks[0])
print(chunks[0].metadata['id'])

page_content='# 음식료 | 2024. 10. 21' metadata={'category': 'heading1', 'coordinates': [{'x': 0.0643, 'y': 0.0616}, {'x': 0.2377, 'y': 0.0616}, {'x': 0.2377, 'y': 0.0778}, {'x': 0.0643, 'y': 0.0778}], 'page': 1, 'id': 0, 'company_name': 'CJ제일제당', 'report_date': '20241021', 'securities_firm': 'IBK투자증권', 'source_file': 'CJ제일제당_20241021_IBK투자증권.pdf', 'content_types': {'markdown': '# 음식료 | 2024. 10. 21', 'text': '음식료 | 2024. 10. 21', 'html': "<h1 id='0' style='font-size:18px'>음식료 | 2024. 10. 21</h1>"}}
0


In [None]:

for chunk in chunks:
    text = chunk.page_content
    metadata = chunk.metadata
    vector = model.encode(text).tolist()  # 벡터를 리스트로 변환

    # 4. Elasticsearch에 데이터 삽입
    doc = {
        "id": metadata['id'],
        "text": text,
        "embedding": vector,
        "metadata": metadata
        
    }
    response = es.index(index=index_name, body=doc)
    print("Document indexed:", response)

In [35]:

### 인덱싱된 문서 수 확인
response = es.count(index=index_name)
print("Documents indexed:", response["count"])

Documents indexed: 8379


In [45]:
## id 기반 3개 document 검색

response = es.search(
    index=index_name,
    body={
        "query": {
            "match_all": {}
        },
        "sort": [
            {"id": "asc"}  # id 필드 기준 오름차순 정렬
        ],
        "from": 0,
        "size": 3
    }
)

for hit in response["hits"]["hits"]:
    print(hit["_source"])
    doc = hit["_source"]["text"], hit["_source"]["metadata"]
    print(doc)


{'id': 0, 'text': '# | 음식료 | 2024. 11. 13', 'embedding': [-0.26708024740219116, 0.0823521688580513, 0.024815471842885017, 0.06229376420378685, 0.2406345158815384, 0.03759411349892616, -0.031037338078022003, -0.21020494401454926, -0.16508308053016663, -0.08668366819620132, 0.2906043827533722, -0.5234279036521912, -0.2911156415939331, -0.20865851640701294, 0.024360625073313713, -0.3257460296154022, 0.11370376497507095, -0.19123680889606476, -0.09447875618934631, -0.19627246260643005, 0.06429991871118546, 0.05710705742239952, -0.021909473463892937, -0.04569835960865021, 0.21656295657157898, -0.030843107029795647, -0.1380767673254013, 0.09986615180969238, -0.060922905802726746, -0.336270809173584, 0.2891808748245239, 0.298706978559494, 0.35384219884872437, -0.023434612900018692, 0.322532594203949, 0.022357042878866196, 0.06311772763729095, -0.2117757946252823, 0.0036750000435858965, 0.2754843533039093, 0.024245610460639, -0.1001066267490387, 0.07297275960445404, 0.00868928525596857, 0.2607

In [7]:
## 키워드 기반(bm25) 검색 예제
index_name = "prod-labq-documents-multi-minilm-l12-v2-2"

search_text = "카카오뱅크의 주가는?"


query = {
    "query": {
        "match": {
            "text": search_text
        }
    }
}

# 검색 실행
response = es.search(index=index_name, body=query, size=5)


  response = es.search(index=index_name, body=query, size=5)


In [8]:




print(response)
for hit in response["hits"]["hits"]:
    print(f"ID: {hit['_id']}, Score: {hit['_score']}")
    print(f"Content: {hit['_source']}")


{'took': 2, 'timed_out': False, '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0}, 'hits': {'total': {'value': 58, 'relation': 'eq'}, 'max_score': 8.812679, 'hits': [{'_index': 'prod-labq-documents-multi-minilm-l12-v2-2', '_id': 'YD3AlZQBLfL4KSeUPnYN', '_score': 8.812679, '_source': {'id': 32, 'text': '[ 그림2] 카카오뱅크의 영업이익', 'embedding': [-0.021122781559824944, 0.14784224331378937, -0.23337534070014954, -0.0033882397692650557, -0.20197685062885284, -0.20013228058815002, 0.28191202878952026, 0.04919629171490669, 0.11420353502035141, -0.0011584110325202346, 0.18423771858215332, -0.13389752805233002, 0.14464494585990906, -0.037112098187208176, 0.11347246170043945, -0.053487420082092285, 0.005101318005472422, -0.029836511239409447, 0.04060068726539612, -0.05866074934601784, 0.07718365639448166, -0.20544268190860748, -0.03030593879520893, 0.019033413380384445, 0.07762787491083145, -0.10930454730987549, 0.1239856407046318, 0.12335337698459625, 0.08254851400852203, -0.07834995

In [12]:
## 벡터 기반 검색 예제
search_text = "카카오뱅크의 주가는?"
query_vector = model.encode(search_text).tolist()

query = {
    "query": {
        "script_score": {
            "query": {
                "match_all": {}
            },
            "script": {
                "source": "cosineSimilarity(params.query_vector, 'embedding') + 1.0",
                "params": {
                    "query_vector": query_vector
                }
            }
        }
    }
}

# 검색 실행
response = es.search(index=index_name, body=query, size=5)

for hit in response["hits"]["hits"]:
    print(f"ID: {hit['_id']}, Score: {hit['_score']}")
    print(f"Content: {hit['_source']}")

ID: oT3AlZQBLfL4KSeUFXU8, Score: 1.8687369
Content: {'id': 19, 'text': '카카오뱅크', 'embedding': [-0.09215157479047775, 0.22198867797851562, -0.29698196053504944, 0.0407625213265419, -0.2600383758544922, -0.23592792451381683, 0.27428463101387024, 0.15262329578399658, 0.06998288631439209, -0.05645885691046715, 0.0286589115858078, -0.3167012333869934, 0.09955122321844101, -0.06578187644481659, 0.09181440621614456, -0.13840824365615845, 0.0020606964826583862, 0.10995099693536758, 0.12435172498226166, -0.11617795377969742, 0.04212060943245888, -0.1660432666540146, 0.0003460001607891172, 0.024936454370617867, 0.08077777922153473, -0.1137470155954361, 0.21488377451896667, 0.16818654537200928, 0.09096509218215942, -0.14973041415214539, -0.013797672465443611, 0.0752599686384201, 0.2004365622997284, 0.11203860491514206, -0.016449477523565292, 0.15232054889202118, -0.01164060365408659, 0.18068131804466248, 0.1741311103105545, 0.09160669893026352, -0.06716128438711166, 0.061141252517700195, 0.1813074

  response = es.search(index=index_name, body=query, size=5)


In [13]:
## bm25 필터링 기반 하이브리드 검색 예제

search_text = "카카오뱅크의 주가는?"
query_vector = model.encode(search_text).tolist()

query = {
    "query": {
        "bool": {
            "should": [
                {
                    "match": {
                        "text": search_text
                    }
                },
                {
                    "script_score": {
                        "query": {
                            "match_all": {}
                        },
                        "script": {
                            "source": "cosineSimilarity(params.query_vector, 'embedding') + 1.0",
                            "params": {
                                "query_vector": query_vector
                            }
                        }
                    }
                }
            ]
        }
    }
}

# 검색 실행
response = es.search(index=index_name, body=query, size=5)

for hit in response["hits"]["hits"]:
    print(f"ID: {hit['_id']}, Score: {hit['_score']}")
    print(f"Content: {hit['_source']}")

ID: YD3AlZQBLfL4KSeUPnYN, Score: 10.626695
Content: {'id': 32, 'text': '[ 그림2] 카카오뱅크의 영업이익', 'embedding': [-0.021122781559824944, 0.14784224331378937, -0.23337534070014954, -0.0033882397692650557, -0.20197685062885284, -0.20013228058815002, 0.28191202878952026, 0.04919629171490669, 0.11420353502035141, -0.0011584110325202346, 0.18423771858215332, -0.13389752805233002, 0.14464494585990906, -0.037112098187208176, 0.11347246170043945, -0.053487420082092285, 0.005101318005472422, -0.029836511239409447, 0.04060068726539612, -0.05866074934601784, 0.07718365639448166, -0.20544268190860748, -0.03030593879520893, 0.019033413380384445, 0.07762787491083145, -0.10930454730987549, 0.1239856407046318, 0.12335337698459625, 0.08254851400852203, -0.07834995537996292, 0.02989034913480282, 0.036283500492572784, 0.11310619860887527, 0.09961705654859543, 0.0014788018306717277, 0.1603192389011383, -0.01250825822353363, 0.015381361357867718, 0.13233426213264465, 0.11712245643138885, -0.01861531473696232, 0.0

  response = es.search(index=index_name, body=query, size=5)


In [29]:
## bm25 + cosine similarity 하이브리드 검색색


search_text = "카카오뱅크의 주가는?"
query_vector = model.encode(search_text).tolist()

search_query = {
  "query": {
    "script_score": {
      "query": {
        # "match_all": {}
        "match": {
          "text": search_text,
        }
      },
      "script": {
        # BM25 점수는 _score 로 접근 가능
        # cosineSimilarity(params.query_vector, 'content_vector')는 벡터 유사도
        # alpha는 BM25와 벡터 유사도의 가중치
        "source": """
          double bm25Score = _score;
          double vectorSim = (cosineSimilarity(params.query_vector, 'embedding') + 1.0) * 125;
          double alpha = 0.0; 
          
          return alpha * bm25Score + (1 - alpha) * vectorSim;
        """,
        "params": {
          "query_vector": query_vector
        }
      }
    }
  }
}

response = es.search(index=index_name, body=search_query, size=5)
print(response)
for hit in response["hits"]["hits"]:
    print(f"ID: {hit['_id']}, Score: {hit['_score']}")
    print(f"Content: {hit['_source']}")

{'took': 7, 'timed_out': False, '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0}, 'hits': {'total': {'value': 58, 'relation': 'eq'}, 'max_score': 229.2092, 'hits': [{'_index': 'prod-labq-documents-multi-minilm-l12-v2-2', '_id': 'XT3AlZQBLfL4KSeUPXZu', '_score': 229.2092, '_source': {'id': 29, 'text': '[그림1] 카카오뱅크의 분기별 당기순이익', 'embedding': [-0.09340398758649826, 0.1924143135547638, -0.2759292721748352, 0.0307113416492939, -0.1303911954164505, -0.1480407416820526, 0.20327170193195343, 0.04328877478837967, 0.11732373386621475, -0.08044759184122086, 0.1374007761478424, -0.1312876045703888, 0.1383548080921173, -0.021799704059958458, -0.027867045253515244, -0.11744268983602524, -0.030355583876371384, 0.023373108357191086, 0.09882509708404541, 0.015093140304088593, 0.04013228788971901, -0.1816829890012741, -0.08525823056697845, 0.07801120728254318, 0.11384650319814682, -0.06271716207265854, 0.10807154327630997, 0.06790485978126526, 0.07458855211734772, -0.20648276805877686,

  response = es.search(index=index_name, body=search_query, size=5)


In [17]:


#### 주의 ! 인덱스 삭제 명령

if es.indices.exists(index=index_name):  # 인덱스가 존재하는지 확인
  confirm = input("인덱스를 삭제하시겠어요? (y/n): ")
  if confirm.lower() == "y":
    es.indices.delete(index=index_name)  # 인덱스 삭제
    print("Index deleted.")
  else:
    print("Index not deleted.")

Index deleted.
