In [None]:
from langchain_community.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
import faiss
from langchain_community.vectorstores import FAISS
from langchain_community.docstore.in_memory import InMemoryDocstore
from langchain_openai import OpenAIEmbeddings


In [None]:
# API 키를 환경변수로 관리하기 위한 설정 파일
from dotenv import load_dotenv

# API 키 정보 로드
load_dotenv()

In [None]:
# LangSmith 추적을 설정합니다. https://smith.langchain.com
# !pip install langchain-teddynote
from langchain_teddynote import logging

# 프로젝트 이름을 입력합니다.
logging.langsmith("CH09-VectorStores")

In [None]:
with open("./data/text.txt", encoding="utf-8") as f:
    file = f.read()

text_splitter = RecursiveCharacterTextSplitter(chunk_size=600, chunk_overlap=0)

In [None]:
print(type(file))
texts = text_splitter.split_text(file)
print(type(texts))
print(texts[1])
print(len(texts))  

In [None]:
loader1 = TextLoader("./data/text.txt", encoding="utf-8")
split_doc1 = loader1.load_and_split(text_splitter)
print(type(split_doc1))
print(split_doc1[1])
print(len(split_doc1))  

In [None]:
from langchain_huggingface.embeddings import HuggingFaceEmbeddings

model_name = "intfloat/multilingual-e5-large-instruct"
# model_name = "intfloat/multilingual-e5-large"

hf_embeddings = HuggingFaceEmbeddings(
    model_name=model_name,
    model_kwargs={"device": "cuda"},  # cuda, cpu
    encode_kwargs={"normalize_embeddings": True},
)

In [None]:
dimension_size = len(hf_embeddings.embed_query("hello world"))
print(dimension_size)

In [None]:
# FAISS 벡터 저장소 생성
db = FAISS(
    embedding_function=hf_embeddings,
    index=faiss.IndexFlatL2(dimension_size),
    docstore=InMemoryDocstore(),
    index_to_docstore_id={},
)

In [None]:
db = FAISS.from_documents(documents=split_doc1, embedding=hf_embeddings)

In [None]:
# 문서 저장소 ID 확인
db.index_to_docstore_id

In [None]:
db.docstore._dict

In [None]:
db.similarity_search("삼성전자 주가")

In [None]:
# filter 사용
db.similarity_search(
    "TF IDF 에 대하여 알려줘", filter={"source": "./data/text.txt"}, k=2
)

In [None]:
from langchain_core.documents import Document

# page_content, metadata 지정
db.add_documents(
    [
        Document(
            page_content="안녕하세요! 이번엔 도큐먼트를 새로 추가해 볼께요",
            metadata={"source": "mydata.txt"},
        )
    ],
    ids=["new_doc1"],
)

In [None]:
# 추가된 데이터를 확인
db.similarity_search("안녕하세요", k=1)


In [None]:
db.index_to_docstore_id

In [None]:
db.delete(ids=["new_doc1"])

In [None]:
db.index_to_docstore_id

## 저장하는 방법


In [None]:
db.save_local(folder_path="faiss_db", index_name="faiss_index")

## 불러오는 방법

In [None]:
# 저장된 데이터를 로드
loaded_db = FAISS.load_local(
    folder_path="faiss_db",
    index_name="faiss_index",
    embeddings=hf_embeddings,
    allow_dangerous_deserialization=True,
)

## 병합하는 방법

In [None]:
db.merge_from(loaded_db)

In [None]:
import torch
import gc

del hf_embeddings               # Delete embedding object
gc.collect()                   # Garbage collect
torch.cuda.empty_cache()       # Empty CUDA cache

## 검색기 활용

In [None]:
retriever = db.as_retriever()
retriever.invoke("삼성전자")

## 다양성이 높은 문서 검색

- k : 최종 반환 문서 수
- fetch_k : MMr 알고맂므에 전달할 문서수 ( 사전 추출)
- lambda_mult : MMr 결과의 다양성 조절

In [None]:
# MMR 검색 수행
retriever = db.as_retriever(
    search_type="mmr", search_kwargs={"k": 6, "lambda_mult": 0.25, "fetch_k": 10}
)
retriever.invoke("삼성전자")