# **사전 준비**

In [None]:
!pip install langchain langchain_openai langchain_community pypdf kiwipiepy rank_bm25 faiss-cpu

In [None]:
from google.colab import drive
import os

# 먼저 구글 드라이브 마운트
drive.mount('/content/drive')

In [None]:
import os
from dotenv import load_dotenv

# .env 파일에서 환경 변수 로드
load_dotenv("/content/.env")

# 환경 변수에서 API 키 가져오기
os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")

In [None]:
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

file_path = (
    "/content/drive/MyDrive/langchain-tutorial/Ch04. Advanced Rag/Data/투자설명서.pdf"
)
loader = PyPDFLoader(file_path)

doc_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap = 200)

docs = loader.load_and_split(doc_splitter)

# **bm25 리트리버**

In [None]:
from langchain_community.retrievers import BM25Retriever

from kiwipiepy import Kiwi

kiwi_tokenizer = Kiwi()

def kiwi_tokenize(text):
  return [token.from for token in kiwi_tokenizer.tokenize(text)]

In [None]:
bm25_retriever = BM25Retriever.from_documents(docs, preprocess_func=kiwi_tokenize)
bm25_retriever.k = 4

# **FAISS 리트리버**

In [None]:
from langchain_openai.embeddings import OpenAIEmbeddings

# OpenAI의 임베딩 모델 사용
embedding = OpenAIEmbeddings(model="text-embedding-3-large")

In [None]:
# FAISS 라이브러리 임포트
from langchain_community.vectorstores import FAISS

# FAISS DB 생성 후 저장
faiss_store = FAISS.from_documents(docs, embedding)
faiss_store.save_local("/content/DB")

In [None]:
# 저장된 DB 경로 지정 후 DB 로드
persist_directory = "/content/DB"
vectordb = FAISS.load_local(persist_directory, embeddings=embedding, allow_dangerous_deserialization=True)

In [None]:
# FAISS 리트리버 생성
faiss_retriever = vectordb.as_retriever(search_kwargs={"k": 4})

# **앙상블 리트리버**

In [None]:
from langchain.retrievers import EnsembleRetriever

ensemble_retriever = EnsembleRetriever(retrievers=[bm25_retriever, faiss_retriever], weights=[0.5, 0.5])