In [1]:
import os
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_upstage import UpstageEmbeddings # Upstage 임베딩 모델
from langchain_community.vectorstores import Chroma # Chroma DB
from langchain_openai import ChatOpenAI
from langchain.chains import ConversationalRetrievalChain
from langchain.memory import ConversationBufferMemory
from dotenv import load_dotenv

load_dotenv()

True

In [2]:
pdf_folder_path = "./w_total"

def load_pdfs_from_folder(folder_path):
    documents = []
    for filename in os.listdir(folder_path):
        if filename.endswith('.pdf'):
            file_path = os.path.join(folder_path, filename)
            loader = PyPDFLoader(file_path)
            documents.extend(loader.load())
    print(f"loaded {len(documents)} documents")
    return documents
all_documents = load_pdfs_from_folder(pdf_folder_path)

# 2. 문서 분할 (이전 코드의 split_documents 함수 재사용)
def split_documents(documents):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200, add_start_index=True)
    return text_splitter.split_documents(documents)
split_docs = split_documents(all_documents)
print(f"split {len(split_docs)} documents")

loaded 2098 documents
split 2103 documents


In [3]:
from langchain.chat_models import init_chat_model

llm = init_chat_model("gpt-4o-mini", model_provider="openai")

In [4]:
# from langchain_core.messages import HumanMessage, SystemMessage

# messages = [
#     SystemMessage("Translate the following from English into Korean"),
#     HumanMessage("hi!"),

# llm.invoke(messages)

In [5]:
from langchain_upstage import UpstageEmbeddings

embeddings = UpstageEmbeddings(model="solar-embedding-1-large")

In [6]:
from langchain_chroma import Chroma

vector_store = Chroma(
    collection_name="sermon_collection",
    embedding_function=embeddings,
    persist_directory="./chroma_w_2024_2025",  # Where to save data locally, remove if not necessary
)

In [7]:
ids = vector_store.add_documents(documents=split_docs)

In [None]:
results = vector_store.similarity_search(
    "시대와 표적을 알아라 주제의 말씀을 찾아줘."
)

print(results[0])

page_content='3 240107- 주일말씀 .hwp
- 3 -◎ 오늘 말씀의 ‘핵심’을 먼저 전해 주겠습니다 .
   ① 주가 오면 ‘표적’이 일어납니다 . 
      그것을 통해 알게 하려 함입니다 .
   ② 시대가 오고, 주가 와도 
      행할 것을 행하지 않으면 온 것 같지 않습니다 .
      애인이 되고 결혼을 하여도 
      사랑하고 행하지 않으면  
      애인이나 부부 같지 않은 것입니다 .
   ③ 죄를 짓고 회개하지 않으면 , 사망에 있습니다 . 
      이는 하나님과 상관없는 자라는 것입니다 .
   이렇게 3가지 핵심을 잘 기억하며 , 말씀 듣겠습니다 .' metadata={'creationdate': '2024-01-07T15:31:39+09:00', 'start_index': 0, 'total_pages': 23, 'creator': 'Hwp 2022 12.0.0.3146', 'author': '생명의 말씀', 'pdfversion': '1.4', 'page': 2, 'producer': 'Hancom PDF 1.3.0.546', 'page_label': '3', 'source': './w_total/20240107_주일말씀.pdf', 'moddate': '2024-01-07T15:31:39+09:00'}


In [None]:
from langchain_core.documents import Document
from typing_extensions import List, TypedDict


class State(TypedDict):
    question: str
    context: List[Document]
    answer: str

4
