In [None]:
from langchain_community.document_loaders.directory import DirectoryLoader
from langchain_community.document_loaders.markdown import UnstructuredMarkdownLoader
from langchain_community.document_loaders.text import TextLoader
from langchain_community.document_loaders.unstructured import UnstructuredFileLoader

loader = DirectoryLoader(
    "/Users/anpigon/ObsidianVault/Documents",
    glob="**/*.md",  # .md 확장자를 가진 파일만 대상으로 함
    recursive=True,  # 하위 디렉토리까지 모두 탐색
    show_progress=True,  # 진행 상태를 표시
    loader_cls=TextLoader, 
)
documents = loader.load()
documents[0]

In [47]:
from pathlib import Path
from typing import Iterator
from langchain_community.document_loaders.obsidian import ObsidianLoader
from langchain_core.documents import Document


class MyObsidianLoader(ObsidianLoader):
    def lazy_load(self) -> Iterator[Document]:
        paths = list(Path(self.file_path).glob("**/*.md"))
        for path in paths:
            with open(path, encoding=self.encoding) as f:
                text = f.read()

            try:
                front_matter = self._parse_front_matter(text)
                tags = self._parse_document_tags(text)
                dataview_fields = self._parse_dataview_fields(text)
                text = self._remove_front_matter(text)
                metadata = {
                    "source": str(path.name),
                    "path": str(path),
                    "created": path.stat().st_ctime,
                    "last_modified": path.stat().st_mtime,
                    "last_accessed": path.stat().st_atime,
                    **self._to_langchain_compatible_metadata(front_matter),
                    **dataview_fields,
                }

                if tags or front_matter.get("tags"):
                    metadata["tags"] = ",".join(
                        tags | set(front_matter.get("tags", []) or [])
                    )
            except:
                metadata = {
                    "source": str(path.name),
                    "path": str(path),
                    "created": path.stat().st_ctime,
                    "last_modified": path.stat().st_mtime,
                    "last_accessed": path.stat().st_atime,
                }

            yield Document(page_content=text, metadata=metadata)


loader = MyObsidianLoader(
    "/Users/anpigon/ObsidianVault/Documents", encoding="utf-8", collect_metadata=True
)
documents = loader.load()
documents[0]

Encountered non-yaml frontmatter
Encountered non-yaml frontmatter
Encountered non-yaml frontmatter


Document(metadata={'source': 'Dashboard.md', 'path': '/Users/anpigon/ObsidianVault/Documents/Dashboard.md', 'created': '2024-05-21 10:11:24', 'last_modified': 1721313244.8919904, 'last_accessed': 1721391469.7960355, 'date': '2024-05-21 10:11:24', 'updated': '2024-07-18 11:34:03', 'cssclasses': "['wide']"}, page_content='\n### 프로젝트\n```dataview\nTABLE WITHOUT ID \n\t"[["+ file.path + "|"+ default(file.aliases[0], file.name) +"]]" AS 제목, \n\tstatus as 상태 \nWHERE type = "project"\nAND !contains(status, "중단")\nAND !contains(status, "완료")\nSORT status ASC, category DESC\n```\n\n### 최근 메모\n```dataview\nTABLE WITHOUT ID \n\t"[["+ file.path + "|"+ default(file.aliases[0], file.name) +"]]" AS 제목,\n\tdateformat(file.ctime,"yyyy-MM-dd") as 날짜\nFROM "5 Inbox"\nWHERE file.name != file.folder\nSORT file.ctime DESC\nLIMIT 10\n```\n\n### 최근 스크랩\n```dataview\nTABLE WITHOUT ID \n\t"[["+ file.path + "|"+ default(file.aliases[0], file.name) +"]]" AS 제목,\n\tchoice(contains(title, "GeekNews"), "`GeekNews`,"

In [44]:
len(documents)

6631

In [48]:
from langchain.text_splitter import Language, RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter.from_language(
    chunk_size=1024, chunk_overlap=24, language=Language.MARKDOWN
)
splitted_documents = text_splitter.split_documents(documents)
splitted_documents[0]

Document(metadata={'source': 'Dashboard.md', 'path': '/Users/anpigon/ObsidianVault/Documents/Dashboard.md', 'created': '2024-05-21 10:11:24', 'last_modified': 1721313244.8919904, 'last_accessed': 1721391469.7960355, 'date': '2024-05-21 10:11:24', 'updated': '2024-07-18 11:34:03', 'cssclasses': "['wide']"}, page_content='### 프로젝트\n```dataview\nTABLE WITHOUT ID \n\t"[["+ file.path + "|"+ default(file.aliases[0], file.name) +"]]" AS 제목, \n\tstatus as 상태 \nWHERE type = "project"\nAND !contains(status, "중단")\nAND !contains(status, "완료")\nSORT status ASC, category DESC\n```\n\n### 최근 메모\n```dataview\nTABLE WITHOUT ID \n\t"[["+ file.path + "|"+ default(file.aliases[0], file.name) +"]]" AS 제목,\n\tdateformat(file.ctime,"yyyy-MM-dd") as 날짜\nFROM "5 Inbox"\nWHERE file.name != file.folder\nSORT file.ctime DESC\nLIMIT 10\n```\n\n### 최근 스크랩\n```dataview\nTABLE WITHOUT ID \n\t"[["+ file.path + "|"+ default(file.aliases[0], file.name) +"]]" AS 제목,\n\tchoice(contains(title, "GeekNews"), "`GeekNews`,", 

In [46]:
len(splitted_documents)

33752

In [54]:
from FlagEmbedding import BGEM3FlagModel, FlagModel

model_name = "BAAI/bge-m3"
bge_embeddings = BGEM3FlagModel(
    model_name, use_fp16=True
)  # use_fp16을 True로 설정하면 약간의 성능 저하와 함께 계산 속도가 빨라집니다.

Fetching 30 files: 100%|██████████| 30/30 [01:11<00:00,  2.37s/it]


In [60]:
from pathlib import Path
from langchain.storage import LocalFileStore
from langchain.embeddings import CacheBackedEmbeddings
from langchain_community.embeddings import HuggingFaceBgeEmbeddings, HuggingFaceEmbeddings

root_path = Path.cwd()
store = LocalFileStore(root_path / ".cached_embeddings")

model_name = "BAAI/bge-m3" # "intfloat/multilingual-e5-large-instruct"
model_kwargs = {"device": "mps"}
encode_kwargs = {"normalize_embeddings": True}
underlying_embeddings = HuggingFaceBgeEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs,
)

cached_embeddings = CacheBackedEmbeddings.from_bytes_store(
    underlying_embeddings=underlying_embeddings, document_embedding_cache=store
)

In [61]:
from langchain_chroma import Chroma

vectorstore = Chroma.from_documents(
    splitted_documents,
    cached_embeddings,
    persist_directory="vectorstore",
)

In [None]:
 vectorstore_retriever = vectorstore.as_retriever()

In [None]:
from langchain_teddynote.retrievers import KiwiBM25Retriever

bm25_retriever = KiwiBM25Retriever.from_texts(splitted_documents)

In [None]:
from langchain.retrievers import BM25Retriever, EnsembleRetriever

ensemble_retriever = EnsembleRetriever(
    retrievers=[bm25_retriever, vectorstore_retriever],
    weights=[0.4, 0.6],
    search_type="mmr",
)

In [None]:
from langchain.chat_models import ChatOpenAI

llm = ChatOpenAI(model_name="gpt-4o-mini", temperature=0.1, max_tokens=500)

In [None]:
from langchain.retrievers.multi_query import MultiQueryRetriever

multi_query_retriever = MultiQueryRetriever.from_llm(
    retriever=ensemble_retriever,
    llm=llm,
)

In [None]:
from langchain_core.prompts import PromptTemplate

prompt_template = """당신은 마크다운으로 작성한 노트에 대한 사용자의 의문점이나 질문에 도움을 주는 것이 주된 목적인 어시스턴트입니다. 제공된 CONTEXT를 바탕으로 답변을 작성하세요.

다음 지침에 따라 질문에 대한 답변을 생성하세요.
질문: {input}

검색된 다음 컨텍스트 스니펫을 사용해 질문에 답하세요:
<CONTEXT>
{context}
</CONTEXT>

Answer:"""

prompt = PromptTemplate.from_template(prompt_template)
prompt

In [None]:
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser

qa_chain = (
    {
        "context": multi_query_retriever,
        "question": RunnablePassthrough(),
    }
    | prompt
    | llm
    | StrOutputParser()
)

In [None]:
import streamlit as st
from langchain_community.chat_message_histories import StreamlitChatMessageHistory
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_core.runnables.history import RunnableWithMessageHistory

# Optionally, specify your own session_state key for storing messages
msgs = StreamlitChatMessageHistory(key="chat_messages")

chain = (
    qa_chain
    | ChatPromptTemplate.from_messages(
        [
            ("system", "You are an AI chatbot having a conversation with a human."),
            MessagesPlaceholder(variable_name="history"),
            ("human", "{question}"),
        ]
    )
    | ChatOpenAI(model_name="gpt-4o-mini")
)

chain_with_history = RunnableWithMessageHistory(
    chain,
    lambda session_id: msgs,  # Always return the instance created earlier
    input_messages_key="question",
    history_messages_key="history",
)

if len(msgs.messages) == 0:
    msgs.add_ai_message("How can I help you?")

for msg in msgs.messages:
    st.chat_message(msg.type).write(msg.content)

if prompt := st.chat_input():
    st.chat_message("human").write(prompt)

    # As usual, new messages are added to StreamlitChatMessageHistory when the Chain is called.
    config = {"configurable": {"session_id": "any"}}
    response = chain_with_history.invoke({"question": prompt}, config)
    st.chat_message("ai").write(response.content)

In [3]:
from langchain_core.prompts.chat import ChatPromptTemplate, MessagesPlaceholder

prompt_template = """You are an assistant whose primary purpose is to help with questions or inquiries about notes written in Markdown. Base your answer on the provided CONTEXT and the chat history.

Use the following context snippets to answer the question:
<CONTEXT>
{context}
</CONTEXT>"""

prompt = ChatPromptTemplate.from_messages(
    [
        ("system", prompt_template),
        MessagesPlaceholder(variable_name="chat_history"),
        ("human", "{question}"),
    ]
)
prompt

ChatPromptTemplate(input_variables=['chat_history', 'context', 'question'], input_types={'chat_history': typing.List[typing.Union[langchain_core.messages.ai.AIMessage, langchain_core.messages.human.HumanMessage, langchain_core.messages.chat.ChatMessage, langchain_core.messages.system.SystemMessage, langchain_core.messages.function.FunctionMessage, langchain_core.messages.tool.ToolMessage]]}, messages=[SystemMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context'], template='You are an assistant whose primary purpose is to help with questions or inquiries about notes written in Markdown. Base your answer on the provided CONTEXT and the chat history.\n\nUse the following context snippets to answer the question:\n<CONTEXT>\n{context}\n</CONTEXT>')), MessagesPlaceholder(variable_name='chat_history'), HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['question'], template='{question}'))])