# Logseq semantic search and retrieval 
A test notebook for semantic search and retrieval across a Logseq note database.
Based on https://learn.deeplearning.ai/langchain-chat-with-your-data/


## Preamble

In [None]:
import os
import dotenv
import openai
from pathlib import Path
dotenv.load_dotenv()
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
openai.api_key = OPENAI_API_KEY
CHROMA_PERSIST_DIR = Path('../data/chroma')
CHROMA_PERSIST_DIR.unlink(missing_ok=True)
CHROMA_PERSIST_DIR.mkdir(parents=True, exist_ok=True)

## Load the documents from the Logseq database

In [None]:
# Load docs
import logging
# logging.basicConfig(level=logging.DEBUG)
from langchain.document_loaders import DirectoryLoader, UnstructuredMarkdownLoader
logseq_location = os.getenv("LOGSEQ_DIR")

loader = DirectoryLoader(
    logseq_location,  
    glob="**/*.md", 
    # FIXME: The Logseq data dir contains a 'logseq/bak' subdirectory that contains
    # old versions of the files. Need to add a param to DirectoryLoader to exclude this directory.
    # exclude_glob='logseq/bak/**/*.*', 
    # https://github.com/langchain-ai/langchain/pull/11831
    loader_cls=UnstructuredMarkdownLoader, 
    silent_errors=True
)
docs = loader.load()

## Split the documents with the splitter

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
chunk_size = 1000
chunk_overlap = 100
splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap, separators=['\n\n', '\n- ', '\n', '\.', ' ', ''])
splits = splitter.split_documents(docs)
len(splits)

## Embed the documents into vectors and store them in a vector store

In [None]:
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Chroma
embedding = OpenAIEmbeddings()
vectordb = Chroma.from_documents(
    documents=splits,
    embedding=embedding,
    persist_directory=str(CHROMA_PERSIST_DIR),
)
vectordb.persist()
docs = vectordb.similarity_search("What potential projects do I have?", k=3)
docs_mmr = vectordb.max_marginal_relevance_search("What potential projects do I have?", k=2, fetch_k=5)


## Create a retrieval chain with the retriever

In [None]:
from langchain.chat_models import ChatOpenAI
from langchain.chains import RetrievalQA
llm = ChatOpenAI(model_name='gpt-3.5-turbo', temperature=0)
qa_chain = RetrievalQA.from_chain_type(
	llm, retriever=vectordb.as_retriever(search_type='mmr')
)

In [None]:
questions = [
    'What potential projects do I have?',
    'How to train a model?',
    'Write a Wikipedia-style article about Iltar',
]
result = qa_chain({'query': questions[2]})
print(result['result'])