In [None]:
import os 

from IPython.display import display_markdown
from langchain.docstore.document import Document
from langchain.vectorstores import Chroma
from langchain.document_loaders import ConfluenceLoader
from langchain.text_splitter import MarkdownHeaderTextSplitter
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_google_vertexai import VertexAIEmbeddings
from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA
from langchain_google_vertexai import VertexAI

import vertexai

os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = ""
vertexai.init(project="", location="europe-west3")



In [None]:

loader = ConfluenceLoader(
    url="",
    token=""
)
docs = loader.load(
    space_key="",
    # include_attachments=True, # uncomment to include png, jpeg, ..
    keep_markdown_format=True
)

print("Content: \n ------- \n" + docs[-1].page_content)
print("Metadatas: \n ------- \n" + str(docs[-1].metadata))

In [None]:
def pretty_print(chunks):
    print(
        str('\n' + '='*50 + '\n').join(
            [ chunk.page_content + '\n' +'-'*50 + '\n' + str(chunk.metadata)  for chunk in chunks ]
        )
    
    )

In [None]:
text = """
# I am a title 
## I am a subtitle
I am a block of text. However, my size is quite long. First of all, I'd like the MarkdownHeaderTextSplitter
to identify my title and subtitle in its metadata.
Then I'd like the RecursiveCharacterTextSplitter to identify the two parts that make up my text 
because my size would be too large to feed a language model. 
Finally, I'd like the metadata corresponding to my origins, i.e. the url, to be merged with my
title and subtitle information.

"""

metadata={'url': 'https://abdul-mateen.com'}
sample = Document(page_content=text, metadata=metadata)

# Markdown 
headers_to_split_on = [
    ("#", "Title 1"),
    ("##", "Sub-title 1"),
    ("###", "Sub-title 2"),
]

markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)

# Split based on markdown and add original metadata
md_docs = []
for doc in [sample]:
    md_doc = markdown_splitter.split_text(doc.page_content)
    for i in range(len(md_doc)):
        md_doc[i].metadata = md_doc[i].metadata | doc.metadata 
    md_docs.extend(md_doc)


# Chunk size big enough
splitter = RecursiveCharacterTextSplitter(
    chunk_size=256,
    chunk_overlap=20,
    separators=["\n\n", "\n", "(?<=\. )", " ", ""]
)

splitted_docs = splitter.split_documents(md_docs)

pretty_print(splitted_docs)

In [None]:
chunks = splitter.split_documents(docs)
chunks

In [None]:
# Embeddings
embeddings = VertexAIEmbeddings(model_name="textembedding-gecko@003")

In [None]:
# Save db 
db = Chroma.from_documents(chunks, embeddings, persist_directory="./chroma_db")
db.persist()
# db._collection.count()

In [None]:
# db.get()
retriever = db.as_retriever()
# retriever = db.as_retriever(search_type="similarity_score_threshold", search_kwargs={"k": 5, "score_threshold": 0.3})
template = """Given this text extracts:
    -----
    {context}
    -----
    Please answer with to the following question:
    Question: {question}
    Answer: 
    """

prompt = PromptTemplate(template=template, input_variables=["context", "question"])
# LLM
llm = VertexAI(model_name="gemini-pro", temperature=0.6)

chain_type_kwargs = {"prompt": prompt}
qa = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=retriever,
    return_source_documents=True,
    verbose=True
)

In [None]:
q1 = "How to create service account for sisense?"
q2 = "Tell me about Tech lead role?"
q3= "what is daily duty person checklist?"
q4= "How to offboard a team member?"


answer = qa({"query": q4})

display_markdown(answer["result"], raw=True)

In [None]:
retriever.get_relevant_documents(q4)