In [38]:
from IPython.display import display, Markdown, Latex

In [39]:
import os
import chromadb
import pickle
import pytesseract as pt
from dotenv import load_dotenv
from langchain.vectorstores import Chroma
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.llms import OpenAI
from langchain.document_loaders import PyMuPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains.question_answering import load_qa_chain
from langchain.chains.summarize import load_summarize_chain

load_dotenv()
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)
llm = OpenAI(temperature=0.7, openai_api_key=OPENAI_API_KEY)

pt.pytesseract.tesseract_cmd = "C:\\Program Files\\Tesseract-OCR\\tesseract.exe"


In [27]:
class ChatDoc:
    def __init__(self, fileName: str):
        self.fileName = fileName
        self.init()

    def load_from_pdf(self):
        loader = PyMuPDFLoader(f"../data/{self.fileName}.pdf")
        self.data = loader.load()
        self.save_to_pkl()

    def load_from_pkl(self):
        with open(f"../data/{self.fileName}.pkl", "rb") as f:
            self.data = pickle.load(f)

    def save_to_pkl(self):
        with open(f"../data/{self.fileName}.pkl", "wb") as f:
            pickle.dump(self.data, f)

    def init(self):
        if os.path.exists(f"../data/chroma/{self.fileName}"):
            self.docsearch = Chroma(persist_directory=f"../data/chroma/{self.fileName}", embedding_function=embeddings, collection_name=self.fileName)
            return
        
        if os.path.exists(f"../data/{self.fileName}.pkl"):
            self.load_from_pkl()
        else:
            self.load_from_pdf()

        self.text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=1000, chunk_overlap=0
        )
        self.texts = self.text_splitter.split_documents(self.data)

        self.docsearch = Chroma.from_texts(
            [t.page_content for t in self.texts],
            embeddings,
            collection_name=self.fileName,
            persist_directory=f"../data/chroma/{self.fileName}",
        )

        self.docsearch.persist()

    def query(self, q: str):
        docs = self.docsearch.similarity_search(q, include_metadata=True)
        chain = load_qa_chain(llm, chain_type="stuff")
        return str(chain.run(input_documents=docs, question=q)).strip()

    def summarize(self):
        chain = load_summarize_chain(llm, chain_type="map_reduce")
        return str(chain.run(self.texts[:5])).strip()


In [28]:
embryology = ChatDoc("embryology")

Using embedded DuckDB with persistence: data will be stored in: ../data/chroma/embryology


In [24]:
embryology.query("From what dermal layer is the neural tube derived?")

'The neural tube is derived from the ectoderm.'

In [25]:
embryology.query("Explain the process of gastrulation")

'Gastrulation is a formative process by which the three germ layers, which are precursors of all embryonic tissues, and the axial orientation are established in embryos. During gastrulation, the bilaminar embryonic disc is converted into a trilaminar embryonic disc. This process begins with the appearance of the primitive streak, which appears at the beginning of the third week as a thickening of the epiblast at the caudal end of the embryonic disc. The primitive streak results from migration of epiblastic cells to the median plane of the disc. Invagination of epiblastic cells from the primitive streak gives rise to the three germ layers (ectoderm, mesoderm, and endoderm). Extensive cell shape changes, rearrangement, movement, and alterations in adhesive properties contribute to the process of gastrulation. Bone morphogenetic proteins and other signaling molecules such as FGFs, Shh (sonic hedgehog), Tgifs, and Wnts play a crucial role in gastrulation.'

In [32]:
embryology.query("Explain the process of gastrulation in simple terms")

'Gastrulation is the process of development when the bilaminar embryonic disc is converted into a trilaminar embryonic disc. During this process, there is extensive cell shape changes, rearrangement, movement, and alterations in adhesive properties. This process is also when the three germ layers (ectoderm, mesoderm, and endoderm) are established. The ectoderm gives rise to the epidermis, central and peripheral nervous systems, eyes and internal ears, neural crest cells, and many connective tissues of the head. The mesoderm gives rise to muscle, bone, cartilage, and connective tissue and the endoderm gives rise to the digestive system.'

In [33]:
embryology.query("Explain the process of gastrulation in a very funny way")

"Gastrulation is like a cell party! Cells rearrange, change shape, and move around to form the three germ layers - ectoderm, mesoderm, and endoderm - like guests at a big bash. They show up, dance, and mingle, and by the time the party's over everyone's got new friends and new places to be!"

In [37]:
embryology.query("Act as a teacher of elementary school. Explain the process of gastrulation. Use only basic words. Replace all complex words with very basic words.")

'Gastrulation is a process that happens in the early stages of development. Tiny cells in the embryo move around and arrange into three different layers. These layers are called the ectoderm, mesoderm, and endoderm. These layers will become the different parts of the body, like the skin, bones, and organs.'

In [64]:
embryology.query("Act as a teacher of elementary school. Find a metaphor to illustrate gastrulation.")

'Gastrulation is like building a house. Just like how a house starts out with a blueprint and then builders lay down the foundation and the walls, during gastrulation, cells move around and become organized to form the basis of the body.'

In [63]:
Markdown(
    embryology.query(
        "Give me a bullet list of two processes during embryonic development in markdown format. For each bullet point, make the text a different color using html tags."
    )
)


- <font color="blue">Formation of the trilaminar embryonic disc</font> 
- <font color="green">Development of the notochordal process</font>

In [54]:
Markdown(r"""$\begin{aligned}
S(\omega) 
&= \frac{\alpha g^2}{\omega^5} e^{[ -0.74\bigl\{\frac{\omega U_\omega 19.5}{g}\bigr\}^{\!-4}\,]} \\
&= \frac{\alpha g^2}{\omega^5} \exp\Bigl[ -0.74\Bigl\{\frac{\omega U_\omega 19.5}{g}\Bigr\}^{\!-4}\,\Bigr] 
\end{aligned}$""")

$\begin{aligned}
S(\omega) 
&= \frac{\alpha g^2}{\omega^5} e^{[ -0.74\bigl\{\frac{\omega U_\omega 19.5}{g}\bigr\}^{\!-4}\,]} \\
&= \frac{\alpha g^2}{\omega^5} \exp\Bigl[ -0.74\Bigl\{\frac{\omega U_\omega 19.5}{g}\Bigr\}^{\!-4}\,\Bigr] 
\end{aligned}$