In [26]:
import os
os.environ["OPENAI_API_KEY"] = ""

In [27]:
from langchain.vectorstores import Chroma
from langchain.embeddings import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.llms import OpenAI
from langchain.chains import RetrievalQA
from langchain.document_loaders import DirectoryLoader
from langchain.document_loaders import PyPDFLoader

In [28]:
# Load the pdf files to the loader

loader = DirectoryLoader("C:/DS/FA23 DATA SCIENCE IN PRACTICE/James A. Glazier",
                         glob="./*.pdf",
                         loader_cls=PyPDFLoader
)

Documents = loader.load()

In [43]:
len(Documents)

9

In [30]:
# split it

text_splitter  = RecursiveCharacterTextSplitter(chunk_size = 1000, chunk_overlap=200)
texts = text_splitter.split_documents(documents=Documents)


In [31]:
len(texts)

50

In [32]:
Documents[0]

Document(page_content='BIOINFORMATICSVol. 20 no. 7 2004, pages 1129–1137\nDOI: 10.1093/bioinformatics/bth050\nCOMPUCELL, a multi-model framework for\nsimulation of morphogenesis\nJ. A. Izaguirre1,∗, R. Chaturvedi1, C. Huang1, T. Cickovski1,\nJ. Cofﬂand1, G. Thomas2, G. Forgacs3, M. Alber4, G. Hentschel5,\nS. A. Newman6and J. A. Glazier7\n1Department of Computer Science and Engineering,2Department of Physics,\nUniversity of Notre Dame, Notre Dame, IN 46556, USA,3Department of Physics and\nBiology, University of Missouri, Columbia, MO 65211, USA,4Department of\nMathematics, University of Notre Dame, Notre Dame, IN 46556, USA,5Department of\nPhysics, Emory University, Atlanta, GA 30332, USA,6Department of Cell Biology and\nAnatomy, New York Medical College, Valhalla, NY 10595, USA and7Departments of\nPhysics and Biology and Biocomplexity Institute, Indiana University, Bloomington,\nIN 47405, USA\nReceived on March 12, 2003; revised on August 5, 2003; accepted on August 29, 2003\nAdvance A

In [33]:
# Embed and store the texts
# Supplying a persist_directory will store the embeddings on disk
persist_directory = 'db'

## here we are using OpenAI embeddings but in future we will swap out to local embeddings
embedding = OpenAIEmbeddings()

vectordb = Chroma.from_documents(documents=texts, 
                                 embedding=embedding,
                                 persist_directory=persist_directory)

In [35]:
retriever = vectordb.as_retriever()
docs = retriever.get_relevant_documents("How much money did Pando raise?")
len(docs)

4

In [36]:
retriever = vectordb.as_retriever(search_kwargs={"k": 2})
retriever.search_type

'similarity'

In [37]:
retriever.search_kwargs

{'k': 2}

In [38]:
# create the chain to answer questions 
qa_chain = RetrievalQA.from_chain_type(llm=OpenAI(), 
                                  chain_type="stuff", 
                                  retriever=retriever, 
                                  return_source_documents=True)

In [39]:
## Cite sources
def process_llm_response(llm_response):
    print(llm_response['result'])
    print('\n\nSources:')
    for source in llm_response["source_documents"]:
        print(source.metadata['source'])

In [40]:
# full example using RAG with LLM
query = "What is CompuCell?"
llm_response = qa_chain(query)
process_llm_response(llm_response)

 CompuCell is a multi-model software framework for simulation of the development of multicellular organisms known as morphogenesis.


Sources:
C:\DS\FA23 DATA SCIENCE IN PRACTICE\James A. Glazier\C_scp_ompu_scp_C_scp_ell_scp_a_multi_mod.pdf
C:\DS\FA23 DATA SCIENCE IN PRACTICE\James A. Glazier\C_scp_ompu_scp_C_scp_ell_scp_a_multi_mod.pdf


In [41]:
query = "What is CompuCell?"
llm_response = qa_chain(query)
llm_response

{'query': 'What is CompuCell?',
 'result': ' CompuCell is a multi-model software framework for simulation of the development of multicellular organisms known as morphogenesis.',
 'source_documents': [Document(page_content='IN 47405, USA\nReceived on March 12, 2003; revised on August 5, 2003; accepted on August 29, 2003\nAdvance Access publication February 5, 2004\nABSTRACT\nMotivation: Compu Cellis a multi-model software framework\nfor simulation of the development of multicellular organisms\nknown as morphogenesis. It models the interaction of the gene\nregulatory network with generic cellular mechanisms, such as\ncell adhesion, division, haptotaxis and chemotaxis. A combin-\nation of a state automaton with stochastic local rules and a\nset of differential equations, including subcellular ordinary dif-\nferential equations and extracellular reaction–diffusion partial\ndifferential equations, model gene regulation. This automaton\nin turn controls the differentiation of the cells, and 

In [42]:
# Regular LLM answer of OpenAI
llm  = OpenAI(temperature=0.9)
print(llm(query))



CompuCell is a multi-scale modeling software designed to simulate the behavior of cells in various biological systems. It uses a cellular automata approach to simulate complex behaviors of cells and their interactions with surrounding cells and the environment. CompuCell is used for research in areas such as cancer, wound healing, tissue engineering, and stem cells. It is an open source software and is available for free download.
