# Loading and Preprocessing Data

In [113]:
# ! pip install langchain
# ! pip install -qU langchain_community beautifulsoup4
# ! pip install tiktoken
# ! pip install sentence-transformers
# ! pip install nltk
! pip install regrex

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




In [114]:
from langchain_ollama import OllamaLLM

class Models:
    def __init__(self):
        """Initialize the Models class."""
        self.llm = None  # Initialize llm attribute as None

    def mistral(self):
        """Create and return an instance of the Mistral LLM."""
        try:
            self.llm = OllamaLLM(base_url="http://localhost:11434", model="llama3.1", temperature=0)
            print("LLM instance created successfully.")
            return self.llm
        except Exception as e:
            print(f"An error occurred while creating the LLM instance: {e}")
            return None


# Create an instance of the Models class
llm_check = Models()  # Create an instance of the Models class

# Initialize the Mistral LLM using the instance
llm = llm_check.mistral()

if llm:
    print("LLM is ready for use!")
else:
    print("Failed to initialize the LLM.")


LLM instance created successfully.
LLM is ready for use!


In [144]:
from langchain_community.document_loaders import WebBaseLoader
#Initialize the WebBaseLoader
loader = WebBaseLoader("https://pmc.ncbi.nlm.nih.gov/articles/PMC5404248/")
#$Load data and store it in a variabe
data = loader.load()
#display data
# data

In [116]:
# import regex as re
# import nltk

# stopwords = set(nltk.corpus.stopwords.words('English'))

# def remove__stop(tokens):
#     return [t for t in tokens if t.lower() not in stopwords]

In [117]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
#Initialize the text splitter with specified parameter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 200,
    chunk_overlap= 20
)

In [143]:
#chunk the loaded document into smaller chunk
chunks = text_splitter.split_documents(data)

## Data Embedding and Vector Databases

- Once document has been successfully loaded and chunks into smaller part, we can:
1 :
  Choose an Embedding model to transform this human text  into vector, there, will be store in:
2 : Vector database, the vector embeddings  will be stored in a vector store. For this   purpose, we will be using ChromaDB.

In [119]:
from langchain.embeddings import HuggingFaceEmbeddings

# Specify the model name and additional arguments
model_name = "sentence-transformers/all-MiniLM-L6-v2"
model_kwargs = {'device' : 'cpu'}
encode_kwargs = {'normalize_embeddings': False}

# Initialize HuggingFace Embeddings
hf = HuggingFaceEmbeddings(
    model_name = model_name,
    model_kwargs = model_kwargs,
    encode_kwargs = encode_kwargs
)

In [120]:
embeb = hf.embed_documents(texts=['h','e'])
#print lenght of one of embedding to check its dimension
# print(embeb)
print(len(embeb[1]))

384


In [137]:
#ChromaDB installation
# ! pip install chromadb

In [122]:
from langchain.vectorstores import Chroma

# Initialize chroma DB to save the the vectors embedding
vectordb = Chroma.from_documents(chunks, hf)

In [126]:
# let's perform a similarity search
vectordb.similarity_search("tell me about deseater", k= 3)

[Document(metadata={'description': 'In recent years, growth of international travel and trade, as well as climate change, has resulted in the frequent emergence and reemergence of infectious diseases such as Ebola, Zika, and MERS. In 2016, Taiwan used the Joint External Evaluation ...', 'language': 'en', 'source': 'https://pmc.ncbi.nlm.nih.gov/articles/PMC5404248/', 'title': '\n            Public Health Emergency Response in Taiwan - PMC\n        '}, page_content='November20, 2016'),
 Document(metadata={'description': 'In recent years, growth of international travel and trade, as well as climate change, has resulted in the frequent emergence and reemergence of infectious diseases such as Ebola, Zika, and MERS. In 2016, Taiwan used the Joint External Evaluation ...', 'language': 'en', 'source': 'https://pmc.ncbi.nlm.nih.gov/articles/PMC5404248/', 'title': '\n            Public Health Emergency Response in Taiwan - PMC\n        '}, page_content='November20, 2016'),
 Document(metadata={'d

# Retrieve and Generate 

In [None]:
from langchain.chains import ConversationalRetrievalChain
from langchain.prompts import PromptTemplate
from langchain.chains.question_answering import load_qa_chain
from langchain.chains import LLMChain

# Define the prompt template and create an LLM chain (if needed)
prompt_template = "Answer the question based on the context: {context}\nQuestion: {question}\nAnswer:"
llm_chain = LLMChain(llm=llm, prompt=PromptTemplate.from_template(prompt_template))

# Initialize the retriever with similarity score threshold
retriever = vectordb.as_retriever(search_type="similarity_score_threshold", search_kwargs={'score_threshold': 0.8})

# Initialize the ConversationalRetrievalChain
qa_chain = ConversationalRetrievalChain(
    llm_chain=llm_chain,
    retriever=retriever,
    combine_docs_chain=load_qa_chain(llm_chain, chain_type="stuff"),  
    question_generator=llm_chain  
)



In [None]:
# Perform a query using the chain
response = qa_chain({"query": "What factors have contributed to the frequent emergence and reemergence of infectious diseases in recent years?"})

# Print the response
print(response)