In [25]:
# for preprocessing html data  
!pip install beautifulsoup4

# For RAG
!pip install langchain
!pip install langchainhub
!pip install chromadb
!pip install gpt4all

!pip install tqdm

# Needed if using LlamaCpp from LangChain
# !pip install llama-cpp-python




In [1]:
from tqdm import tqdm

# To deal with emails 
import email
from email.policy import default
from bs4 import BeautifulSoup

In [29]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.embeddings import GPT4AllEmbeddings
from langchain_community.vectorstores import Chroma
from langchain_community.llms import LlamaCpp
from langchain_core.output_parsers import StrOutputParser
from langchain.docstore.document import Document
from langchain import hub
from langchain_core.runnables import RunnablePassthrough

# Preprocess the ArXiv Email Digests

In [3]:
# code from Stack Overflow:
# https://stackoverflow.com/questions/59681461/read-a-big-mbox-file-with-python
class MboxReader:
    def __init__(self, filename):
        self.handle = open(filename, 'rb')
        assert self.handle.readline().startswith(b'From ')

    def __enter__(self):
        return self

    def __exit__(self, exc_type, exc_value, exc_traceback):
        self.handle.close()

    def __iter__(self):
        return iter(self.__next__())

    def __next__(self):
        lines = []
        while True:
            line = self.handle.readline()
            if line == b'' or line.startswith(b'From '):
                yield email.message_from_bytes(b''.join(lines), policy=default)
                if line == b'':
                    break
                lines = []
                continue
            lines.append(line)

In [56]:
path = "./Takeout/Mail/Arxiv.mbox"
mbox = MboxReader(path)
emails_to_process = 10

current_mails = 0
arxiv_contents = ""
for __, message in tqdm(enumerate(mbox)):
    payload = message.get_payload(decode=True)
    if payload:
        current_mails += 1
        if current_mails > emails_to_process:
            break
        soup = BeautifulSoup(payload, 'html.parser')
        body_text = soup.get_text().replace('"','').replace("\n", "").replace("\t", "").strip()
        arxiv_contents += body_text + " "

10it [00:00, 63.80it/s]


In [57]:
arxiv_contents



In [8]:
# write the arxiv emails into a txt file
with open("arxiv_contents.txt", "w", encoding="utf-8") as f:
 f.write(arxiv_contents)

### convert text document to langchain document format

In [10]:
doc = Document(page_content=arxiv_contents, 
                metadata={"source": "local"})

# split into different chunks
# chunk_size and chunk_overlap are a hyperparameters we choose 
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=20)

# split the documents and convert to vector stores
all_splits = text_splitter.split_documents([doc])
vector_store = Chroma.from_documents(documents=all_splits, 
                                     embedding=GPT4AllEmbeddings())

Downloading: 100%|███████████████████████████████████████████████████████████████████████████████████████| 45.9M/45.9M [00:02<00:00, 18.4MiB/s]
Verifying: 100%|██████████████████████████████████████████████████████████████████████████████████████████| 45.9M/45.9M [00:00<00:00, 399MiB/s]


### If using fine-tuned model, quantize to GGUF Format
For fine-tuning a given LLM, checkout [this video](https://youtu.be/_xxGMSVLwU8?feature=shared)

For quantizing, we can either: 
- Use the llama.cpp library written in C,C++ for this. Checkout [this video](https://youtu.be/j7ahltwlFH0?feature=shared)
- Or we can use LangChain's functionality. 

In any case we need the model to be converted to `gguf` format to run on the CPU.

## Load the quantize LLM model 
For the quantized model, we can either, 
- Use the LlamaCpp class from LangChain
- Use the GPT4All library

In any case, the models need to be quantized models

### Either use LlamaCpp as below

In [62]:
n_gpu_layers = 1 

n_batch = 512
quantized_gguf_model = "../generative-ai-course/quantized_models/ft-Q8_K_M.gguf"

# Initiate the LlamaCpp class to run the LLM
llm = LlamaCpp(
    model_path=quantized_gguf_model,
    n_gpu_layers=n_gpu_layers,
    n_batch=n_batch,
    n_ctx=1024,
    f16_kv=True,
    verbose=True,
)

ImportError: Could not import llama-cpp-python library. Please install the llama-cpp-python library to use this embedding model: pip install llama-cpp-python

### Or use GPT4All as below

In [61]:
from langchain_community.llms import GPT4All

llm = GPT4All(
    model=r"./models/orca-mini-3b-gguf2-q4_0.gguf",
    max_tokens=2048,
)
llm.invoke("what is Retrieval Augmented Generation?")

'\nRetrieval augmented generation (RAG) is a technique used in natural language processing to generate new text by retrieving and combining existing text. It involves using a large corpus of text to train a machine learning model that can predict the most likely next word or phrase in a given sentence. The model is then used to generate new text by selecting the most probable words or phrases from the training data. RAG has been applied to many different domains, including news summarization, language translation, and chatbot generation.'

## Create the LangChain with and without RAG
For a given prompt, 
- Create a langchain without retrieval and see the response
- Create a langchain with the retrieval object

And see how the response differs

### LLM response without RAG

In [43]:
# retrieve relevant docs
rag_prompt = hub.pull("rlm/rag-prompt")
retriever = vector_store.as_retriever()

# Create the langchain with retriever
qa_chain = (
    {"context": {}, "question": RunnablePassthrough()}
    | rag_prompt
    | llm
    | StrOutputParser()
)
qa_chain.invoke("what is the title of the paper that talks about awareness")

' There is no specific information provided in the context to determine the title of a paper.'

### LLM response with RAG

In [63]:
# retrieve relevant docs
rag_prompt = hub.pull("rlm/rag-prompt")
rag_prompt.messages

[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context', 'question'], template="You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.\nQuestion: {question} \nContext: {context} \nAnswer:"))]

In [None]:
def format_documents(documents):
    return "\n\n".join(documents.page_content for doc in docs)

In [40]:
retriever = vector_store.as_retriever()

# Create the langchain with retriever,
# prompt template and LLM
qa_chain = (
    {"context": retriever | format_documents, "question": RunnablePassthrough()}
    | rag_prompt
    | llm
    | StrOutputParser()
)
qa_chain.invoke("what is the title of the paper that talks about awareness?")

' The title of the paper that talks about awareness is "I Think, Therefore I am: Awareness in Large Language Models".'