**Prerequisite**
1. Download LLama Model locally
  1. https://huggingface.co/TheBloke/Llama-2-13B-chat-GGUF/tree/main
2. Preload Sentence Transformer model (run the preload code below)

**Plan**

1. Use PDF document (e.g. a financial report)
2. Split using SentenceTransformer
3. Load to MongoDB
4. Search 
5. Add a prompt
6. Generate

In [None]:
!pip install langchain==0.1.3
!pip install sentence-transformers==2.2.2
!pip install "pymongo[srv]"
!pip install typing-inspect==0.8.0 typing_extensions==4.5.0
!pip install pypdf==3.17.4

In [None]:
!CMAKE_ARGS="-DLLAMA_CUBLAS=on" pip install llama-cpp-python==0.2.25 --force-reinstall --upgrade --no-cache-dir

In [None]:
!rm -rf /opt/conda/lib/python3.10/site-packages/numpy-1.26.3.dist-info
!rm -rf /opt/conda/lib/python3.10/site-packages/numpy-1.26.4.dist-info

In [None]:
# !pip install --force-reinstall --no-deps numpy==1.26.3
!pip install -U  numpy==1.24.1

### Imports

In [None]:
from pymongo import MongoClient
import os
from llama_cpp import Llama
from langchain_community.llms import LlamaCpp
import torch

# https://www.sbert.net/docs/pretrained_models.html#model-overview
# Sentence BERT, based on BERT
from sentence_transformers import SentenceTransformer

# https://api.python.langchain.com/en/latest/text_splitter/langchain.text_splitter.RecursiveCharacterTextSplitter.ht
# https://api.python.langchain.com/en/latest/text_splitter/langchain.text_splitter.SentenceTransformersTokenTextSplitter.html
from langchain.text_splitter import (
    RecursiveCharacterTextSplitter, 
    SentenceTransformersTokenTextSplitter
)
from pypdf import PdfReader

import ctypes
from llama_cpp import llama_log_set
def my_log_callback(level, message, user_data):
    pass

log_callback = ctypes.CFUNCTYPE(None, ctypes.c_int, ctypes.c_char_p, ctypes.c_void_p)(my_log_callback)
llama_log_set(log_callback, ctypes.c_void_p())

# We will keep all global variables in an object to not pullute the global namespace.
class Object(object):
    pass

In [None]:
t = Object()

In [None]:
KAGGLE = os.environ.get('KAGGLE_KERNEL_RUN_TYPE', '') != ''

## MongoDB Config

In [None]:
if KAGGLE:
    from kaggle_secrets import UserSecretsClient
    user_secrets = UserSecretsClient()
    t.uri = user_secrets.get_secret("MONGODB_URI")
else:
    t.uri = os.environ["MONGODB_URI"]
# Create a new client and connect to the server
t.client = MongoClient(t.uri)
# Send a ping to confirm a successful connection
try:
    t.client.admin.command('ping')
    print("Pinged your deployment. You successfully connected to MongoDB!")
except Exception as e:
    print(e)

In [None]:
t.db = t.client.rag_llama
t.coll = t.db.mdb

In [None]:
def preload():
    s = SentenceTransformersTokenTextSplitter()
    emb = SentenceTransformer('sentence-transformers/multi-qa-mpnet-base-cos-v1')

In [None]:
if KAGGLE:
    !wget https://huggingface.co/TheBloke/Llama-2-13B-chat-GGUF/resolve/main/llama-2-13b-chat.Q6_K.gguf    
    preload()

## Llama Config

In [None]:
# t.model_path = "../../data"
if KAGGLE:
    t.llm_path = "/kaggle/working/llama-2-13b-chat.Q6_K.gguf"
    t.layers = 50
else:    
    t.model_path = "../../../../data"
    t.llm_path = f"{t.model_path}/llama/llama-2-13b-chat.Q6_K.gguf"
    t.layers = 50

## Load and Parse Documents

In [None]:
# t.reader = PdfReader("data/brk-2023-q3.pdf")
# t.reader = PdfReader("data/msft-2022.pdf")
if KAGGLE:
    t.reader = PdfReader(f"../input/mdb-pdf/{t.coll.name}-2022.pdf")
else:
    t.reader = PdfReader(f"data/{t.coll.name}-2022.pdf")
t.pages = [p.extract_text().strip() for p in t.reader.pages]

Pages are of various sizes. We need to split into chunks that fit into the model window, specifically, the BERT embedding 256-token sized window. 

So we'll join all pages, and use the SentenceTransformer splitter to split the doc into the chunks of the right size.

In [None]:
# print(t.pages[10])

In [None]:
t.ch_splitter =  RecursiveCharacterTextSplitter(
    separators=["\n\n", "\n", ". ", " ", ""],
    chunk_size=1024,
    chunk_overlap=0
)
t.ch_chunks = t.ch_splitter.split_text("\n".join(t.pages))
len(t.ch_chunks)

In [None]:
t.token_splitter = SentenceTransformersTokenTextSplitter(chunk_overlap=10, tokens_per_chunk=256)
t.token_chunks = []
for ch in t.ch_chunks:
    t.token_chunks.extend(t.token_splitter.split_text(ch))
len(t.token_chunks)

## Embedding Model

In [None]:
t.emb_model = SentenceTransformer('sentence-transformers/multi-qa-mpnet-base-cos-v1')

In [None]:
len(t.emb_model.encode(t.token_chunks[21]).tolist())

## Upload documents

In [None]:
len(list(t.coll.find().limit(10)))

In [None]:
# _ = t.coll.insert_many(t.docs)

In [None]:
len(list(t.coll.find().limit(10)))

## Query Index

Index definition:

```
{
  "fields": [
    {
      "type": "vector",
      "path": "emb",
      "numDimensions": 768,
      "similarity": "dotProduct"
    }
  ]
}
```

In [None]:
t.query = "What was the total revenue?"

t.results = t.coll.aggregate([{
    "$vectorSearch": {
        "queryVector": t.emb_model.encode(t.query).tolist(),
        "path": "embedding",
        "numCandidates": 100,
        "limit": 8,
        "index": f"{t.coll.name}_vector_index"
    }}])

t.context = "\n\n".join([d['text'] for d in t.results])

In [None]:
print(t.context[0:1000])

## Load LLama

In [None]:
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler

# https://python.langchain.com/docs/guides/local_llms
t.llm = LlamaCpp(
    model_path=t.llm_path,
    n_gpu_layers=t.layers,
    n_threads=2, 
    n_ctx=4096, 
    n_batch=512,
    verbose=False,
    f16_kv=True,
    callback_manager=CallbackManager([StreamingStdOutCallbackHandler()]),
)

## Query LLaMa

In [None]:
def ask(prompt, temp=0.8, top_p=0.95):
    out = t.llm.invoke(
        prompt, 
        max_tokens=512, 
        stop=["Q:"], 
        temperature=temp,
        top_p=top_p,
        top_k=10,
        repeat_penalty=1.2,
    )
    return out

Prompt Format:
```
<s>[INST] <<SYS>>
{{ system_prompt }}
<</SYS>>

{{ user_message }} [/INST]
```

### Query with RAG

In [None]:
def ask_with_context(question, context):
    full_prompt = (
    "<s>[INST]<<SYS>>\n"
    + "You are a helpful expert financial research assistant." 
    + "You answer questions about about information contained in a financial report."
    + "You will be given the user's question, and the relevant informaton from " 
    + "the financial report. Answer the question using only this information" 
    + "\n<</SYS>>\n\n"
    + "Information: {context}\n"
    + "Question: {question}\n"
    + "Answer:\n"
    + "[/INST]"
    )
    full_prompt = full_prompt.replace("{context}", context)
    full_prompt = full_prompt.replace("{question}", question)
    ask(full_prompt)

In [None]:
def find_context(question):
    results = t.coll.aggregate([{
    "$vectorSearch": {
        "queryVector": t.emb_model.encode(question).tolist(),
        "path": "embedding",
        "numCandidates": 200,
        "limit": 8,
        "index": f"{t.coll.name}_vector_index"
    }}])
    result_texts = [d['text'] for d in results]
    assert len(result_texts) > 0
    context = "\n\n".join(result_texts)
    return context

In [None]:
def ask_with_rag(question):
    context = find_context(question)
    ask_with_context(question, context)

In [None]:
%%time
ask_with_rag("What was the total revenue?")

In [None]:
%%time
ask_with_rag("What was the operating income or loss?")

In [None]:
%%time
ask_with_rag("What was the operating income or loss in year 2022?")

In [None]:
%%time
ask_with_rag("Compare the total revenue between the years 2023 and 2022")

In [None]:
%%time
ask_with_rag("What time period does the report cover?")

In [None]:
%%time
ask_with_rag("Were there any changes to the executive team?")

In [None]:
ask_with_context(
    "Were there any changes to the executive team?", 
    find_context("Were there any changes to the executive team?"))

### Query Embedded Knowledge

In [None]:
def ask_llm(question):
    prompt = (
        f"<s>[INST]<<SYS>>\n"
        + f"You are a helpful expert financial research assistant." 
        + f"\n<</SYS>>\n\n"
        + f"Question: {question}\n"
        + f"Answer:\n"
        + f"[/INST]"
    )
    ask(prompt)

In [None]:
%%time
ask_llm("What was the total revenue of MongoDB in the year ended January 31, 2023?")

In [None]:
%%time
ask_llm("Were there any changes to the executive team at MongoDB in the year ended January 31, 2023?")

## Visualize Embeddings


In [None]:
v = Object()

In [None]:
v.embeddings = list(t.coll.find({}, {"embedding": 1}))

In [None]:
v.emb_values = [e['embedding'] for e in v.embeddings]

In [None]:
def umap_embeddings(emb, umap_t, n=1e10):
    n = min(n, len(emb))
    umap_emb = np.empty((n, 2))
    for i, e in enumerate(tqdm(emb)): 
        umap_emb[i] = umap_t.transform([e])
        if i >= n - 1: break
    return umap_emb

In [None]:
import umap
import numpy as np
from tqdm import tqdm
import pickle

# Projections are slow, so we'll cache them
v.umap_all_emb_path = "umap_emb_all_docs.temp.pickle"

v.umap_transform = umap.UMAP(
    random_state=0, 
    transform_seed=0, 
    low_memory=False).fit(v.emb_values)

if os.path.exists(v.umap_all_emb_path):
    with open(v.umap_all_emb_path, "rb") as f:
        v.umap_all = pickle.load(f)
else:
    v.umap_all = umap_embeddings(v.emb_values, v.umap_transform, 120)
    with open(v.umap_all_emb_path, "wb") as f:
        pickle.dump(v.umap_all, f)

In [None]:
v.question = "What was the total revenue?"

In [None]:
import matplotlib.pyplot as plt
plt.figure()
plt.scatter(v.umap_all[:, 0], v.umap_all[:, 1], s=15)

In [None]:
v.results = t.coll.aggregate([{
    "$vectorSearch": {
        "queryVector": t.emb_model.encode(v.question).tolist(),
        "path": "embedding",
        "numCandidates": 200,
        "limit": 8,
        "index": f"{t.coll.name}_vector_index"
    }}]) 
v.results = list(v.results)

In [None]:
len(v.results)

In [None]:
v.umap_query = umap_embeddings([
    t.emb_model.encode(v.question)
], v.umap_transform)

In [None]:
v.umap_query

In [None]:
v.umap_results = umap_embeddings(
    [d['embedding'] for d in v.results],
    v.umap_transform
)

In [None]:
v.umap_results.shape

In [None]:
plt.figure()
plt.scatter(v.umap_all[:, 0], v.umap_all[:, 1], s=15, color="gray")
plt.scatter(v.umap_results[:, 0], v.umap_results[:, 1], s=15, color="blue")
plt.scatter(v.umap_query[:, 0], v.umap_query[:, 1], marker="x", s=100, color="r")

## LangChain

We'll use LangChain to tie this all together into a simple API.

In [None]:
# https://python.langchain.com/docs/integrations/vectorstores/mongodb_atlas

from langchain.chains import RetrievalQA
from langchain_community.vectorstores import MongoDBAtlasVectorSearch
from langchain_community.embeddings import HuggingFaceEmbeddings

In [None]:
l = Object()
l.llm = t.llm

In [None]:
l.lang_emb = HuggingFaceEmbeddings(model_name="multi-qa-mpnet-base-cos-v1")

Check that the embeddings model returns embeddings of the correct size of 768:

In [None]:
len(l.lang_emb.embed_documents(['This is a test document'])[0])

In [None]:
l.vector_search = MongoDBAtlasVectorSearch(
    t.coll, 
    l.lang_emb, 
    index_name="mdb_vector_index",
    embedding_key="embedding")

In [None]:
l.results = list(l.vector_search.max_marginal_relevance_search(
    query="What was the total revenue?",
    k = 8,
))

In [None]:
len(l.results)

### Make a Retriever Object

In [None]:
l.retriever = l.vector_search.as_retriever(search_kwargs={"k": 8})

### Make the end-to-end chain object

In [None]:
l.qa = RetrievalQA.from_chain_type(
    llm=l.llm, 
    retriever=l.retriever)

### Query LLM with LangChain

In [None]:
%%time
l.qa.invoke("What was the total revenue?")

In [None]:
%%time
l.qa.invoke("What time period does the report cover?")