 # Test langchain integration with Llama Cpp

In [11]:
from pathlib import Path

from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain.chains.question_answering import load_qa_chain
from langchain.prompts import PromptTemplate
from langchain.text_splitter import Language, RecursiveCharacterTextSplitter
from langchain_community.document_loaders.generic import GenericLoader
from langchain_community.document_loaders.parsers import LanguageParser
from langchain_community.embeddings.huggingface import HuggingFaceEmbeddings
from langchain_community.llms import LlamaCpp
from langchain_community.vectorstores import Chroma

dirpath = Path("/Users/cgebbe/git/_private/code_rag_prototype/models")
MODEL_PATH = {
    # codellama yielded poor results
    "codellama": dirpath / "codellama-7b.Q4_K_M.gguf",
    # deepseek performed pretty well on leaderboard and is fast with llama.cpp.
    # However, it doesn't seem to work with langchain, see https://github.com/langchain-ai/langchain/issues/14593
    "deepseek": dirpath / "deepseek-coder-6.7b-instruct.Q4_K_M.gguf",
    # llama works somewhat
    "llama": dirpath / "llama-2-7b-chat.Q2_K.gguf",
}["llama"]
assert MODEL_PATH.exists()

callback_manager = CallbackManager([StreamingStdOutCallbackHandler()])
llm = LlamaCpp(
    model_path=str(MODEL_PATH),
    temperature=0.75,
    # max_tokens=2000,
    n_ctx=4000,
    n_gpu_layers=-1,
    # n_batch=512, # should be between 1 and n_ctx ?!
    top_p=1,
    # f16_kv=True,
    callback_manager=callback_manager,
    verbose=True,  # Verbose is required to pass to the callback manager
)

llama_model_loader: loaded meta data with 19 key-value pairs and 291 tensors from /Users/cgebbe/git/_private/code_rag_prototype/models/llama-2-7b-chat.Q2_K.gguf (version GGUF V2)
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = LLaMA v2
llama_model_loader: - kv   2:                       llama.context_length u32              = 4096
llama_model_loader: - kv   3:                     llama.embedding_length u32              = 4096
llama_model_loader: - kv   4:                          llama.block_count u32              = 32
llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 11008
llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
llama_model_loader: - kv   7:            

In [2]:
prompt = """
Write a fastapi app
"""
llm(prompt)

NameError: name 'llm' is not defined

# Index documents

In [3]:
repo_path = "/Users/cgebbe/git/_private/elliptio_data_lake"
Path(repo_path).exists()

True

In [4]:
!find {repo_path} -name '*.py' | wc -l

      10


In [5]:
# There are 10 python files, but each (top-level) function and class get its own document
loader = GenericLoader.from_filesystem(
    repo_path,
    glob="**/*",
    suffixes=[".py"],
    exclude=("__init__.py",),
    parser=LanguageParser(language=Language.PYTHON),
)
documents = loader.load()
print(len(documents))
documents[0]

40


Document(page_content='def rm_s3(c):\n    s3_bucket_url = _get_s3_bucket_url()\n    c.run(f"aws s3 rm --recursive {s3_bucket_url}")', metadata={'source': '/Users/cgebbe/git/_private/elliptio_data_lake/tasks.py', 'content_type': 'functions_classes', 'language': <Language.PYTHON: 'python'>})

In [6]:
python_splitter = RecursiveCharacterTextSplitter.from_language(
    language=Language.PYTHON, chunk_size=2000, chunk_overlap=200
)
texts = python_splitter.split_documents(documents)
print(len(texts))
texts[0]

42


Document(page_content='def rm_s3(c):\n    s3_bucket_url = _get_s3_bucket_url()\n    c.run(f"aws s3 rm --recursive {s3_bucket_url}")', metadata={'source': '/Users/cgebbe/git/_private/elliptio_data_lake/tasks.py', 'content_type': 'functions_classes', 'language': <Language.PYTHON: 'python'>})

In [12]:
# from langchain_openai import OpenAIEmbeddings # Wouldn't work locally!!!
# NOTE: How interchangable are embeddings?!

# see https://medium.com/international-school-of-ai-data-science/implementing-rag-with-langchain-and-hugging-face-28e3ea66c5f7

model_name = {
    # this downloads all ~10 models for different quantizations (~40GB)
    "llama-7b": "TheBloke/Llama-2-7B-Chat-GGUF",
    # pretrained model from `sentence_transformers`, see https://www.sbert.net/docs/pretrained_models.html
    "pretrained": "all-mpnet-base-v2",
}["pretrained"]

embeddings = HuggingFaceEmbeddings(model_name=model_name)
embeddings

HuggingFaceEmbeddings(client=SentenceTransformer(
  (0): Transformer({'max_seq_length': 384, 'do_lower_case': False}) with Transformer model: MPNetModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
  (2): Normalize()
), model_name='all-mpnet-base-v2', cache_folder=None, model_kwargs={}, encode_kwargs={}, multi_process=False)

In [13]:
text = "This is a test document."
query_result = embeddings.embed_query(text)
len(query_result)

768

In [14]:
db = Chroma.from_documents(texts, embeddings)
retriever = db.as_retriever(
    search_type="mmr",  # Also test "similarity"
    search_kwargs={"k": 8},
)
# db.persist()

# Retrieve

In [15]:
template = """Use the following pieces of context to answer the question at the end. 
If you don't know the answer, just say that you don't know, don't try to make up an answer. 
Use three sentences maximum and keep the answer as concise as possible. 
{context}
Question: {question}
Helpful Answer:"""
QA_CHAIN_PROMPT = PromptTemplate(
    input_variables=["context", "question"],
    template=template,
)

In [16]:
# Doesn't work, Connection error. Likely need to setup LangSmith API key.
# QA_CHAIN_PROMPT = hub.pull("rlm/rag-prompt-default")

In [17]:
class RAG:
    def __init__(self) -> None:
        self.chain = load_qa_chain(
            llm, chain_type="stuff", prompt=QA_CHAIN_PROMPT, verbose=True
        )

    def answer(self, question: str):
        docs = retriever.get_relevant_documents(question)
        return self.chain(
            {"input_documents": docs, "question": question}, return_only_outputs=False
        )


rag = RAG()

In [21]:
# rag.answer("What filesystems are implemented in elliptio?") # good answer
rag.answer(
    "What automatically generated metadata does elliptio store?"
)  # bad answer first time, very good next time?!



[1m> Entering new StuffDocumentsChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mUse the following pieces of context to answer the question at the end. 
If you don't know the answer, just say that you don't know, don't try to make up an answer. 
Use three sentences maximum and keep the answer as concise as possible. 
from __future__ import annotations

import os
import shutil
import typing
from pathlib import Path, PurePosixPath

from elliptio.filetypes import RemoteFileInterface

if typing.TYPE_CHECKING:
    from elliptio.metadata import Metadata


# Code for: class LocalFile(RemoteFileInterface):

import abc
from dataclasses import dataclass
from pathlib import Path, PurePosixPath

from elliptio.metadata import Metadata


@dataclass
# Code for: class RemoteFileInterface(abc.ABC):

def get_metadata(run_id: str) -> Metadata:
    return Metadata(
        artifact_id=get_id(prefix="artifact_"),
        run_id=run_id,
        username=

Llama.generate: prefix-match hit


ptio stores the following automatically generated metadata for each file:
* creation time
* owner (the username of the user who uploaded it)
* hostname (the hostname of the machine where the file was saved)
* argv (the command-line arguments used to run the file)
* python packages (the Python packages required by the file, such as NumPy or pandas)

Additionally, the following metadata may be automatically generated depending on the environment:
* labels (labels associated with the file, such as "private" or "sensitive")
* file format (e.g., yml, yaml,json, etc.)
* file size (in bytes)
* last modified time (the date and time when the file was most recently modified)
* file type (e.g., "file", "directory", "symlink")

The metadata is retrieved from the underlying storage system (such as AWS S3 or MongoDB) using the `get_metadata` method of the `RemoteFileInterface` class.
[1m> Finished chain.[0m

[1m> Finished chain.[0m



llama_print_timings:        load time =     341.78 ms
llama_print_timings:      sample time =      22.78 ms /   223 runs   (    0.10 ms per token,  9788.00 tokens per second)
llama_print_timings: prompt eval time =       0.00 ms /     1 tokens (    0.00 ms per token,      inf tokens per second)
llama_print_timings:        eval time =    9741.57 ms /   223 runs   (   43.68 ms per token,    22.89 tokens per second)
llama_print_timings:       total time =   10159.16 ms /   224 tokens


{'input_documents': [Document(page_content='from __future__ import annotations\n\nimport os\nimport shutil\nimport typing\nfrom pathlib import Path, PurePosixPath\n\nfrom elliptio.filetypes import RemoteFileInterface\n\nif typing.TYPE_CHECKING:\n    from elliptio.metadata import Metadata\n\n\n# Code for: class LocalFile(RemoteFileInterface):', metadata={'content_type': 'simplified_code', 'language': 'python', 'source': '/Users/cgebbe/git/_private/elliptio_data_lake/src/elliptio/filetypes/local.py'}),
  Document(page_content='import abc\nfrom dataclasses import dataclass\nfrom pathlib import Path, PurePosixPath\n\nfrom elliptio.metadata import Metadata\n\n\n@dataclass\n# Code for: class RemoteFileInterface(abc.ABC):', metadata={'content_type': 'simplified_code', 'language': 'python', 'source': '/Users/cgebbe/git/_private/elliptio_data_lake/src/elliptio/filetypes/interface.py'}),
  Document(page_content='def get_metadata(run_id: str) -> Metadata:\n    return Metadata(\n        artifact_i