# core

> Core functionality for `onprem`

In [None]:
#| default_exp core

In [None]:
#| hide
from nbdev.showdoc import *

In [None]:
#| export

from langchain.chains import RetrievalQA
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain.vectorstores import Chroma
from langchain.llms import GPT4All, LlamaCpp
import chromadb
import os
import argparse
import time
import warnings
import sys
from typing import Any, Dict, Generator, List, Optional, Tuple, Union


In [None]:
#| export

from onprem import utils as U
DEFAULT_MODEL_URL = 'https://huggingface.co/TheBloke/Wizard-Vicuna-7B-Uncensored-GGML/resolve/main/Wizard-Vicuna-7B-Uncensored.ggmlv3.q4_0.bin'
DEFAULT_MODEL_NAME = os.path.basename(DEFAULT_MODEL_URL)
DEFAULT_EMBEDDING_MODEL = 'sentence-transformers/all-MiniLM-L6-v2'

class LLM:
    def __init__(self, 
                 model_name=DEFAULT_MODEL_NAME,
                 n_gpu_layers:Optional[int]=None, 
                 max_tokens:int=512, 
                 n_ctx:int=2048, 
                 n_batch:int=1024,
                 mute_stream=False,
                 embedding_model_name:str ='sentence-transformers/all-MiniLM-L6-v2',
                 embedding_model_kwargs:dict ={'device': 'cpu'},
                verbose=False):
        """
        LLM Constructor
        
        **Args:**

        - *model_name*: Name of the model.  Must downloaded with call `LLM.download_model`.
        - *n_gpu_layers*: Number of layers to be loaded into gpu memory. Default is `None`.
        - *max_tokens*: The maximum number of tokens to generate.
        - *n_ctx*: Token context window.
        - *n_batch*: Number of tokens to process in parallel.
        - *mute_stream*: Mute ChatGPT-like token stream output during generation
        - *embedding_model*: name of sentence-transformers model. Used for `LLM.ingest` and `LLM.ask`.
        - *embedding_model_kwargs*: arguments to embedding model (e.g., `{device':'cpu'}`).
        - *verbose*: Verbosity
        """
        self.model_name = model_name
        if not os.path.isfile(os.path.join(U.get_datadir(), model_name)):
            warnings.warn('The model {model_name} does not exist in {U.get_datadir()}. '+\
                          'Please execute LLM.download_model() to download it.')
        self.llm = None
        self.ingester = None
        self.n_gpu_layers = n_gpu_layers
        self.max_tokens = max_tokens
        self.n_ctx = n_ctx
        self.n_batch = n_batch
        self.callbacks = [] if mute_stream else [StreamingStdOutCallbackHandler()]
        self.embedding_model_name = embedding_model_name
        self.embedding_model_kwargs = embedding_model_kwargs
        self.verbose = verbose
 
    @classmethod
    def download_model(cls, model_url=DEFAULT_MODEL_URL, confirm=True, ssl_verify=True):
        """
        Download an LLM in GGML format supported by [lLama.cpp](https://github.com/ggerganov/llama.cpp).
        
        **Args:**
        
        - *model_url*: URL of model
        - *confirm*: whether or not to confirm with user before downloading
        - *ssl_verify*: If True, SSL certificates are verified. 
                        You can set to False if corporate firewall gives you problems.
        """
        datadir = U.get_datadir()
        model_name = os.path.basename(model_url)
        filename = os.path.join(datadir, model_name)
        confirm_msg = f"You are about to download the LLM {model_name} to the {datadir} folder. Are you sure?"
        if os.path.isfile(filename):
            confirm_msg = f'There is already a file {model_name} in {datadir}.\n Do you want to still download it?'
            
        shall = True
        if confirm:
            shall = input("%s (Y/n) " % confirm_msg) == "Y"
        if shall:
            U.download(model_url, filename, verify=ssl_verify)
        else:
            warnings.warn(f'{model_name} was not downloaded because "Y" was not selected.')
        return

    def load_ingester(self):
        """Get Ingester instance"""
        if not self.ingester:
            from onprem.ingest import Ingester
            self.ingester = Ingester(embedding_model_name=self.embedding_model_name,
                                     embedding_model_kwargs=self.embedding_model_kwargs)
        return self.ingester
        
        
    def ingest(self, 
               source_directory:str,
              ):
        """
        Ingests all documents in `source_folder` into vector database.
        Previously-ingested documents are ignored.

        **Args:**
        
        - *source_directory*: path to folder containing document store

        
        **Returns:** `None`
        """
        ingester = self.load_ingester()
        ingester.ingest(source_directory)
        return

 
        
    def check_model(self):
        datadir = U.get_datadir()
        model_path = os.path.join(datadir, self.model_name)
        if not os.path.isfile(model_path):
            raise ValueError(f'The LLM model {self.model_name} does not appear to have been downloaded. '+\
                             f'Execute the download_model() method to download it.')
        return model_path
        
 
    def load_llm(self):
        model_path = self.check_model()
        
        if not self.llm:
            self.llm =  llm = LlamaCpp(model_path=model_path, 
                                       max_tokens=self.max_tokens, 
                                       n_batch=self.n_batch, 
                                       callbacks=self.callbacks, 
                                       verbose=self.verbose, 
                                       n_gpu_layers=self.n_gpu_layers, 
                                       n_ctx=self.n_ctx)    

        return self.llm
        
        
    def prompt(self, prompt):
        """
        Send prompt to LLM to generate a response
        """
        llm = self.load_llm()
        return llm(prompt)  
    
    def ask(self, question, num_source_docs=4):
        """
        Answer a question based on source documents fed to the `ingest` method.
        
        **Args:**
        - question: a question you want to ask
        - num_source_docs: the number of ingested source documents use to generate answer
        """
        ingester = self.load_ingester()
        db = ingester.get_db()
        if not db:
            raise ValueError('A vector database has not yet been created. Please call the LLM.ingest method.')
        retriever = db.as_retriever(search_kwargs={"k": num_source_docs})
        llm = self.load_llm()
        qa = RetrievalQA.from_chain_type(llm=llm, 
                                         chain_type="stuff", 
                                         retriever=retriever, 
                                         return_source_documents= True)
        res = qa(question)
        return res['result'], res['source_documents']

In [None]:
show_doc(LLM.download_model)

---

[source](https://github.com/amaiya/onprem/blob/main/onprem/core.py#L68){target="_blank" style="float:right; font-size:smaller"}

### LLM.download_model

>      LLM.download_model (model_url='https://huggingface.co/TheBloke/Wizard-
>                          Vicuna-7B-Uncensored-GGML/resolve/main/Wizard-
>                          Vicuna-7B-Uncensored.ggmlv3.q4_0.bin', confirm=True,
>                          ssl_verify=True)

Download an LLM in GGML format supported by [lLama.cpp](https://github.com/ggerganov/llama.cpp).

**Args:**

- *model_url*: URL of model
- *confirm*: whether or not to confirm with user before downloading
- *ssl_verify*: If True, SSL certificates are verified. 
                You can set to False if corporate firewall gives you problems.

In [None]:
llm = LLM(model_name=DEFAULT_MODEL_NAME)

In [None]:
if not os.path.isfile( os.path.join(U.get_datadir(), DEFAULT_MODEL_NAME) ):
    LLM.download_model(DEFAULT_MODEL_URL, confirm=False)

In [None]:
assert os.path.isfile(os.path.join(U.get_datadir(), DEFAULT_MODEL_NAME))

In [None]:
show_doc(LLM.prompt)

---

[source](https://github.com/amaiya/onprem/blob/main/onprem/core.py#L148){target="_blank" style="float:right; font-size:smaller"}

### LLM.prompt

>      LLM.prompt (prompt)

Send prompt to LLM to generate a response

In [None]:
prompt = """Extract the names of people in the supplied sentences. Here is an example:
Sentence: James Gandolfini and Paul Newman were great actors.
People:
James Gandolfini, Paul Newman
Sentence:
I like Cillian Murphy's acting. Florence Pugh is great, too.
People:"""

In [None]:
saved_output = llm.prompt(prompt)

llama.cpp: loading model from /home/amaiya/onprem_data/Wizard-Vicuna-7B-Uncensored.ggmlv3.q4_0.bin
llama_model_load_internal: format     = ggjt v3 (latest)
llama_model_load_internal: n_vocab    = 32000
llama_model_load_internal: n_ctx      = 2048
llama_model_load_internal: n_embd     = 4096
llama_model_load_internal: n_mult     = 256
llama_model_load_internal: n_head     = 32
llama_model_load_internal: n_layer    = 32
llama_model_load_internal: n_rot      = 128
llama_model_load_internal: ftype      = 2 (mostly Q4_0)
llama_model_load_internal: n_ff       = 11008
llama_model_load_internal: model size = 7B
llama_model_load_internal: ggml ctx size =    0.08 MB
llama_model_load_internal: mem required  = 5407.72 MB (+ 1026.00 MB per state)
llama_new_context_with_model: kv self size  = 1024.00 MB



Cillian Murphy, Florence Pugh

In [None]:
show_doc(LLM.ingest)

---

[source](https://github.com/amaiya/onprem/blob/main/onprem/core.py#L104){target="_blank" style="float:right; font-size:smaller"}

### LLM.ingest

>      LLM.ingest (source_directory:str)

Ingests all documents in `source_folder` into vector database.
Previously-ingested documents are ignored.

**Args:**

- *source_directory*: path to folder containing document store

**Returns:** `None`

In [None]:
llm.ingest('./sample_data')

Appending to existing vectorstore at /home/amaiya/onprem_data/vectordb
Loading documents from ./sample_data


Loading new documents: 100%|██████████████████████| 2/2 [00:00<00:00, 12.71it/s]


Loaded 11 new documents from ./sample_data
Split into 62 chunks of text (max. 500 tokens each)
Creating embeddings. May take some minutes...
Ingestion complete! You can now query your documents using the prompt method


In [None]:
show_doc(LLM.ask)

---

[source](https://github.com/amaiya/onprem/blob/main/onprem/core.py#L155){target="_blank" style="float:right; font-size:smaller"}

### LLM.ask

>      LLM.ask (question, num_source_docs=4)

Answer a question based on source documents fed to the `ingest` method.

**Args:**
- question: a question you want to ask
- num_source_docs: the number of ingested source documents use to generate answer

In [None]:
question = """What is ktrain in a single sentence?""" 
answer, docs = llm.ask(question)
print('\n\nReferences:\n\n')
for i, document in enumerate(docs):
    print(f"\n{i+1}.> " + document.metadata["source"] + ":")
    print(document.page_content)

llama.cpp: loading model from /home/amaiya/onprem_data/Wizard-Vicuna-7B-Uncensored.ggmlv3.q4_0.bin
llama_model_load_internal: format     = ggjt v3 (latest)
llama_model_load_internal: n_vocab    = 32000
llama_model_load_internal: n_ctx      = 2048
llama_model_load_internal: n_embd     = 4096
llama_model_load_internal: n_mult     = 256
llama_model_load_internal: n_head     = 32
llama_model_load_internal: n_layer    = 32
llama_model_load_internal: n_rot      = 128
llama_model_load_internal: ftype      = 2 (mostly Q4_0)
llama_model_load_internal: n_ff       = 11008
llama_model_load_internal: model size = 7B
llama_model_load_internal: ggml ctx size =    0.08 MB
llama_model_load_internal: mem required  = 5407.72 MB (+ 1026.00 MB per state)
llama_new_context_with_model: kv self size  = 1024.00 MB


ValueError: `run` not supported when there is not exactly one output key. Got ['result', 'source_documents'].

In [None]:
#| hide
import nbdev; nbdev.nbdev_export()