In [1]:
from llama_cpp import Llama
from langchain.embeddings import LlamaCppEmbeddings
import numpy as np

In [2]:
MODEL_PATH = './gpt4all-lora-quantized-ggml.bin'

class LLama:
    def __init__(
        self,
        model_path = MODEL_PATH,
        n_threads  = 24
    ):
        self.model = Llama(
            model_path = model_path,
            n_threads  = n_threads
        )
        
    def predict(self, prompt, temperature = 0.5, max_len=512):
        output = self.model(
            prompt      = prompt,
            max_tokens  = max_len - len(prompt),
            echo        = False,
            temperature = temperature
        )
        return output['choices'][0]['text'].replace('\n', '')

    
class LlamaEmbeddingGenerator:
    def __init__(self, model_path = MODEL_PATH):
       self.model = LlamaCppEmbeddings(model_path=model_path)
    
    def embed(self, promp):
        return np.array(self.model.embed_query(promp))

In [3]:
model = LLama()

llama.cpp: loading model from ./gpt4all-lora-quantized-ggml.bin
llama.cpp: can't use mmap because tensors are not aligned; convert to new format to avoid this
llama_model_load_internal: format     = ggmf v1 (old version with no mmap support)
llama_model_load_internal: n_vocab    = 32001
llama_model_load_internal: n_ctx      = 512
llama_model_load_internal: n_embd     = 4096
llama_model_load_internal: n_mult     = 256
llama_model_load_internal: n_head     = 32
llama_model_load_internal: n_layer    = 32
llama_model_load_internal: n_rot      = 128
llama_model_load_internal: ftype      = 2 (mostly Q4_0)
llama_model_load_internal: n_ff       = 11008
llama_model_load_internal: n_parts    = 1
llama_model_load_internal: model size = 7B
llama_model_load_internal: ggml ctx size = 4113744.11 KB
llama_model_load_internal: mem required  = 5809.33 MB (+ 1026.00 MB per state)
...................................................................................................
.
llama_init_from_file: kv

In [4]:
%%time
model.predict('What is overfitting?')

CPU times: user 4min 34s, sys: 1 s, total: 4min 35s
Wall time: 11.9 s



llama_print_timings:        load time =   247.91 ms
llama_print_timings:      sample time =    28.64 ms /    53 runs   (    0.54 ms per run)
llama_print_timings: prompt eval time =   247.88 ms /     7 tokens (   35.41 ms per token)
llama_print_timings:        eval time = 11577.34 ms /    52 runs   (  222.64 ms per run)
llama_print_timings:       total time = 11859.30 ms


'Overfitting is when a machine learning model learns too much from the training data, resulting in poor performance on unseen data. It can happen when there are too many features or if the model is not properly tuned for the task at hand.'

In [5]:
generator = LlamaEmbeddingGenerator()

llama.cpp: loading model from ./gpt4all-lora-quantized-ggml.bin
llama.cpp: can't use mmap because tensors are not aligned; convert to new format to avoid this
llama_model_load_internal: format     = ggmf v1 (old version with no mmap support)
llama_model_load_internal: n_vocab    = 32001
llama_model_load_internal: n_ctx      = 512
llama_model_load_internal: n_embd     = 4096
llama_model_load_internal: n_mult     = 256
llama_model_load_internal: n_head     = 32
llama_model_load_internal: n_layer    = 32
llama_model_load_internal: n_rot      = 128
llama_model_load_internal: ftype      = 2 (mostly Q4_0)
llama_model_load_internal: n_ff       = 11008
llama_model_load_internal: n_parts    = 1
llama_model_load_internal: model size = 7B
llama_model_load_internal: ggml ctx size = 4113744.11 KB
llama_model_load_internal: mem required  = 5809.33 MB (+ 2052.00 MB per state)
...................................................................................................
.
llama_init_from_file: kv

In [6]:
%%time
embedding = generator.embed('What is overfitting?')
embedding.shape

CPU times: user 2.65 s, sys: 26.7 ms, total: 2.68 s
Wall time: 226 ms



llama_print_timings:        load time =   224.97 ms
llama_print_timings:      sample time =     0.00 ms /     1 runs   (    0.00 ms per run)
llama_print_timings: prompt eval time =   224.72 ms /     7 tokens (   32.10 ms per token)
llama_print_timings:        eval time =     0.00 ms /     1 runs   (    0.00 ms per run)
llama_print_timings:       total time =   225.07 ms


(4096,)