From 125ebe85db5653f060b38485f5a886ebfd8dc66a Mon Sep 17 00:00:00 2001 From: zocainViken <75504411+zocainViken@users.noreply.github.com> Date: Sat, 18 Nov 2023 00:10:35 +0100 Subject: [PATCH 1/4] bug fixing --- .../low_level_api/low_level_api_llama_cpp.py | 33 +++++++--- .../readme/low_level_api_llama_cpp.md | 61 +++++++++++++++++++ 2 files changed, 86 insertions(+), 8 deletions(-) create mode 100644 examples/low_level_api/readme/low_level_api_llama_cpp.md diff --git a/examples/low_level_api/low_level_api_llama_cpp.py b/examples/low_level_api/low_level_api_llama_cpp.py index e3cff3251..3d8bbe142 100644 --- a/examples/low_level_api/low_level_api_llama_cpp.py +++ b/examples/low_level_api/low_level_api_llama_cpp.py @@ -11,20 +11,34 @@ prompt = b"\n\n### Instruction:\nWhat is the capital of France?\n\n### Response:\n" -lparams = llama_cpp.llama_context_default_params() +lparams = llama_cpp.llama_model_default_params() +cparams = llama_cpp.llama_context_default_params() model = llama_cpp.llama_load_model_from_file(MODEL_PATH.encode('utf-8'), lparams) -ctx = llama_cpp.llama_new_context_with_model(model, lparams) +ctx = llama_cpp.llama_new_context_with_model(model, cparams) # determine the required inference memory per token: tmp = [0, 1, 2, 3] -llama_cpp.llama_eval(ctx, (llama_cpp.c_int * len(tmp))(*tmp), len(tmp), 0, N_THREADS) +llama_cpp.llama_eval( + ctx = ctx, + tokens=(llama_cpp.c_int * len(tmp))(*tmp), + n_tokens=len(tmp), + n_past=0 + )# Deprecated n_past = 0 prompt = b" " + prompt embd_inp = (llama_cpp.llama_token * (len(prompt) + 1))() -n_of_tok = llama_cpp.llama_tokenize(ctx, prompt, embd_inp, len(embd_inp), True) +n_of_tok = llama_cpp.llama_tokenize( + model=model, + text=bytes(str(prompt),'utf-8'), + text_len=len(embd_inp), + tokens=embd_inp, + n_max_tokens=len(embd_inp), + add_bos=False, + special=False +) embd_inp = embd_inp[:n_of_tok] n_ctx = llama_cpp.llama_n_ctx(ctx) @@ -49,8 +63,11 @@ while remaining_tokens > 0: if len(embd) > 0: llama_cpp.llama_eval( - ctx, (llama_cpp.c_int * len(embd))(*embd), len(embd), n_past, N_THREADS - ) + ctx = ctx, + tokens=(llama_cpp.c_int * len(embd))(*embd), + n_tokens=len(embd), + n_past=n_past + )# Deprecated n_past += len(embd) embd = [] @@ -93,7 +110,7 @@ for id in embd: size = 32 buffer = (ctypes.c_char * size)() - n = llama_cpp.llama_token_to_piece_with_model( + n = llama_cpp.llama_token_to_piece( model, llama_cpp.llama_token(id), buffer, size) assert n <= size print( @@ -109,4 +126,4 @@ llama_cpp.llama_print_timings(ctx) -llama_cpp.llama_free(ctx) +llama_cpp.llama_free(ctx) \ No newline at end of file diff --git a/examples/low_level_api/readme/low_level_api_llama_cpp.md b/examples/low_level_api/readme/low_level_api_llama_cpp.md new file mode 100644 index 000000000..5f350ffe9 --- /dev/null +++ b/examples/low_level_api/readme/low_level_api_llama_cpp.md @@ -0,0 +1,61 @@ +# Low-Level API for Llama_cpp + +## Overview +This Python script, low_level_api_llama_cpp.py, demonstrates the implementation of a low-level API for interacting with the llama_cpp library. The script defines an inference that generates embeddings based on a given prompt using .gguf model. + +### Prerequisites +Before running the script, ensure that you have the following dependencies installed: + +. Python 3.6 or higher +. llama_cpp: A C++ library for working with .gguf model +. NumPy: A fundamental package for scientific computing with Python +. multiprocessing: A Python module for parallel computing + +### Usage +install depedencies: +```bash +python -m pip install llama-cpp-python ctypes os multiprocessing +``` +Run the script: +```bash +python low_level_api_llama_cpp.py +``` + +## Code Structure +The script is organized as follows: + +### . Initialization: + Load the model from the specified path. + Create a context for model evaluation. + +### . Tokenization: + Tokenize the input prompt using the llama_tokenize function. + Prepare the input tokens for model evaluation. + +### . Inference: + Perform model evaluation to generate responses. + Sample from the model's output using various strategies (top-k, top-p, temperature). + +### . Output: + Print the generated tokens and the corresponding decoded text. + +### .Cleanup: + Free resources and print timing information. + +## Configuration +Customize the inference behavior by adjusting the following variables: + +#### . N_THREADS: Number of CPU threads to use for model evaluation. +#### . MODEL_PATH: Path to the model file. +#### . prompt: Input prompt for the chatbot. + +## Notes +. Ensure that the llama_cpp library is built and available in the system. Follow the instructions in the llama_cpp repository for building and installing the library. + +. This script is designed to work with the .gguf model and may require modifications for compatibility with other models. + +## Acknowledgments +This code is based on the llama_cpp library developed by the community. Special thanks to the contributors for their efforts. + +## License +This project is licensed under the MIT License - see the LICENSE file for details. \ No newline at end of file From b38278a54980f734cef14e34e307371473f5c544 Mon Sep 17 00:00:00 2001 From: zocainViken <75504411+zocainViken@users.noreply.github.com> Date: Fri, 1 Dec 2023 00:36:15 +0100 Subject: [PATCH 2/4] llava from readme got this error: UnsupportedOperation: fileno quick fix by checking hasattr --- llama_cpp/_utils.py | 26 ++++++++++++++++---------- 1 file changed, 16 insertions(+), 10 deletions(-) diff --git a/llama_cpp/_utils.py b/llama_cpp/_utils.py index 1b61eec69..171f357f5 100644 --- a/llama_cpp/_utils.py +++ b/llama_cpp/_utils.py @@ -17,14 +17,18 @@ def __enter__(self): if self.disable: return self + # Check if sys.stdout and sys.stderr have fileno method + if not hasattr(self.sys.stdout, 'fileno') or not hasattr(self.sys.stderr, 'fileno'): + return self # Return the instance without making changes + self.outnull_file = self.open(self.os.devnull, "w") self.errnull_file = self.open(self.os.devnull, "w") self.old_stdout_fileno_undup = self.sys.stdout.fileno() self.old_stderr_fileno_undup = self.sys.stderr.fileno() - self.old_stdout_fileno = self.os.dup(self.sys.stdout.fileno()) - self.old_stderr_fileno = self.os.dup(self.sys.stderr.fileno()) + self.old_stdout_fileno = self.os.dup(self.old_stdout_fileno_undup) + self.old_stderr_fileno = self.os.dup(self.old_stderr_fileno_undup) self.old_stdout = self.sys.stdout self.old_stderr = self.sys.stderr @@ -40,14 +44,16 @@ def __exit__(self, *_): if self.disable: return - self.sys.stdout = self.old_stdout - self.sys.stderr = self.old_stderr + # Check if sys.stdout and sys.stderr have fileno method + if hasattr(self.sys.stdout, 'fileno') and hasattr(self.sys.stderr, 'fileno'): + self.sys.stdout = self.old_stdout + self.sys.stderr = self.old_stderr - self.os.dup2(self.old_stdout_fileno, self.old_stdout_fileno_undup) - self.os.dup2(self.old_stderr_fileno, self.old_stderr_fileno_undup) + self.os.dup2(self.old_stdout_fileno, self.old_stdout_fileno_undup) + self.os.dup2(self.old_stderr_fileno, self.old_stderr_fileno_undup) - self.os.close(self.old_stdout_fileno) - self.os.close(self.old_stderr_fileno) + self.os.close(self.old_stdout_fileno) + self.os.close(self.old_stderr_fileno) - self.outnull_file.close() - self.errnull_file.close() + self.outnull_file.close() + self.errnull_file.close() From 86f1ee1f77009751dc260810251b4b1a3a05fde1 Mon Sep 17 00:00:00 2001 From: zocainViken <75504411+zocainViken@users.noreply.github.com> Date: Sat, 2 Dec 2023 11:55:29 +0100 Subject: [PATCH 3/4] multi modal params fix: add logits = True -> to make llava work --- README.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index dd4eb52ed..8920f0e3b 100644 --- a/README.md +++ b/README.md @@ -265,7 +265,8 @@ Then you'll need to use a custom chat handler to load the clip model and process >>> llm = Llama( model_path="./path/to/llava/llama-model.gguf", chat_handler=chat_handler, - n_ctx=2048 # n_ctx should be increased to accomodate the image embedding + n_ctx=2048, # n_ctx should be increased to accomodate the image embedding + logits_all=True,# needed to make llava work ) >>> llm.create_chat_completion( messages = [ From 077155392c1bf70766bb1845a07a567a84f0c204 Mon Sep 17 00:00:00 2001 From: zocainViken <75504411+zocainViken@users.noreply.github.com> Date: Sat, 2 Dec 2023 12:09:18 +0100 Subject: [PATCH 4/4] multi modal params fix: add logits = True -> to make llava work --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 8920f0e3b..a79fafc4c 100644 --- a/README.md +++ b/README.md @@ -265,8 +265,8 @@ Then you'll need to use a custom chat handler to load the clip model and process >>> llm = Llama( model_path="./path/to/llava/llama-model.gguf", chat_handler=chat_handler, - n_ctx=2048, # n_ctx should be increased to accomodate the image embedding logits_all=True,# needed to make llava work + n_ctx=2048, # n_ctx should be increased to accomodate the image embedding ) >>> llm.create_chat_completion( messages = [