From 125ebe85db5653f060b38485f5a886ebfd8dc66a Mon Sep 17 00:00:00 2001
From: zocainViken <75504411+zocainViken@users.noreply.github.com>
Date: Sat, 18 Nov 2023 00:10:35 +0100
Subject: [PATCH 1/4] bug fixing

---
 .../low_level_api/low_level_api_llama_cpp.py  | 33 +++++++---
 .../readme/low_level_api_llama_cpp.md         | 61 +++++++++++++++++++
 2 files changed, 86 insertions(+), 8 deletions(-)
 create mode 100644 examples/low_level_api/readme/low_level_api_llama_cpp.md

diff --git a/examples/low_level_api/low_level_api_llama_cpp.py b/examples/low_level_api/low_level_api_llama_cpp.py
index e3cff3251..3d8bbe142 100644
--- a/examples/low_level_api/low_level_api_llama_cpp.py
+++ b/examples/low_level_api/low_level_api_llama_cpp.py
@@ -11,20 +11,34 @@
 
 prompt = b"\n\n### Instruction:\nWhat is the capital of France?\n\n### Response:\n"
 
-lparams = llama_cpp.llama_context_default_params()
+lparams = llama_cpp.llama_model_default_params()
+cparams = llama_cpp.llama_context_default_params()
 model = llama_cpp.llama_load_model_from_file(MODEL_PATH.encode('utf-8'), lparams)
-ctx = llama_cpp.llama_new_context_with_model(model, lparams)
+ctx = llama_cpp.llama_new_context_with_model(model, cparams)
 
 # determine the required inference memory per token:
 tmp = [0, 1, 2, 3]
-llama_cpp.llama_eval(ctx, (llama_cpp.c_int * len(tmp))(*tmp), len(tmp), 0, N_THREADS)
+llama_cpp.llama_eval(
+            ctx = ctx, 
+            tokens=(llama_cpp.c_int * len(tmp))(*tmp),
+            n_tokens=len(tmp),
+            n_past=0
+        )# Deprecated
 
 n_past = 0
 
 prompt = b" " + prompt
 
 embd_inp = (llama_cpp.llama_token * (len(prompt) + 1))()
-n_of_tok = llama_cpp.llama_tokenize(ctx, prompt, embd_inp, len(embd_inp), True)
+n_of_tok = llama_cpp.llama_tokenize(
+    model=model,
+    text=bytes(str(prompt),'utf-8'),
+    text_len=len(embd_inp), 
+    tokens=embd_inp,
+    n_max_tokens=len(embd_inp),
+    add_bos=False,
+    special=False
+)
 embd_inp = embd_inp[:n_of_tok]
 
 n_ctx = llama_cpp.llama_n_ctx(ctx)
@@ -49,8 +63,11 @@
 while remaining_tokens > 0:
     if len(embd) > 0:
         llama_cpp.llama_eval(
-            ctx, (llama_cpp.c_int * len(embd))(*embd), len(embd), n_past, N_THREADS
-        )
+            ctx = ctx, 
+            tokens=(llama_cpp.c_int * len(embd))(*embd),
+            n_tokens=len(embd),
+            n_past=n_past
+        )# Deprecated
 
     n_past += len(embd)
     embd = []
@@ -93,7 +110,7 @@
         for id in embd:
             size = 32
             buffer = (ctypes.c_char * size)()
-            n = llama_cpp.llama_token_to_piece_with_model(
+            n = llama_cpp.llama_token_to_piece(
                 model, llama_cpp.llama_token(id), buffer, size)
             assert n <= size
             print(
@@ -109,4 +126,4 @@
 
 llama_cpp.llama_print_timings(ctx)
 
-llama_cpp.llama_free(ctx)
+llama_cpp.llama_free(ctx)
\ No newline at end of file
diff --git a/examples/low_level_api/readme/low_level_api_llama_cpp.md b/examples/low_level_api/readme/low_level_api_llama_cpp.md
new file mode 100644
index 000000000..5f350ffe9
--- /dev/null
+++ b/examples/low_level_api/readme/low_level_api_llama_cpp.md
@@ -0,0 +1,61 @@
+# Low-Level API for Llama_cpp
+
+## Overview
+This Python script, low_level_api_llama_cpp.py, demonstrates the implementation of a low-level API for interacting with the llama_cpp library. The script defines an inference that generates embeddings based on a given prompt using .gguf model.
+
+### Prerequisites
+Before running the script, ensure that you have the following dependencies installed:
+
+.    Python 3.6 or higher
+.    llama_cpp: A C++ library for working with .gguf model
+.    NumPy: A fundamental package for scientific computing with Python
+.    multiprocessing: A Python module for parallel computing
+
+### Usage
+install depedencies:
+```bash
+python -m pip install llama-cpp-python ctypes os multiprocessing
+```
+Run the script:
+```bash
+python low_level_api_llama_cpp.py
+```
+
+## Code Structure
+The script is organized as follows:
+
+### . Initialization:
+        Load the model from the specified path.
+        Create a context for model evaluation.
+
+### . Tokenization:
+        Tokenize the input prompt using the llama_tokenize function.
+        Prepare the input tokens for model evaluation.
+
+### . Inference:
+        Perform model evaluation to generate responses.
+        Sample from the model's output using various strategies (top-k, top-p, temperature).
+
+### . Output:
+        Print the generated tokens and the corresponding decoded text.
+
+### .Cleanup:
+        Free resources and print timing information.
+
+## Configuration
+Customize the inference behavior by adjusting the following variables:
+
+#### . N_THREADS: Number of CPU threads to use for model evaluation.
+#### . MODEL_PATH: Path to the model file.
+#### . prompt: Input prompt for the chatbot.
+
+## Notes
+.    Ensure that the llama_cpp library is built and available in the system. Follow the instructions in the llama_cpp repository for building and installing the library.
+
+.    This script is designed to work with the .gguf model and may require modifications for compatibility with other models.
+
+## Acknowledgments
+This code is based on the llama_cpp library developed by the community. Special thanks to the contributors for their efforts.
+
+## License
+This project is licensed under the MIT License - see the LICENSE file for details.
\ No newline at end of file

From b38278a54980f734cef14e34e307371473f5c544 Mon Sep 17 00:00:00 2001
From: zocainViken <75504411+zocainViken@users.noreply.github.com>
Date: Fri, 1 Dec 2023 00:36:15 +0100
Subject: [PATCH 2/4] llava from readme got this error: UnsupportedOperation:
 fileno   quick fix by checking hasattr

---
 llama_cpp/_utils.py | 26 ++++++++++++++++----------
 1 file changed, 16 insertions(+), 10 deletions(-)

diff --git a/llama_cpp/_utils.py b/llama_cpp/_utils.py
index 1b61eec69..171f357f5 100644
--- a/llama_cpp/_utils.py
+++ b/llama_cpp/_utils.py
@@ -17,14 +17,18 @@ def __enter__(self):
         if self.disable:
             return self
 
+        # Check if sys.stdout and sys.stderr have fileno method
+        if not hasattr(self.sys.stdout, 'fileno') or not hasattr(self.sys.stderr, 'fileno'):
+            return self  # Return the instance without making changes
+
         self.outnull_file = self.open(self.os.devnull, "w")
         self.errnull_file = self.open(self.os.devnull, "w")
 
         self.old_stdout_fileno_undup = self.sys.stdout.fileno()
         self.old_stderr_fileno_undup = self.sys.stderr.fileno()
 
-        self.old_stdout_fileno = self.os.dup(self.sys.stdout.fileno())
-        self.old_stderr_fileno = self.os.dup(self.sys.stderr.fileno())
+        self.old_stdout_fileno = self.os.dup(self.old_stdout_fileno_undup)
+        self.old_stderr_fileno = self.os.dup(self.old_stderr_fileno_undup)
 
         self.old_stdout = self.sys.stdout
         self.old_stderr = self.sys.stderr
@@ -40,14 +44,16 @@ def __exit__(self, *_):
         if self.disable:
             return
 
-        self.sys.stdout = self.old_stdout
-        self.sys.stderr = self.old_stderr
+        # Check if sys.stdout and sys.stderr have fileno method
+        if hasattr(self.sys.stdout, 'fileno') and hasattr(self.sys.stderr, 'fileno'):
+            self.sys.stdout = self.old_stdout
+            self.sys.stderr = self.old_stderr
 
-        self.os.dup2(self.old_stdout_fileno, self.old_stdout_fileno_undup)
-        self.os.dup2(self.old_stderr_fileno, self.old_stderr_fileno_undup)
+            self.os.dup2(self.old_stdout_fileno, self.old_stdout_fileno_undup)
+            self.os.dup2(self.old_stderr_fileno, self.old_stderr_fileno_undup)
 
-        self.os.close(self.old_stdout_fileno)
-        self.os.close(self.old_stderr_fileno)
+            self.os.close(self.old_stdout_fileno)
+            self.os.close(self.old_stderr_fileno)
 
-        self.outnull_file.close()
-        self.errnull_file.close()
+            self.outnull_file.close()
+            self.errnull_file.close()

From 86f1ee1f77009751dc260810251b4b1a3a05fde1 Mon Sep 17 00:00:00 2001
From: zocainViken <75504411+zocainViken@users.noreply.github.com>
Date: Sat, 2 Dec 2023 11:55:29 +0100
Subject: [PATCH 3/4] multi modal params fix: add logits = True -> to make
 llava work

---
 README.md | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index dd4eb52ed..8920f0e3b 100644
--- a/README.md
+++ b/README.md
@@ -265,7 +265,8 @@ Then you'll need to use a custom chat handler to load the clip model and process
 >>> llm = Llama(
   model_path="./path/to/llava/llama-model.gguf",
   chat_handler=chat_handler,
-  n_ctx=2048 # n_ctx should be increased to accomodate the image embedding
+  n_ctx=2048, # n_ctx should be increased to accomodate the image embedding
+  logits_all=True,# needed to make llava work
 )
 >>> llm.create_chat_completion(
     messages = [

From 077155392c1bf70766bb1845a07a567a84f0c204 Mon Sep 17 00:00:00 2001
From: zocainViken <75504411+zocainViken@users.noreply.github.com>
Date: Sat, 2 Dec 2023 12:09:18 +0100
Subject: [PATCH 4/4] multi modal params fix: add logits = True -> to make
 llava work

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 8920f0e3b..a79fafc4c 100644
--- a/README.md
+++ b/README.md
@@ -265,8 +265,8 @@ Then you'll need to use a custom chat handler to load the clip model and process
 >>> llm = Llama(
   model_path="./path/to/llava/llama-model.gguf",
   chat_handler=chat_handler,
-  n_ctx=2048, # n_ctx should be increased to accomodate the image embedding
   logits_all=True,# needed to make llava work
+  n_ctx=2048, # n_ctx should be increased to accomodate the image embedding
 )
 >>> llm.create_chat_completion(
     messages = [