In [1]:
pip install llama-cpp-python 

Collecting llama-cpp-python
  Using cached llama_cpp_python-0.3.7.tar.gz (66.7 MB)
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
  Installing backend dependencies: started
  Installing backend dependencies: finished with status 'done'
  Preparing metadata (pyproject.toml): started
  Preparing metadata (pyproject.toml): finished with status 'done'
Building wheels for collected packages: llama-cpp-python
  Building wheel for llama-cpp-python (pyproject.toml): started
  Building wheel for llama-cpp-python (pyproject.toml): still running...
  Building wheel for llama-cpp-python (pyproject.toml): finished with status 'done'
  Created wheel for llama-cpp-python: filename=llama_cpp_python-0.3.7-cp311-cp311-win_amd64.whl size=3791368 sha256=8b9edf04de828320e477d6180b9668b3b91598aad90f5afed6dea2cc8afe0d29
  Stored in directory:

In [3]:
from llama_cpp import Llama
import gradio as gr
import os

# Function to load the GGUF model
def load_gguf_model(model_path, n_ctx=2048, n_gpu_layers=0):
    try:
        model = Llama(
            model_path=model_path,
            n_ctx=n_ctx,
            n_gpu_layers=n_gpu_layers
        )
        return model
    except Exception as e:
        print(f"Error loading model: {str(e)}")
        return None

# Load model globally
model_path = r"llama-3-8b-Instruct-bnb-4bit-scrapegraph-companion-unsloth.Q4_K_M.gguf"
llm = load_gguf_model(model_path)

# File to store conversation history
CONTEXT_FILE = "conversation_history.txt"
PROMPT_FILE = "custom_prompt.txt"

# Default prompt section (Modify this as needed)
CUSTOM_PROMPT = """
You are Sora, Arnav Mishra's personal assistant. Arnav Mishra is a human. Arnav is busy till 10:00 AM.
"""

# Load custom prompt if available
def load_custom_prompt():
    """Loads the custom prompt from a file if available."""
    if os.path.exists(PROMPT_FILE):
        with open(PROMPT_FILE, "r", encoding="utf-8") as file:
            return file.read().strip()
    return CUSTOM_PROMPT  # Fallback to default prompt

def load_conversation_history():
    """Loads the saved conversation history from a text file."""
    if os.path.exists(CONTEXT_FILE):
        with open(CONTEXT_FILE, "r", encoding="utf-8") as file:
            return file.read().strip()
    return ""

def save_conversation(message, response):
    """Appends user input and model response to the history file."""
    with open(CONTEXT_FILE, "a", encoding="utf-8") as file:
        file.write(f"User: {message}\nAssistant: {response}\n\n")

def chat_with_model(message, history):
    """
    Chat function for Gradio with a manual prompt and context saving.
    """
    # Load past conversation history
    past_conversation = load_conversation_history()
    
    # Load custom prompt
    system_prompt = load_custom_prompt()

    # Build conversation prompt
    conversation = f"{system_prompt}\n{past_conversation}\nUser: {message}\nAssistant:"

    # Get model response
    response = llm.create_completion(
        conversation,
        max_tokens=150,
        temperature=0.7,
        stop=["User:", "Assistant:"]
    )

    response_text = response['choices'][0]['text'].strip()

    # Save the new interaction
    save_conversation(message, response_text)

    return response_text

# Create Gradio interface
demo = gr.ChatInterface(
    fn=chat_with_model,
    title="GGUF Model Chatbot",
    description="Chat with your local GGUF model",
    examples=["Tell me a joke", "What is machine learning?", "Write a short poem"],
    theme="default"
)

# Launch the interface
demo.launch(share=True)  # Set share=False to disable public URL


llama_model_loader: loaded meta data with 23 key-value pairs and 291 tensors from llama-3-8b-Instruct-bnb-4bit-scrapegraph-companion-unsloth.Q4_K_M.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = llama-3-8b-Instruct-bnb-4bit-scrapegr...
llama_model_loader: - kv   2:                          llama.block_count u32              = 32
llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
llama_mod

* Running on local URL:  http://127.0.0.1:7862

Could not create share link. Please check your internet connection or our status page: https://status.gradio.app.




llama_perf_context_print:        load time =   29862.11 ms
llama_perf_context_print: prompt eval time =   29861.55 ms /   793 tokens (   37.66 ms per token,    26.56 tokens per second)
llama_perf_context_print:        eval time =   13611.87 ms /    69 runs   (  197.27 ms per token,     5.07 tokens per second)
llama_perf_context_print:       total time =   43567.16 ms /   862 tokens
Llama.generate: 770 prefix-match hit, remaining 102 prompt tokens to eval
llama_perf_context_print:        load time =   29862.11 ms
llama_perf_context_print: prompt eval time =    3744.84 ms /   102 tokens (   36.71 ms per token,    27.24 tokens per second)
llama_perf_context_print:        eval time =   11972.78 ms /    60 runs   (  199.55 ms per token,     5.01 tokens per second)
llama_perf_context_print:       total time =   15795.58 ms /   162 tokens
Llama.generate: 860 prefix-match hit, remaining 83 prompt tokens to eval
llama_perf_context_print:        load time =   29862.11 ms
llama_perf_context_print

In [5]:
demo = gr.ChatInterface(
    fn=chat_with_model,
    title="GGUF Model Chatbot",
    description="Chat with your local GGUF model",
    examples=["Tell me a joke", "What is machine learning?", "Write a short poem"],
    theme=gr.themes.Soft(),  # Different theme
    chatbot=gr.Chatbot(height=600),  # Taller chat window
    textbox=gr.Textbox(placeholder="Type your message here...", container=False),
    retry_btn="Retry",
    undo_btn="Undo Last",
    clear_btn="Clear",
)



TypeError: ChatInterface.__init__() got an unexpected keyword argument 'retry_btn'

llama_perf_context_print:        load time =    1136.33 ms
llama_perf_context_print: prompt eval time =    1136.23 ms /     7 tokens (  162.32 ms per token,     6.16 tokens per second)
llama_perf_context_print:        eval time =    3300.26 ms /    15 runs   (  220.02 ms per token,     4.55 tokens per second)
llama_perf_context_print:       total time =    4458.44 ms /    22 tokens
Llama.generate: 20 prefix-match hit, remaining 14 prompt tokens to eval
llama_perf_context_print:        load time =    1136.33 ms
llama_perf_context_print: prompt eval time =     564.73 ms /    14 tokens (   40.34 ms per token,    24.79 tokens per second)
llama_perf_context_print:        eval time =   11646.62 ms /    51 runs   (  228.37 ms per token,     4.38 tokens per second)
llama_perf_context_print:       total time =   12285.00 ms /    65 tokens
Llama.generate: 82 prefix-match hit, remaining 21 prompt tokens to eval
llama_perf_context_print:        load time =    1136.33 ms
llama_perf_context_print: p