In [20]:
from langchain_community.chat_models.llamacpp import ChatLlamaCpp
from langchain.schema import SystemMessage, HumanMessage, AIMessage
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler

from langchain.prompts.chat import (
    ChatPromptTemplate,
    SystemMessagePromptTemplate,
    HumanMessagePromptTemplate,
    AIMessagePromptTemplate
)

from langchain.tools import Tool
from langchain.prompts import MessagesPlaceholder

import sys
import os
sys.path.append('../../littleSeven/')
from common.config import cfg

In [2]:
from os.path import expanduser

# from langchain_community.llms import LlamaCpp

model_path = expanduser(cfg.llm_gguf_model_path_2)
llm = ChatLlamaCpp(
    model_path=model_path,
    temperature=0.7,
    streaming=True,
    callbacks=[StreamingStdOutCallbackHandler()],
    verbose=True
)

llama_load_model_from_file: using device Metal (Apple M2 Max) - 21845 MiB free
llama_model_loader: loaded meta data with 19 key-value pairs and 291 tensors from ../model_gguf/llama-2-7b-chat.Q4_0.gguf (version GGUF V2)
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = LLaMA v2
llama_model_loader: - kv   2:                       llama.context_length u32              = 4096
llama_model_loader: - kv   3:                     llama.embedding_length u32              = 4096
llama_model_loader: - kv   4:                          llama.block_count u32              = 32
llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 11008
llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
l

In [3]:
# create chat message
messages = [
    SystemMessage(content="You are a helpful assistant."),
    HumanMessage(content="What is the capital of France?")
]

# invoke model
response = llm(messages)


print(response.content)

  response = llm(messages)


  Of course! The capital of France is Paris. 🇫🇷

llama_perf_context_print:        load time =     992.78 ms
llama_perf_context_print: prompt eval time =       0.00 ms /    32 tokens (    0.00 ms per token,      inf tokens per second)
llama_perf_context_print:        eval time =       0.00 ms /    20 runs   (    0.00 ms per token,      inf tokens per second)
llama_perf_context_print:       total time =    1796.43 ms /    52 tokens


  Of course! The capital of France is Paris. 🇫🇷


In [4]:
response

AIMessage(content='  Of course! The capital of France is Paris. 🇫🇷', additional_kwargs={}, response_metadata={'finish_reason': 'stop'}, id='run-d0c107a8-e39c-421f-92f0-9d9ca69dec06-0')

#### add chat prompt template

In [8]:
# define the system message template
system_message = SystemMessagePromptTemplate.from_template(
    "You are a helpful assistant. Respond concisely to the user's query."
)

# define user message template
human_message = HumanMessagePromptTemplate.from_template("{user_input}")

# create ChatPromptTemplate (without history memory)
chat_prompt = ChatPromptTemplate.from_messages([system_message, human_message])

In [9]:
chat_prompt

ChatPromptTemplate(input_variables=['user_input'], input_types={}, partial_variables={}, messages=[SystemMessagePromptTemplate(prompt=PromptTemplate(input_variables=[], input_types={}, partial_variables={}, template="You are a helpful assistant. Respond concisely to the user's query."), additional_kwargs={}), HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['user_input'], input_types={}, partial_variables={}, template='{user_input}'), additional_kwargs={})])

In [12]:
# user input
user_input = "Who is Stephen Curry in NBA?"

# generate Prompt
prompt_values = chat_prompt.format_prompt(user_input=user_input)

# see Prompt
messages = prompt_values.to_messages()
for msg in messages:
    print(f"{msg.type}: {msg.content}")

system: You are a helpful assistant. Respond concisely to the user's query.
human: Who is Stephen Curry in NBA?


In [13]:
response = llm(messages)

# print response
print("Assistant:", response.content)

Llama.generate: 33 prefix-match hit, remaining 12 prompt tokens to eval


  Stephen Curry is a professional basketball player who currently plays for the Golden State Warriors in the National Basketball Association (NBA). He is a shooting guard and point guard known for his incredible shooting ability and quick release. Curry has won numerous awards and accolades throughout his career, including two NBA championships, two NBA Finals MVPs, and four NBA scoring titles. He is widely considered one of the greatest shooters in NBA history and is often referred to as "The Baby-Faced Assassin."

llama_perf_context_print:        load time =     992.78 ms
llama_perf_context_print: prompt eval time =       0.00 ms /    12 tokens (    0.00 ms per token,      inf tokens per second)
llama_perf_context_print:        eval time =       0.00 ms /   110 runs   (    0.00 ms per token,      inf tokens per second)
llama_perf_context_print:       total time =    4842.55 ms /   122 tokens


Assistant:   Stephen Curry is a professional basketball player who currently plays for the Golden State Warriors in the National Basketball Association (NBA). He is a shooting guard and point guard known for his incredible shooting ability and quick release. Curry has won numerous awards and accolades throughout his career, including two NBA championships, two NBA Finals MVPs, and four NBA scoring titles. He is widely considered one of the greatest shooters in NBA history and is often referred to as "The Baby-Faced Assassin."


In [14]:
chat_prompt = ChatPromptTemplate.from_messages([
    SystemMessagePromptTemplate.from_template("{role_description}"),
    HumanMessagePromptTemplate.from_template("{user_input}")
])


prompt_values = chat_prompt.format_prompt(
    role_description="You are a knowledgeable assistant specializing in geography.",
    user_input="What is the largest desert in the world?"
)


for msg in prompt_values.to_messages():
    print(f"{msg.type}: {msg.content}")

system: You are a knowledgeable assistant specializing in geography.
human: What is the largest desert in the world?


In [16]:
# generate message from template
messages = prompt_values.to_messages()

messages

[SystemMessage(content='You are a knowledgeable assistant specializing in geography.', additional_kwargs={}, response_metadata={}),
 HumanMessage(content='What is the largest desert in the world?', additional_kwargs={}, response_metadata={})]

In [17]:
# 调用 LLM
response = llm(messages)

# 打印模型的响应
print("Assistant:", response.content)

Llama.generate: 12 prefix-match hit, remaining 28 prompt tokens to eval


  The largest desert in the world is the Sahara Desert, which covers much of North Africa. It spans across several countries including Algeria, Chad, Egypt, Libya, Mali, Mauritania, Morocco, Niger, and Tunisia. The Sahara Desert is approximately 9.4 million square kilometers (3.6 million square miles) in size, making it the largest hot desert in the world. It is known for its extreme heat and arid conditions, with some areas receiving less than 20 millimeters (0.8 inches) of rainfall per year. The Sahara is also home to several oases, which are fertile areas that support vegetation and wildlife despite the harsh desert environment.

llama_perf_context_print:        load time =     992.78 ms
llama_perf_context_print: prompt eval time =       0.00 ms /    28 tokens (    0.00 ms per token,      inf tokens per second)
llama_perf_context_print:        eval time =       0.00 ms /   161 runs   (    0.00 ms per token,      inf tokens per second)
llama_perf_context_print:       total time =    7928.30 ms /   189 tokens


Assistant:   The largest desert in the world is the Sahara Desert, which covers much of North Africa. It spans across several countries including Algeria, Chad, Egypt, Libya, Mali, Mauritania, Morocco, Niger, and Tunisia. The Sahara Desert is approximately 9.4 million square kilometers (3.6 million square miles) in size, making it the largest hot desert in the world. It is known for its extreme heat and arid conditions, with some areas receiving less than 20 millimeters (0.8 inches) of rainfall per year. The Sahara is also home to several oases, which are fertile areas that support vegetation and wildlife despite the harsh desert environment.


#### fill the prompt template that could incoke tools

In [33]:



# return local time
def get_local_time(input: str = None) -> str:
    """return local time"""
    from datetime import datetime
    
    try:
        # get current local time
        current_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        return f"Current local time is: {current_time}"
    except Exception as e:
        return f"Error retrieving local time: {str(e)}"

# tool defination
tools = [
    Tool(
        name="LocalTime",
        func=get_local_time,
        description="Use this tool to get the current local time. It does not require any input."
    ),
    Tool(
        name="Search",
        func=lambda x: f"Searching for: {x}",
        description="Use this tool for general information search."
    )
]

In [37]:
# tool description
tool_descriptions = "\n".join([f"- {tool.name}: {tool.description}" for tool in tools])

# define Prompt Template
chat_prompt = ChatPromptTemplate.from_messages([
    SystemMessagePromptTemplate.from_template(
        "You are a smart assistant. Here are the tools you can use:\n"
        "{tool_descriptions}\n\n"
        "When a user asks a question, decide if you need to use a tool. "
        "If the question is about the current local time, use the `LocalTime` tool directly. "
        "Respond with the tool's name and the input needed in the following format:\n"
        "action: <tool_name>\n"
        "action_input: <input>\n"
    ),
    HumanMessagePromptTemplate.from_template("{user_input}")
])

In [38]:
# user input
user_input = "What is the local time?"

# fill template
prompt_values = chat_prompt.format_prompt(
    tool_descriptions=tool_descriptions,
    user_input=user_input
)

# generate message
messages = prompt_values.to_messages()
for msg in messages:
    print(f"{msg.type}: {msg.content}")

system: You are a smart assistant. Here are the tools you can use:
- LocalTime: Use this tool to get the current local time. It does not require any input.
- Search: Use this tool for general information search.

When a user asks a question, decide if you need to use a tool. If the question is about the current local time, use the `LocalTime` tool directly. Respond with the tool's name and the input needed in the following format:
action: <tool_name>
action_input: <input>

human: What is the local time?


In [39]:
# invoke LLM
response = llm(messages)

# Parsing tool call logic
if "action:" in response.content:
    # Example parsing tool call
    action, action_input = response.content.split("action_input:", 1)
    action_name = action.split("action:")[1].strip()
    action_input = action_input.strip()

    # Find and call the corresponding tool
    for tool in tools:
        if tool.name == action_name:
            tool_result = tool.func(action_input)
            print(f"Tool '{action_name}' output:", tool_result)
            break
else:
    print("Assistant:", response.content)

Llama.generate: 76 prefix-match hit, remaining 68 prompt tokens to eval


  action: LocalTime
action_input: None

llama_perf_context_print:        load time =     992.78 ms
llama_perf_context_print: prompt eval time =       0.00 ms /    68 tokens (    0.00 ms per token,      inf tokens per second)
llama_perf_context_print:        eval time =       0.00 ms /    11 runs   (    0.00 ms per token,      inf tokens per second)
llama_perf_context_print:       total time =    2550.88 ms /    79 tokens


Tool 'LocalTime' output: Current local time is: 2025-01-10 21:35:53


In [40]:
response

AIMessage(content='  action: LocalTime\naction_input: None', additional_kwargs={}, response_metadata={'finish_reason': 'stop'}, id='run-d98710ac-159c-4fb9-8188-a3b9797387c0-0')