In [40]:
from typing import List,Any,Dict,Tuple,Optional
from llama_cpp import Llama,llama_types,llama_chat_format

def _format_add_colon_single(
    system_message: str, messages: List[Tuple[str, Optional[str]]], sep: str
) -> str:
    """Format the prompt with the add-colon-single style."""
    ret = system_message + sep
    for role, message in messages:
        if message:
            ret += role + ": " + message + sep
        else:
            ret += role + ":"
    return ret
    
def _map_roles(
    messages: List[llama_types.ChatCompletionRequestMessage], role_map: Dict[str, str]
) -> List[Tuple[str, Optional[str]]]:
    """Map the message roles."""
    output: List[Tuple[str, Optional[str]]] = []
    for message in messages:
        role = message["role"]
        if role in role_map:
            output.append((role_map[role], message["content"]))
    return output

def format_deepseek(
    messages: List[llama_types.ChatCompletionRequestMessage],
    **kwargs: Any,
) -> llama_chat_format.ChatFormatterResponse:
    _roles = dict(user="### Instruction:", assistant="### Assistant:")
    _sep = "\n"
    _system_message = f'''You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer.'''
    _messages = _map_roles(messages, _roles)
    _messages.append((_roles["assistant"], None))
    _prompt = _format_add_colon_single(_system_message, _messages, _sep)
    return llama_chat_format.ChatFormatterResponse(prompt=_prompt)

model = "/Users/ww5/.cache/huggingface/hub/models--TheBloke--deepseek-coder-6.7B-instruct-GGUF/snapshots/9e221e6b41cb1bf1c5d8f9718e81e3dc781f7557/deepseek-coder-6.7b-instruct.Q4_K_M.gguf"
llm = Llama(
      model_path=model,
      chat_handler=format_deepseek,
  n_ctx=4096,  # The max sequence length to use - note that longer sequence lengths require much more resources
  n_threads=8,            # The number of CPU threads to use, tailor to your system and the resulting performance
  n_gpu_layers=35         # The number of layers to offload to GPU, if you have GPU acceleration available

)

# result=llm.create_chat_completion(
#       messages = [
#           {"role": "system", "content": "You are a python code developer. You use the bash command line and git commands to modify the A_GIS package repository according to a request."},
#           {
#               "role": "user",
#               "content": "Implement a function to sample from a uniform distribution."
#           }
#       ]
# )
    

llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /Users/ww5/.cache/huggingface/hub/models--TheBloke--deepseek-coder-6.7B-instruct-GGUF/snapshots/9e221e6b41cb1bf1c5d8f9718e81e3dc781f7557/deepseek-coder-6.7b-instruct.Q4_K_M.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = deepseek-ai_deepseek-coder-6.7b-instruct
llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
llama_model_loader: - kv   3:                     llama.embedding_length u32              = 4096
llama_model_loader: - kv   4:                          llama.block_count u32              = 32
llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 11

In [41]:
print(result)

ChatFormatterResponse(prompt='You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer.\n### Instruction:: Implement a function to sample from a uniform distribution.\n### Assistant::', stop=None)


In [42]:
llm("Implement a function to sample from a uniform distribution.")


llama_print_timings:        load time =     292.04 ms
llama_print_timings:      sample time =       1.47 ms /    16 runs   (    0.09 ms per token, 10876.95 tokens per second)
llama_print_timings: prompt eval time =     291.90 ms /    12 tokens (   24.33 ms per token,    41.11 tokens per second)
llama_print_timings:        eval time =     263.32 ms /    15 runs   (   17.55 ms per token,    56.96 tokens per second)
llama_print_timings:       total time =     579.90 ms /    27 tokens


{'id': 'cmpl-854d9c81-10c3-4f25-9756-afde4f43bd35',
 'object': 'text_completion',
 'created': 1709488977,
 'model': '/Users/ww5/.cache/huggingface/hub/models--TheBloke--deepseek-coder-6.7B-instruct-GGUF/snapshots/9e221e6b41cb1bf1c5d8f9718e81e3dc781f7557/deepseek-coder-6.7b-instruct.Q4_K_M.gguf',
 'choices': [{'text': '\n\n\n```python\ndef generate_uniform(low, high):\n',
   'index': 0,
   'logprobs': None,
   'finish_reason': 'length'}],
 'usage': {'prompt_tokens': 12, 'completion_tokens': 16, 'total_tokens': 28}}

In [37]:
result=llm.create_chat_completion(
      messages = [
          {
              "role": "user",
              "content": "Implement a function to sample from a uniform distribution."
          }
      ]
)

In [38]:
print(result)

ChatFormatterResponse(prompt='You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer.\n### Instruction:: Implement a function to sample from a uniform distribution.\n### Assistant::', stop=None)


In [39]:
llm("Implement a function to sample from a uniform distribution.")

Llama.generate: prefix-match hit

llama_print_timings:        load time =     168.92 ms
llama_print_timings:      sample time =       1.26 ms /    16 runs   (    0.08 ms per token, 12738.85 tokens per second)
llama_print_timings: prompt eval time =       0.00 ms /     1 tokens (    0.00 ms per token,      inf tokens per second)
llama_print_timings:        eval time =     692.24 ms /    16 runs   (   43.26 ms per token,    23.11 tokens per second)
llama_print_timings:       total time =     713.90 ms /    17 tokens


{'id': 'cmpl-d2992b7c-40e6-41b8-a247-3104f0a754db',
 'object': 'text_completion',
 'created': 1709488864,
 'model': '/Users/ww5/.cache/huggingface/hub/models--TheBloke--deepseek-coder-6.7B-instruct-GGUF/snapshots/9e221e6b41cb1bf1c5d8f9718e81e3dc781f7557/deepseek-coder-6.7b-instruct.Q4_K_M.gguf',
 'choices': [{'text': '\n<jupyter_code>\nimport random\r\ndef sample',
   'index': 0,
   'logprobs': None,
   'finish_reason': 'length'}],
 'usage': {'prompt_tokens': 12, 'completion_tokens': 16, 'total_tokens': 28}}

In [43]:
prompt="Write a uniform distribution."

In [44]:
output = llm(
  "User: {prompt}\n\nAssistant:", # Prompt
  max_tokens=512,  # Generate up to 512 tokens
  stop=["</s>"],   # Example stop token - not necessarily correct for this specific model! Please check before using.
  echo=True        # Whether to echo the prompt
)

Llama.generate: prefix-match hit

llama_print_timings:        load time =     292.04 ms
llama_print_timings:      sample time =      38.76 ms /   512 runs   (    0.08 ms per token, 13208.47 tokens per second)
llama_print_timings: prompt eval time =     267.17 ms /    11 tokens (   24.29 ms per token,    41.17 tokens per second)
llama_print_timings:        eval time =    9135.23 ms /   511 runs   (   17.88 ms per token,    55.94 tokens per second)
llama_print_timings:       total time =   10081.43 ms /   522 tokens


In [45]:
print(output)

{'id': 'cmpl-5f7576ba-36e1-4fb5-927b-3c428ce24fa1', 'object': 'text_completion', 'created': 1709489024, 'model': '/Users/ww5/.cache/huggingface/hub/models--TheBloke--deepseek-coder-6.7B-instruct-GGUF/snapshots/9e221e6b41cb1bf1c5d8f9718e81e3dc781f7557/deepseek-coder-6.7b-instruct.Q4_K_M.gguf', 'choices': [{'text': 'User: {prompt}\n\nAssistant: {response}\nUser: {next_user_input}\nAssistant: {next_assistant_reply}\n```\n"""\n\n\nclass Conversation:\n    def __init__(self, max_lines=10):\n        self.max_lines = max_lines\n        self.convo = []\n        self.current = 0\n        self.template = """\\n\\n{line}\\n\\n{line}\\n\\n{line}\\n\\n{line}\\n\\n{line}\\n\\n{line}\\n\\n{line}\\n\\n{line}\\n\\n{line}\\n\\n{line}\\n\\n{line}\\n\\n{line}\\n\\n{line}\\n\\n{line}\\n\\n{line}\\n\\n{line}\\n\\n{line}\\n\\n{line}\\n\\n{line}\\n\\n{line}\\n\\n{line}\\n\\n{line}\\n\\n{line}\\n\\n{line}\\n\\n{line}\\n\\n{line}\\n\\n{line}\\n\\n{line}\\n\\n{line}\\n\\n{line}\\n\\n{line}\\n\\n{line}\\n\\n{line