# Run with HF

In [14]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

torch.random.manual_seed(0)

model = AutoModelForCausalLM.from_pretrained(
    "microsoft/Phi-3-mini-4k-instruct",
    torch_dtype="auto",
    trust_remote_code=True,
)
tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-4k-instruct")

messages = [
    {"role": "system", "content": "You are a helpful digital assistant. Please provide safe, ethical and accurate information to the user."},
    {"role": "user", "content": "Can you provide ways to eat combinations of bananas and dragonfruits?"},
    {"role": "assistant", "content": "Sure! Here are some ways to eat bananas and dragonfruits together: 1. Banana and dragonfruit smoothie: Blend bananas and dragonfruits together with some milk and honey. 2. Banana and dragonfruit salad: Mix sliced bananas and dragonfruits together with some lemon juice and honey."},
    {"role": "user", "content": "What about solving an 2x + 3 = 7 equation?"},
]

pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
)

generation_args = {
    "max_new_tokens": 500,
    "return_full_text": False,
    "temperature": 0.0,
    "do_sample": False,
}

output = pipe(messages, **generation_args)
print(output[0]['generated_text'])




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


 To solve the equation 2x + 3 = 7, follow these steps:

1. Subtract 3 from both sides of the equation to isolate the term with the variable (x):
   2x + 3 - 3 = 7 - 3
   2x = 4

2. Divide both sides of the equation by 2 to solve for x:
   2x / 2 = 4 / 2
   x = 2

So, the solution to the equation 2x + 3 = 7 is x = 2.


In [18]:
output[0].keys()

dict_keys(['generated_text'])

# Run with llama_cpp

In [None]:
!apt-get update;
!wget https://developer.nvidia.com/compute/cuda/10.0/Prod/local_installers/cuda-repo-ubuntu1604-10-0-local-10.0.130-410.48_1.0-1_amd64 -O cuda-repo-ubuntu1604-10-0-local-10.0.130-410.48_1.0-1_amd64.deb
!dpkg -i cuda-repo-ubuntu1604-10-0-local-10.0.130-410.48_1.0-1_amd64.deb
!apt-key add /var/cuda-repo-10-0-local/7fa2af80.pub
!apt-get update
!apt-get -y install gcc-7 g++-7
!apt-get -y install cuda

!export PATH=/usr/local/cuda/bin${PATH:+:${PATH}}
!export LD_LIBRARY_PATH=/usr/local/cuda/lib64${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}

In [21]:
! CMAKE_ARGS="-DLLAMA_CUBLAS=on" pip install llama-cpp-python



In [7]:
from llama_cpp import Llama

In [None]:
llm = Llama(
  model_path="./Phi-3-mini-4k-instruct-q4.gguf",  # path to GGUF file
  n_ctx=4096,  # The max sequence length to use - note that longer sequence lengths require much more resources
  n_threads=8, # The number of CPU threads to use, tailor to your system and the resulting performance
  n_gpu_layers=35, # The number of layers to offload to GPU, if you have GPU acceleration available. Set to 0 if no GPU acceleration is available on your system.
)

In [9]:
prompt = "How to explain Internet to a medieval knight?"

In [10]:
# Simple inference example
output = llm(
  f"<|user|>\n{prompt}<|end|>\n<|assistant|>",
  max_tokens=256,  # Generate up to 256 tokens
  stop=["<|end|>"],
  echo=True,  # Whether to echo the prompt
)


llama_print_timings:        load time =     967.51 ms
llama_print_timings:      sample time =     165.30 ms /   256 runs   (    0.65 ms per token,  1548.68 tokens per second)
llama_print_timings: prompt eval time =     967.20 ms /    16 tokens (   60.45 ms per token,    16.54 tokens per second)
llama_print_timings:        eval time =   38602.00 ms /   255 runs   (  151.38 ms per token,     6.61 tokens per second)
llama_print_timings:       total time =   40738.45 ms /   271 tokens


In [11]:
print(output['choices'][0]['text'])

<|user|>
How to explain Internet to a medieval knight?<|end|>
<|assistant|> To explain the concept of the "Internet" to a medieval knight, we must simplify and draw parallels with familiar concepts:

Imagine, noble knight, that you possess an invisible network connecting all people across vast lands. This magical web allows messengers (much like your own mounted couriers) to carry information at unthinkable speeds, as if by some arcane enchantment. Instead of relying solely on physical letters or word-of-mouth, this mystical realm enables people from distant lands to share knowledge and news instantaneously through a series of interconnected channels – much like a vast network of roads crisscrossing the kingdoms in your time.

The "Internet" is an extraordinary library where every book ever written can be found at one's fingertips, accessible from anywhere within this web-like system. In essence, it connects distant lands and their inhabitants through a shared pool of information, enha

# Run with gguf

In [5]:
!pip install llama-cpp-python



In [6]:
from llama_cpp import Llama

In [23]:
# Put the location of to the GGUF model that you've download from HuggingFace here
model_path = "/content/drive/MyDrive/LLAMA/llama-cpp/Phi-3-mini-4k-instruct-q4.gguf"