<a href="https://colab.research.google.com/github/aliakyurek/llm/blob/main/Mistral_instruct_ctransformers_llamacpp_GPU.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

References:
* https://medium.com/@scholarly360/mistral-7b-complete-guide-on-colab-129fa5e9a04d
* https://vilsonrodrigues.medium.com/run-llama-2-models-in-a-colab-instance-using-ggml-and-ctransformers-41c1d6f0e6ad

In [None]:
!wget https://huggingface.co/TheBloke/Mistral-7B-v0.1-GGUF/resolve/main/mistral-7b-v0.1.Q5_K_S.gguf

In [None]:
!wget https://huggingface.co/TheBloke/Mistral-7B-Instruct-v0.1-GGUF/resolve/main/mistral-7b-instruct-v0.1.Q5_K_S.gguf

Mount the drive first!

In [None]:
!cp "/content/mistral-7b-v0.1.Q5_K_S.gguf" "/content/drive/MyDrive/models"

In [10]:
!cp "/content/mistral-7b-instruct-v0.1.Q5_K_S.gguf" "/content/drive/MyDrive/models"

In [2]:
!pip install -q ctransformers[cuda]

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.0/302.0 kB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m417.8/417.8 MB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m845.8/845.8 kB[0m [31m46.7 MB/s[0m eta [36m0:00:00[0m
[?25h

#### Auto-regressive model

In [None]:
from ctransformers import AutoModelForCausalLM

# Set gpu_layers to the number of layers to offload to GPU. Set to 0 if no GPU acceleration is available on your system.
llm = AutoModelForCausalLM.from_pretrained("TheBloke/Mistral-7B-v0.1-GGUF", model_file="/content/drive/MyDrive/models/mistral-7b-v0.1.Q5_K_S.gguf",
                                           model_type="mistral", gpu_layers=50)

In [None]:
for text in llm("Name the planets in the solar system?", stream=True):
    print(text, end="", flush=True)

#### Instruct model

In [3]:
from ctransformers import AutoModelForCausalLM

# Set gpu_layers to the number of layers to offload to GPU. Set to 0 if no GPU acceleration is available on your system.
llm = AutoModelForCausalLM.from_pretrained("TheBloke/Mistral-7B-Instruct-v0.1-GGUF", model_file="/content/drive/MyDrive/models/mistral-7b-instruct-v0.1.Q5_K_S.gguf",
                                           model_type="mistral", gpu_layers=50)

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

(…)a64a14aea61a4c468bbbf9f258a8/config.json:   0%|          | 0.00/31.0 [00:00<?, ?B/s]

Fetching 0 files: 0it [00:00, ?it/s]

In [4]:
template = """<s>[INST] You are a helpful, respectful and honest assistant. Answer exactly in few words from the context
Answer the question below:
{question} [/INST] </s>
"""

In [6]:
prompt = template.format(question="Name the planets in the solar system?")

In [7]:
llm(prompt)

'The eight planets in our solar system are: Mercury, Venus, Earth, Mars, Jupiter, Saturn, Uranus, and Neptune.'

In [8]:
tokens = llm.tokenize(prompt)

In [10]:
for text in llm("Name the planets in the solar system?", stream=True):
    print(text, end="", flush=True)


The eight planets in our solar system are, in order from the Sun: Mercury, Venus, Earth, Mars, Jupiter, Saturn, Uranus, and Neptune.

#### Manual streaming

In [9]:
import time
start = time.time()
NUM_TOKENS=0
print('-'*4+'Start Generation'+'-'*4)
for token in llm.generate(tokens):
    print(llm.detokenize(token), end='', flush=True)
    NUM_TOKENS+=1
time_generate = time.time() - start
print('\n')
print('-'*4+'End Generation'+'-'*4)
print(f'Num of generated tokens: {NUM_TOKENS}')
print(f'Time for complete generation: {time_generate}s')
print(f'Tokens per secound: {NUM_TOKENS/time_generate}')
print(f'Time per token: {(time_generate/NUM_TOKENS)*1000}ms')

----Start Generation----
The eight planets in our solar system are: Mercury, Venus, Earth, Mars, Jupiter, Saturn, Uranus, and Neptune.

----End Generation----
Num of generated tokens: 34
Time for complete generation: 1.4930694103240967s
Tokens per secound: 22.77188171219696
Time per token: 43.913806186002844ms


#### Langchain

In [11]:
!pip install -q langchain

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m9.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.0/40.0 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m49.4/49.4 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[?25h

In [1]:
from langchain import PromptTemplate, LLMChain
from langchain.llms import CTransformers
config = {'max_new_tokens': 100, 'temperature': 0, 'gpu_layers': 50}
llm = CTransformers(model="TheBloke/Mistral-7B-Instruct-v0.1-GGUF", model_file="/content/drive/MyDrive/models/mistral-7b-instruct-v0.1.Q5_K_S.gguf", config=config)

template = """<s>[INST] You are a helpful, respectful and honest assistant.Answer the question below:
{question} [/INST] </s>
"""

#### Prompt
question = """Name the planets in the solar system?"""

prompt = PromptTemplate(template=template, input_variables=["question"])
llm_chain = LLMChain(prompt=prompt, llm=llm)
response = llm_chain.run({"question":question})
response

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 0 files: 0it [00:00, ?it/s]

'There are eight planets in our solar system: Mercury, Venus, Earth, Mars, Jupiter, Saturn, Uranus, and Neptune.'

In [2]:
!CMAKE_ARGS="-DLLAMA_CUBLAS=on" FORCE_CMAKE=1 pip install llama-cpp-python


Collecting llama-cpp-python
  Downloading llama_cpp_python-0.2.11.tar.gz (3.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.6/3.6 MB[0m [31m25.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Installing backend dependencies ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting diskcache>=5.6.1 (from llama-cpp-python)
  Downloading diskcache-5.6.3-py3-none-any.whl (45 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.5/45.5 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: llama-cpp-python
  Building wheel for llama-cpp-python (pyproject.toml) ... [?25l[?25hdone
  Created wheel for llama-cpp-python: filename=llama_cpp_python-0.2.11-cp310-cp310-manylinux_2_35_x86_64.whl size=6423567 sha256=c7186d6776fd95bd1aa5df40c59d07d6411a11c9799cce0213fe4d5fedd65f55
  Store

In [5]:
prompt = template.format(question="Name the planets in the solar system?")

In [None]:
from llama_cpp import Llama
llm = Llama(model_path="/content/drive/MyDrive/models/mistral-7b-instruct-v0.1.Q5_K_S.gguf")


In [7]:
llm(prompt, max_tokens=100, stop=["Q:", "\n"], echo=True)

Llama.generate: prefix-match hit


{'id': 'cmpl-9ccd6eb8-4ae7-4469-b967-c38fbb2e58ec',
 'object': 'text_completion',
 'created': 1697455276,
 'model': '/content/drive/MyDrive/models/mistral-7b-instruct-v0.1.Q5_K_S.gguf',
 'choices': [{'text': '<s>[INST] You are a helpful, respectful and honest assistant.Answer the question below:\nName the planets in the solar system? [/INST] </s>\nThere are eight planets in our solar system. They are, in order from the sun: Mercury, Venus, Earth, Mars, Jupiter, Saturn, Uranus, and Neptune.',
   'index': 0,
   'logprobs': None,
   'finish_reason': 'stop'}],
 'usage': {'prompt_tokens': 41, 'completion_tokens': 43, 'total_tokens': 84}}