In [1]:
!pip install mistral_inference

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Collecting mistral_inference
  Downloading mistral_inference-1.5.0-py3-none-any.whl.metadata (14 kB)
Collecting fire>=0.6.0 (from mistral_inference)
  Downloading fire-0.7.0.tar.gz (87 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m87.2/87.2 kB[0m [31m18.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Collecting simple-parsing>=0.1.5 (from mistral_inference)
  Downloading simple_parsing-0.1.6-py3-none-any.whl.metadata (7.3 kB)
Collecting docstring-parser<1.0,>=0.15 (from simple-parsing>=0.1.5->mistral_inference)
  Downloading docstring_parser-0.16-py3-none-any.whl.metadata (3.0 kB)
Downloading mistral_inference-1.5.0-py3-none-any.whl (30 kB)
Downloading simple_parsing-0.1.6-py3-none-any.whl (112 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m112.6/112.6 kB[0m [31m201.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading docstring_parser-0.

# Model Information

Source: https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3 \
Gated Model so it needs a prior login, access verification and usage of an access token

In [5]:
from huggingface_hub import login
login(token="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")

In [6]:
from huggingface_hub import snapshot_download
from pathlib import Path

mistral_models_path = Path.home().joinpath('mistral_models', '7B-Instruct-v0.3')
mistral_models_path.mkdir(parents=True, exist_ok=True)

snapshot_download(repo_id="mistralai/Mistral-7B-Instruct-v0.3", allow_patterns=["params.json", "consolidated.safetensors", "tokenizer.model.v3"], local_dir=mistral_models_path)

Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]

params.json:   0%|          | 0.00/202 [00:00<?, ?B/s]

tokenizer.model.v3:   0%|          | 0.00/587k [00:00<?, ?B/s]

consolidated.safetensors:   0%|          | 0.00/14.5G [00:00<?, ?B/s]

'/home/jovyan/mistral_models/7B-Instruct-v0.3'

# Request Steps
The basic flow of using the local LLM can be described as follows:
1. Define the human-readable message as a string
2. Encapsulate the message from 1. into a structured object the LLM can work with
3. Encode the object from 2. into tokens (basically numbers) that can interpreted by the LLM
4. Pass the tokens to the LLM with additional parameters and receive the answer in tokens
5. Convert the generated tokens back into a human-readable message

In [17]:
from mistral_inference.transformer import Transformer
from mistral_inference.generate import generate

from mistral_common.tokens.tokenizers.mistral import MistralTokenizer
from mistral_common.protocol.instruct.messages import UserMessage, AssistantMessage
from mistral_common.protocol.instruct.request import ChatCompletionRequest

# loads a "dictionary" of how to convert text into numbers and back
# used by the LLM to understand text 
tokenizer = MistralTokenizer.from_file(f"{mistral_models_path}/tokenizer.model.v3")
model = Transformer.from_folder(mistral_models_path)

In [24]:
# 1.
user_message = "Is 2003 a prime number?"

# 2.
completion_request = ChatCompletionRequest(messages=[UserMessage(content=user_message)])

# 3.
tokens = tokenizer.encode_chat_completion(completion_request).tokens

# 4.
out_tokens, _ = generate([tokens], model, max_tokens=256, temperature=0.0, eos_id=tokenizer.instruct_tokenizer.tokenizer.eos_id)  # end of sequence ID
# Temperature:
#  -> 0.0 deterministic, always same/similar answer
#  -> 1.0 more creative, varying responses

# 5.
result = tokenizer.instruct_tokenizer.tokenizer.decode(out_tokens[0])

print(result)

No, 2003 is not a prime number. A prime number is a natural number greater than 1 that has no positive divisors other than 1 and itself. For 2003, it can be divided evenly by 1, 3, 667, and 2003, so it does not meet the criteria to be a prime number.


# Chat History
To simulate a conversation, meaning a back and forth of different messages between the user and the LLM, we need to save the chat history.\
Then, with each LLM call (`generate(...)`) we pass the previous history as an array of UserMessage and AssistantMessage.

In [25]:
message_history=[
    UserMessage(content=user_message),  # from prior question
    AssistantMessage(content=result),  # from prior question
    UserMessage(content="Are you sure?")
]

completion_request = ChatCompletionRequest(messages=message_history)

tokens = tokenizer.encode_chat_completion(completion_request).tokens

out_tokens, _ = generate([tokens], model, max_tokens=256, temperature=0.0, eos_id=tokenizer.instruct_tokenizer.tokenizer.eos_id)

result = tokenizer.instruct_tokenizer.tokenizer.decode(out_tokens[0])

print(result)

I apologize for the mistake in my previous response. You are correct that 2003 is a prime number. I made an error in my calculation. A prime number is a natural number greater than 1 that has no positive divisors other than 1 and itself. Since 2003 cannot be divided evenly by any number between 2 and 2002, it is a prime number. I apologize for any confusion my previous response may have caused. Thank you for bringing this to my attention.


# Memory Usage Information
Open Terminal
- `nvidia-smi` to check current status of GPUs
- `PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True` to make room for the model to run


In [10]:
import torch

print(f"Available GPU memory: {torch.cuda.get_device_properties(0).total_memory/1e9:.2f}GB")
print(f"Reserved GPU memory: {torch.cuda.memory_reserved(0)/1e9:.2f}GB")
print(f"Allocated GPU memory: {torch.cuda.memory_allocated(0)/1e9:.2f}GB")

import gc
torch.cuda.empty_cache()
gc.collect()

print(f"Available GPU memory: {torch.cuda.get_device_properties(0).total_memory/1e9:.2f}GB")
print(f"Reserved GPU memory: {torch.cuda.memory_reserved(0)/1e9:.2f}GB")
print(f"Allocated GPU memory: {torch.cuda.memory_allocated(0)/1e9:.2f}GB")

Available GPU memory: 50.82GB
Reserved GPU memory: 1.91GB
Allocated GPU memory: 1.90GB
Available GPU memory: 50.82GB
Reserved GPU memory: 1.91GB
Allocated GPU memory: 0.00GB
