# Mixtral-7B-Instruct-v0.3 Model Chat-Box

### Imports and Configuration

In [4]:
from IPython.display import display, HTML
import transformers
import gradio
import os, yaml
import logging
import torch

In [5]:
config_path = os.path.join('..', '..', 'config', 'config.yml')
with open(config_path, 'r') as file:
    config = yaml.safe_load(file)

## Model Loading

In [6]:
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

app_dir = os.path.join('..', os.getcwd())
model_dir = os.path.join('..', config['model']['path'])
print(model_dir)
model_name = config['model']['model_name']

snapshots = os.listdir(model_dir)

if snapshots:
    latest_snapshot = snapshots[0] # Assuming the first one is the latest
    full_model_dir = os.path.join(model_dir, latest_snapshot)
    logger.info(f"Model path: {full_model_dir}")
else:
    logger.info(f"No snapshots found inside {model_dir}")
try:
    logger.info(f"Attempting to load tokenizer and model from {full_model_dir}")
    tokenizer = transformers.AutoTokenizer.from_pretrained(full_model_dir)
    logger.info("Model and tokenizer successfully loaded from primary directory.")
    model_id = full_model_dir

except Exception as e:
    logger.error(f"An unexpected error occurred: {e}")
    display(HTML("<h1 style='color:red;'>Unexpected Error: Notebook execution stopped.</h1>"))
    raise SystemExit(f"An unexpected error occurred: {e}")

INFO:__main__:Model path: ../models/models--mistralai--Mistral-7B-Instruct-v0.3/snapshots/0d4b76e1efeb5eb6f6b5e757c79870472e04bd3a
INFO:__main__:Attempting to load tokenizer and model from ../models/models--mistralai--Mistral-7B-Instruct-v0.3/snapshots/0d4b76e1efeb5eb6f6b5e757c79870472e04bd3a


../models/models--mistralai--Mistral-7B-Instruct-v0.3/snapshots


INFO:__main__:Model and tokenizer successfully loaded from primary directory.


In [7]:
pipeline = transformers.pipeline(
    "text-generation",
    model=model_id,
    tokenizer = tokenizer,
    model_kwargs={"torch_dtype": torch.bfloat16},
    device_map="auto",
    return_full_text=False,
    max_new_tokens=300,
)

`torch_dtype` is deprecated! Use `dtype` instead!
`torch_dtype` is deprecated! Use `dtype` instead!
INFO:accelerate.utils.modeling:We will use 90% of the memory on device 0 for storing the model, and 10% for the buffer to avoid OOM. You can set `max_memory` in to a higher value to use more memory (at your own risk).
Loading checkpoint shards: 100%|██████████| 3/3 [00:01<00:00,  1.91it/s]
Device set to use cuda:0


## Chat Status Class

In [8]:
class ChatStatus():
    """Manages the conversation history for a turn-based chatbot"""

    def __init__(self, pipeline, tokenizer, max_position_embeddings=32768) -> None:
        self.pipeline = pipeline
        self.tokenizer = tokenizer
        self.history = []
        self.max_position_embeddings = max_position_embeddings
    
    def add_message(self, role: str, content: str):
        """Adds a message to the history."""
        if role not in ("user", "assistant"):
            raise ValueError("Role must be either 'user' or 'assistant'")
        self.history.append({"role": role, "content": content})
        
    def get_history_as_text(self) -> str:
        """Returns the chat history as a formatted string."""
        return "\n".join(f"{msg['role']}: {msg['content']}" for msg in self.history)


    def _truncate_history(self):
        """
        Truncates history if it exceeds the model's max input length.
        Removes the oldest entries first.
        """
        while True:
            encoded = self.tokenizer(
                self.get_history_as_text(),
                return_tensors="pt",
                truncation=False
            )["input_ids"]

            if encoded.shape[1] <= self.max_position_embeddings or len(self.history) <= 1:
                break
            self.history.pop(0)  # drop oldest message

    def send_message(self, message: str) -> str:
        """
        Sends a user message, gets the model response, and updates history.
        """
        self.add_message("user", message)
        self._truncate_history()

        response = self.pipeline(self.history)
        reply = response[0]["generated_text"]

        self.add_message("assistant", reply)
        return reply

In [9]:
chat_status = ChatStatus(pipeline, tokenizer)

def chat(prompt):
    response = chat_status.send_message(prompt)
    return response

## NeuralBox App - w/Gradio

In [10]:
app = gradio.Interface(
    fn= chat,
    inputs=gradio.Textbox(lines=2, placeholder="Type here..."),
    outputs="text",
    title="Mixtral-7B-Instruct-v0.3 Chat-Box",
    description="Chat with Mixtral-7B-Instruct-v0.3."
)

app.launch(server_port=7860, server_name="0.0.0.0", debug=True)

INFO:httpx:HTTP Request: GET http://localhost:7860/gradio_api/startup-events "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: HEAD http://localhost:7860/ "HTTP/1.1 200 OK"


* Running on local URL:  http://0.0.0.0:7860
* To create a public link, set `share=True` in `launch()`.


INFO:httpx:HTTP Request: GET https://api.gradio.app/pkg-version "HTTP/1.1 200 OK"


Keyboard interruption in main thread... closing server.


