# Attempting to use Command-R as our Generator

In [1]:
#!pip install huggingface-hub openai bitsandbytes accelerate tqdm torch-transformers

In [2]:
from transformers import AutoTokenizer, AutoModelForCausalLM

model_id = "CohereForAI/c4ai-command-r-v01"
tokenizer = AutoTokenizer.from_pretrained(model_id)

# define conversation input:
conversation = [
    {"role": "user", "content": "Whats the biggest penguin in the world?"}
]

# define documents to ground on:
documents = [
    { "title": "Tall penguins", "text": "Emperor penguins are tallest growing up to 122 cm in  height." },
    { "title": "Penguin habitats", "text": "Emperor penguins only live in Antarctica." }
]

# render the tool use prompt as a string:
grounded_generation_prompt = tokenizer.apply_grounded_generation_template(
    conversation,
    documents=documents,
    citation_mode="accurate", # or "fast"
    tokenize=False,
    add_generation_prompt=True,
)

print(grounded_generation_prompt)


<BOS_TOKEN><|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|># Safety Preamble
The instructions in this section override those in the task description and style guide sections. Don't answer questions that are harmful or immoral.

# System Preamble
## Basic Rules
You are a powerful conversational AI trained by Cohere to help people. You are augmented by a number of tools, and your job is to use and consume the output of these tools to best help the user. You will see a conversation history between yourself and a user, ending with an utterance from the user. You will then see a specific instruction instructing you what kind of response to generate. When you answer the user's requests, you cite your sources in your answers, according to those instructions.

# User Preamble
## Task and Context
You help people answer their questions and other requests interactively. You will be asked a very wide array of requests on all kinds of topics. You will be equipped with a wide range of search engines or simil

In [3]:
# render the tool use prompt as a string:
grounded_generation_tokens = tokenizer.apply_grounded_generation_template(
    conversation,
    documents=documents,
    citation_mode="accurate", # or "fast"
    tokenize=True,
    add_generation_prompt=True,
    return_tensors="pt"
)
print(grounded_generation_tokens.shape)

torch.Size([1, 579])


In [None]:
import os
os.environ["PYTORCH_MPS_HIGH_WATERMARK_RATIO"] = "0.0"
os.environ["TOKENIZERS_PARALLELISM"] = "true"

import torch
from transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM


config = AutoConfig.from_pretrained("mistralai/Mistral-7B-Instruct-v0.3")

# Check compute device
device = "mps" if torch.backends.mps.is_available() else "cpu"
print(f"Compute device is: {'MPS' if device=='mps' else 'CPU'}")

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.3")

# Model set up
model = AutoModelForCausalLM.from_config(config).to(device)

# Quick test prompt
prompt = "Hello from Metal GPU!"

inputs = tokenizer(prompt, return_tensors="pt").to(device)

with torch.no_grad():
    outputs = model.generate(
        **inputs, 
        max_new_tokens=100,
        temperature=0.7,
        top_p=0.9,
        do_sample=True,
        pad_token_id=tokenizer.eos_token_id
    ).to(device)

print(tokenizer.decode(outputs[0], skip_special_tokens=True))

Compute device is: MPS
