# Launching with NVIDIA Llama 3.1 Nemotron 70B Instruct HF

In [None]:
#@title Select the Huggingface model
model_name = "nvidia/Llama-3.1-Nemotron-70B-Instruct-HF"  # @param {type:"string"}

## Install Required Libraries
Please note, a session restart might be required before starting the chat interface.

In [None]:
!pip install transformers accelerate bitsandbytes gradio --quiet

## Import Libraries

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
import gradio as gr
from accelerate import init_empty_weights, infer_auto_device_map

## Load the Model and Tokenizer

In [None]:
# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
# Check available GPUs
num_gpus = torch.cuda.device_count()
print(f"Number of GPUs available: {num_gpus}")

In [None]:
# Define the max memory per GPU
max_memory = {i: '80GB' for i in range(num_gpus)}

# Load the model using device_map 'auto'
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map='auto',
    max_memory=max_memory,
    torch_dtype=torch.float16
)

## Define the Chat Function

In [None]:
def chat(input_text, history=[]):
    history = history or []
    history.append({"role": "user", "content": input_text})

    # Build the conversation prompt
    prompt = ""
    for turn in history:
        role = turn["role"].capitalize()
        content = turn["content"]
        prompt += f"{role}: {content}\n"
    prompt += "Assistant:"

    input_ids = tokenizer(prompt, return_tensors='pt').input_ids.to(model.device)
    output_ids = model.generate(
        input_ids=input_ids,
        max_new_tokens=16000,
        temperature=0.3,
        top_p=0.9,
        do_sample=True,
        eos_token_id=tokenizer.eos_token_id,
        pad_token_id=tokenizer.eos_token_id,
        use_cache=True
    )
    response = tokenizer.decode(output_ids[0][input_ids.shape[-1]:], skip_special_tokens=True)
    response = response.strip()
    history.append({"role": "assistant", "content": response})
    return "", history

## Create the Gradio Interface

In [None]:
with gr.Blocks() as demo:
    chatbot = gr.Chatbot()
    msg = gr.Textbox()
    clear = gr.Button("Clear")

    def respond(message, chat_history):
        return chat(message, chat_history)

    msg.submit(respond, [msg, chatbot], [msg, chatbot])
    clear.click(lambda: None, None, chatbot, queue=False)

demo.launch()

## Sample Usage
You can now interact with the model using the chat interface above.

### Example Prompts:
- "Can you list five recommended tourist destinations in Japan?"
- "What is the second highest mountain in Japan?"
- "Summarize the following text..."

In [None]:
# Example of generating a response programmatically
input_text = "Can you list five recommended tourist destinations in Japan?"
_, history = chat(input_text)
print(history[-1]["content"])