In [1]:
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
!pip install transformers

Looking in indexes: https://download.pytorch.org/whl/cu118
[0m

In [2]:
import torch
print("CUDA Available:", torch.cuda.is_available())
print("GPU Device:", torch.cuda.get_device_name(0))

CUDA Available: True
GPU Device: NVIDIA H100 PCIe


In [3]:
import torch
print("Is CUDA available:", torch.cuda.is_available())
print("GPU Name:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "No GPU")

Is CUDA available: True
GPU Name: NVIDIA H100 PCIe


In [4]:
import torch
print(torch.cuda.is_available())
print(torch.cuda.device_count())
print(torch.cuda.get_device_name(0))

True
1
NVIDIA H100 PCIe


In [5]:
import accelerate
print(accelerate.__version__)

1.0.1


In [6]:
!pip install --upgrade accelerate

[0m

In [12]:
def process_query(model, tokenizer, query):
    inputs = tokenizer(query, return_tensors="pt").to("cuda")
    start_time = time.time()
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_length=50,
            pad_token_id=tokenizer.eos_token_id  # Explicitly set pad_token_id
        )
    end_time = time.time()
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response, end_time - start_time

In [13]:
def load_model(model_name):
    print(f"Loading {model_name}...")
    tokenizer = AutoTokenizer.from_pretrained(MODELS[model_name], token=HF_TOKEN)
    tokenizer.pad_token_id = tokenizer.eos_token_id  # Set pad_token_id globally
    model = AutoModelForCausalLM.from_pretrained(
        MODELS[model_name],
        token=HF_TOKEN,
        torch_dtype=torch.float16,
        device_map="auto"
    )
    return tokenizer, model


In [14]:
tokenizer.pad_token_id = tokenizer.eos_token_id

In [15]:
from transformers import AutoTokenizer

# Define model name and token
model_name = "epfl-llm/meditron-7b"
token = "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"  # Replace with your actual Hugging Face token

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, token=token)

# Fix the pad_token_id issue
tokenizer.pad_token_id = tokenizer.eos_token_id

In [17]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

# Hugging Face repo for Meditron-7b
model_name = "epfl-llm/meditron-7b"
token = "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"

# Define max_memory to manage memory usage explicitly
max_memory = {i: "40GB" for i in range(torch.cuda.device_count())}  # Example: Allocate 40GB per GPU

# Load tokenizer
print("Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(model_name, token=token)

# Load model with explicit max_memory
print("Loading model...")
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    token=token,
    torch_dtype=torch.float16,
    device_map="auto",
    max_memory=max_memory,  # Explicitly set max_memory per GPU
)

# Example input
input_text = "What is pneumonia?"
inputs = tokenizer(input_text, return_tensors="pt").to("cuda")

# Generate response
print("Generating response...")
with torch.no_grad():
    output = model.generate(**inputs, max_length=50)

response = tokenizer.decode(output[0], skip_special_tokens=True)
print("Response:", response)


Loading tokenizer...
Loading model...


Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Generating response...
Response: What is pneumonia?
Pneumonia is an infection of the lungs. It is caused by bacteria, viruses, or fungi. The infection can spread to the lungs through the air or through the


In [16]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

# Hugging Face repo for Meditron-7b
model_name = "epfl-llm/meditron-7b"
token = "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"

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_name, token=token)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    token=token,
    torch_dtype=torch.float16,
    device_map="auto"
)

# Set pad_token explicitly to eos_token
tokenizer.pad_token = tokenizer.eos_token

# Example input
input_text = "What is pneumonia?"
inputs = tokenizer(input_text, return_tensors="pt", padding=True).to("cuda")

# Generate response
with torch.no_grad():
    output = model.generate(
        **inputs,
        max_length=50,
        pad_token_id=tokenizer.pad_token_id  # Explicitly set pad_token_id
    )

response = tokenizer.decode(output[0], skip_special_tokens=True)
print("Response:", response)


INFO:accelerate.utils.modeling:We will use 90% of the memory on device 0 for storing the model, and 10% for the buffer to avoid OOM. You can set `max_memory` in to a higher value to use more memory (at your own risk).


Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

Response: What is pneumonia?
Pneumonia is an infection of the lungs. It is caused by bacteria, viruses, or fungi. The infection can spread to the lungs through the air or through the


In [11]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

# Hugging Face repo for Meditron-7b
model_name = "epfl-llm/meditron-70b"
token = "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"

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_name, token=token)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    token=token,
    torch_dtype=torch.float16,
    device_map="auto"
)

# Example input
input_text = "What is pneumonia?"
inputs = tokenizer(input_text, return_tensors="pt").to("cuda")

# Generate response
with torch.no_grad():
    output = model.generate(**inputs, max_length=50)

response = tokenizer.decode(output[0], skip_special_tokens=True)
print("Response:", response)

INFO:accelerate.utils.modeling:We will use 90% of the memory on device 0 for storing the model, and 10% for the buffer to avoid OOM. You can set `max_memory` in to a higher value to use more memory (at your own risk).


Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

KeyboardInterrupt: 