In [1]:
!pip install --extra-index-url https://pip.repos.neuron.amazonaws.com transformers-neuronx
!pip install sentencepiece -U
!pip install --upgrade jupyter ipywidgets
!pip install fastapi
!pip install ray
%pip list

Looking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazonaws.com
Collecting transformers-neuronx
  Using cached https://pip.repos.neuron.amazonaws.com/transformers-neuronx/transformers_neuronx-0.7.84-py3-none-any.whl (150 kB)
Collecting accelerate (from transformers-neuronx)
  Downloading accelerate-0.24.0-py3-none-any.whl (260 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m261.0/261.0 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m00:01[0m
Collecting transformers (from transformers-neuronx)
  Downloading transformers-4.34.1-py3-none-any.whl (7.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.7/7.7 MB[0m [31m37.1 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
Collecting huggingface-hub (from accelerate->transformers-neuronx)
  Using cached huggingface_hub-0.18.0-py3-none-any.whl (301 kB)
Collecting filelock (from transformers->transformers-neuronx)
  Using cached filelock-3.12.4-py3-none-any.whl (11 kB)
Collecting regex!=

In [1]:
!pip list | grep neuron

aws-neuronx-runtime-discovery 2.9
libneuronxla                  0.5.476
neuronx-cc                    2.10.0.34+6c8792c6f
neuronx-hwm                   2.10.0.5+7b1976adf
torch-neuronx                 1.13.1.1.11.0
torch-xla                     1.13.1+torchneuronb
transformers-neuronx          0.7.84


In [4]:
from transformers import LlamaForCausalLM

model_name = "NousResearch/Llama-2-13b-chat-hf"
model_split_name = f"{model_name.replace('/','_')}-split"

# Load model from Hugging Face Hub
model = LlamaForCausalLM.from_pretrained(model_name)

import torch
from transformers_neuronx.module import save_pretrained_split

# Split and save model using transformers-neuronx
save_pretrained_split(model, model_split_name)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]


Thrown during validation:
`do_sample` is set to `False`. However, `temperature` is set to `0.9` -- this flag is only used in sample-based generation modes. You should set `do_sample=True` or unset `temperature`.


In [5]:
!neuron-ls

+--------+--------+--------+-------------+---------+
| NEURON | NEURON | NEURON |  CONNECTED  |   PCI   |
| DEVICE | CORES  | MEMORY |   DEVICES   |   BDF   |
+--------+--------+--------+-------------+---------+
| 0      | 2      | 32 GB  | 12, 3, 4, 1 | 10:1e.0 |
| 1      | 2      | 32 GB  | 13, 0, 5, 2 | 20:1e.0 |
| 2      | 2      | 32 GB  | 14, 1, 6, 3 | 10:1d.0 |
| 3      | 2      | 32 GB  | 15, 2, 7, 0 | 20:1f.0 |
| 4      | 2      | 32 GB  | 0, 7, 8, 5  | 10:1f.0 |
| 5      | 2      | 32 GB  | 1, 4, 9, 6  | 20:1d.0 |
+--------+--------+--------+-------------+---------+


In [6]:
import time
import torch
from transformers import AutoTokenizer
from transformers_neuronx.llama.model import LlamaForSampling
import os
os.environ['NEURON_CC_FLAGS'] = '-O1'

model_name = "NousResearch/Llama-2-13b-chat-hf"
model_split_name = f"{model_name.replace('/','_')}-split"

# load model to the NeuronCores with 32-way tensor parallelism and run compilation
# Note -> this is for trn1.32xl. For inf2 instances, adjust 'tp_degree' accordingly
# I am using inf2.24x hence changed the tp_degree to 12
neuron_model = LlamaForSampling.from_pretrained(model_split_name, batch_size=1, tp_degree=12, amp='f16')
neuron_model.to_neuron()

# construct a tokenizer and encode prompt text
tokenizer = AutoTokenizer.from_pretrained(model_name)

2023-10-25 14:20:07.000816:  2897  INFO ||NEURON_CACHE||: Compile cache path: /var/tmp/neuron-compile-cache
2023-10-25 14:20:07.000827:  2897  INFO ||NEURON_CC_WRAPPER||: Using a cached neff at /var/tmp/neuron-compile-cache/neuronxcc-2.10.0.34+6c8792c6f/MODULE_94f7f205f454ad378c33+33d97156/model.neff. Exiting with a successfully compiled graph.
2023-10-25 14:20:07.000853:  2898  INFO ||NEURON_CACHE||: Compile cache path: /var/tmp/neuron-compile-cache
2023-10-25 14:20:07.000863:  2898  INFO ||NEURON_CC_WRAPPER||: Using a cached neff at /var/tmp/neuron-compile-cache/neuronxcc-2.10.0.34+6c8792c6f/MODULE_97125ec263cc2a0cc559+33d97156/model.neff. Exiting with a successfully compiled graph.
2023-10-25 14:20:07.000889:  2899  INFO ||NEURON_CACHE||: Compile cache path: /var/tmp/neuron-compile-cache
2023-10-25 14:20:07.000899:  2899  INFO ||NEURON_CC_WRAPPER||: Using a cached neff at /var/tmp/neuron-compile-cache/neuronxcc-2.10.0.34+6c8792c6f/MODULE_0737387391f5d9fb8432+33d97156/model.neff. Exi

In [9]:
prompt = "What is data parallelism and explain with more details"
input_ids = tokenizer.encode(prompt, return_tensors="pt")

# run inference with top-k sampling
with torch.inference_mode():
    start = time.time()
    generated_sequences = neuron_model.sample(input_ids, sequence_length=2048, top_k=50)
    elapsed = time.time() - start

generated_sequences = [tokenizer.decode(seq) for seq in generated_sequences]
print(f'generated sequences {generated_sequences} in {elapsed} seconds')
print()


generated sequences ['<s> What is data parallelism and explain with more details?\n\nData parallelism is a programming strategy used in parallel computing, where a single process is divided into smaller sub-processes, each of which operates on a separate portion of the data. This allows for the processing of large datasets in parallel, leading to increased performance and scalability.\n\nIn traditional parallel processing, a single process is divided into multiple threads or tasks, each of which operates on a subset of the data. However, this can lead to contention and synchronization issues, as each thread or task may need to access the same data.\n\nData parallelism avoids these issues by ensuring that each sub-process operates on a separate portion of the data, minimizing the need for synchronization and contention. This makes it particularly useful for processing large datasets that do not fit in the memory of a single processor.\n\nThere are several key benefits to using data para