# MarianMT - Pytorch
This notebook shows how to compile a pre-trained MarianMT model in PyTorch to AWS Inferentia (inf1 instances) using NeuronSDK. The original implementation is provided by HuggingFace.

This is a simplified example based on the Neuron MarianMT tutorial. For more details regarding the code, reference the [tutorial](https://github.com/aws/aws-neuron-sdk/blob/master/src/examples/pytorch/transformers-marianmt.ipynb).

**Reference:** https://huggingface.co/Helsinki-NLP/opus-mt-en-de

## 1) Install dependencies

In [None]:
# Set Pip repository  to point to the Neuron repository
%pip config set global.extra-index-url https://pip.repos.neuron.amazonaws.com
# now restart the kernel

In [None]:
#Install Neuron PyTorch
%pip install -U pip
%pip install -U torch-neuron "transformers==4.26.1" neuron-cc[tensorflow] sentencepiece
# use --force-reinstall if you're facing some issues while loading the modules
# now restart the kernel again

## 2) Load a pre-trained model to CPU


In [None]:
from transformers import MarianMTModel, MarianTokenizer

model_name='Helsinki-NLP/opus-mt-en-de'   # English -> German model
num_texts = 1                             # Number of input texts to decode
num_beams = 4                             # Number of beams per input text
max_encoder_length = 32                   # Maximum input token length
max_decoder_length = 32                   # Maximum output token length

model_cpu = MarianMTModel.from_pretrained(model_name)
model_cpu.config.max_length = max_decoder_length
model_cpu.eval()

tokenizer = MarianTokenizer.from_pretrained(model_name)


## 3) Test inference on CPU

In [None]:
def infer(model, tokenizer, text):

    # Truncate and pad the max length to ensure that the token size is compatible with fixed-sized encoder (Not necessary for pure CPU execution)
    batch = tokenizer(text, max_length=max_decoder_length, truncation=True, padding='max_length', return_tensors="pt")
    output = model.generate(**batch, max_length=max_decoder_length, num_beams=num_beams, num_return_sequences=num_beams)
    results = [tokenizer.decode(t, skip_special_tokens=True) for t in output]

    print('Texts:')
    for i, summary in enumerate(results):
        print(i + 1, summary)

sample_text = "I am a small frog."
infer(model_cpu, tokenizer, sample_text)

## 4) Create padded encoder & decoder wrappers

The following wrappers allow the model to be traced with padded inputs and produce outputs that can be used by the huggingface generate function.

In [None]:
import torch
from torch.nn import functional as F


class PaddedEncoder(torch.nn.Module):

    def __init__(self, model):
        super().__init__()
        self.encoder = model.model.encoder
        self.main_input_name = 'input_ids'

    def forward(self, input_ids, attention_mask):
        return self.encoder(input_ids, attention_mask=attention_mask, return_dict=False)


class PaddedDecoder(torch.nn.Module):

    def __init__(self, model):
        super().__init__()
        self.weight = model.model.shared.weight.clone().detach()
        self.bias = model.final_logits_bias.clone().detach()
        self.decoder = model.model.decoder

    def forward(self, input_ids, attention_mask, encoder_outputs, index):

        # Invoke the decoder
        hidden, = self.decoder(
            input_ids=input_ids,
            encoder_hidden_states=encoder_outputs,
            encoder_attention_mask=attention_mask,
            return_dict=False,
            use_cache=False,
        )

        _, n_length, _ = hidden.shape

        # Create selection mask
        mask = torch.arange(n_length, dtype=torch.float32) == index
        mask = mask.view(1, -1, 1)

        # Broadcast mask
        masked = torch.multiply(hidden, mask)

        # Reduce along 1st dimension
        hidden = torch.sum(masked, 1, keepdims=True)

        # Compute final linear layer for token probabilities
        logits = F.linear(
            hidden,
            self.weight,
            bias=self.bias
        )
        return logits


## 5) Create a padded GenerationMixin subclass

This generator object allows us to reuse the various sampling methods provided by huggingface.

In [None]:
import os

from transformers import GenerationMixin, AutoConfig
from transformers.modeling_outputs import Seq2SeqLMOutput, BaseModelOutput
from transformers.modeling_utils import PreTrainedModel


class PaddedGenerator(PreTrainedModel, GenerationMixin):

    @classmethod
    def from_model(cls, model):
        generator = cls(model.config)
        generator.encoder = PaddedEncoder(model)
        generator.decoder = PaddedDecoder(model)
        return generator

    def prepare_inputs_for_generation(
            self,
            input_ids,
            encoder_outputs=None,
            attention_mask=None,
            **kwargs,
    ):
        # Pad the inputs for Neuron
        current_length = input_ids.shape[1]
        pad_size = self.config.max_length - current_length
        return dict(
            input_ids=F.pad(input_ids, (0, pad_size)),
            attention_mask=attention_mask,
            encoder_outputs=encoder_outputs.last_hidden_state,
            current_length=torch.tensor(current_length - 1),
        )

    def get_encoder(self):
        def encode(input_ids, attention_mask, **kwargs):
            output, = self.encoder(input_ids, attention_mask)
            return BaseModelOutput(
                last_hidden_state=output,
            )
        return encode

    def forward(self, input_ids, attention_mask, encoder_outputs, current_length, **kwargs):
        logits = self.decoder(input_ids, attention_mask, encoder_outputs, current_length)
        return Seq2SeqLMOutput(logits=logits)

    @property
    def device(self):  # Attribute required by beam search
        return torch.device('cpu')

    def save_pretrained(self, directory):
        if os.path.isfile(directory):
            print(f"Provided path ({directory}) should be a directory, not a file")
            return
        os.makedirs(directory, exist_ok=True)
        torch.jit.save(self.encoder, os.path.join(directory, 'encoder.pt'))
        torch.jit.save(self.decoder, os.path.join(directory, 'decoder.pt'))
        self.config.save_pretrained(directory)

    @classmethod
    def from_pretrained(cls, directory):
        config = AutoConfig.from_pretrained(directory)
        obj = cls(config)
        obj.encoder = torch.jit.load(os.path.join(directory, 'encoder.pt'))
        obj.decoder = torch.jit.load(os.path.join(directory, 'decoder.pt'))
        setattr(obj.encoder, 'main_input_name', 'input_ids')  # Attribute required by beam search
        return obj


## 6) Test padded inference on CPU


In [None]:
padded_model_cpu = PaddedGenerator.from_model(model_cpu)
infer(padded_model_cpu, tokenizer, sample_text)


## 7) Trace the encoder & decoder for inference on Neuron


In [None]:
import torch
import torch_neuron


def trace(model, num_texts, num_beams, max_decoder_length, max_encoder_length):
    """
    Traces the encoder and decoder modules for use on Neuron.

    This function fixes the network to the given sizes. Once the model has been
    compiled to a given size, the inputs to these networks must always be of
    fixed size.

    Args:
        model (PaddedGenerator): The padded generator to compile for Neuron
        num_texts (int): The number of input texts to translate at once
        num_beams (int): The number of beams to compute per text
        max_decoder_length (int): The maximum number of tokens to be generated
        max_encoder_length (int): The maximum number of input tokens that will be encoded
    """

    # Trace the encoder
    inputs = (
        torch.ones((num_texts, max_encoder_length), dtype=torch.long),
        torch.ones((num_texts, max_encoder_length), dtype=torch.long),
    )
    encoder = torch_neuron.trace(model.encoder, inputs)

    # Trace the decoder (with expanded inputs)
    batch_size = num_texts * num_beams
    inputs = (
        torch.ones((batch_size, max_decoder_length), dtype=torch.long),
        torch.ones((batch_size, max_encoder_length), dtype=torch.long),
        torch.ones((batch_size, max_encoder_length, model.config.d_model), dtype=torch.float),
        torch.tensor(0),
    )
    decoder = torch_neuron.trace(model.decoder, inputs)

    traced = PaddedGenerator(model.config)
    traced.encoder = encoder
    traced.decoder = decoder
    setattr(encoder, 'main_input_name', 'input_ids')  # Attribute required by beam search
    return traced


padded_model_neuron = trace(padded_model_cpu, num_texts, num_beams, max_decoder_length, max_encoder_length)


## 8) Test padded inference on Neuron


In [None]:
infer(padded_model_neuron, tokenizer, sample_text)