In [1]:
import os
from datetime import datetime

import torch
import torch.multiprocessing as mp
import wandb
import wget
from megatron.core import parallel_state
from nemo.collections.nlp.models.language_modeling.megatron_gpt_prompt_learning_model import \
    MegatronGPTPromptLearningModel
from nemo.collections.nlp.modules.common import VirtualPromptStyle
from nemo.collections.nlp.modules.common.transformer.text_generation import (
    LengthParam, SamplingParam)
from nemo.collections.nlp.parts.nlp_overrides import NLPDDPStrategy
from nemo.utils.exp_manager import exp_manager
from omegaconf import OmegaConf
from omegaconf.omegaconf import open_dict
from pytorch_lightning.plugins.environments import TorchElasticEnvironment
from pytorch_lightning.trainer.trainer import Trainer
from utils import download_file
import argparse


In [5]:
def parse_args():
    parser = argparse.ArgumentParser(description="NeMo Megatron PTuning - Evaluation")
    
    parser.add_argument("--output_dir", default="output", help="Output directory.")
    
    # Add argparse parameters for the `inference` block
    # Add argparse parameters for the `inference` block with default values
    parser.add_argument('--greedy', type=bool, default=False, help='Whether or not to use sampling; use greedy decoding otherwise')
    parser.add_argument('--top_k', type=int, default=0, help='The number of highest probability vocabulary tokens to keep for top-k-filtering.')
    parser.add_argument('--top_p', type=float, default=0.9, help='If set to float < 1, only the most probable tokens with probabilities that add up to top_p or higher are kept for generation.')
    parser.add_argument('--temperature', type=float, default=1.0, help='Sampling temperature')
    parser.add_argument('--add_BOS', type=bool, default=True, help='Add the bos token at the beginning of the prompt')
    parser.add_argument('--tokens_to_generate', type=int, default=30, help='The minimum length of the sequence to be generated.')
    parser.add_argument('--all_probs', type=bool, default=False, help='Whether return the log prob for all the tokens in vocab')
    parser.add_argument('--repetition_penalty', type=float, default=1.2, help='The parameter for repetition penalty. 1.0 means no penalty.')
    parser.add_argument('--min_tokens_to_generate', type=int, default=0, help='The minimum length of the sequence to be generated.')
    parser.add_argument('--compute_logprob', type=bool, default=False, help='A flag used to compute logprob of all the input text, a very special case of running inference, default False')
    parser.add_argument('--batch_size', type=int, default=5, help='Batch size for inference')

    
    # Trainer configs
    parser.add_argument(
        "--accelerator",
        default="gpu" if torch.cuda.is_available() else "cpu",
        help="Accelerator - GPU or CPU.",
    )
    parser.add_argument("--devices", default=1, type=int, help="Number of devices.")
    parser.add_argument("--enable_checkpointing", action="store_true",
        help="Whether to enable checkpoints during inference. Default is to not to",
    )
    parser.add_argument(
        "--precision",
        default=16 if torch.cuda.is_available() else 32,
        type=int,
        help="Training precision.",
    )
    
    # Experiment manager configs
    parser.add_argument(
        "--name", default="NeMo_Megatron_PTuning", help="Name of the experiment."
    )
    parser.add_argument(
        "--resume_if_exists",
        action="store_true",
        help="Whether to resume if exists. Default is to not resume",
    )
    parser.add_argument(
        "--no_create_wandb_logger",
        action="store_false",
        dest="create_wandb_logger",
        help="Specify this flag to not create wandb logger. Default is to create wandb logger.",
    )
    parser.add_argument(
        "--project", default="NeMo_Megatron_PTuning", help="WandB project name."
    )
    parser.add_argument("--log_model", default="all", help="Log model in WandB.")
    
    return parser.parse_known_args()
    
args, unknown = parse_args()

In [6]:
run = wandb.init(
    entity="a-sh0ts",
    project="NeMo_Megatron_PTuning",
    name=f"export@{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}",
    config=args,
)
args = run.config
squad_art_path = run.use_artifact("squad:latest", type="datasets").download()
SQUAD_DIR = os.path.join(squad_art_path, "data", "SQuAD")

# TODO: Loop all over artifact versions given a filter and use that to apply an alias for model registry
final_chkpt_path = run.use_artifact(
    "final_model_checkpoints:latest", type="model"
).download()
tuned_model_path = os.path.join(final_chkpt_path, "NeMo_Megatron_PTuning.nemo")
gpt_model_file = os.path.join(
    final_chkpt_path, "nemo_assets", "megatron_gpt_345m.nemo"
)

OUTPUT_DIR = args.output_dir
NEMO_DIR = os.path.join(OUTPUT_DIR, "nemo_assets")
CONFIG_DIR = os.path.join(NEMO_DIR, "conf")

os.makedirs(NEMO_DIR, exist_ok=True)
os.makedirs(CONFIG_DIR, exist_ok=True)

# Download the example config file
download_file(
    f"https://raw.githubusercontent.com/NVIDIA/NeMo/stable/examples/nlp/language_modeling/conf/megatron_gpt_prompt_learning_inference.yaml",
    CONFIG_DIR,
)

# Load the example config file so we can start editing it
CONFIG_PATH = os.path.join(
    CONFIG_DIR, "megatron_gpt_prompt_learning_inference.yaml"
)
cfg = OmegaConf.load(CONFIG_PATH)
OmegaConf.set_struct(cfg, False)

# Override configuration values with command line arguments
cfg.inference.greedy = args.greedy
cfg.inference.top_k = args.top_k
cfg.inference.top_p = args.top_p
cfg.inference.temperature = args.temperature
cfg.inference.add_BOS = args.add_BOS
cfg.inference.tokens_to_generate = args.tokens_to_generate
cfg.inference.all_probs = args.all_probs
cfg.inference.repetition_penalty = args.repetition_penalty
cfg.inference.min_tokens_to_generate = args.min_tokens_to_generate
cfg.inference.compute_logprob = args.compute_logprob
cfg.inference.batch_size = args.batch_size

cfg.virtual_prompt_model_file = tuned_model_path
cfg.model = {
    "language_model_path": gpt_model_file,
    "virtual_prompt_style": VirtualPromptStyle.P_TUNING.value,
}
cfg.gpt_model_file = gpt_model_file
test_data_path = os.path.join(SQUAD_DIR, "squad_short_test.jsonl")
cfg.data_paths = [test_data_path]

mp.set_start_method("spawn", force=True)

# let's modify some trainer configs
# check if we have GPU available and uses it
accelerator = args.accelerator
cfg.trainer.accelerator = accelerator
cfg.trainer.devices = args.devices
cfg.trainer.enable_checkpointing = args.enable_checkpointing

# for PyTorch Native AMP set precision=16
cfg.trainer.precision = args.precision

# setup cluster environment parameters"
# use torch elastic cluster environment so `create_process_externally` is True
# the launcher is set to None. It will not try to spawn new processes.
# It won't create the misconfiguration error because of the `interactive session`
os.environ["LOCAL_RANK"] = "0"
os.environ["RANK"] = "0"
os.environ["WORLD_SIZE"] = "1"

if not torch.cuda.is_available():
    raise EnvironmentError("GPU is needed for the inference")

strategy = NLPDDPStrategy(
    find_unused_parameters=False, no_ddp_communication_hook=True
)
plugins = [TorchElasticEnvironment()]
trainer = Trainer(strategy=strategy, plugins=plugins, **cfg.trainer)

# Set name of the experiment
cfg.name = args.name
cfg.exp_manager = {
    "resume_if_exists": args.resume_if_exists,
    "create_wandb_logger": args.create_wandb_logger,
    "wandb_logger_kwargs": {"project": args.project, "log_model": args.log_model},
}
OmegaConf.set_struct(cfg, True)

# Init the experiment manager and view the exp_dir
exp_dir = exp_manager(trainer, cfg.get("exp_manager", None))
exp_dir = str(exp_dir)

if (
    cfg.tensor_model_parallel_size < 0
    or cfg.pipeline_model_parallel_size < 0
    or cfg.get("pipeline_model_parallel_split_rank", -1) < 0
):
    model_config = MegatronGPTPromptLearningModel.restore_from(
        restore_path=cfg.gpt_model_file,
        trainer=trainer,
        return_config=True,
    )

    with open_dict(cfg):
        cfg.tensor_model_parallel_size = model_config.get(
            "tensor_model_parallel_size", 1
        )
        cfg.pipeline_model_parallel_size = model_config.get(
            "pipeline_model_parallel_size", 1
        )
        cfg.pipeline_model_parallel_split_rank = model_config.get(
            "pipeline_model_parallel_split_rank", 0
        )

assert (
    cfg.trainer.devices * cfg.trainer.num_nodes
    == cfg.tensor_model_parallel_size * cfg.pipeline_model_parallel_size
), "devices * num_nodes should equal tensor_model_parallel_size * pipeline_model_parallel_size"

# Update frozen GPT model path if it is given in case it has changed
prompt_learning_cfg = MegatronGPTPromptLearningModel.restore_from(
    cfg.virtual_prompt_model_file,
    trainer=trainer,
    return_config=True,
)
if cfg.get("gpt_model_file"):
    with open_dict(prompt_learning_cfg):
        prompt_learning_cfg.language_model_path = cfg.gpt_model_file
        prompt_learning_cfg.sequence_parallel = False
        prompt_learning_cfg.activations_checkpoint_method = None
        prompt_learning_cfg.activations_checkpoint_granularity = None
        prompt_learning_cfg.activations_checkpoint_num_layers = None
        prompt_learning_cfg.virtual_prompt_style = cfg.model.virtual_prompt_style

# Load prompt tuned model, virtual_prompt_model_file must be provided in config
# Now load prompt learning model with frozen gpt model base
model = MegatronGPTPromptLearningModel.restore_from(
    restore_path=cfg.virtual_prompt_model_file,
    trainer=trainer,
    override_config_path=prompt_learning_cfg,
)
model.freeze()

# Have to turn off activations_checkpoint_method for inference
try:
    model.frozen_model.model.language_model.encoder.activations_checkpoint_method = (
        None
    )
except AttributeError:
    pass

# Check whether the DDP is initialized
if parallel_state.is_unitialized():

    def placeholder():
        return

    if model.trainer.strategy.launcher is not None:
        model.trainer.strategy.launcher.launch(placeholder, trainer=model.trainer)
    model.trainer.strategy.setup_environment()

length_params: LengthParam = {
    "max_length": cfg.inference.tokens_to_generate,
    "min_length": cfg.inference.min_tokens_to_generate,
}

sampling_params: SamplingParam = {
    "use_greedy": cfg.inference.greedy,
    "temperature": cfg.inference.temperature,
    "top_k": cfg.inference.top_k,
    "top_p": cfg.inference.top_p,
    "repetition_penalty": cfg.inference.repetition_penalty,
    "add_BOS": cfg.inference.add_BOS,
    "all_probs": cfg.inference.all_probs,
    "compute_logprob": cfg.inference.compute_logprob,
}

config = OmegaConf.to_container(cfg.inference)
model.set_inference_config(config)

[34m[1mwandb[0m: Currently logged in as: [33ma-sh0ts[0m. Use [1m`wandb login --relogin`[0m to force relogin


[34m[1mwandb[0m: Downloading large artifact squad:latest, 1497.99MB. 15 files... 
[34m[1mwandb[0m:   15 of 15 files downloaded.  
Done. 0:0:2.6
[34m[1mwandb[0m: Downloading large artifact final_model_checkpoints:latest, 1372.88MB. 5 files... 
[34m[1mwandb[0m:   5 of 5 files downloaded.  
Done. 0:0:2.7
Using 16bit None Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


megatron_gpt_prompt_learning_inference.yaml already exists. Skipping download.
[NeMo I 2023-08-16 09:36:52 exp_manager:374] Experiments will be logged at /workspace/nemo/demo/nemo_experiments/default/2023-08-16_09-36-52
[NeMo I 2023-08-16 09:36:52 exp_manager:797] TensorboardLogger has been set up


      rank_zero_warn(
    


[NeMo I 2023-08-16 09:36:53 exp_manager:812] WandBLogger has been set up
[NeMo I 2023-08-16 09:36:53 megatron_init:234] Rank 0 has data parallel group: [0]
[NeMo I 2023-08-16 09:36:53 megatron_init:237] All data parallel group ranks: [[0]]
[NeMo I 2023-08-16 09:36:53 megatron_init:238] Ranks 0 has data parallel rank: 0
[NeMo I 2023-08-16 09:36:53 megatron_init:246] Rank 0 has model parallel group: [0]
[NeMo I 2023-08-16 09:36:53 megatron_init:247] All model parallel group ranks: [[0]]
[NeMo I 2023-08-16 09:36:53 megatron_init:257] Rank 0 has tensor model parallel group: [0]
[NeMo I 2023-08-16 09:36:53 megatron_init:261] All tensor model parallel group ranks: [[0]]
[NeMo I 2023-08-16 09:36:53 megatron_init:262] Rank 0 has tensor model parallel rank: 0
[NeMo I 2023-08-16 09:36:53 megatron_init:276] Rank 0 has pipeline model parallel group: [0]
[NeMo I 2023-08-16 09:36:53 megatron_init:288] Rank 0 has embedding group: [0]
[NeMo I 2023-08-16 09:36:53 megatron_init:294] All pipeline model p

[NeMo W 2023-08-16 09:36:55 modelPT:244] You tried to register an artifact under config key=tokenizer.vocab_file but an artifact for it has already been registered.


[NeMo I 2023-08-16 09:36:55 tokenizer_utils:204] Getting Megatron tokenizer for pretrained model name: megatron-gpt-345m, custom vocab file: /tmp/tmpceshfx_g/bfcdca5e44814366bdb5dcd651325152_gpt2-vocab.json, and merges file: /tmp/tmpceshfx_g/315a11fd68be49d6abdb34363e8c4997_gpt2-merge.txt
[NeMo I 2023-08-16 09:36:55 tokenizer_utils:130] Getting HuggingFace AutoTokenizer with pretrained_model_name: gpt2, vocab_file: /tmp/tmpceshfx_g/bfcdca5e44814366bdb5dcd651325152_gpt2-vocab.json, merges_files: /tmp/tmpceshfx_g/315a11fd68be49d6abdb34363e8c4997_gpt2-merge.txt, special_tokens_dict: {}, and use_fast: False


Using sep_token, but it is not set yet.
Using cls_token, but it is not set yet.
Using pad_token, but it is not set yet.
Using mask_token, but it is not set yet.


[NeMo I 2023-08-16 09:36:55 megatron_base_model:264] Padded vocab_size: 50304, original vocab_size: 50257, dummy tokens: 47.
[NeMo I 2023-08-16 09:36:56 nlp_overrides:401] Model MegatronGPTModel was successfully restored from /workspace/nemo/demo/artifacts/final_model_checkpoints:latest/nemo_assets/megatron_gpt_345m.nemo.
[NeMo I 2023-08-16 09:36:56 auto_tokenizer:172] 15 special tokens added, resize your model accordingly.


Using pad_token, but it is not set yet.
Using mask_token, but it is not set yet.


[NeMo I 2023-08-16 09:36:58 megatron_init:234] Rank 0 has data parallel group: [0]
[NeMo I 2023-08-16 09:36:58 megatron_init:237] All data parallel group ranks: [[0]]
[NeMo I 2023-08-16 09:36:58 megatron_init:238] Ranks 0 has data parallel rank: 0
[NeMo I 2023-08-16 09:36:58 megatron_init:246] Rank 0 has model parallel group: [0]
[NeMo I 2023-08-16 09:36:58 megatron_init:247] All model parallel group ranks: [[0]]
[NeMo I 2023-08-16 09:36:58 megatron_init:257] Rank 0 has tensor model parallel group: [0]
[NeMo I 2023-08-16 09:36:58 megatron_init:261] All tensor model parallel group ranks: [[0]]
[NeMo I 2023-08-16 09:36:58 megatron_init:262] Rank 0 has tensor model parallel rank: 0
[NeMo I 2023-08-16 09:36:58 megatron_init:276] Rank 0 has pipeline model parallel group: [0]
[NeMo I 2023-08-16 09:36:58 megatron_init:288] Rank 0 has embedding group: [0]
[NeMo I 2023-08-16 09:36:58 megatron_init:294] All pipeline model parallel group ranks: [[0]]
[NeMo I 2023-08-16 09:36:58 megatron_init:295]

[NeMo W 2023-08-16 09:36:58 modelPT:244] You tried to register an artifact under config key=tokenizer.vocab_file but an artifact for it has already been registered.


[NeMo I 2023-08-16 09:36:58 tokenizer_utils:204] Getting Megatron tokenizer for pretrained model name: megatron-gpt-345m, custom vocab file: /tmp/tmppig2bkvb/bfcdca5e44814366bdb5dcd651325152_gpt2-vocab.json, and merges file: /tmp/tmppig2bkvb/315a11fd68be49d6abdb34363e8c4997_gpt2-merge.txt
[NeMo I 2023-08-16 09:36:58 tokenizer_utils:130] Getting HuggingFace AutoTokenizer with pretrained_model_name: gpt2, vocab_file: /tmp/tmppig2bkvb/bfcdca5e44814366bdb5dcd651325152_gpt2-vocab.json, merges_files: /tmp/tmppig2bkvb/315a11fd68be49d6abdb34363e8c4997_gpt2-merge.txt, special_tokens_dict: {}, and use_fast: False


Using sep_token, but it is not set yet.
Using cls_token, but it is not set yet.
Using pad_token, but it is not set yet.
Using mask_token, but it is not set yet.


[NeMo I 2023-08-16 09:36:58 megatron_base_model:264] Padded vocab_size: 50304, original vocab_size: 50257, dummy tokens: 47.
[NeMo I 2023-08-16 09:36:59 nlp_overrides:401] Model MegatronGPTModel was successfully restored from /workspace/nemo/demo/artifacts/final_model_checkpoints:latest/nemo_assets/megatron_gpt_345m.nemo.
[NeMo I 2023-08-16 09:36:59 auto_tokenizer:172] 15 special tokens added, resize your model accordingly.


Using pad_token, but it is not set yet.
Using mask_token, but it is not set yet.


[NeMo I 2023-08-16 09:36:59 save_restore_connector:249] Model MegatronGPTPromptLearningModel was successfully restored from /workspace/nemo/demo/artifacts/final_model_checkpoints:latest/NeMo_Megatron_PTuning.nemo.


      rank_zero_warn("MASTER_ADDR environment variable is not defined. Set as localhost")
    
      rank_zero_warn("MASTER_PORT environment variable is not defined. Set as 12910")
    
Initializing distributed: GLOBAL_RANK: 0, MEMBER: 1/1
----------------------------------------------------------------------------------------------------
distributed_backend=nccl
All distributed processes registered. Starting with 1 processes
----------------------------------------------------------------------------------------------------



In [7]:
import json

def load_jsonl(filename):
    with open(filename, 'r') as f:
        return [json.loads(line) for line in f]

data = load_jsonl(test_data_path)

In [8]:
data[0:5]

[{'taskname': 'squad',
  'context': 'Super Bowl 50 was an American football game to determine the champion of the National Football League (NFL) for the 2015 season. The American Football Conference (AFC) champion Denver Broncos defeated the National Football Conference (NFC) champion Carolina Panthers 24–10 to earn their third Super Bowl title. The game was played on February 7, 2016, at Levi\'s Stadium in the San Francisco Bay Area at Santa Clara, California. As this was the 50th Super Bowl, the league emphasized the "golden anniversary" with various gold-themed initiatives, as well as temporarily suspending the tradition of naming each Super Bowl game with Roman numerals (under which the game would have been known as "Super Bowl L"), so that the logo could prominently feature the Arabic numerals 50.',
  'question': 'Which NFL team represented the AFC at Super Bowl 50?'},
 {'taskname': 'squad',
  'context': 'Super Bowl 50 was an American football game to determine the champion of the

In [9]:
max_seq_length = (
    model.frozen_model.cfg.encoder_seq_length - length_params["max_length"]
)
max_seq_length = min(max_seq_length, cfg.get("max_seq_length", 8192))

_, dataloader = model.build_virtual_prompt_dataset(
    data=cfg.data_paths,
    batch_size=cfg.inference.get("batch_size", 1),
    max_seq_length=max_seq_length,
    min_seq_length=model.cfg.data.get("min_seq_length", 1),
    add_bos=sampling_params["add_BOS"],
    add_eos=False,
    for_train=False,
    tokens_to_generate=length_params["max_length"],
    drop_last=False,
    shuffle=False,
    num_workers=cfg.get("num_workers", 1),
)

[NeMo I 2023-08-16 09:37:24 gpt_prompt_learning_dataset:85] Loading and tokenizing dataset ... 


0it [00:00, ?it/s]

[NeMo I 2023-08-16 09:37:28 gpt_prompt_learning_dataset:196] Skipped 0 sentences, sequence length too short or too long even after truncation


In [10]:
response = trainer.predict(model, dataloader)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Predicting: 0it [00:00, ?it/s]

      input_lengths = torch.cuda.LongTensor([len(inputs) for inputs in input_ids])
    
      input_lengths = torch.cuda.LongTensor([len(inputs) for inputs in input_ids])
    
      input_lengths = torch.cuda.LongTensor([len(inputs) for inputs in input_ids])
    
      input_lengths = torch.cuda.LongTensor([len(inputs) for inputs in input_ids])
    
      input_lengths = torch.cuda.LongTensor([len(inputs) for inputs in input_ids])
    
      input_lengths = torch.cuda.LongTensor([len(inputs) for inputs in input_ids])
    
      input_lengths = torch.cuda.LongTensor([len(inputs) for inputs in input_ids])
    
      input_lengths = torch.cuda.LongTensor([len(inputs) for inputs in input_ids])
    
      input_info_tensor = torch.cuda.FloatTensor(input_info)
    
      string_tensor = torch.as_tensor(
    
    


In [23]:
data[0].keys()

dict_keys(['taskname', 'context', 'question'])

In [24]:
dataloader_example = next(iter(dataloader))

In [25]:
# dataloader_example

(tensor([[16485,   324],
         [16485,   324],
         [16485,   324],
         [16485,   324],
         [16485,   324]], device='cuda:0'),
 (tensor([[50256, 50257, 50258,  ..., 50256, 50256, 50256],
          [50256, 50257, 50258,  ..., 50256, 50256, 50256],
          [50256, 50257, 50258,  ..., 50256, 50256, 50256],
          [50256, 50257, 50258,  ..., 50256, 50256, 50256],
          [50256, 50257, 50258,  ..., 50256, 50256, 50256]], device='cuda:0'),
  tensor([192, 192, 189, 189, 196], device='cuda:0')))

In [26]:
response[0].keys()

dict_keys(['sentences', 'tokens', 'logprob', 'full_logprob', 'token_ids', 'offsets'])

In [29]:
from pprint import pprint

In [15]:
from nemo.core.classes.exportable import Exportable

In [None]:
dir(model)

In [148]:
from typing import Optional, Dict
from nemo.core.neural_types import ChannelType, NeuralType

In [149]:
class ExportableSquadModel(torch.nn.Module, Exportable):
    def __init__(self, model, trainer, input_example, cfg, max_seq_length, sampling_params, length_params):
        super().__init__()
        self.model = model
        self.trainer = trainer
        self._input_example = input_example
        self.cfg = cfg
        self.max_seq_length = max_seq_length
        self.sampling_params = sampling_params
        self.length_params = length_params
        
#     @property
    def input_example(self):
        return self._input_example
        
    @property
    def input_module(self):
        return self
    
    @property
    def output_module(self):
        return self
    
    @property
    def input_names(self):
        return ['taskname', 'context', 'question']
    
    @property
    def output_names(self):
        return ['sentences']
    
    @property
    def input_types(self) -> Optional[Dict[str, NeuralType]]:
        return {
            "taskname": NeuralType(('B', 'T'), ChannelType()),
            "context": NeuralType(('B', 'T'), ChannelType()),
            "question": NeuralType(('B', 'T'), ChannelType()),
        }

    @property
    def output_types(self) -> Optional[Dict[str, NeuralType]]:
        return {"sentences": NeuralType(('B', 'T'), ChannelType())}
    
    def forward(
        self,
        taskname,
        context,
        question
    ):
        model_input = [{
            "taskname": taskname, 
            "context": context, 
            "question": question
        }
        ]
        _, dataloader = self.model.build_virtual_prompt_dataset(
            data=model_input,
            batch_size=self.cfg.inference.get("batch_size", 1),
            max_seq_length=self.max_seq_length,
            min_seq_length=self.model.cfg.data.get("min_seq_length", 1),
            add_bos=self.sampling_params["add_BOS"],
            add_eos=False,
            for_train=False,
            tokens_to_generate=self.length_params["max_length"],
            drop_last=False,
            shuffle=False,
            num_workers=self.cfg.get("num_workers", 1),
        )
        response = self.trainer.predict(self.model, dataloader)
        output_response = []
        for resp in response:
            output_response.append({
                "sentences": resp["sentences"]
            })
        return output_response

In [151]:
export_model = ExportableSquadModel(model, trainer, data[0].values(), cfg, max_seq_length, sampling_params, length_params)

In [152]:
# export_model.forward(model_input=data[0:5])

In [153]:
from nemo.utils.export_utils import (
    parse_input_example
)

In [154]:
input_example = export_model.input_module.input_example()


In [155]:
input_example

dict_values(['squad', 'Super Bowl 50 was an American football game to determine the champion of the National Football League (NFL) for the 2015 season. The American Football Conference (AFC) champion Denver Broncos defeated the National Football Conference (NFC) champion Carolina Panthers 24–10 to earn their third Super Bowl title. The game was played on February 7, 2016, at Levi\'s Stadium in the San Francisco Bay Area at Santa Clara, California. As this was the 50th Super Bowl, the league emphasized the "golden anniversary" with various gold-themed initiatives, as well as temporarily suspending the tradition of naming each Super Bowl game with Roman numerals (under which the game would have been known as "Super Bowl L"), so that the logo could prominently feature the Arabic numerals 50.', 'Which NFL team represented the AFC at Super Bowl 50?'])

In [156]:
input_list, input_dict = parse_input_example(input_example)

In [157]:
input_list

['squad',
 'Super Bowl 50 was an American football game to determine the champion of the National Football League (NFL) for the 2015 season. The American Football Conference (AFC) champion Denver Broncos defeated the National Football Conference (NFC) champion Carolina Panthers 24–10 to earn their third Super Bowl title. The game was played on February 7, 2016, at Levi\'s Stadium in the San Francisco Bay Area at Santa Clara, California. As this was the 50th Super Bowl, the league emphasized the "golden anniversary" with various gold-themed initiatives, as well as temporarily suspending the tradition of naming each Super Bowl game with Roman numerals (under which the game would have been known as "Super Bowl L"), so that the logo could prominently feature the Arabic numerals 50.',
 'Which NFL team represented the AFC at Super Bowl 50?']

In [158]:
input_dict

{}

In [159]:
export_model.eval()
export_model.to('cuda')  # or to('cpu') if you don't have GPU
export_model.export("export_model.onnx", verbose=True)

[NeMo I 2023-08-17 08:05:03 export_utils:430] Swapped 24 modules
[NeMo I 2023-08-17 08:05:03 gpt_prompt_learning_dataset:85] Loading and tokenizing dataset ... 


  0%|          | 0/1 [00:00<?, ?it/s]

[NeMo I 2023-08-17 08:05:03 gpt_prompt_learning_dataset:196] Skipped 0 sentences, sequence length too short or too long even after truncation


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Predicting: 0it [00:00, ?it/s]

      input_lengths = torch.cuda.LongTensor([len(inputs) for inputs in input_ids])
    
    
    
    


verbose: False, log level: Level.ERROR



TypeError: MegatronBasePromptLearningModel.state_dict() got an unexpected keyword argument 'destination'

In [52]:
export_model.input_example()

TypeError: 'tuple' object is not callable

In [None]:
wandb.run.log_code()
wandb.finish()