In [1]:
import sagemaker
from sagemaker.estimator import Estimator
from sagemaker import get_execution_role
import boto3
import os

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml


In [2]:
sess = sagemaker.Session()
role = get_execution_role()
bucket = 'arnasinh-nemo-bert-ft-sm' #sess.default_bucket()
region = sess.boto_session.region_name

### Building Nemo Container with training script for training

In [3]:
account_id = boto3.client('sts').get_caller_identity()['Account']
ecr_repository = 'nemo-bert-finetuning'
tag = 'latest'
container_uri = f'{account_id}.dkr.ecr.{region}.amazonaws.com/{ecr_repository}:{tag}'

In [None]:
!pip install sagemaker-studio-image-build

In [None]:
!sm-docker build . --repository {ecr_repository}:latest

### Preparing Training and Eval Dataset and saving them in csv. Upload to s3 for Sagemaker to access

In [None]:
!curl -LO https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
!tar -xvf aclImdb_v1.tar.gz 
!curl -LO https://raw.githubusercontent.com/NVIDIA/NeMo/refs/heads/main/examples/nlp/text_classification/data/import_datasets.py 

In [None]:
!pip install nemo_toolkit[nlp]

In [None]:
!python3 import_datasets.py     --dataset_name imdb --target_data_dir . --source_data_dir /home/sagemaker-user/nvidia-nemo/aclImdb

In [None]:
sess.upload_data(path='train.tsv', bucket=bucket, key_prefix='data/training')
sess.upload_data(path='test.tsv', bucket=bucket, key_prefix='data/validation')

In [None]:
!sm-docker build ./claude/ --repository {ecr_repository}:latest

### Sagemaker Training using nemo container

In [6]:
image_uri = "nvcr.io/nvidia/nemo:23.10"

In [7]:
instance_type = "ml.p3.2xlarge"  # Choose an appropriate instance (GPU is recommended)
instance_count = 1
output_path = f"s3://{bucket}/nemo-bert-finetuning"

In [8]:
%%writefile gemini/sagemaker_train.py
#!/usr/bin/env python3
import os
import subprocess
import sys
import json
import logging

logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)

def parse_sagemaker_parameters():
    """Parse SageMaker hyperparameters from the hyperparameters.json file."""
    hyperparameters_path = "/opt/ml/input/config/hyperparameters.json"
    
    if os.path.exists(hyperparameters_path):
        with open(hyperparameters_path, 'r') as f:
            hyperparameters = json.load(f)
        return hyperparameters
    else:
        return {}

def build_command():
    """Build the command to run the training script with appropriate arguments."""
    hyperparameters = parse_sagemaker_parameters()
    
    cmd = [sys.executable, "/opt/ml/code/train.py"]
    
    # Add SageMaker environment variables as arguments
    if os.environ.get("SM_MODEL_DIR"):
        cmd.extend(["--model-dir", os.environ.get("SM_MODEL_DIR","/opt/ml/model")])
    
    if os.environ.get("SM_CHANNEL_TRAIN"):
        cmd.extend(["--training-dir", os.environ.get("SM_CHANNEL_TRAIN","/opt/ml/input/data/training/")])
    
    if os.environ.get("SM_CHANNEL_VALIDATION"):
        cmd.extend(["--validation-dir", os.environ.get("SM_CHANNEL_VALIDATION","/opt/ml/input/data/validation/")])
    
    if os.environ.get("SM_NUM_GPUS"):
        cmd.extend(["--num-gpus", os.environ.get("SM_NUM_GPUS")])
    
    # Add hyperparameters
    required_params = ["num_classes", "class_labels"]
    for param in required_params:
        if param not in hyperparameters:
            raise ValueError(f"Missing required hyperparameter: {param}")
    
    for key, value in hyperparameters.items():
        # Convert hyphen-case to underscore_case for command line args
        arg_key = key.replace('_', '-')
        cmd.extend([f"--{arg_key}", str(value)])
    
    return cmd

def main():
    """Main entrypoint for the SageMaker wrapper script."""
    try:
        cmd = build_command()
        logger.info(f"Running command: {' '.join(cmd)}")
        
        process = subprocess.run(cmd, check=True)
        sys.exit(process.returncode)
    except Exception as e:
        logger.error(f"Exception during training: {e}")
        sys.exit(1)

if __name__ == "__main__":
    main()

Overwriting gemini/sagemaker_train.py


In [37]:
%%writefile gemini/train.py
import nemo.collections.nlp as nemo_nlp
from nemo.core.config import hydra_runner
from omegaconf import DictConfig, OmegaConf
import pytorch_lightning as pl
import os
from nemo.utils.exp_manager import exp_manager
import argparse
import logging
from nemo.collections.nlp.models.text_classification import TextClassificationModel # Explicit Import
from nemo.collections.nlp.parts.nlp_overrides import NLPDDPStrategy

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

def get_args():
    parser = argparse.ArgumentParser()
    
    # Data, model, and output directories
    parser.add_argument("--model-dir", type=str, default=os.environ.get("SM_MODEL_DIR","/opt/ml/model"))
    parser.add_argument("--training-dir", type=str, default=os.environ.get("SM_CHANNEL_TRAIN","/opt/ml/input/data/training/"))
    parser.add_argument("--validation-dir", type=str, default=os.environ.get("SM_CHANNEL_VALIDATION","/opt/ml/input/data/validation/"))
    parser.add_argument("--num-gpus", type=int, default=os.environ.get("SM_NUM_GPUS", 1))
    
    # Training parameters
    parser.add_argument("--epochs", type=int, default=2)
    parser.add_argument("--learning-rate", type=float, default=2e-5)
    parser.add_argument("--batch-size", type=int, default=32)
    parser.add_argument("--max-seq-length", type=int, default=128)
    parser.add_argument("--num-classes", type=int, required=True)
    parser.add_argument("--class-labels", type=str, required=True)
    parser.add_argument("--pretrained-model-name", type=str, default="bert-base-uncased")
    
    return parser.parse_args()

def create_nemo_config(args):
    # Parse class labels
    class_labels = args.class_labels.split(',')
    
    # Create base config
    config = {
      "trainer": {
        "devices": args.num_gpus,
        "num_nodes": 1,
        "max_epochs": args.epochs,
        "max_steps": 10,
        "accumulate_grad_batches": 1,
        "gradient_clip_val": 0,
        "precision": 32,
        "accelerator": "gpu" if args.num_gpus > 0 else "cpu",
        "log_every_n_steps": 1,
        "val_check_interval": 1,
        "num_sanity_val_steps": 0,
        "enable_checkpointing": False,
        "logger": False
      },
      "model": {
        "nemo_path": "text_classification_model.nemo",
        "tokenizer": {
          "tokenizer_name": args.pretrained_model_name,
          "vocab_file": None,
          "tokenizer_model": None,
          "special_tokens": None
        },
        "language_model": {
          "pretrained_model_name": args.pretrained_model_name,
          "lm_checkpoint": None,
          "config_file": None,
          "config": None
        },
        "classifier_head": {
          "num_output_layers": 2,
          "fc_dropout": 0.1
        },
        "class_labels": {
          "class_labels_file": None
        },
        "dataset": {
          "num_classes": args.num_classes,
          "do_lower_case": False,
          "max_seq_length": args.max_seq_length,
          "class_balancing": None,
          "use_cache": False
        },
        "train_ds": {
          "file_path": os.path.join(args.training_dir, "train.tsv"),
          "batch_size": args.batch_size,
          "shuffle": True,
          "num_samples": -1,
          "num_workers": 3,
          "drop_last": False,
          "pin_memory": False
        },
        "validation_ds": {
          "file_path": os.path.join(args.validation_dir, "test.tsv"),
          "batch_size": args.batch_size,
          "shuffle": False,
          "num_samples": -1,
          "num_workers": 3,
          "drop_last": False,
          "pin_memory": False
        },
        "test_ds": {
          "file_path": None,
          "batch_size": args.batch_size,
          "shuffle": False,
          "num_samples": -1,
          "num_workers": 3,
          "drop_last": False,
          "pin_memory": False
        },
        "optim": {
          "name": "adam",
          "lr": args.learning_rate,
          "betas": [
            0.9,
            0.999
          ],
          "weight_decay": 0.01,
          "sched": {
            "name": "WarmupAnnealing",
            "warmup_steps": None,
            "warmup_ratio": 0.1,
            "last_epoch": -1,
            "monitor": "val_loss",
            "reduce_on_plateau": False
          }
        }
      }
    }
        
    return OmegaConf.create(config)

def train(args):
    """Train and save the NeMo model."""
    logger.info("Creating model config")
    config = create_nemo_config(args)
    
    logger.info(f"Starting training with config: {OmegaConf.to_yaml(config)}")
    
    # Initialize trainer and exp_manager
    #exp_manager(config.get("exp_manager"))
    
    # Initialize trainer
    try:
        strategy = NLPDDPStrategy(find_unused_parameters=True)
    except (ImportError, ModuleNotFoundError):
        strategy = 'auto'
        
    #trainer = nemo.core.PyTorchLightning.Trainer(strategy=strategy, **config.trainer)
    trainer = pl.Trainer(strategy=strategy, **config.trainer)

    # Create the model
    logger.info("Initializing model")
    if args.pretrained_model_name:
        # Use pretrained bert
        model = TextClassificationModel(cfg=config.model, trainer=trainer)
    else:
        logger.error("No pretrained model specified")
        raise ValueError("Must provide a pretrained model name")
    
    # Start training
    logger.info("Starting training")
    trainer.fit(model)
    logger.info("Finished training")
    
    # Save the final model
    model_path = os.path.join(args.model_dir, "final_model.nemo")
    model.save_to(model_path)
    logger.info(f"Model saved to {model_path}")
    
    return model

def main():
    args = get_args()
    model = train(args)
    
if __name__ == "__main__":
    main()

Overwriting gemini/train.py


In [32]:
%%writefile gemini/Dockerfile
FROM nvcr.io/nvidia/nemo:23.06

# Install sagemaker-training toolkit
RUN pip install --no-cache-dir sagemaker-training

# Set working directory for the application
WORKDIR /opt/ml/code

# Copy training scripts
COPY train.py /opt/ml/code/train.py
COPY sagemaker_train.py /opt/ml/code/sagemaker_train.py

# Entry point
ENTRYPOINT ["python", "/opt/ml/code/sagemaker_train.py"]

Overwriting gemini/Dockerfile


In [None]:
!sm-docker build ./gemini/ --repository {ecr_repository}:latest

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml
..

In [40]:
# Create a SageMaker estimator
estimator = Estimator(
    image_uri=container_uri,
    role=role,
    instance_count=1,
    instance_type="ml.p3.8xlarge",  # Use GPU instance
    volume_size=30,
    max_run=6 * 60 * 60,  # 2 hours in seconds
    input_mode='File',
    output_path=f"s3://{bucket}/bert-finetuning/output",
    hyperparameters={
        "epochs": 1,
        "learning_rate": 2e-5,
        "batch_size": 32,
        "max_seq_length": 128,
        "num_classes": 2,  # Change based on your classification task
        "class_labels": "0,1",  # Change based on your classes
        "pretrained_model_name": "bert-base-uncased",
        "num_gpus": 4
    }
)

In [41]:
train_data='s3://arnasinh-nemo-bert-ft-sm/data/training/train.tsv'
validation_data='s3://arnasinh-nemo-bert-ft-sm/data/validation/test.tsv'
estimator.fit(
    inputs={
        "training": train_data,
        "validation": validation_data
    },
    wait=True
)

INFO:sagemaker:Creating training-job with name: nemo-bert-finetuning-2025-02-27-23-42-00-198


2025-02-27 23:42:00 Starting - Starting the training job
2025-02-27 23:42:00 Pending - Training job waiting for capacity...........................
2025-02-27 23:46:22 Pending - Preparing the instances for training......
2025-02-27 23:47:25 Downloading - Downloading the training image..............................
2025-02-27 23:52:18 Training - Training image download completed. Training in progress..[34mINFO:__main__:Running command: /usr/bin/python /opt/ml/code/train.py --batch-size 32 --class-labels 0,1 --epochs 1 --learning-rate 2e-05 --max-seq-length 128 --num-classes 2 --num-gpus 4 --pretrained-model-name bert-base-uncased[0m
[34mGPU available: True (cuda), used: True[0m
[34mTPU available: False, using: 0 TPU cores[0m
[34mIPU available: False, using: 0 IPUs[0m
[34mHPU available: False, using: 0 HPUs[0m
[34m`Trainer(val_check_interval=1)` was configured so validation will run after every batch.[0m
[34mNOTE! Installing ujson may make loading annotations faster.[0m
[3