In [None]:
# Model Parallelism
https://docs.aws.amazon.com/sagemaker/latest/dg/model-parallel-intro.html

!pip install "sagemaker>=2.48.0" --upgrade

In [3]:
import sagemaker.huggingface

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml


In [4]:
import sagemaker
import boto3
sess = sagemaker.Session()
# sagemaker session bucket -> used for uploading data, models and logs
# sagemaker will automatically create this bucket if it not exists
sagemaker_session_bucket=None
if sagemaker_session_bucket is None and sess is not None:
    # set to default bucket if a bucket name is not given
    sagemaker_session_bucket = sess.default_bucket()

try:
    role = sagemaker.get_execution_role()
except ValueError:
    iam = boto3.client('iam')
    role = iam.get_role(RoleName='sagemaker_execution_role')['Role']['Arn']

sess = sagemaker.Session(default_bucket=sagemaker_session_bucket)

print(f"sagemaker role arn: {role}")
print(f"sagemaker bucket: {sess.default_bucket()}")
print(f"sagemaker session region: {sess.boto_region_name}")

sagemaker role arn: arn:aws:iam::802575742115:role/service-role/AmazonSageMaker-ExecutionRole-20230929T143152
sagemaker bucket: sagemaker-us-east-1-802575742115
sagemaker session region: us-east-1


In [12]:
from transformers.sagemaker import SageMakerTrainingArguments as TrainingArguments, SageMakerTrainer as Trainer
from sagemaker.huggingface import HuggingFace

In [31]:
# hyperparameters, which are passed into the training job
hyperparameters={
    'model_name_or_path':'roberta-large',
    'task_name': 'mnli',
    'per_device_train_batch_size': 16,
    'per_device_eval_batch_size': 16,
    'do_train': True,
    'do_eval': True,
    'do_predict': True,
    'num_train_epochs': 2,
    'output_dir':'/opt/ml/model',
    'max_steps': 500,
}

# git configuration to download our fine-tuning script
git_config = {'repo': 'https://github.com/huggingface/transformers.git','branch': 'v4.28.1'}


In [32]:
# configuration for running training on smdistributed Model Parallel
mpi_options = {
    "enabled" : True,
    "processes_per_host" : 8,
}
smp_options = {
    "enabled":True,
    "parameters": {
        "microbatches": 4,
        "placement_strategy": "spread",
        "pipeline": "interleaved",
        "optimize": "speed",
        "partitions": 4,
        "ddp": True,
    }
}

distribution={
    "smdistributed": {"modelparallel": smp_options},
    "mpi": mpi_options
}

# instance configurations
instance_type='ml.p3.16xlarge'
instance_count=1
volume_size=200

In [33]:
# metric definition to extract the results
metric_definitions=[
     {'Name': 'train_runtime', 'Regex':"train_runtime.*=\D*(.*?)$"},
     {'Name': 'train_samples_per_second', 'Regex': "train_samples_per_second.*=\D*(.*?)$"},
     {'Name': 'epoch', 'Regex': "epoch.*=\D*(.*?)$"},
     {'Name': 'f1', 'Regex': "f1.*=\D*(.*?)$"},
     {'Name': 'exact_match', 'Regex': "exact_match.*=\D*(.*?)$"}]

In [40]:
# estimator
huggingface_estimator = HuggingFace(entry_point='run_glue.py',
                                    source_dir='./examples/pytorch/text-classification',
                                    git_config=git_config,
                                    metrics_definition=metric_definitions,
                                    instance_type=instance_type,
                                    instance_count=instance_count,
                                    volume_size=volume_size,
                                    role=role,
                                    transformers_version='4.28.1',
                                    pytorch_version='2.0.0',
                                    py_version='py310',
                                    distribution= distribution,
                                    hyperparameters = hyperparameters,
                                    debugger_hook_config=False)

In [41]:
huggingface_estimator.hyperparameters()

{'model_name_or_path': '"roberta-large"',
 'task_name': '"mnli"',
 'per_device_train_batch_size': '16',
 'per_device_eval_batch_size': '16',
 'do_train': 'true',
 'do_eval': 'true',
 'do_predict': 'true',
 'num_train_epochs': '2',
 'output_dir': '"/opt/ml/model"',
 'max_steps': '500',
 'sagemaker_mpi_enabled': 'true',
 'sagemaker_mpi_num_of_processes_per_host': '8',
 'sagemaker_mpi_custom_mpi_options': '""',
 'mp_parameters': '{"microbatches": 4, "placement_strategy": "spread", "pipeline": "interleaved", "optimize": "speed", "partitions": 4, "ddp": true}',
 'sagemaker_distributed_dataparallel_enabled': 'false',
 'sagemaker_instance_type': '"ml.p3.16xlarge"'}

In [42]:
# starting the train job with our uploaded datasets as input
huggingface_estimator.fit()

INFO:sagemaker.image_uris:image_uri is not presented, retrieving image_uri based on instance_type, framework etc.
Cloning into '/tmp/tmpp8jnbk_g'...
Note: switching to 'v4.28.1'.

You are in 'detached HEAD' state. You can look around, make experimental
changes and commit them, and you can discard any commits you make in this
state without impacting any branches by switching back to a branch.

If you want to create a new branch to retain commits you create, you may
do so (now or later) by using -c with the switch command. Example:

  git switch -c <new-branch-name>

Or undo this operation with:

  git switch -

Turn off this advice by setting config variable advice.detachedHead to false

HEAD is now at 04ab5605f Patch release: v4.28.1
INFO:sagemaker.image_uris:image_uri is not presented, retrieving image_uri based on instance_type, framework etc.
INFO:sagemaker:Creating training-job with name: huggingface-pytorch-training-2024-01-23-06-06-54-216


2024-01-23 06:07:18 Starting - Starting the training job...
2024-01-23 06:07:28 Pending - Training job waiting for capacity......
2024-01-23 06:08:31 Pending - Preparing the instances for training......
2024-01-23 06:09:41 Downloading - Downloading input data...
2024-01-23 06:10:05 Downloading - Downloading the training image....................................
2024-01-23 06:16:02 Training - Training image download completed. Training in progress....[34mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[34mbash: no job control in this shell[0m
[34m2024-01-23 06:16:40,456 sagemaker-training-toolkit INFO     Imported framework sagemaker_pytorch_container.training[0m
[34m2024-01-23 06:16:40,516 sagemaker-training-toolkit INFO     No Neurons detected (normal if no neurons installed)[0m
[34m2024-01-23 06:16:40,526 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.[0m
[34m2024-01-23 06:16:40,529 sagemaker_pytorch_c

UnexpectedStatusException: Error for Training job huggingface-pytorch-training-2024-01-23-06-06-54-216: Failed. Reason: AlgorithmError: ExecuteUserScriptError:
ExitCode 1
ErrorMessage "│    986 │   │   │   except OSError as e:                                      │
 │    987 │   │   │   │   raise OSError(                                        │
 │    988 │   │   │   │   │   "Cannot find data file. "                         │
 │                                                                              │
 │ /opt/conda/lib/python3.10/site-packages/datasets/builder.py:1706 in          │
 │ _prepare_split                                                               │
 │   1703 │   │   path_join = os.path.join if is_local else posixpath.join      │
 │   1704 │   │                                                                 │
 │   1705 │   │   if self.info.splits is not None:                              │
 │ ❱ 1706 │   │   │   split_info = self.info.splits[split_generator.name]       │
 │   1707 │   │   else:                                           , exit code: 1