## Run training scripts for Models to measure performance
---------------------

This notebook initiates the training script to execute and measures metrics such as time to train, loss values, instance utilization metrics, etc. 

The training pipeline consists of the following stages:

1. Configuration Retrieval: Experiment configurations, including training scripts, hyperparameters, and datasets, are fetched from a central repository or file.

1. Training Execution: The training script is executed based on the configuration. This script:

1. Prepares the dataset.

1. Sets up the model and tokenizer.

1. Initiates training using the optimum.neuron.NeuronTrainer.

1. Saves the trained model and logs metrics.

1. Metrics Recording: Metrics such as loss, training time, and validation accuracy are captured and stored.

All the data and metrics are stored in the `report.html` that is created. User can configure variables for target total loss value as well as the time to train in the config file.

#### Import all of the necessary libraries below to run this notebook

In [None]:
# if interactive mode is set to no -> pickup fmbench from Python installation path
# if interactive mode is set to yes -> pickup fmbench from the current path (one level above this notebook)
# if interactive mode is not defined -> pickup fmbench from the current path (one level above this notebook)
# the premise is that if run non-interactively then it can only be run through main.py which will set interactive mode to no
import os
import sys
if os.environ.get("INTERACTIVE_MODE_SET", "yes") == "yes":
    sys.path.append(os.path.dirname(os.getcwd()))

In [None]:
import sys
import time
import json
import boto3
import asyncio
import logging
import importlib.util
import fmbench.scripts
from pathlib import Path
from fmbench.utils import *
from fmbench.globals import *
from fmbench.scripts import constants
from typing import Dict, List, Optional
from sagemaker import get_execution_role
import importlib.resources as pkg_resources
from botocore.exceptions import ClientError
from botocore.exceptions import NoCredentialsError

#### Set up a logger to log all messages while the code runs

In [None]:
logging.basicConfig(format='[%(asctime)s] p%(process)s {%(filename)s:%(lineno)d} %(levelname)s - %(message)s', level=logging.INFO)
logger = logging.getLogger(__name__)

### Load the config.yml file
------

The config.yml file contains information that is used across this benchmarking environment, such as information about the aws account, prompts, payloads to be used for invocations, and model configurations like the version of the model, the endpoint name, model_id that needs to be deployed. Configurations also support the gives instance type to be used, for example: "ml.g5.24xlarge", the image uri, whether or not to deploy this given model, followed by an inference script "jumpstart.py" which supports the inference script for jumpstart models to deploy the model in this deploy notebook. The experiment configuration also has the training script parameter, that is run as a part of the benchmarking test, where the loss value, time to train and other metrics are recorded.

In [None]:
## Load the config.yml file referring to the globals.py file
config = load_main_config(CONFIG_FILE)

## configure the aws region and execution role
aws_region = config['aws']['region']


# try:
#     sagemaker_execution_role = get_execution_role()
#     config['aws']['sagemaker_execution_role'] = sagemaker_execution_role
#     logger.info(f"determined SageMaker exeuction role from get_execution_role")
# except Exception as e:
#     logger.error(f"could not determine SageMaker execution role, error={e}")
#     logger.info(f"going to look for execution role in config file..")
#     sagemaker_execution_role = config['aws'].get('sagemaker_execution_role')
#     if sagemaker_execution_role is not None:
#         logger.info(f"found SageMaker execution role in config file..")

logger.info(f"aws_region={aws_region}, execution_role={config['aws']['sagemaker_execution_role']}")
logger.info(f"config={json.dumps(config, indent=2)}")

In [None]:
def run_training(experiment_config: Dict, aws_region: str, role_arn: str) -> Optional[Dict]:
    """Function to run training for a model"""
    # Log the training details
    logger.info(f"going to train {experiment_config}, in {aws_region} with {role_arn}")
    training_result: Optional[Dict] = None

    # Check if training is enabled in the config; skip if not
    run_training = experiment_config.get('run_training', False)
    if run_training is False:
        logger.info(f"skipping training of {experiment_config['model_id']} because run_training={run_training}")
        training_result = dict(
            model_id=experiment_config['model_id'],
            experiment_name=experiment_config['name'],
            instance_type=experiment_config.get('instance_type'),
            instance_count=experiment_config.get('instance_count'),
            trained=False
        )
        return training_result

    # Get the scripts directory
    scripts_dir = Path(pkg_resources.files('fmbench'), 'scripts')
    logger.info(f"Using fmbench.scripts directory: {scripts_dir}")

    try:
        # Import and run the training script
        module_name = Path(experiment_config['training_script']).stem
        logger.info(f"script provided for training this model is --> {module_name}")
        training_script_path = scripts_dir / f"{module_name}.py"
        logger.info(f"script path is --> {training_script_path}")

        if not training_script_path.exists():
            logger.error(f"Training script {training_script_path} not found.")
            return None

        logger.info(f"Training using local code: {training_script_path}")

        # Import the training module
        spec = importlib.util.spec_from_file_location(module_name, str(training_script_path))
        module = importlib.util.module_from_spec(spec)
        sys.modules[module_name] = module
        spec.loader.exec_module(module)

        # Run training and measure time
        st = time.perf_counter()
        training_result = module.train(experiment_config, role_arn)
        elapsed_time = time.perf_counter() - st
        
        logger.info(f"time taken to train model_id={experiment_config['model_id']} via "
                    f"{training_script_path} is {elapsed_time:0.2f}")
        
        # Add training time to result
        if training_result:
            training_result['training_time'] = elapsed_time
            training_result['trained'] = True
            
        return training_result

    except Exception as error:
        logger.error(f"An error occurred during training: {error}")
        return training_result

async def async_run_training(experiment_config: Dict, role_arn: str, aws_region: str) -> Dict:
    """Asynchronous wrapper function to allow concurrent training requests"""
    return await asyncio.to_thread(run_training, experiment_config, role_arn, aws_region)

async def async_train_all_models(config: Dict) -> List[Dict]:
    """Final asynchronous function to train all models concurrently"""
    # Extract experiments from the config
    experiments: List[Dict] = config['experiments']
    n: int = 4  # max concurrency to avoid throttling
    
    # Check for non-reentrant training scripts
    non_reentrant_training_scripts = config.get('non_reentrant_training_scripts', [])
    non_reentrant_scripts_present = [e['training_script'] for e in experiments 
                                   if e['training_script'] in non_reentrant_training_scripts]
    
    if len(non_reentrant_scripts_present) > 1:
        logger.info(f"non_reentrant_training_scripts_present={len(non_reentrant_scripts_present)}, "
                    f"going to train models serially")
        n = 1

    # Split experiments into smaller batches for concurrent training
    experiments_splitted = [experiments[i * n:(i + 1) * n] for i in range((len(experiments) + n - 1) // n)]
    results = []
    
    for exp_list in experiments_splitted:
        # Run training in batches
        result = await asyncio.gather(*[
            async_run_training(
                m,
                config['aws']['region'],
                config['aws']['sagemaker_execution_role']
            ) for m in exp_list
        ])
        # Collect results from each batch
        results.extend(result)
    
    return results

In [None]:
# Start timer
s = time.perf_counter()

# Run all training jobs
training_results = await async_train_all_models(config)

# Calculate total time
elapsed_async = time.perf_counter() - s
logger.info(f"Training results: {training_results}")
logger.info(f"All training completed in {elapsed_async:0.2f} seconds")