### This end-to-end training notebook contains code to fine-tune the Multilingual Cased DistilBERT model with or without hyperparameter optimization on our training dataset and create a model for inference.

In [2]:
#Run the utility notebook first
%run distilBERT_utility.py

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml


### Starting a SageMaker session with boto3

In [3]:
sagemaker_session = Session()
aws_role = sagemaker_session.get_caller_identity_arn()
aws_region = boto3.Session().region_name
sess = sagemaker.Session()

### Setting Model ID and version for Fine Tuning

In [4]:
model_id, model_version = 'huggingface-tc-distilbert-base-multilingual-cased', "2.0.0" 

### Set training data and output

In [5]:
training_dataset_s3_path = f"s3://sagemaker-sigparser-caylent-mlops/data/email-type/input/processed/distilbert/distilbert-train/distilbert-fine_tuning-24-04-2024/data.csv"
s3_output_location = f"s3://sagemaker-sigparser-caylent-mlops/model/email-type/distilBERT/{datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}"

### Training

In [6]:
hyperparameters = {'epochs': '4',
 'learning_rate': '2e-05',
 'batch_size': '64',
 'eval_batch_size': '8',
 'eval_accumulation_steps': 'None',
 'reinitialize_top_layer': 'Auto',
 'train_only_top_layer': 'False'}

estimator = JumpStartEstimator(
    model_id=model_id,
    hyperparameters=hyperparameters,
    instance_type="ml.g4dn.xlarge",
    output_path=s3_output_location
)

Using model 'huggingface-tc-distilbert-base-multilingual-cased' with wildcard version identifier '*'. You can pin to version '2.0.0' for more stable results. Note that models may have different input/output signatures after a major version upgrade.


In [103]:
#For Hyperparameter Optimization, set use_amt to True
use_amt = False 

#For HPO:Select from the hyperparameters supported by the model, and configure ranges of values to be searched for training the optimal model.(https://docs.aws.amazon.com/sagemaker/latest/dg/automatic-model-tuning-define-ranges.html)
hyperparameter_ranges = {
    "learning_rate": ContinuousParameter(0.00001, 0.0001, scaling_type="Logarithmic")
}

amt_metric_definitions = {
    "metrics": [{"Name": "val_accuracy", "Regex": "'eval_accuracy': ([0-9\\.]+)"}],
    "type": "Maximize",
}

#If use_amt is set to True, a hyperparameter optimization job is carried out.
if use_amt:
    hp_tuner = HyperparameterTuner(
        estimator, 
        amt_metric_definitions["metrics"][0]["Name"],
        hyperparameter_ranges,
        amt_metric_definitions["metrics"],
        max_jobs=6,
        max_parallel_jobs=2,
        objective_type=amt_metric_definitions["type"],
        base_tuning_job_name="distilbert-multilingual-",  
    )

    #Launch a SageMaker Tuning job to search for the best hyperparameters
    hp_tuner.fit({"training": training_dataset_s3_path})
    
#Else, a regular training job is created
else:
    training_job_name = 'distilbert-model-1' #change this value to change the training job's name and folder name where the model artifacts are stored
    #Note that the training job's name must satisfy regular expression pattern: ^[a-zA-Z0-9](-*[a-zA-Z0-9]){0,62}
    print("using estimator")
    estimator.fit({"training": training_dataset_s3_path}, logs=True, job_name=training_job_name) 

INFO:sagemaker:Creating training-job with name: distilbert-model-1


using estimator
2024-04-26 23:06:59 Starting - Starting the training job
2024-04-26 23:06:59 Pending - Training job waiting for capacity......
2024-04-26 23:07:38 Pending - Preparing the instances for training...
2024-04-26 23:08:12 Downloading - Downloading input data...
2024-04-26 23:08:43 Downloading - Downloading the training image........................
2024-04-26 23:12:59 Training - Training image download completed. Training in progress....[34mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[34mbash: no job control in this shell[0m
[34m2024-04-26 23:13:21,827 sagemaker-training-toolkit INFO     Imported framework sagemaker_pytorch_container.training[0m
[34m2024-04-26 23:13:21,854 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.[0m
[34m2024-04-26 23:13:21,856 sagemaker_pytorch_container.training INFO     Invoking user training script.[0m
[34m2024-04-26 23:13:21,864 sagemaker-training-toolkit INFO  

In [104]:
#Training performance metrics
if use_amt:
    training_job_name = hp_tuner.best_training_job()
    model_data = hp_tuner.best_estimator().model_data
else:
    training_job_name = estimator.latest_training_job.job_name
    model_data = estimator.model_data
    
df = TrainingJobAnalytics(training_job_name=training_job_name).dataframe()
df.head()

Unnamed: 0,timestamp,metric_name,value
0,0.0,hugginface-tc:eval-accuracy,0.955882
1,60.0,hugginface-tc:eval-accuracy,0.975


### Refactoring compressed model artifacts and re-routing S3 path (for deployment purposes only)

In [58]:
#You can either derive model_data from the above cell after training, or hard-code it as done below .
model_data = 's3://sagemaker-sigparser-caylent-mlops/model/email-type/distilBERT/2024-04-26_23-06-31/distilbert-model-1/output/model.tar.gz'
source_bucket = model_data[5:].split('/', 1)[0] #Bucket name

#We will extract the following string 'model/email-type/distilBERT/2024-04-26_23-06-31/distilbert-model-1' from model_data path given in above example
source_prefix = model_data.split('sagemaker-sigparser-caylent-mlops/')[1].rsplit('/', 2)[0] 

#Next, we will extract just the timestamp and training job name from model_data path to create the destination path
destination_inter_prefix = model_data.split('distilBERT/')[1].split('/output')[0]
destination_prefix = f'model-artifacts/distilBERT/{destination_inter_prefix}'

In [62]:
paginator = s3_client.get_paginator('list_objects_v2')
for result in paginator.paginate(Bucket=source_bucket, Prefix=source_prefix):
    if 'Contents' in result:
        #List all objects in the source directory containing all three output folders after training
        for obj in result['Contents']:
            key = obj['Key']
            #Get folder names
            folder_name = key.split('/')[-2]
            #The 'output' folder contains the model.tar.gz file
            if folder_name == 'output' and key.endswith('model.tar.gz'):
                #Uncompress and upload model.tar.gz files to 'output/model' path
                obj = s3_client.get_object(Bucket=source_bucket, Key=key)
                with tarfile.open(fileobj=BytesIO(obj['Body'].read()), mode="r:gz") as tar: #Extracting from compressed tar.gz file
                    for member in tar.getmembers():
                        if member.isfile():
                            #Extracted files are stored in path output/model in the folder
                            extracted_file_key = os.path.join(destination_prefix, 'output', 'model', os.path.normpath(member.name))
                            s3_client.upload_fileobj(BytesIO(tar.extractfile(member).read()), source_bucket, extracted_file_key)
                            print("Copied file to", extracted_file_key)
                            
            elif folder_name in ['debug-output', 'profiler-output']:
                #Copy other folders and files as they are. It will skip copying any files which are 0 bytes.
                new_key = key.replace(source_prefix, destination_prefix, 1)  
                s3_client.copy_object(
                    Bucket=source_bucket,
                    CopySource={'Bucket': source_bucket, 'Key': key},
                    Key=new_key
                )
                print("Copied file to", new_key)

Copied file to model-artifacts/distilBERT/2024-04-26_23-06-31/distilbert-model-1/debug-output/training_job_end.ts
Copied file to model-artifacts/distilBERT/2024-04-26_23-06-31/distilbert-model-1/output/model/config.json
Copied file to model-artifacts/distilBERT/2024-04-26_23-06-31/distilbert-model-1/output/model/special_tokens_map.json
Copied file to model-artifacts/distilBERT/2024-04-26_23-06-31/distilbert-model-1/output/model/vocab.txt
Copied file to model-artifacts/distilBERT/2024-04-26_23-06-31/distilbert-model-1/output/model/code/inference.py
Copied file to model-artifacts/distilBERT/2024-04-26_23-06-31/distilbert-model-1/output/model/code/constants/constants.py
Copied file to model-artifacts/distilBERT/2024-04-26_23-06-31/distilbert-model-1/output/model/code/constants/__init__.py
Copied file to model-artifacts/distilBERT/2024-04-26_23-06-31/distilbert-model-1/output/model/code/version
Copied file to model-artifacts/distilBERT/2024-04-26_23-06-31/distilbert-model-1/output/model/co

### Incrementally train the fine-tuned model

You can use the artifacts from an existing model and use an expanded dataset to train a new model as long as the dataset format remain the same (set of classes).

In [9]:
#Identify the previously trained training job name.
last_training_job_name = 'distilbert-model-1'

#Create a SageMaker client
sagemaker_client = boto3.client('sagemaker')
#Retrieve all details about the previous training job. Several items will be picked up from the resulting JSON object to create the estimator in the code block below.
training_job_info = sagemaker_client.describe_training_job(TrainingJobName=last_training_job_name)

#New training dataset's S3 path
training_dataset_s3_path = 's3://sagemaker-sigparser-caylent-mlops/data/email-type/input/processed/distilbert/distilbert-train/distilbert-fine_tuning-24-04-2024/data.csv'

In [54]:
#Set a name for the new incremental training job
incremental_training_job_name = f"{last_training_job_name}-incremental-training-0"
training_instance_type = "ml.g4dn.xlarge" #Set instance type for the new training job

#Creating the new estimator job
incremental_train_estimator = Estimator(
    model_id=model_id,
    role=aws_role,
    image_uri=training_job_info['AlgorithmSpecification']['TrainingImage'], #Previous training job's image uri
    model_uri=training_job_info['ModelArtifacts']['S3ModelArtifacts'], #Trained model's S3 uri (tar.gz file)
    entry_point="transfer_learning.py",
    source_dir=training_job_info['InputDataConfig'][2]['DataSource']['S3DataSource']['S3Uri'], #Source directory containing transfer_learning.py
    instance_count=1,
    instance_type=training_instance_type,
    hyperparameters=hyperparameters, #Already set in the beginning of this notebook
    output_path=s3_output_location, #Already set in the beginning of this notebook, can be changed here if needed
    #base_job_name=incremental_training_job_name, #Commenting this out since the job_name is fed while using estimator.fit below
    metric_definitions=training_job_info['AlgorithmSpecification']['MetricDefinitions'] #Training metric definitions picked up from the previous training job's JSON object
)

#Fitting the above created estimator object to the new training data to create a new model
incremental_train_estimator.fit({"training": training_dataset_s3_path}, logs=True, job_name=incremental_training_job_name)

INFO:sagemaker:Creating training-job with name: distilbert-model-1-incremental-training-0


2024-04-30 22:23:02 Starting - Starting the training job...
2024-04-30 22:23:18 Starting - Preparing the instances for training...
2024-04-30 22:23:49 Downloading - Downloading input data...
2024-04-30 22:24:29 Downloading - Downloading the training image...........................
2024-04-30 22:28:40 Training - Training image download completed. Training in progress.[34mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[34mbash: no job control in this shell[0m
[34m2024-04-30 22:28:52,691 sagemaker-training-toolkit INFO     Imported framework sagemaker_pytorch_container.training[0m
[34m2024-04-30 22:28:52,716 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.[0m
[34m2024-04-30 22:28:52,720 sagemaker_pytorch_container.training INFO     Invoking user training script.[0m
[34m2024-04-30 22:28:52,951 sagemaker-training-toolkit INFO     Installing dependencies from requirements.txt:[0m
[34m/opt/conda/bin/python3.

##### The model files for the above training job are stored in 's3://sagemaker-sigparser-caylent-mlops/model/email-type/distilBERT/2024-04-30_21-42-06/distilbert-model-1-incremental-training-0/'