### This end-to-end training notebook contains code to fine-tune the Multilingual Cased DistilBERT model with or without hyperparameter optimization on our training dataset and create a model for inference.

In [22]:
#Run the utility notebook first
%run utils.py

### Setup where to store/access data files
- Access all training data files: https://us-east-1.console.aws.amazon.com/s3/buckets/sigparser-models?region=us-east-1&bucketType=general&prefix=email-types-distilBERT/&showversions=false
- IMPORTANT: Make sure this naming convention is used (e.g. **data-train-email-types-yyyy-mm-dd-a.csv**)

In [None]:
#The INPUT will change every time we run this notebook with a new dataset
INPUT = "data-train-email-types-yyyy-mm-dd-a.csv"
OUTPUT = INPUT.removesuffix(".csv")

print(f"INPUT: {INPUT}")
print(f"OUTPUT: {OUTPUT}")

#These values won't change unless we change the structure of our S3 bucket
S3_BUCKET_NAME = 'sigparser-models'
S3_FOLDER_NAME = 'email-types-distilBERT'

### Clean the raw training data (IMPORTANT: Make sure you uploaded the training data on the S3 bucket in the previous step)

In [None]:
train_df = retrieve_from_s3(bucket_name=S3_BUCKET_NAME, file_key=f"{S3_FOLDER_NAME}/{INPUT}")
print(f"Number of rows in the training dataset: {len(train_df)}")

In [None]:
EMAIL_TYPE_COL = "Email Type"
EMAIL_ADDRESS_COL = "Email Address"
EMAIL_NAME_COL = "Email Name"
EMAIL_DISPLAY_NAME_COL = "Email Display Name"

print("Shape of train dataframe: ", train_df.shape)
print("Value count of Email Type values in train dataset: ", train_df[EMAIL_TYPE_COL].value_counts())

nan_values(train_df)

#Convert Email Type str values to numeric values
train_df[EMAIL_TYPE_COL] = train_df[EMAIL_TYPE_COL].apply(email_type_to_int)

#Concatenating the email columns
train_df['combined'] = train_df[EMAIL_ADDRESS_COL]+ ', ' +train_df[EMAIL_NAME_COL]+ ', ' +train_df[EMAIL_DISPLAY_NAME_COL]

#Picking out only the Email Type and combined string columns to be in the final train df
final_df = train_df[[EMAIL_TYPE_COL, 'combined']]

#Saving train data csv file to S3 
save_to_s3(final_df, bucket_name=S3_BUCKET_NAME, file_key=f"{S3_FOLDER_NAME}/{OUTPUT}/data.csv", mode="train")

### Starting a SageMaker session with boto3

In [23]:
sagemaker_session = Session()
aws_role = sagemaker_session.get_caller_identity_arn()
aws_region = boto3.Session().region_name
sess = sagemaker.Session()

### Setting Model ID and version for Fine Tuning

In [24]:
model_id, model_version = 'huggingface-tc-distilbert-base-multilingual-cased', "2.0.0" 

### Set training data and output

In [87]:
training_dataset_s3_path = f"s3://{S3_BUCKET_NAME}/{S3_FOLDER_NAME}/{OUTPUT}/data.csv"
s3_output_location = f"s3://{S3_BUCKET_NAME}/{S3_FOLDER_NAME}/{OUTPUT}"

print(f"Training dataset S3 path: {training_dataset_s3_path}")
print(f"S3 output location: {s3_output_location}")

### Training

In [81]:
hyperparameters = {'epochs': '4',
 'learning_rate': '2e-05',
 'batch_size': '64',
 'eval_batch_size': '8',
 'eval_accumulation_steps': 'None',
 'reinitialize_top_layer': 'Auto',
 'train_only_top_layer': 'False'}

estimator = JumpStartEstimator(
    model_id=model_id,
    hyperparameters=hyperparameters,
    instance_type="ml.g4dn.xlarge",
    output_path=s3_output_location
)

In [88]:
#For Hyperparameter Optimization, set use_amt to True
use_amt = False 

#For HPO:Select from the hyperparameters supported by the model, and configure ranges of values to be searched for training the optimal model.(https://docs.aws.amazon.com/sagemaker/latest/dg/automatic-model-tuning-define-ranges.html)
hyperparameter_ranges = {
    "learning_rate": ContinuousParameter(0.00001, 0.0001, scaling_type="Logarithmic")
}

amt_metric_definitions = {
    "metrics": [{"Name": "val_accuracy", "Regex": "'eval_accuracy': ([0-9\\.]+)"}],
    "type": "Maximize",
}

#If use_amt is set to True, a hyperparameter optimization job is carried out.
if use_amt:
    hp_tuner = HyperparameterTuner(
        estimator, 
        amt_metric_definitions["metrics"][0]["Name"],
        hyperparameter_ranges,
        amt_metric_definitions["metrics"],
        max_jobs=6,
        max_parallel_jobs=2,
        objective_type=amt_metric_definitions["type"],
        base_tuning_job_name="distilbert-multilingual",  
    )

    #Launch a SageMaker Tuning job to search for the best hyperparameters
    hp_tuner.fit({"training": training_dataset_s3_path})
    
#Else, a regular training job is created
else:
    #Note that the training job's name must satisfy regular expression pattern: ^[a-zA-Z0-9](-*[a-zA-Z0-9]){0,62}
    print("using estimator")
    estimator.fit({"training": training_dataset_s3_path}, logs=True, job_name=f"{OUTPUT}-model") 

INFO:sagemaker:Creating training-job with name: distilbert-model


using estimator
2024-04-26 21:49:33 Starting - Starting the training job...
2024-04-26 21:49:33 Pending - Training job waiting for capacity............
2024-04-26 21:51:36 Pending - Preparing the instances for training...
2024-04-26 21:52:14 Downloading - Downloading input data...
2024-04-26 21:52:44 Downloading - Downloading the training image........................
2024-04-26 21:56:51 Training - Training image download completed. Training in progress...[34mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[34mbash: no job control in this shell[0m
[34m2024-04-26 21:57:20,767 sagemaker-training-toolkit INFO     Imported framework sagemaker_pytorch_container.training[0m
[34m2024-04-26 21:57:20,794 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.[0m
[34m2024-04-26 21:57:20,797 sagemaker_pytorch_container.training INFO     Invoking user training script.[0m
[34m2024-04-26 21:57:20,805 sagemaker-training-toolki

In [89]:
#Training performance metrics
if use_amt:
    training_job_name = hp_tuner.best_training_job()
    model_data = hp_tuner.best_estimator().model_data
else:
    training_job_name = estimator.latest_training_job.job_name
    model_data = estimator.model_data
    
df = TrainingJobAnalytics(training_job_name=training_job_name).dataframe()
df.head()

Unnamed: 0,timestamp,metric_name,value
0,0.0,hugginface-tc:eval-accuracy,0.955882
1,60.0,hugginface-tc:eval-accuracy,0.975


### Refactoring compressed model artifacts and re-routing S3 path

In [98]:
#The model artifacts are in tar.gz format. The files specifically in subfolder 'output' are extracted and stored in 'output/model'
source_bucket = model_data[5:].split('/', 1)[0]
source_key = model_data[5:].split('/', 1)[1]

#Destination path where the tar.gz file contents should be extracted and stored.
dest_bucket = 'sagemaker-sigparser-caylent-mlops'
dest_key_prefix = source_key[:source_key.rfind('.tar.gz')]+'/'

obj = s3_client.get_object(Bucket=source_bucket, Key=source_key)
file_stream = BytesIO(obj['Body'].read())

with tarfile.open(fileobj=file_stream, mode='r:gz') as tar:
    for file in tar.getmembers():
        if file.isfile():
            with tar.extractfile(file) as ind_file:
                dest_key = f"{dest_key_prefix}{file.name}"
                #upload_fileobj() below will stream each file's content directly to S3. This avoids loading the full content into memory.
                s3_client.upload_fileobj(ind_file, Bucket=dest_bucket, Key=dest_key)
                print(f"Extracted and Uploaded {file.name} to {dest_bucket}/{dest_key}")

Extracted and Uploaded config.json to sagemaker-sigparser-caylent-mlops/model-artifacts/distilBERT/2024-04-26_21-33-18/distilbert-model/output/model/config.json
Extracted and Uploaded tokenizer.json to sagemaker-sigparser-caylent-mlops/model-artifacts/distilBERT/2024-04-26_21-33-18/distilbert-model/output/model/tokenizer.json
Extracted and Uploaded pytorch_model.bin to sagemaker-sigparser-caylent-mlops/model-artifacts/distilBERT/2024-04-26_21-33-18/distilbert-model/output/model/pytorch_model.bin
Extracted and Uploaded tokenizer_config.json to sagemaker-sigparser-caylent-mlops/model-artifacts/distilBERT/2024-04-26_21-33-18/distilbert-model/output/model/tokenizer_config.json
Extracted and Uploaded vocab.txt to sagemaker-sigparser-caylent-mlops/model-artifacts/distilBERT/2024-04-26_21-33-18/distilbert-model/output/model/vocab.txt
Extracted and Uploaded special_tokens_map.json to sagemaker-sigparser-caylent-mlops/model-artifacts/distilBERT/2024-04-26_21-33-18/distilbert-model/output/model/

In [99]:
#Deleting the tar.gz file
s3_client.delete_object(Bucket=source_bucket, Key=source_key)
print(f'\nDeleted the original tar.gz model file from {source_bucket}/{source_key}')


Deleted the original tar.gz model file from sagemaker-sigparser-caylent-mlops/model-artifacts/distilBERT/2024-04-26_21-33-18/distilbert-model/output/model.tar.gz
