# Importing Libraries and Initializations

In [None]:
# Install huggingface specific libraries
! pip install transformers datasets evaluate --quiet

In [None]:
# Common libraries
import time
import json
import numpy as np
import pandas as pd
from time import gmtime, strftime

# Sagemaker specific
import boto3
import sagemaker
from sagemaker.huggingface import HuggingFace, TrainingCompilerConfig

# Huggingface specific
from datasets import Dataset, DatasetDict, load_dataset, concatenate_datasets
from datasets.filesystems import S3FileSystem
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

In [None]:
# Generic initializations
model_path = 'google/flan-t5-xxl'
workspace_bucket_name = 'gupshup-ml'
s3_prefix = 'peft'
model_name = model_path.split('/')[1]
save_model_s3_path = f's3://{workspace_bucket_name}/{s3_prefix}/{model_name}-peft/'
base_job_name = f'qa-peft-{model_name}'
experiment_name = f'qa-peft-{model_name}'

# Hyperparameters
epochs = 1                           # number of training epochs
per_device_train_batch_size = 5      # batch size for training
gradient_accumulation_steps = 64     # gradient accumulation steps for training
learning_rate = float('1e-4')

In [None]:
# Sagemaker specific
sess = sagemaker.Session()
role = sagemaker.get_execution_role()

print(f'Sagemaker Role ARN: {role}')
print(f'Sagemaker Bucket: {sess.default_bucket()}')
print(f'Sagemaker Session Region: {sess.boto_region_name}')

# Data

In [None]:
# Load csv as a pandas dataframe
train_path = '../others/data/full_data/train/sniper_faq_session_train.csv'
train = pd.read_csv(train_path)
train['id'] = train.index
train = train[['id', 'input_text', 'output_text']]
print('Train:', train.shape)

test_path = '../others/data/full_data/test/sniper_faq_session_test.csv'
test = pd.read_csv(test_path)
test['id'] = test.index
test = test[['id', 'input_text', 'output_text']]
test.dropna(inplace=True)
print('Test:', test.shape)

# Create Dataset from pandas dataframes
train_dataset = Dataset.from_pandas(train)
test_dataset = Dataset.from_pandas(test)

dataset = DatasetDict({
    'train': train_dataset,
    'test': test_dataset
})

dataset

# train = 6673, test = 1668 samples
# train.info(memory_usage='deep') # memory usage: 26.6 MB
# test.info(memory_usage='deep') # memory usage: 6.6 MB

In [None]:
# Initialize tokenizer for a chosen model
tokenizer = AutoTokenizer.from_pretrained(model_path)

print(f'Model input names: {tokenizer.model_input_names}')
print(f'Model max length: {tokenizer.model_max_length}')

In [None]:
%%time
# Data processing

# The maximum total input sequence length after tokenization.
# Sequences longer than this will be truncated, sequences shorter will be padded.
tokenized_inputs = concatenate_datasets([dataset['train'], dataset['test']]).map(lambda x: tokenizer(x['input_text'], truncation=True), batched=True, remove_columns=['input_text', 'output_text'])
input_lenghts = [len(x) for x in tokenized_inputs['input_ids']]
# take 85 percentile of max length for better utilization
max_source_length = int(np.percentile(input_lenghts, 85))
print(f'Max source length: {max_source_length}')

# The maximum total sequence length for target text after tokenization.
# Sequences longer than this will be truncated, sequences shorter will be padded.'
tokenized_targets = concatenate_datasets([dataset['train'], dataset['test']]).map(lambda x: tokenizer(x['output_text'], truncation=True), batched=True, remove_columns=['input_text', 'output_text'])
target_lenghts = [len(x) for x in tokenized_targets['input_ids']]
# take 90 percentile of max length for better utilization
max_target_length = int(np.percentile(target_lenghts, 90))
print(f'Max target length: {max_target_length}')

def preprocess_function(sample,padding='max_length'):
    # add prefix to the input for t5
    inputs = sample['input_text']

    # tokenize inputs
    model_inputs = tokenizer(inputs, max_length=max_source_length, padding=padding, truncation=True)

    # Tokenize targets with the `text_target` keyword argument
    labels = tokenizer(text_target=sample['output_text'], max_length=max_target_length, padding=padding, truncation=True)

    # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore
    # padding in the loss.
    if padding == 'max_length':
        labels['input_ids'] = [
            [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels['input_ids']
        ]

    model_inputs['labels'] = labels['input_ids']
    return model_inputs

tokenized_dataset = dataset.map(preprocess_function, batched=True, remove_columns=['id', 'input_text', 'output_text'])
print('Keys of tokenized dataset:', list(tokenized_dataset['train'].features))

In [None]:
# Save the train and test datasets to s3
s3 = S3FileSystem()

training_input_path = f's3://{workspace_bucket_name}/{s3_prefix}/train'
print(f'Training input path: {training_input_path}')
tokenized_dataset['train'].save_to_disk(training_input_path, fs=s3)

test_input_path = f's3://{workspace_bucket_name}/{s3_prefix}/test'
print(f'Test input path: {test_input_path}')
tokenized_dataset['test'].save_to_disk(test_input_path, fs=s3)

# Fine-Tune

In [None]:
# hyperparameters, which are passed into the training job
hyperparameters = {
    'model_id': model_path,
    'learning_rate': learning_rate,
    'per_device_train_batch_size': per_device_train_batch_size,
    'gradient_accumulation_steps': gradient_accumulation_steps,
    'epochs': epochs,
    'save_model_s3_path': save_model_s3_path
}
print('Hyperparameters: \n', json.dumps(hyperparameters, indent=2, default=str))

In [None]:
%%time
hf_estimator = HuggingFace(
        entry_point          = './scripts/qa-peft-s5cmd.py',      # training script filename 
        source_dir           = '.',                               # training script source-dir
        instance_type        = 'ml.p4d.24xlarge',                 # instances type used for the training job  
        instance_count       = 1,                                 # the number of instances used for training
        base_job_name        = base_job_name,                     # the name of the training job
        role                 = role,                              # IAM role used in training job to access AWS ressources, e.g. S3
        transformers_version = '4.26.0',                          # the transformers version used in the training job
        pytorch_version      = '1.13.1',                          # the pytorch_version version used in the training job
        py_version           = 'py39',                            # the python version used in the training job
        hyperparameters      = hyperparameters,                   # the hyperparameter used for running the training job
        volume_size          = 300,                               # the size of the EBS volume in GB
        disable_profiler     = True, 
        debugger_hook_config = False,
        keep_alive_period_in_seconds = 1800,                      # useful parameter when trying small changes right after failed job status. NOTE: This will keep the instance alive. Lookout for costs
)

data = {
    'train': training_input_path
}
print(json.dumps(data, indent=2, default=str))

In [None]:
%%time
# starting the train job with our uploaded datasets as input
hf_estimator.fit(data, wait=True) # Can also go with logs='None' parameter

In [None]:
hf_estimator.hyperparameters()