# Transformers trainer class
https://huggingface.co/docs/transformers/en/training

In [None]:
!pip install -U accelerate transformers datasets

### (Optional) Try the fine-tuned model

The fully fine-tuned model is available for you to try out. Uncomment the code below and run it for testing.

[acloudfan/fine-tuned-BERT-IMDB](https://huggingface.co/acloudfan/fine-tuned-BERT-IMDB/settings)

* LABEL_0    Negative
* LABEL_1    Positive

In [None]:
# from transformers import pipeline
# sentiment_pipeline = pipeline("sentiment-analysis", model="acloudfan/fine-tuned-BERT-IMDB")
# data = ["I love you", "I hate you"]
# sentiment_pipeline(data)

## 1. Use GPU/CUDA for fine-tuning

All frameworks provide support for running training and fine-tuning on GPU/CUDA devices.

```
device = "cuda:0" if torch.cuda.is_available() else "cpu"

model     = BertModel.from_pretrained('bert-large-uncased')

inputs    = tokenizer(sentence, return_tensors="pt").to(device)
model     = model.to(device)
outputs   = model(**inputs)
```

In [None]:
import torch

# Identify the device available for training
device = "cuda:0" if torch.cuda.is_available() else "cpu"

## 2. Define simple eval function & test

In [None]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer

# Run test with base model
MODEL_ID="google-bert/bert-base-cased"

# Create the instance of the model with 'sequence classification head' - ignore the warning as the 'sequence classification head' is not trained
# https://huggingface.co/docs/transformers/en/model_doc/auto#transformers.AutoModelForSequenceClassification.from_pretrained
# Checkout other models that can be fine-tuned using this method
model = AutoModelForSequenceClassification.from_pretrained(MODEL_ID, num_labels=2, torch_dtype="auto").to(device)

# print(model.config)

# Load the tokenizer for the model
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)

# Define the inference function
def predict_label_with_given_model(use_model, text: str) -> int:
    """
    Predict the label for a given text using the fine-tuned model from the trainer.

    Args:
        text (str): The input text for inference.

    Returns:
        int: The predicted label.
    """
    # Tokenize the input text
    inputs = tokenizer(
        text,
        return_tensors="pt",  # PyTorch tensors
        truncation=True,      # Truncate if the text is too long
        padding=True          # Pad to the max length in the batch
    ).to(device)

    # Perform inference
    with torch.no_grad():  # No gradient calculation during inference
        outputs = use_model(**inputs)
        logits = outputs.logits
        predicted_label = torch.argmax(logits, dim=1).item()  # Get the label with the highest score

    return predicted_label

# Testing the function
def run_simple_eval(use_model):
    # Example texts for testing
    test_texts = [
        "This movie was fantastic! I really enjoyed it.",
        "I didn't like this product. It broke after one use.",
        "The book was okay, not great but not terrible either.",
        "it is all fun"
    ]

    # Run inference and print results
    print("Label = 0 :  Negative      Label = 1 : Positive")
    for text in test_texts:
        label = predict_label_with_given_model(use_model,text)
        print(f"Text: {text}\nPredicted Label: {label}\n")



# Run evaluation on the base model
run_simple_eval(model)

## 3. Load the IMDB dataset

In [None]:
from datasets import load_dataset

# dataset = load_dataset("yelp_review_full")
dataset = load_dataset("imdb")
print("Number of records in 'Train' : ",len(dataset["train"]))

## 4. (optional) Reduce the size of the training & test dataset

In [None]:
# Adjust this to change the number of train and test rows
TOTAL_NUMBER_RECORDS = 5000

# Reduce the size of the dataset
reduced_dataset=dataset['train'].shuffle(seed=42).select(range(TOTAL_NUMBER_RECORDS))

# Split the dataset into train and test
reduced_dataset = reduced_dataset.train_test_split(train_size=0.9)

reduced_dataset

## 5. Tokenize the dataset

In [None]:

# Tokenize function used by map
# Move the tokenized data to device (GPU)
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True).to(device)


# Tokenize using map
tokenized_datasets = reduced_dataset.map(tokenize_function, batched=True)

# Uncommenting this line will lead to an error as the trainer expects dataset to have a column with the name 'label'
# ValueError: The model did not return a loss from the inputs, only the following keys: logits. For reference, the inputs it received are input_ids,token_type_ids,attention_mask
# tokenized_datasets = tokenized_datasets.rename_column('label','type')

## 6. Setup TrainingArguments & Trainer

In [None]:
from transformers import TrainingArguments, Trainer

# Output directory for the training
output_dir = "./temp"


# Define training arguments
training_args = TrainingArguments(

    # Directory for logs and outputs from trainer
    output_dir=output_dir,
    overwrite_output_dir=True,
    

    # Hyperparameters
    eval_strategy = "epoch",                # Can be set to "steps" but will lead to longer training times
    per_device_train_batch_size=4,          # Default = 8. Adjust based on the VRAM/GPU. High value can lead to OOM errors
    per_device_eval_batch_size=4,           # Default = 8. Adjust based on the VRAM/GPU. High value can lead to OOM errors
    num_train_epochs=2,                     # Higher epoch means longer training time
    # max_steps=100,                        # Change this for forced stopping - good for experimentation
    learning_rate=0.00005,                  # The initial learning rate for AdamW optimizer
    warmup_steps=0,
    warmup_ratio=0,
    weight_decay=0.01,
    optim="adamw_torch",                    # Using the default optimizer

    # Reporting & Logging - refer documentation for more args
    report_to='none',                       # "azure_ml", "clearml", "codecarbon", "comet_ml", "dagshub", "dvclive", "flyte", "mlflow", "neptune", "tensorboard", and "wandb"
    logging_strategy="steps",
    # logging_steps=50,                       # default = 500, Number of update steps between two logs if logging_strategy="steps". Otherwise a number between 0 & 1, interprtted as a ratio of train steps

    # Reports to - integration with MLOPS & ML experiments tooling
    run_name="Fine-Tune-BERT-IMDB",
    

    # Save strategy - refer documentation for more args
    save_strategy="epoch",

    # Hub strategy -  - refer documentation for more args
    
    
)

# Setup traininer with minimilitic st of arguments
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['test'],
)

## 7. Train and evaluate

In [None]:
trainer.train()

In [None]:
fine_tuned_model = trainer.model

# Run evaluation on the base model
run_simple_eval(fine_tuned_model)

## 8. Save the model locally

In [None]:
model_folder = output_dir + "/fine-tuned-BERT-IMDB"

trainer.save_model(model_folder)
tokenizer.save_pretrained(model_folder)

## 9. Load fine-tuned model to hub

In [None]:
import getpass

print("Provide the HuggingFace API token:")

hf_token = getpass.getpass()

hub_model_id = "acloudfan/fine-tuned-BERT-IMDB"

fine_tuned_model.push_to_hub(hub_model_id, hf_token=hf_token)
tokenizer.push_to_hub(hub_model_id, hf_token=hf_token)