In [1]:
import torch
import numpy as np
from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments, DataCollatorWithPadding
from torch.utils.data import DataLoader
from sklearn.metrics import classification_report, accuracy_score, f1_score
from datasets import load_dataset
from huggingface_hub import login
import os

os.environ["TENSORBOARD_LOGGING_DIR"] = "./logs"

MODEL_ID = 'vinai/bertweet-large'

# check for gpu
torch.cuda.is_available()

True

In [2]:
# Use your hugging face token to log in
login()

Error while fetching `HF_TOKEN` secret value from your vault: 'Requesting secret HF_TOKEN timed out. Secrets can only be fetched when running from the Colab UI.'.
You are not authenticated with the Hugging Face Hub in this notebook.
If the error persists, please let us know by opening an issue on GitHub (https://github.com/huggingface/huggingface_hub/issues/new).


In [22]:
# Load the dataset you're using, don't forget to specify the data directory
dataset = load_dataset("ADS509/experiment_labels_full_match")

README.md:   0%|          | 0.00/895 [00:00<?, ?B/s]

data/train-00000-of-00001.parquet:   0%|          | 0.00/3.69M [00:00<?, ?B/s]

data/test-00000-of-00001.parquet:   0%|          | 0.00/819k [00:00<?, ?B/s]

data/valid-00000-of-00001.parquet:   0%|          | 0.00/794k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/23949 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/5133 [00:00<?, ? examples/s]

Generating valid split:   0%|          | 0/5132 [00:00<?, ? examples/s]

In [23]:
label2id = {}
id2label = {}

for i, label in enumerate(dataset['train'].features['label'].names):

    label2id[label] = i
    id2label[i] = label

In [None]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)

# Function to tokenize data with
def tokenize_function(batch):
    return tokenizer(
        batch['text'],
        truncation=True, 
        #padding='max_length',
        max_length=512 # Can't be greater than model max length
    )
# Data collator handles padding dynamically, set padding and max_length if you want to control it explicitly and drop the collator

# Tokenize Data
train_data = dataset['train'].map(tokenize_function, batched=True)
test_data = dataset['test'].map(tokenize_function, batched=True)
valid_data = dataset['valid'].map(tokenize_function, batched=True)

# Convert lists to tensors
train_data.set_format("torch", columns=['input_ids', "attention_mask", "label"])
test_data.set_format("torch", columns=['input_ids', "attention_mask", "label"])
valid_data.set_format("torch", columns=['input_ids', "attention_mask", "label"])

    
# Verify batch
test_loader = DataLoader(train_data, batch_size=4)
batch = next(iter(test_loader))
print(f"Batch keys: {batch.keys()}")
print(f"Input IDs shape: {batch['input_ids'].shape}")
print(f"Labels shape: {batch['label'].shape}")

In [26]:
# Pick a repo name to save the trained model to
# model_repo = "experiment_labels_bert_base"

model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_ID,
    num_labels=5, # adjust this based on number of labels you're training on
    device_map='cuda',
    dtype='auto',
    label2id=label2id, # set these two args to attach the metadata to the model.config
    id2label=id2label
)

# Metric function for evaluation in Trainer
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    
    return {
        'accuracy': accuracy_score(labels, predictions),
        'f1_macro': f1_score(labels, predictions, average='macro'),
        'f1_weighted': f1_score(labels, predictions, average='weighted')
    }

# Data collator to handle padding dynamically per batch
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

training_args = TrainingArguments(
    output_dir='./bert-comment', # Saves it locally
    #push_to_hub=True,
    #hub_model_id=f"ADS509/{model_repo}",
    num_train_epochs=2,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=64,
    learning_rate=2e-5,
    weight_decay=0.01,
    warmup_steps=200,  # or warmup_ratio=%
    
    # Evaluation & saving
    eval_strategy='epoch',
    save_strategy='epoch',
    load_best_model_at_end=True,
    metric_for_best_model='f1_macro',
    
    # Logging
    logging_steps=100,
    report_to='tensorboard',
    
    # Other
    seed=42,
    fp16=torch.cuda.is_available(),  # Mixed precision if GPU available
)

Loading weights:   0%|          | 0/389 [00:00<?, ?it/s]

RobertaForSequenceClassification LOAD REPORT from: vinai/bertweet-large
Key                             | Status     | 
--------------------------------+------------+-
lm_head.layer_norm.weight       | UNEXPECTED | 
lm_head.decoder.bias            | UNEXPECTED | 
lm_head.layer_norm.bias         | UNEXPECTED | 
lm_head.dense.bias              | UNEXPECTED | 
lm_head.bias                    | UNEXPECTED | 
roberta.embeddings.position_ids | UNEXPECTED | 
lm_head.decoder.weight          | UNEXPECTED | 
lm_head.dense.weight            | UNEXPECTED | 
classifier.out_proj.bias        | MISSING    | 
classifier.out_proj.weight      | MISSING    | 
classifier.dense.bias           | MISSING    | 
classifier.dense.weight         | MISSING    | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.


In [27]:
# Set up Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=valid_data,
    processing_class=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

# Train!
trainer.train()

# Evaluate
eval_results = trainer.evaluate()
print(eval_results)

Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro,F1 Weighted
1,0.383654,0.353443,0.874513,0.863622,0.875051
2,0.216188,0.310921,0.89205,0.886037,0.892004


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

There were missing keys in the checkpoint model loaded: ['roberta.embeddings.LayerNorm.weight', 'roberta.embeddings.LayerNorm.bias', 'roberta.encoder.layer.0.attention.output.LayerNorm.weight', 'roberta.encoder.layer.0.attention.output.LayerNorm.bias', 'roberta.encoder.layer.0.output.LayerNorm.weight', 'roberta.encoder.layer.0.output.LayerNorm.bias', 'roberta.encoder.layer.1.attention.output.LayerNorm.weight', 'roberta.encoder.layer.1.attention.output.LayerNorm.bias', 'roberta.encoder.layer.1.output.LayerNorm.weight', 'roberta.encoder.layer.1.output.LayerNorm.bias', 'roberta.encoder.layer.2.attention.output.LayerNorm.weight', 'roberta.encoder.layer.2.attention.output.LayerNorm.bias', 'roberta.encoder.layer.2.output.LayerNorm.weight', 'roberta.encoder.layer.2.output.LayerNorm.bias', 'roberta.encoder.layer.3.attention.output.LayerNorm.weight', 'roberta.encoder.layer.3.attention.output.LayerNorm.bias', 'roberta.encoder.layer.3.output.LayerNorm.weight', 'roberta.encoder.layer.3.output.Laye

{'eval_loss': 0.3109213709831238, 'eval_accuracy': 0.892049883086516, 'eval_f1_macro': 0.8860368383380253, 'eval_f1_weighted': 0.8920043912778948, 'eval_runtime': 11.4518, 'eval_samples_per_second': 448.138, 'eval_steps_per_second': 7.073, 'epoch': 2.0}


In [17]:
preds = trainer.predict(test_data)

In [18]:
true_ints = np.asarray(test_data['label'], dtype=int)
true_labels = [id2label[i] for i in true_ints]

pred_ints = np.argmax(preds.predictions, axis=1)
pred_labels = [id2label[i] for i in pred_ints]

print(classification_report(true_labels, pred_labels))

               precision    recall  f1-score   support

Argumentative       0.76      0.80      0.78      2423
   Expressive       0.82      0.77      0.80      3289
Informational       0.68      0.68      0.68       810
      Neutral       0.69      0.70      0.69       779
      Opinion       0.71      0.72      0.71      3257

     accuracy                           0.75     10558
    macro avg       0.73      0.73      0.73     10558
 weighted avg       0.75      0.75      0.75     10558



In [28]:
trainer.args.hub_model_id = "ADS509/BERTweet-large-full-match-data"

In [29]:
# Save trained model to hugging face model repo
trainer.save_model(training_args.output_dir)
trainer.push_to_hub(commit_message = "BERTweet-large with correct labels on full match data")

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Processing Files (0 / 0)      : |          |  0.00B /  0.00B            

New Data Upload               : |          |  0.00B /  0.00B            

  ...comment/model.safetensors:   0%|          | 47.8kB / 1.42GB            

  ...comment/training_args.bin:  29%|##9       | 1.54kB / 5.26kB            

CommitInfo(commit_url='https://huggingface.co/ADS509/BERTweet-large-full-match-data/commit/7d86942249dc908c034391fcc403c862dcfb2acf', commit_message='BERTweet-large with correct labels on full match data', commit_description='', oid='7d86942249dc908c034391fcc403c862dcfb2acf', pr_url=None, repo_url=RepoUrl('https://huggingface.co/ADS509/BERTweet-large-full-match-data', endpoint='https://huggingface.co', repo_type='model', repo_id='ADS509/BERTweet-large-full-match-data'), pr_revision=None, pr_num=None)

# Hyperparameter Tuning

In [None]:
!pip install optuna

In [None]:
# Alternative: Trainer's built-in hyperparameter_search with Optuna
import optuna
import transformers
import logging

def model_init():
    # Temporarily suppress model load reports
    transformers.logging.set_verbosity_error()
    logging.getLogger("accelerate.utils.modeling").setLevel(logging.ERROR)
    model = AutoModelForSequenceClassification.from_pretrained(
        MODEL_ID,
        num_labels=5,
        device_map='cuda',
        dtype='auto',
    )
    transformers.logging.set_verbosity_warning()
    logging.getLogger("accelerate.utils.modeling").setLevel(logging.WARNING)
    return model

def hp_space(trial):
    return {
        "num_train_epochs": trial.suggest_int("epochs", 2, 3),
        "learning_rate": trial.suggest_float("lr", 1e-5, 1e-4, log=True),
        "warmup_steps": trial.suggest_int("warmup", 100, 300, step=50),
        "weight_decay": trial.suggest_float("decay", 0, 0.2, step=0.05),
        "per_device_train_batch_size": trial.suggest_categorical("batch_size", [16, 32, 64]),
        "optim": trial.suggest_categorical("optimizer", ["adamw_torch", "adamw_torch_fused", "adafactor"]),
    }

search_args = TrainingArguments(
    output_dir='./hp-search',
    per_device_eval_batch_size=64,
    eval_strategy='epoch',
    save_strategy='no',
    metric_for_best_model='f1_macro',
    report_to='none',
    seed=42,
    fp16=torch.cuda.is_available(),
)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

hp_trainer = Trainer(
    model_init=model_init,
    args=search_args,
    train_dataset=train_data,
    eval_dataset=valid_data,
    processing_class=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

best_run = hp_trainer.hyperparameter_search(
    hp_space=hp_space,
    compute_objective=lambda metrics: metrics["eval_f1_macro"],
    direction="maximize",
    backend="optuna",
    n_trials=10,
    pruner=optuna.pruners.MedianPruner(n_startup_trials=2, n_warmup_steps=1),
)

print("Best run:")
print(f"  F1 Macro: {best_run.objective:.4f}")
print(f"  Params: {best_run.hyperparameters}")

In [None]:
# Retrain with best hyperparameters from hyperparameter_search and push to Hub
repo_id = "best_hp_tuning_v1"

best_model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_ID,
    num_labels=5,
    device_map='cuda',
    dtype='auto',
    label2id=label2id,
    id2label=id2label,
)

best_hp = best_run.hyperparameters
best_args = TrainingArguments(
    output_dir=f'./best-{repo_id}',
    push_to_hub=True,
    hub_model_id=f"ADS509/{repo_id}",
    optim=best_hp['optimizer'],
    num_train_epochs=best_hp['epochs'],
    per_device_train_batch_size=best_hp['batch_size'],
    per_device_eval_batch_size=64,
    learning_rate=best_hp['lr'],
    weight_decay=best_hp['decay'],
    warmup_steps=best_hp['warmup'],

    eval_strategy='epoch',
    save_strategy='epoch',
    load_best_model_at_end=True,
    metric_for_best_model='f1_macro',

    logging_steps=100,
    report_to='tensorboard',

    seed=42,
    fp16=torch.cuda.is_available(),
)

best_trainer = Trainer(
    model=best_model,
    args=best_args,
    train_dataset=train_data,
    eval_dataset=valid_data,
    processing_class=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

best_trainer.train()
eval_results = best_trainer.evaluate()
print(eval_results)

best_trainer.save_model(best_args.output_dir)
best_trainer.push_to_hub(commit_message=f"Best model from HP search (f1_macro={best_run.objective:.4f})")