In [None]:
import torch
import numpy as np
from datasets import load_dataset
from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments, DataCollatorWithPadding
from torch.utils.data import DataLoader
from sklearn.metrics import accuracy_score, f1_score
import os

os.environ["TENSORBOARD_LOGGING_DIR"] = "./logs"

MODEL_ID = 'bert-base-uncased'

# check for gpu
torch.cuda.is_available()

In [None]:
# Load the dataset you're using, don't forget to specify the data directory
dataset = load_dataset('ADS509/final_project_data', data_dir="")

In [None]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)

# Function to tokenize data with
def tokenize_function(batch):
    return tokenizer(
        batch['text'],
        truncation=True, 
       # padding='max_length',
       # max_length=128 # Can't be greater than model max length
    )
# Data collator handles padding dynamically, set padding and max_length if you want to control it explicitly and drop the collator

# Tokenize Data
train_data = dataset['train'].map(tokenize_function, batched=True)
test_data = dataset['test'].map(tokenize_function, batched=True)
valid_data = dataset['valid'].map(tokenize_function, batched=True)

# Convert lists to tensors
train_data.set_format("torch", columns=['input_ids', "attention_mask", "label"])
test_data.set_format("torch", columns=['input_ids', "attention_mask", "label"])
valid_data.set_format("torch", columns=['input_ids', "attention_mask", "label"])

    
# Verify batch
test_loader = DataLoader(train_data, batch_size=4)
batch = next(iter(test_loader))
print(f"Batch keys: {batch.keys()}")
print(f"Input IDs shape: {batch['input_ids'].shape}")
print(f"Labels shape: {batch['label'].shape}")

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_ID,
    num_labels=2, # adjust this based on number of labels you're training on
    device_map='cuda',
    dtype='auto',
    label2id={}, # set these two args to attach the metadata to the model.config
    id2label={}
)

# Metric function for evaluation in Trainer
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    
    return {
        'accuracy': accuracy_score(labels, predictions),
        'f1_macro': f1_score(labels, predictions, average='macro'),
        'f1_weighted': f1_score(labels, predictions, average='weighted')
    }

# Data collator to handle padding dynamically per batch
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

training_args = TrainingArguments(
    output_dir='./bert-comment-classifier', # Saves it locally
    push_to_hub=True,
    hub_model_id="ADS509/final_project_models",
    num_train_epochs=3,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=64,
    learning_rate=2e-5,
    weight_decay=0.01,
    warmup_ratio=0.1,  # or warmup_steps=some int
    
    # Evaluation & saving
    eval_strategy='epoch',
    save_strategy='epoch',
    load_best_model_at_end=True,
    metric_for_best_model='accuracy',
    
    # Logging
    logging_steps=100,
    report_to='tensorboard',
    
    # Other
    seed=42,
    fp16=torch.cuda.is_available(),  # Mixed precision if GPU available
)

In [None]:
# Set up Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=valid_data,
    processing_class=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

# Train!
trainer.train()

# Evaluate
eval_results = trainer.evaluate()
print(eval_results)

In [None]:
# Save trained model to hugging face model repo
trainer.save_model(training_args.output_dir)
trainer.push_to_hub(commit_message = "")