In [None]:
!pip install transformers[torch]
!pip install accelerate -U
!pip install datasets
!pip install evaluate

In [None]:
import pickle
from transformers import AutoModelForSequenceClassification, TrainingArguments, AutoTokenizer, Trainer
import transformers
from datasets import load_dataset, Dataset
import torch
import numpy as np
import evaluate
import pandas as pd
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score

In [None]:
if torch.cuda.is_available():
    print(f"GPU is available: {torch.cuda.get_device_name(0)}")
else:
    print("GPU is not available.")

GPU is available: NVIDIA A100-SXM4-40GB


In [None]:
# transformers.logging.set_verbosity_info()  # Set transformers logging to info level

# # Optionally, adjust the Python logging level if you're seeing too much or too little log output
# logging.basicConfig(level=logging.INFO)

In [None]:
# To load the list or array back
with open('hatexplain_data.pickle', 'rb') as file:
    data = pickle.load(file)

In [None]:
documents = data["documents"]
documents = [" ".join(tokens) for tokens in documents]
train_documents = data["train_documents"]
train_documents = [" ".join(tokens) for tokens in train_documents]
test_documents = data["test_documents"]
test_documents = [" ".join(tokens) for tokens in test_documents]
y_train = data["y_train"]
y_test = data["y_test"]

In [None]:
train_data = {'text': train_documents, 'label': y_train}
train_df = pd.DataFrame(train_data)
train_dataset = Dataset.from_pandas(train_df)

In [None]:
test_data = {'text': test_documents, 'label': y_test}
test_df = pd.DataFrame(test_data)
test_dataset = Dataset.from_pandas(test_df)

In [None]:
tokenizer = AutoTokenizer.from_pretrained("roberta-base")

# Tokenization function
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

# Apply the tokenization to the entire dataset
tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True)
tokenized_test_dataset = test_dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/15383 [00:00<?, ? examples/s]

Map:   0%|          | 0/3846 [00:00<?, ? examples/s]

In [None]:
# Split the tokenized dataset into validation and test sets
split_datasets = tokenized_test_dataset.train_test_split(test_size=0.5, shuffle=True, seed=42)
# This splits the dataset into 50% test and 50% validation by default. Adjust the test_size as needed.

# Access the split datasets
tokenized_validation_dataset = split_datasets['train']
tokenized_test_dataset = split_datasets['test']

In [None]:
model = AutoModelForSequenceClassification.from_pretrained("roberta-base", num_labels=3)

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return {
        'accuracy': accuracy_score(labels, predictions),
        'f1_weighted': f1_score(labels, predictions, average='weighted'),
        'precision_weighted': precision_score(labels, predictions, average='weighted'),
        'recall_weighted': recall_score(labels, predictions, average='weighted'),
    }

In [None]:
training_args = TrainingArguments(
    output_dir='./results',             # Output directory for model checkpoints and other outputs
    num_train_epochs=3,                 # Total number of training epochs
    per_device_train_batch_size=16,     # Batch size per device during training
    per_device_eval_batch_size=64,      # Batch size for evaluation
    warmup_steps=500,                   # Number of warmup steps for learning rate scheduler
    weight_decay=0.01,                  # Strength of weight decay regularization
    logging_dir='./logs',               # Directory for storing logs
    logging_steps=10,                   # Log and evaluate every `logging_steps` steps
    evaluation_strategy="epoch",        # Evaluate at the end of every epoch
    save_strategy="epoch",              # Save the model at the end of every epoch
    load_best_model_at_end=True,        # Load the best model (in terms of validation loss) at the end
    metric_for_best_model='loss',       # Use validation loss to determine the best model
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_validation_dataset,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1 Weighted,Precision Weighted,Recall Weighted
1,0.6642,0.785386,0.679667,0.658822,0.66899,0.679667
2,0.7609,0.729487,0.698388,0.686391,0.688759,0.698388
3,0.4596,0.779682,0.697868,0.697173,0.696857,0.697868


TrainOutput(global_step=2886, training_loss=0.7020352821092348, metrics={'train_runtime': 1125.7839, 'train_samples_per_second': 40.993, 'train_steps_per_second': 2.564, 'total_flos': 1.2142421114729472e+16, 'train_loss': 0.7020352821092348, 'epoch': 3.0})

In [None]:
predictions = trainer.predict(tokenized_test_dataset)

print(predictions)

PredictionOutput(predictions=array([[ 2.2692719 , -2.1436775 , -0.69117594],
       [ 2.431879  , -2.0833573 , -0.85474056],
       [-0.9604132 ,  0.85606515, -0.08800704],
       ...,
       [-1.2174948 ,  2.0832858 , -0.83550143],
       [-1.9767402 ,  1.4275415 ,  0.54020464],
       [ 0.31305978, -0.30740276, -0.27148488]], dtype=float32), label_ids=array([0, 0, 1, ..., 2, 1, 2]), metrics={'test_loss': 0.7084094882011414, 'test_accuracy': 0.7103484139365575, 'test_f1_weighted': 0.6959265789172349, 'test_precision_weighted': 0.7044511708057243, 'test_recall_weighted': 0.7103484139365575, 'test_runtime': 14.0505, 'test_samples_per_second': 136.863, 'test_steps_per_second': 2.206})
