In [None]:
!pip install transformers[torch]
!pip install accelerate -U
!pip install datasets
!pip install evaluate



In [None]:
import pickle
from transformers import AutoModelForSequenceClassification, TrainingArguments, AutoTokenizer, Trainer
import transformers
from datasets import load_dataset, Dataset
import torch
import numpy as np
import evaluate
import pandas as pd
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score

In [None]:
if torch.cuda.is_available():
    print(f"GPU is available: {torch.cuda.get_device_name(0)}")
else:
    print("GPU is not available.")

GPU is available: Tesla V100-SXM2-16GB


In [None]:
# transformers.logging.set_verbosity_info()  # Set transformers logging to info level

# # Optionally, adjust the Python logging level if you're seeing too much or too little log output
# logging.basicConfig(level=logging.INFO)

In [None]:
# To load the list or array back
with open('hatexplain_data.pickle', 'rb') as file:
    data = pickle.load(file)

In [None]:
documents = data["documents"]
documents = [" ".join(tokens) for tokens in documents]
train_documents = data["train_documents"]
train_documents = [" ".join(tokens) for tokens in train_documents]
test_documents = data["test_documents"]
test_documents = [" ".join(tokens) for tokens in test_documents]
y_train = data["y_train"]
y_test = data["y_test"]

In [None]:
train_data = {'text': train_documents, 'label': y_train}
train_df = pd.DataFrame(train_data)
train_dataset = Dataset.from_pandas(train_df)

In [None]:
test_data = {'text': test_documents, 'label': y_test}
test_df = pd.DataFrame(test_data)
test_dataset = Dataset.from_pandas(test_df)

In [None]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# Tokenization function
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=120)

# Apply the tokenization to the entire dataset
tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True)
tokenized_test_dataset = test_dataset.map(tokenize_function, batched=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Map:   0%|          | 0/15383 [00:00<?, ? examples/s]

Map:   0%|          | 0/3846 [00:00<?, ? examples/s]

In [None]:
# Split the tokenized dataset into validation and test sets
split_datasets = tokenized_test_dataset.train_test_split(test_size=0.5, shuffle=True, seed=42)
# This splits the dataset into 50% test and 50% validation by default. Adjust the test_size as needed.

# Access the split datasets
tokenized_validation_dataset = split_datasets['train']
tokenized_test_dataset = split_datasets['test']

In [None]:
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=3)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return {
        'accuracy': accuracy_score(labels, predictions),
        'f1_weighted': f1_score(labels, predictions, average='weighted'),
        'precision_weighted': precision_score(labels, predictions, average='weighted'),
        'recall_weighted': recall_score(labels, predictions, average='weighted'),
    }

In [None]:
training_args = TrainingArguments(
    output_dir='./results',             # Output directory for model checkpoints and other outputs
    num_train_epochs=3,                 # Total number of training epochs
    per_device_train_batch_size=16,     # Batch size per device during training
    per_device_eval_batch_size=64,      # Batch size for evaluation
    warmup_steps=500,                   # Number of warmup steps for learning rate scheduler
    weight_decay=0.01,                  # Strength of weight decay regularization
    logging_dir='./logs',               # Directory for storing logs
    logging_steps=10,                   # Log and evaluate every `logging_steps` steps
    evaluation_strategy="epoch",        # Evaluate at the end of every epoch
    save_strategy="epoch",              # Save the model at the end of every epoch
    load_best_model_at_end=True,        # Load the best model (in terms of validation loss) at the end
    metric_for_best_model='loss',       # Use validation loss to determine the best model
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_validation_dataset,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1 Weighted,Precision Weighted,Recall Weighted
1,0.7763,0.705285,0.698388,0.699821,0.701913,0.698388
2,0.6608,0.720161,0.703588,0.692953,0.694269,0.703588
3,0.3636,0.890341,0.691628,0.689438,0.688734,0.691628


TrainOutput(global_step=2886, training_loss=0.6095149963164775, metrics={'train_runtime': 348.3927, 'train_samples_per_second': 132.463, 'train_steps_per_second': 8.284, 'total_flos': 2845879948764720.0, 'train_loss': 0.6095149963164775, 'epoch': 3.0})

In [None]:
predictions = trainer.predict(tokenized_test_dataset)

print(predictions)

PredictionOutput(predictions=array([[-0.65717053,  0.4975649 ,  0.4514203 ],
       [ 0.38289893, -1.1594694 ,  0.883371  ],
       [ 0.06064097, -0.4506289 ,  0.4371696 ],
       ...,
       [ 0.30399784, -0.6835202 ,  0.34232756],
       [ 0.21946582, -1.1115614 ,  0.9722835 ],
       [-1.0523854 ,  2.3838031 , -0.10300129]], dtype=float32), label_ids=array([1, 2, 0, ..., 0, 2, 1]), metrics={'test_loss': 0.7342969179153442, 'test_accuracy': 0.6817472698907956, 'test_f1_weighted': 0.6851234424016618, 'test_precision_weighted': 0.6905135248388491, 'test_recall_weighted': 0.6817472698907956, 'test_runtime': 4.0111, 'test_samples_per_second': 479.416, 'test_steps_per_second': 7.728})
