In [None]:
!pip install transformers

In [None]:
!pip install datasets

In [None]:
import torch
import pandas as pd
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from transformers import RobertaTokenizer, RobertaForSequenceClassification
from datasets import load_dataset

# Load data
dataset = load_dataset('hatexplain', split='train[:100%]', cache_dir='./datasets')

print(dataset)
# Convert to pandas DataFrame
#df = pd.DataFrame(dataset['post_tokens'], columns=['text'])
#df['HS'] = dataset['label']

# Load tokenizer
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

# Tokenize inputs
encodings = tokenizer(list(df['post_tokens']), truncation=True, padding=True)

# Convert labels to numerical values
labels = torch.tensor(list(df['HS'])) # HS is the column containing the labels

# Load pre-trained model
model = RobertaForSequenceClassification.from_pretrained('roberta-base')

# Load checkpoint
model.load_state_dict(torch.hub.load_state_dict_from_url('https://cdn.huggingface.co/vinai/phobert-base-finetuned-hatexplain.pth', map_location=torch.device('cpu')))

# Set the model in evaluation mode
model.eval()

# Make predictions on test set
with torch.no_grad():
    logits = model(**encodings)[0]
    predictions = torch.argmax(logits, dim=-1).cpu().numpy()

# Calculate metrics
acc = accuracy_score(labels, predictions)
f1 = f1_score(labels, predictions)
auc = roc_auc_score(labels, predictions)

print(f"Accuracy: {acc}")
print(f"F1 Score: {f1}")
print(f"AUC: {auc}")
