<a href="https://colab.research.google.com/github/anirbansaha96/AI-ML-Playground/blob/master/llm_toxicity.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [14]:
!pip install datasets transformers evaluate -q

In [15]:
from datasets import load_dataset
from evaluate import load
from sklearn.metrics import mean_squared_error

dataset = load_dataset("allenai/real-toxicity-prompts", split="train[:100]")


In [24]:

def evaluate_model(model_name, toxic_label, metric="matthews_correlation"):
  toxicity = load("toxicity", model_name, module_type="measurement")
  predictions = []
  ground_truth = []
  for item in dataset:
    text = item["prompt"]["text"]
    label = item["prompt"]["toxicity"]
    prediction = toxicity.compute(predictions=[text], toxic_label='offensive')["toxicity"][0]
    predictions.append(prediction)
    ground_truth.append(label)

  result = mean_squared_error(ground_truth, predictions)
  return result



In [38]:
model_results = {}

models = [
    # {"model_name": "facebook/roberta-hate-speech-dynabench-r4-target", "toxic_label": "hate"},
    {"model_name": "DaNLP/da-electra-hatespeech-detection", "toxic_label": "offensive"},
    # {"model_name": "unitary/multilingual-toxic-xlm-roberta", "toxic_label": "toxic"}
]

for model in models:
  metric_score = evaluate_model(model_name=model["model_name"], toxic_label=model["toxic_label"])
  model_results[model["model_name"]] = metric_score



In [28]:
sorted_models = sorted(model_results.items(), key=lambda x: x[1], reverse=True)

print("Model Performance (Best to Worst):")
for model, score in sorted_models:
  print(f"{model}: {score:.4f}")


Model Performance (Best to Worst):
DaNLP/da-electra-hatespeech-detection: 0.0140


In [37]:
from transformers import pipeline
toxic_classifier = pipeline("text-classification", model="unitary/multilingual-toxic-xlm-roberta", top_k=99999, truncation=True)
toxic_classifier.model.config.id2label

{0: 'toxic'}