<a href="https://colab.research.google.com/github/anirbansaha96/AI-ML-Playground/blob/master/llm_toxicity.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [35]:
# !pip install datasets transformers evaluate -q

In [36]:
from datasets import load_dataset
from evaluate import load
from sklearn.metrics import mean_squared_error

dataset = load_dataset("Arsive/toxicity_classification_jigsaw"
, split="train"
)
# textdetox/multilingual_toxicity_dataset
# google/jigsaw_toxicity_pred
# Arsive/toxicity_classification_jigsaw


In [37]:
from sklearn.metrics import mean_squared_error, f1_score, precision_score, recall_score, accuracy_score

def evaluate_model(model_name, toxic_label, threshold=0.5):
    toxicity = load("toxicity", model_name, module_type="measurement")
    predictions = []
    ground_truth = []
    binary_predictions = []
    binary_ground_truth = []

    for item in dataset:
        text = item["comment_text"] # item["prompt"]["text"]
        label = item["toxic"] # item["prompt"]["toxicity"]

        # Compute toxicity score
        prediction = toxicity.compute(predictions=[text], toxic_label=toxic_label)["toxicity"][0]

        predictions.append(prediction)
        ground_truth.append(label)

        # Convert to binary based on threshold
        prediction_binary = 1 if prediction >= threshold else 0
        label_binary = 1 if label >= threshold else 0

        binary_predictions.append(prediction_binary)
        binary_ground_truth.append(label_binary)

    # Calculate classification metrics
    f1 = f1_score(binary_ground_truth, binary_predictions)
    precision = precision_score(binary_ground_truth, binary_predictions)
    recall = recall_score(binary_ground_truth, binary_predictions)
    accuracy = accuracy_score(binary_ground_truth, binary_predictions)

    # Calculate RMSE
    # rmse = mean_squared_error(ground_truth, predictions)

    return {
        "model_name": model_name,
        # "rmse": rmse,
        "f1_score": f1,
        "precision": precision,
        "recall": recall,
        "accuracy": accuracy
    }



In [38]:
from time import time

In [None]:
model_results = {}
total_start = time()
models = [
    {"model_name": "facebook/roberta-hate-speech-dynabench-r4-target", "toxic_label": "hate"},
    {"model_name": "DaNLP/da-electra-hatespeech-detection", "toxic_label": "offensive"},
    {"model_name": "unitary/multilingual-toxic-xlm-roberta", "toxic_label": "toxic"},
    {"model_name": "unitary/toxic-bert", "toxic_label": "toxic"},
    {"model_name": "martin-ha/toxic-comment-model", "toxic_label": "toxic"},
    {"model_name": "textdetox/xlmr-large-toxicity-classifier", "toxic_label": "LABEL_1"},
    {"model_name": "s-nlp/roberta_toxicity_classifier", "toxic_label": "toxic"}
]

for model in models:
    try:
        model_start_time = time()
        metric_scores = evaluate_model(model_name=model["model_name"], toxic_label=model["toxic_label"])
        model_results[model["model_name"]] = metric_scores
        model_end_time = time()
        print(f"Model Time Taken - {model_end_time-model_start_time}")
    except Exception as e:
        print(f"Toxicity Calculation failed for {model['model_name']} with toxicity_label {model['toxic_label']} with error {e}")

total_end = time()

print(f"Total Time Taken - {total_end-total_start}")

In [None]:
import json
print(json.dumps(model_results, indent=4))


In [None]:
import matplotlib.pyplot as plt


# Extract model names and metric values
model_names = [model_results[model]["model_name"] for model in model_results]
f1_scores = [model_results[model]["f1_score"] for model in model_results]
precisions = [model_results[model]["precision"] for model in model_results]
recalls = [model_results[model]["recall"] for model in model_results]
accuracies = [model_results[model]["accuracy"] for model in model_results]

# Plotting
plt.figure(figsize=(12, 8))

# F1 Score Plot
plt.subplot(2, 2, 1)
plt.barh(model_names, f1_scores, color='skyblue')
plt.gca().invert_yaxis()  # Invert y-axis to have the highest score at the top
plt.title('F1 Score')

# Precision Plot
plt.subplot(2, 2, 2)
plt.barh(model_names, precisions, color='salmon')
plt.gca().invert_yaxis()  # Invert y-axis
plt.title('Precision')

# Recall Plot
plt.subplot(2, 2, 3)
plt.barh(model_names, recalls, color='lightgreen')
plt.gca().invert_yaxis()  # Invert y-axis
plt.title('Recall')

# Accuracy Plot
plt.subplot(2, 2, 4)
plt.barh(model_names, accuracies, color='gold')
plt.gca().invert_yaxis()  # Invert y-axis
plt.title('Accuracy')

plt.tight_layout()
plt.show()

In [None]:
# sorted_models = sorted(model_results.items(), key=lambda x: x[1])

# print("Model Performance (Best to Worst):")
# for model, score in sorted_models:
#   print(f"{model}: {score:.4f}")


In [None]:
# from transformers import pipeline
# toxic_classifier = pipeline("text-classification", model="textdetox/xlmr-large-toxicity-classifier", top_k=99999, truncation=True)
# toxic_classifier.model.config.id2label