In [2]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import pandas as pd

# Load the pre-trained model
MODEL_NAME = "unitary/toxic-bert"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME)

# Larger dataset for better evaluation
test_data = pd.DataFrame({
    "text": [
        "You are amazing!", "I hate you!", "This is so stupid!",
        "I love this", "This is an insult", "What a great idea!",
        "Shut up, you idiot!", "You are worthless", "Thank you!",
        "Go away, loser!"
    ],
    "label": [0, 1, 1, 0, 1, 0, 1, 1, 0, 1]  # 0 = non-toxic, 1 = toxic
})

# Function for batch processing
def batch_predict(text_list):
    """Predicts toxicity for a batch of texts."""
    inputs = tokenizer(text_list, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():  # Disable gradients for faster inference
        outputs = model(**inputs)
    probabilities = torch.nn.functional.softmax(outputs.logits, dim=-1)[:, 1]  # Take toxic probability
    return (probabilities > 0.5).int().tolist()  # Convert to binary classification (0 or 1)

# Predict in batch
test_data["predicted"] = batch_predict(test_data["text"].tolist())

# Compute evaluation metrics
accuracy = accuracy_score(test_data["label"], test_data["predicted"])
precision = precision_score(test_data["label"], test_data["predicted"])
recall = recall_score(test_data["label"], test_data["predicted"])
f1 = f1_score(test_data["label"], test_data["predicted"])

# Display results
print(f"✅ Accuracy: {accuracy:.2f}")
print(f"✅ Precision: {precision:.2f}")
print(f"✅ Recall: {recall:.2f}")
print(f"✅ F1 Score: {f1:.2f}")

# Print sample predictions for manual review
print("\n🔍 Sample Predictions:")
print(test_data[["text", "label", "predicted"]])


✅ Accuracy: 0.40
✅ Precision: 0.00
✅ Recall: 0.00
✅ F1 Score: 0.00

🔍 Sample Predictions:
                  text  label  predicted
0     You are amazing!      0          0
1          I hate you!      1          0
2   This is so stupid!      1          0
3          I love this      0          0
4    This is an insult      1          0
5   What a great idea!      0          0
6  Shut up, you idiot!      1          0
7    You are worthless      1          0
8           Thank you!      0          0
9      Go away, loser!      1          0


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [4]:
import pandas as pd

df = pd.read_csv("../data/train.csv")

# Check the first few rows
print(df.head())
print(df.shape)  # Should be (159571, 8)


                 id                                       comment_text  toxic  \
0  0000997932d777bf  Explanation\nWhy the edits made under my usern...      0   
1  000103f0d9cfb60f  D'aww! He matches this background colour I'm s...      0   
2  000113f07ec002fd  Hey man, I'm really not trying to edit war. It...      0   
3  0001b41b1c6bb37e  "\nMore\nI can't make any real suggestions on ...      0   
4  0001d958c54c6e35  You, sir, are my hero. Any chance you remember...      0   

   severe_toxic  obscene  threat  insult  identity_hate  
0             0        0       0       0              0  
1             0        0       0       0              0  
2             0        0       0       0              0  
3             0        0       0       0              0  
4             0        0       0       0              0  
(159571, 8)
