✅ Loading  fine-tuned mt0-base model

✅ Loading  Stage 1 toxicity classifier

✅ Using Stage 1 explanation in the prompt

✅ Generating detoxified outputs from mt0-base

✅ Saving outputs to a CSV file for scoring

In [None]:
!pip install -q transformers accelerate bitsandbytes

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForSequenceClassification
import pandas as pd

class ToxicityClassifierPipeline:
    def __init__(self, binary_model_path, fine_model_path, device="cuda"):
        self.device = torch.device(device if torch.cuda.is_available() else "cpu")

        self.tokenizer_binary = AutoTokenizer.from_pretrained(binary_model_path)
        self.model_binary = AutoModelForSequenceClassification.from_pretrained(binary_model_path).to(self.device)

        self.tokenizer_fine = AutoTokenizer.from_pretrained(fine_model_path)
        self.model_fine = AutoModelForSequenceClassification.from_pretrained(fine_model_path).to(self.device)

        self.label_cols = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
        self.label_to_explanation = {
            "toxic": "This sentence contains general toxic language.",
            "severe_toxic": "This sentence includes severe verbal abuse.",
            "obscene": "This sentence contains obscene or vulgar language.",
            "threat": "This sentence includes a threat or incites violence.",
            "insult": "This sentence includes personal insults.",
            "identity_hate": "This sentence attacks a person's identity."
        }

    def __call__(self, comment, threshold=0.5):
        inputs = self.tokenizer_binary(comment, return_tensors="pt", truncation=True, padding=True).to(self.device)
        with torch.no_grad():
            outputs = self.model_binary(**inputs)
            probs = torch.softmax(outputs.logits, dim=1)
            toxic_prob = probs[:, 1].item()

        binary_pred = "toxic" if toxic_prob >= threshold else "non-toxic"

        if binary_pred == "non-toxic":
            return {"binary": binary_pred, "explanation": None}

        inputs = self.tokenizer_fine(comment, return_tensors="pt", truncation=True, padding=True).to(self.device)
        with torch.no_grad():
            outputs = self.model_fine(**inputs)
            fine_probs = torch.sigmoid(outputs.logits).cpu().numpy()[0]

        explanation_parts = [
            self.label_to_explanation[label]
            for label, prob in zip(self.label_cols, fine_probs)
            if prob >= threshold
        ]

        explanation = " ".join(explanation_parts) if explanation_parts else "This sentence contains toxic language."
        return {
            "binary": binary_pred,
            "explanation": explanation
        }

# Instantiate the classifier
pipeline = ToxicityClassifierPipeline(
    binary_model_path="/content/drive/MyDrive/...",
    fine_model_path="/content/drive/MyDrive/..."
)


In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

model_path = "/content/drive/MyDrive/..."
mt0_model = AutoModelForSeq2SeqLM.from_pretrained(model_path).to("cuda")
mt0_tokenizer = AutoTokenizer.from_pretrained(model_path)


In [None]:
def build_mt0_prompt(toxic_sentence, explanation):
    return (
        f"Detoxify the following sentence while preserving its meaning. "
        f"The explanation below describes why the sentence is considered toxic:\n"
        f"Toxic: {toxic_sentence}\n"
        f"Explanation: {explanation}\n"
        f"Detoxified:"
    )

def generate_mt0_detox(prompt, model, tokenizer):
    model.eval()
    inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True, max_length=256).to("cuda")
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=60,
            do_sample=True,
            temperature=0.7,
            top_p=0.9,
            repetition_penalty=1.2
        )

    decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # ✅ Remove prompt from output if echoed
    if "Detoxified:" in decoded:
        detoxified = decoded.split("Detoxified:")[-1].strip()
    else:
        detoxified = decoded.strip()

    return detoxified


In [None]:
# Load test file
df = pd.read_csv("/content/drive/MyDrive/...")

# Generate new detox outputs
outputs_mt0_stage1 = []

for text in df["toxic_sentence"]:
    stage1 = pipeline(text)
    explanation = stage1["explanation"] if stage1["explanation"] else "This sentence contains toxic language."
    prompt = build_mt0_prompt(text, explanation)
    detox = generate_mt0_detox(prompt, mt0_model, mt0_tokenizer)
    outputs_mt0_stage1.append(detox)

# Save new outputs
df["mt0_base_output_stage1"] = outputs_mt0_stage1
df.to_csv("/content/drive/MyDrive/...", index=False)

print("Saved to: mt0_base_output_with_stage1.csv")


✅ BERTScore

✅ CHRF

✅ STA (using your Stage 1 classifier)

✅ J-score

In [None]:
!pip install -q bert_score sentence-transformers sacrebleu


In [None]:
import pandas as pd

# Load MT0 outputs with Stage 1 explanation
df = pd.read_csv("/content/drive/MyDrive/...")

# Confirm columns
print(df.columns)
# Should include: toxic_sentence, neutral_reference, mt0_base_output_stage1
# Ensure outputs are strings and fill NaNs if needed
df["mt0_base_output_stage1"] = df["mt0_base_output_stage1"].fillna("").astype(str)
df["neutral_reference"] = df["neutral_reference"].fillna("").astype(str)



In [None]:
from bert_score import score

P, R, F1 = score(
    df["mt0_base_output_stage1"].tolist(),
    df["neutral_reference"].tolist(),
    lang="en",
    verbose=True
)

df["bertscore_f1"] = F1.tolist()
print(f" Avg BERTScore F1: {F1.mean():.4f}")


In [None]:
from sacrebleu.metrics import CHRF

chrf = CHRF()
sta = []
chrf_scores = []

for tox, detox, ref in zip(df["toxic_sentence"], df["mt0_base_output_stage1"], df["neutral_reference"]):
    result = pipeline(detox)
    sta.append(0 if result["binary"] == "toxic" else 1)
    score_chrf = chrf.sentence_score(detox, [ref]).score / 100
    chrf_scores.append(score_chrf)

df["STA"] = sta
df["CHRF"] = chrf_scores
df["J_score"] = (df["STA"] + df["bertscore_f1"] + df["CHRF"]) / 3

print(f"✅ Avg J-score: {df['J_score'].mean():.4f}")
df.to_csv("/content/drive/MyDrive/...", index=False)
print(" Saved to: mt0_base_with_stage1_evaluation.csv")
