In [None]:
!pip install transformers datasets peft accelerate bitsandbytes


In [None]:
!pip install -U transformers


In [None]:
!pip install -U datasets

In [None]:
from datasets import load_dataset
dataset = load_dataset("s-nlp/paradetox")
train_data = dataset["train"]

In [None]:
# 2. Format examples
def format_example(example):
    return {
        "input_text": f"detoxify: {example['en_toxic_comment']}",
        "labels": example["en_neutral_comment"]
    }

train_data = train_data.map(format_example, remove_columns=train_data.column_names)


In [None]:
# 3. Tokenization with padding mask
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bigscience/mt0-base")

# ✅ Add a real padding token instead of using eos
tokenizer.add_special_tokens({'pad_token': '[PAD]'})

def tokenize(example):
    model_input = tokenizer(
        example["input_text"],
        padding="max_length",
        truncation=True,
        max_length=256
    )

    with tokenizer.as_target_tokenizer():
        label = tokenizer(
            example["labels"],
            padding="max_length",
            truncation=True,
            max_length=256
        )

    # ✅ Now [PAD] token will be correctly masked
    model_input["labels"] = [
        token if token != tokenizer.pad_token_id else -100
        for token in label["input_ids"]
    ]

    return model_input


In [None]:
# 4. Apply tokenization
tokenized_data = train_data.map(tokenize, batched=True)


In [None]:
print(tokenized_data[0]["labels"])



In [None]:
import torch
torch.set_default_dtype(torch.float32)


In [None]:
from transformers import TrainerCallback
import torch.nn.functional as F

class ZLossCallback(TrainerCallback):
    def on_step_end(self, args, state, control, logs=None, **kwargs):
        logs = logs or {}
        if "loss" in logs:
            logits = kwargs["outputs"].logits  # get logits from model forward
            log_z = logits.view(-1, logits.size(-1)).logsumexp(-1)
            z_loss = 1e-4 * torch.mean(log_z ** 2)
            logs["loss"] += z_loss.item()


In [None]:
# 5. Load model
from transformers import AutoModelForSeq2SeqLM

model = AutoModelForSeq2SeqLM.from_pretrained("bigscience/mt0-base")
model.resize_token_embeddings(len(tokenizer))
model.to("cuda")


In [None]:
import torch
import types
from transformers import MT5ForConditionalGeneration

# Store the original class's forward method
original_forward_fn = MT5ForConditionalGeneration.forward

def patched_forward(self, *args, **kwargs):
    # Call the original model class method
    outputs = original_forward_fn(self, *args, **kwargs)

    # Get logits and base loss
    logits = outputs.logits
    base_loss = outputs.loss

    # Add z-loss penalty
    log_z = logits.view(-1, logits.size(-1)).logsumexp(-1)
    z_loss = 1e-4 * torch.mean(log_z ** 2)
    total_loss = base_loss + z_loss

    # Return updated output
    return type(outputs)(loss=total_loss, logits=logits)

# Inject the safe patch
model.forward = types.MethodType(patched_forward, model)



In [None]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="./mt0_paradetox_en_fixed",
    per_device_train_batch_size=8,
    num_train_epochs=4,
    learning_rate=1e-5,
    weight_decay=0.01,
    lr_scheduler_type="cosine",
    save_strategy="epoch",
    logging_dir="./logs",
    logging_steps=50,
    save_total_limit=2,
    fp16=False,
    report_to="none"  # disables wandb
)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_data,
    tokenizer=tokenizer,
    callbacks=[ZLossCallback()]
)




In [None]:
# 7. Train
trainer.train()

In [None]:
# Check first 5 label sequences
for i in range(5):
    print(tokenized_data[i]["labels"])


Code for L4 GPU (No AMP, No Patching)

In [None]:
import torch
torch.set_default_dtype(torch.float32)


In [None]:
from datasets import load_dataset

# Load ParadeTox English split
train_data = load_dataset("s-nlp/paradetox", split="train")


In [None]:
# 2. Format examples
def format_example(example):
    return {
        "input_text": f"detoxify: {example['en_toxic_comment']}",
        "labels": example["en_neutral_comment"]
    }

train_data = train_data.map(format_example, remove_columns=train_data.column_names)


In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bigscience/mt0-base")
tokenizer.pad_token = tokenizer.eos_token  # Required for MT0


In [None]:
def tokenize(example):
    model_input = tokenizer(
        example["input_text"],
        text_target=example["labels"],
        padding="max_length",
        truncation=True,
        max_length=256,
    )
    return model_input

tokenized_data = train_data.map(tokenize, batched=True)


In [None]:
from transformers import MT5ForConditionalGeneration

model = MT5ForConditionalGeneration.from_pretrained("bigscience/mt0-base")
model.resize_token_embeddings(len(tokenizer))  # Required if you set pad_token


In [None]:

from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/...",  #  save here
    save_total_limit=1,         # keep only the last 2 checkpoints
    save_steps=1000,             # save every 500 steps
    logging_dir="/content/drive/MyDrive/...",  # optional: for TensorBoard logs
    logging_steps=100,
    #evaluation_strategy="no",   # change to "steps" if you want eval during training
    per_device_train_batch_size=8,
    num_train_epochs=4,
    weight_decay=0.01,
    fp16=False,                 # or True if you use mixed precision
    report_to="none",           # disable wandb
    save_strategy="steps",      # required to enable checkpoint saving
)



In [None]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_data,
    tokenizer=tokenizer
)


In [None]:
trainer.train()


In [None]:
# Mount your Google Drive
from google.colab import drive
drive.mount('/content/drive')


In [None]:
from transformers import MT5ForConditionalGeneration, AutoTokenizer

output_dir = "/content/drive/MyDrive/..."

# Make sure model is already trained and exists in memory here
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)


In [None]:
!ls /content/drive/MyDrive/mt0_paradetox_model


In [None]:
model_path = "/content/drive/MyDrive/..."
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = MT5ForConditionalGeneration.from_pretrained(model_path)

print("Model and tokenizer loaded.")


In [None]:
import pandas as pd

df = pd.read_csv("/content/drive/MyDrive/...")
df.head()


calling the trained model

In [None]:
from transformers import MT5ForConditionalGeneration, AutoTokenizer
import torch

model_path = "/content/drive/MyDrive/..."

tokenizer = AutoTokenizer.from_pretrained(model_path)
model = MT5ForConditionalGeneration.from_pretrained(model_path).to("cuda")


In [None]:
# Add the prompt prefix
inputs = ["detoxify: " + str(x) for x in df["toxic_sentence"].tolist()]

# Tokenize
tokenized = tokenizer(
    inputs,
    return_tensors="pt",
    padding=True,
    truncation=True,
    max_length=256
).to("cuda")


In [None]:
model.eval()
with torch.no_grad():
    outputs = model.generate(
        input_ids=tokenized["input_ids"],
        attention_mask=tokenized["attention_mask"],
        max_length=256
    )

decoded_outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True)


In [None]:
df["generated_output"] = decoded_outputs


In [None]:
df.to_csv("/content/drive/MyDrive/...", index=False)


In [None]:


from google.colab import files
files.download('/content/drive/MyDrive/...')

 evaluations

In [None]:
import pandas as pd

df = pd.read_csv("/content/drive/MyDrive/...")


In [None]:
!pip install bert-score evaluate sacrebleu
from bert_score import score as bert_score

P, R, F1 = bert_score(
    cands=df["generated_output"].tolist(),
    refs=df["neutral_reference"].tolist(),
    lang="en"
)

print(f"BERTScore F1: {F1.mean():.4f}")


In [None]:
!pip install sentence-transformers sacrebleu


In [None]:
import pandas as pd

df = pd.read_csv("/content/drive/MyDrive/...")  # adjust path if needed

# Rename columns to standard names for code consistency
df.rename(columns={
    "toxic_sentence": "input",
    "generated_output": "prediction",
    "neutral_reference": "reference"
}, inplace=True)


In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

class ToxicityClassifierPipeline:
    def __init__(self, binary_model_path, fine_model_path, device="cuda"):
        self.device = torch.device(device if torch.cuda.is_available() else "cpu")

        self.tokenizer_binary = AutoTokenizer.from_pretrained(binary_model_path)
        self.model_binary = AutoModelForSequenceClassification.from_pretrained(binary_model_path).to(self.device)

        self.tokenizer_fine = AutoTokenizer.from_pretrained(fine_model_path)
        self.model_fine = AutoModelForSequenceClassification.from_pretrained(fine_model_path).to(self.device)

        self.label_cols = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
        self.label_to_explanation = {
            "toxic": "This sentence contains general toxic language.",
            "severe_toxic": "This sentence contains extreme hostility or verbal abuse.",
            "obscene": "This sentence contains obscene or vulgar language.",
            "threat": "This sentence contains a threat or implied violence.",
            "insult": "This sentence includes personal insults or demeaning language.",
            "identity_hate": "This sentence attacks someone based on identity (e.g. race, gender, religion)."
        }

    def __call__(self, comment, threshold=0.5):
        inputs = self.tokenizer_binary(comment, return_tensors="pt", truncation=True, padding=True).to(self.device)
        with torch.no_grad():
            outputs = self.model_binary(**inputs)
            probs = torch.softmax(outputs.logits, dim=1)
            toxic_prob = probs[:, 1].item()

        binary_pred = "toxic" if toxic_prob >= threshold else "non-toxic"

        if binary_pred == "non-toxic":
            return {"binary": binary_pred, "subtypes": None, "toxic_prob": toxic_prob, "explanation": None}

        inputs = self.tokenizer_fine(comment, return_tensors="pt", truncation=True, padding=True).to(self.device)
        with torch.no_grad():
            outputs = self.model_fine(**inputs)
            fine_probs = torch.sigmoid(outputs.logits).cpu().numpy()[0]

        subtypes = {
            label: round(float(prob), 2)
            for label, prob in zip(self.label_cols, fine_probs)
            if prob >= threshold
        }

        explanation_parts = [self.label_to_explanation[label] for label in subtypes]
        explanation = " ".join(explanation_parts) if explanation_parts else None

        return {
            "binary": binary_pred,
            "subtypes": subtypes,
            "toxic_prob": toxic_prob,
            "explanation": explanation
        }


In [None]:
pipeline = ToxicityClassifierPipeline(
    binary_model_path="/content/drive/MyDrive/...",
    fine_model_path="/content/drive/MyDrive/..."
)


In [None]:
def is_toxic(text):
    result = pipeline(text)
    return result["binary"] == "toxic"

df["STA"] = [0 if is_toxic(pred) else 1 for pred in df["prediction"]]


In [None]:
from sentence_transformers import SentenceTransformer, util

# Load multilingual sentence similarity model
labse = SentenceTransformer("sentence-transformers/LaBSE")

emb_input = labse.encode(df["input"].tolist(), convert_to_tensor=True)
emb_pred = labse.encode(df["prediction"].tolist(), convert_to_tensor=True)

similarities = util.cos_sim(emb_input, emb_pred).diagonal().tolist()
df["SIM"] = similarities

In [None]:
from sacrebleu.metrics import CHRF

chrf = CHRF()
chrf_scores = [
    chrf.sentence_score(pred, [ref]).score / 100
    for pred, ref in zip(df["prediction"], df["reference"])
]
df["CHRF"] = chrf_scores

In [None]:
import numpy as np

# Final J-score is the average of STA, SIM, and CHRF
df["J-score"] = (df["STA"] + df["SIM"] + df["CHRF"]) / 3

# Print final average
print(f"✅ J-score (mean over all examples): {df['J-score'].mean():.4f}")


In [None]:
df.to_csv("/content/...", index=False)

In [None]:
from google.colab import files
files.download('/content/...')


In [None]:
import numpy as np

# Metric means from previously computed columns
df["J-score"] = (df["STA"] + df["SIM"] + df["CHRF"]) / 3

bert_f1 = F1.mean().item()                     # BERTScore F1
chrf_score = df["CHRF"].mean()                # CHRF
sta_score = df["STA"].mean()                  # STA
sim_score = df["SIM"].mean()                  # SIM
j_score = df["J-score"].mean()                # J-score

# Calculate overall average metrics
average_scores = {
    "Metric": ["BERTScore F1", "STA", "SIM", "CHRF", "Average J-score"],
    "Score": [
        bert_f1,
        sta_score,
        sim_score,
        chrf_score,
        j_score
    ]
}

# Convert to DataFrame and export
summary_df = pd.DataFrame(average_scores)
summary_df.to_csv("/content/drive/...", index=False)

In [None]:
from google.colab import files
files.download('/content/drive/...')


✅mt0-base finetuned with stage one

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForSequenceClassification
import pandas as pd

class ToxicityClassifierPipeline:
    def __init__(self, binary_model_path, fine_model_path, device="cuda"):
        self.device = torch.device(device if torch.cuda.is_available() else "cpu")

        self.tokenizer_binary = AutoTokenizer.from_pretrained(binary_model_path)
        self.model_binary = AutoModelForSequenceClassification.from_pretrained(binary_model_path).to(self.device)

        self.tokenizer_fine = AutoTokenizer.from_pretrained(fine_model_path)
        self.model_fine = AutoModelForSequenceClassification.from_pretrained(fine_model_path).to(self.device)

        self.label_cols = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
        self.label_to_explanation = {
            "toxic": "This sentence contains general toxic language.",
            "severe_toxic": "This sentence includes severe verbal abuse.",
            "obscene": "This sentence contains obscene or vulgar language.",
            "threat": "This sentence includes a threat or incites violence.",
            "insult": "This sentence includes personal insults.",
            "identity_hate": "This sentence attacks a person's identity."
        }

    def __call__(self, comment, threshold=0.5):
        inputs = self.tokenizer_binary(comment, return_tensors="pt", truncation=True, padding=True).to(self.device)
        with torch.no_grad():
            outputs = self.model_binary(**inputs)
            probs = torch.softmax(outputs.logits, dim=1)
            toxic_prob = probs[:, 1].item()

        binary_pred = "toxic" if toxic_prob >= threshold else "non-toxic"

        if binary_pred == "non-toxic":
            return {"binary": binary_pred, "explanation": None}

        inputs = self.tokenizer_fine(comment, return_tensors="pt", truncation=True, padding=True).to(self.device)
        with torch.no_grad():
            outputs = self.model_fine(**inputs)
            fine_probs = torch.sigmoid(outputs.logits).cpu().numpy()[0]

        explanation_parts = [
            self.label_to_explanation[label]
            for label, prob in zip(self.label_cols, fine_probs)
            if prob >= threshold
        ]

        explanation = " ".join(explanation_parts) if explanation_parts else "This sentence contains toxic language."
        return {
            "binary": binary_pred,
            "explanation": explanation
        }

# Instantiate the classifier
pipeline = ToxicityClassifierPipeline(
    binary_model_path="/content/drive/MyDrive/...",
    fine_model_path="/content/drive/MyDrive/..."
)


In [None]:
from transformers import MT5ForConditionalGeneration, AutoTokenizer
import torch

model_path = "/content/drive/MyDrive/..."

tokenizer = AutoTokenizer.from_pretrained(model_path)
model = MT5ForConditionalGeneration.from_pretrained(model_path).to("cuda")

In [None]:
def build_mt0_prompt(toxic_sentence, explanation):
    return (
        f"Detoxify the following sentence while preserving its meaning. "
        f"The explanation below describes why the sentence is considered toxic:\n"
        f"Toxic: {toxic_sentence}\n"
        f"Explanation: {explanation}\n"
        f"Detoxified:"
    )

def generate_mt0_detox(prompt, model, tokenizer):
    model.eval()
    inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True, max_length=256).to("cuda")
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=60,
            do_sample=True,
            temperature=0.7,
            top_p=0.9,
            repetition_penalty=1.2
        )

    decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # ✅ Remove prompt from output if echoed
    if "Detoxified:" in decoded:
        detoxified = decoded.split("Detoxified:")[-1].strip()
    else:
        detoxified = decoded.strip()

    return detoxified


In [None]:
# Load test file
df = pd.read_csv("/content/drive/MyDrive/...")

# Generate new detox outputs
outputs_mt0_stage1 = []

for text in df["toxic_sentence"]:
    stage1 = pipeline(text)
    explanation = stage1["explanation"] if stage1["explanation"] else "This sentence contains toxic language."
    prompt = build_mt0_prompt(text, explanation)
    detox = generate_mt0_detox(prompt, mt0_model, mt0_tokenizer)
    outputs_mt0_stage1.append(detox)

# Save new outputs
df["mt0_base_output_stage1"] = outputs_mt0_stage1
df.to_csv("/content/drive/MyDrive/...", index=False)

print("Saved to: mt0_base_retrain_output_with_stage1.csv")
