In [None]:
!pip install -q transformers datasets accelerate


In [None]:
from datasets import load_dataset, Dataset

# Load only the English split (400 examples)
raw_dataset =load_dataset("textdetox/multilingual_paradetox")
en_data = raw_dataset["en"]

# Prepare it for seq2seq training: toxic → detoxified
def format_example(example):
    return {
        "input_text": f"detoxify: {example['toxic_sentence']}",
        "labels": example["neutral_sentence"]
    }

train_data = Dataset.from_list(en_data)
train_data = train_data.map(format_example, remove_columns=train_data.column_names)


In [None]:
import torch

torch.cuda.empty_cache()

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bigscience/mt0-base")

def tokenize(example):
    inputs = tokenizer(example["input_text"], padding="max_length", truncation=True, max_length=256)
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(example["labels"], padding="max_length", truncation=True, max_length=256)
    inputs["labels"] = labels["input_ids"]
    return inputs

tokenized_data = train_data.map(tokenize, batched=True)


In [None]:
from transformers import AutoModelForSeq2SeqLM

model = AutoModelForSeq2SeqLM.from_pretrained("bigscience/mt0-base")
model.to("cuda")

In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./mt0_paradetox_finetuned",
    per_device_train_batch_size=8,
    num_train_epochs=4,
    learning_rate=1e-5,
    weight_decay=0.01,
    lr_scheduler_type="cosine",
    save_strategy="epoch",
    logging_dir="./logs",
    logging_steps=50,
    save_total_limit=2,
    fp16=True
)

from transformers import Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_data,
    tokenizer=tokenizer
)

trainer.train()


In [None]:
model.save_pretrained("/content/drive/MyDrive/...")
tokenizer.save_pretrained("/content/drive/MyDrive/...")


In [None]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
import torch

model_path = "/content/drive/MyDrive/..."
model = AutoModelForSeq2SeqLM.from_pretrained(model_path).to("cuda")
tokenizer = AutoTokenizer.from_pretrained(model_path)
from datasets import load_dataset

# Load EN toxic → neutral set
ds = load_dataset("textdetox/multilingual_paradetox")
toxic_sentences = ds["en"]["toxic_sentence"]
references = ds["en"]["neutral_sentence"]
def generate_detox(texts, batch_size=8):
    model.eval()
    outputs = []
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i+batch_size]
        inputs = tokenizer(batch, return_tensors="pt", padding=True, truncation=True, max_length=256).to("cuda")
        with torch.no_grad():
            gen = model.generate(**inputs, max_new_tokens=128)
        decoded = tokenizer.batch_decode(gen, skip_special_tokens=True)
        outputs.extend(decoded)
    return outputs

generated_outputs = generate_detox(toxic_sentences)


In [None]:

import pandas as pd

# Assuming 'generated_outputs' and 'toxic_sentences' are defined from the previous code
# Create a DataFrame
df = pd.DataFrame({'toxic_sentence': toxic_sentences, 'generated_output': generated_outputs})

# Save to CSV
df.to_csv('generated_outputs.csv', index=False)

# Download the CSV file
from google.colab import files
files.download('generated_outputs.csv')


In [None]:
# prompt: save generated_outputs.csv to my drive

# Save to CSV in Google Drive
df.to_csv('/content/drive/MyDrive/...', index=False)


In [None]:
!pip install bert_score
from bert_score import score

P, R, F1 = score(generated_outputs, references, lang="en", verbose=True)
print(f"Average BERTScore F1: {F1.mean():.4f}")


generating for gpt-40 score

In [None]:
import pandas as pd
from datasets import load_dataset

# Load ParadeTox EN data
ds = load_dataset("textdetox/multilingual_paradetox")
toxic_sentences = ds["en"]["toxic_sentence"]
references = ds["en"]["neutral_sentence"]

# Run your model
generated_outputs = generate_detox(toxic_sentences)  # from earlier

# Create and save dataframe
df = pd.DataFrame({
    "toxic_sentence": toxic_sentences,
    "generated_output": generated_outputs,
    "neutral_reference": references
})

# Save to CSV (sample 50 rows for now)
df.head(100).to_csv("/content/...", index=False)


In [None]:


from google.colab import files
files.download("/content/...")


running the model on test set of paradetox 2024 which is the same as 2025

In [None]:
from datasets import load_dataset

# Login using e.g. `huggingface-cli login` to access this dataset
ds = load_dataset("textdetox/multilingual_paradetox_test")
test_en = ds["en"]

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch

model_path = "/content/drive/MyDrive/..."

# Load from local path properly
tokenizer = AutoTokenizer.from_pretrained(model_path, local_files_only=True)
model = AutoModelForSeq2SeqLM.from_pretrained(model_path, local_files_only=True).to("cuda")




In [None]:
import torch

def generate_detox(texts, batch_size=8):
    model.eval()
    outputs = []
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i+batch_size]
        inputs = tokenizer(batch, return_tensors="pt", padding=True, truncation=True, max_length=256).to("cuda")
        with torch.no_grad():
            gen = model.generate(**inputs, max_new_tokens=128)
        decoded = tokenizer.batch_decode(gen, skip_special_tokens=True)
        outputs.extend(decoded)
    return outputs

generated_outputs = generate_detox(toxic_sentences)


In [None]:


import pandas as pd

# Assuming 'generated_outputs' and 'toxic_sentences' are defined from the previous code
# Create a DataFrame
df = pd.DataFrame({'toxic_sentence': toxic_sentences, 'generated_output': generated_outputs,"neutral_reference": references})

# Save to CSV
df.to_csv('generated_outputs_testset.csv', index=False)

# Download the CSV file
from google.colab import files
files.download('generated_outputs_testset.csv')

In [None]:
from bert_score import score

P, R, F1 = score(generated_outputs, references, lang="en", verbose=True)
print(f" BERTScore F1 on TEST set: {F1.mean():.4f}")


now j score on test set:

In [None]:
!pip install sentence-transformers sacrebleu


In [None]:
import pandas as pd

# Load the final dataset
df = pd.read_csv("/content/...")

# Rename columns to match expected names
df.rename(columns={
    "toxic_sentence": "input",
    "generated_output": "prediction",
    "neutral_reference": "reference"
}, inplace=True)


In [None]:
# prompt: moumt drive

from google.colab import drive
drive.mount('/content/drive')


In [None]:
!pip install -q sentence-transformers sacrebleu


In [None]:
import pandas as pd

df = pd.read_csv("/content/...")  # adjust path if needed

# Rename columns to standard names for code consistency
df.rename(columns={
    "toxic_sentence": "input",
    "generated_output": "prediction",
    "neutral_reference": "reference"
}, inplace=True)


In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

class ToxicityClassifierPipeline:
    def __init__(self, binary_model_path, fine_model_path, device="cuda"):
        self.device = torch.device(device if torch.cuda.is_available() else "cpu")

        self.tokenizer_binary = AutoTokenizer.from_pretrained(binary_model_path)
        self.model_binary = AutoModelForSequenceClassification.from_pretrained(binary_model_path).to(self.device)

        self.tokenizer_fine = AutoTokenizer.from_pretrained(fine_model_path)
        self.model_fine = AutoModelForSequenceClassification.from_pretrained(fine_model_path).to(self.device)

        self.label_cols = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
        self.label_to_explanation = {
            "toxic": "This sentence contains general toxic language.",
            "severe_toxic": "This sentence contains extreme hostility or verbal abuse.",
            "obscene": "This sentence contains obscene or vulgar language.",
            "threat": "This sentence contains a threat or implied violence.",
            "insult": "This sentence includes personal insults or demeaning language.",
            "identity_hate": "This sentence attacks someone based on identity (e.g. race, gender, religion)."
        }

    def __call__(self, comment, threshold=0.5):
        inputs = self.tokenizer_binary(comment, return_tensors="pt", truncation=True, padding=True).to(self.device)
        with torch.no_grad():
            outputs = self.model_binary(**inputs)
            probs = torch.softmax(outputs.logits, dim=1)
            toxic_prob = probs[:, 1].item()

        binary_pred = "toxic" if toxic_prob >= threshold else "non-toxic"

        if binary_pred == "non-toxic":
            return {"binary": binary_pred, "subtypes": None, "toxic_prob": toxic_prob, "explanation": None}

        inputs = self.tokenizer_fine(comment, return_tensors="pt", truncation=True, padding=True).to(self.device)
        with torch.no_grad():
            outputs = self.model_fine(**inputs)
            fine_probs = torch.sigmoid(outputs.logits).cpu().numpy()[0]

        subtypes = {
            label: round(float(prob), 2)
            for label, prob in zip(self.label_cols, fine_probs)
            if prob >= threshold
        }

        explanation_parts = [self.label_to_explanation[label] for label in subtypes]
        explanation = " ".join(explanation_parts) if explanation_parts else None

        return {
            "binary": binary_pred,
            "subtypes": subtypes,
            "toxic_prob": toxic_prob,
            "explanation": explanation
        }


In [None]:
pipeline = ToxicityClassifierPipeline(
    binary_model_path="/content/drive/MyDrive/..",
    fine_model_path="/content/drive/MyDrive/..."
)


In [None]:
def is_toxic(text):
    result = pipeline(text)
    return result["binary"] == "toxic"

df["STA"] = [0 if is_toxic(pred) else 1 for pred in df["prediction"]]


In [None]:
from sentence_transformers import SentenceTransformer, util

# Load multilingual sentence similarity model
labse = SentenceTransformer("sentence-transformers/LaBSE")

emb_input = labse.encode(df["input"].tolist(), convert_to_tensor=True)
emb_pred = labse.encode(df["prediction"].tolist(), convert_to_tensor=True)

similarities = util.cos_sim(emb_input, emb_pred).diagonal().tolist()
df["SIM"] = similarities


In [None]:
from sacrebleu.metrics import CHRF

chrf = CHRF()
chrf_scores = [
    chrf.sentence_score(pred, [ref]).score / 100
    for pred, ref in zip(df["prediction"], df["reference"])
]
df["CHRF"] = chrf_scores


In [None]:
import numpy as np

# Final J-score is the average of STA, SIM, and CHRF
df["J-score"] = (df["STA"] + df["SIM"] + df["CHRF"]) / 3

# Print final average
print(f"✅ J-score (mean over all examples): {df['J-score'].mean():.4f}")


In [None]:
df.to_csv("/content/...", index=False)


In [None]:
# prompt: download detox_jscore_results.csv to my local

from google.colab import files
files.download('/content/...')
