In [None]:
!pip install -q transformers accelerate bitsandbytes

In [None]:
# Mount your Google Drive
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForSequenceClassification
import pandas as pd

class ToxicityClassifierPipeline:
    def __init__(self, binary_model_path, fine_model_path, device="cuda"):
        self.device = torch.device(device if torch.cuda.is_available() else "cpu")

        self.tokenizer_binary = AutoTokenizer.from_pretrained(binary_model_path)
        self.model_binary = AutoModelForSequenceClassification.from_pretrained(binary_model_path).to(self.device)

        self.tokenizer_fine = AutoTokenizer.from_pretrained(fine_model_path)
        self.model_fine = AutoModelForSequenceClassification.from_pretrained(fine_model_path).to(self.device)

        self.label_cols = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
        self.label_to_explanation = {
            "toxic": "This sentence contains general toxic language.",
            "severe_toxic": "This sentence includes severe verbal abuse.",
            "obscene": "This sentence contains obscene or vulgar language.",
            "threat": "This sentence includes a threat or incites violence.",
            "insult": "This sentence includes personal insults.",
            "identity_hate": "This sentence attacks a person's identity."
        }

    def __call__(self, comment, threshold=0.5):
        inputs = self.tokenizer_binary(comment, return_tensors="pt", truncation=True, padding=True).to(self.device)
        with torch.no_grad():
            outputs = self.model_binary(**inputs)
            probs = torch.softmax(outputs.logits, dim=1)
            toxic_prob = probs[:, 1].item()

        binary_pred = "toxic" if toxic_prob >= threshold else "non-toxic"

        if binary_pred == "non-toxic":
            return {"binary": binary_pred, "explanation": None}

        inputs = self.tokenizer_fine(comment, return_tensors="pt", truncation=True, padding=True).to(self.device)
        with torch.no_grad():
            outputs = self.model_fine(**inputs)
            fine_probs = torch.sigmoid(outputs.logits).cpu().numpy()[0]

        explanation_parts = [
            self.label_to_explanation[label]
            for label, prob in zip(self.label_cols, fine_probs)
            if prob >= threshold
        ]

        explanation = " ".join(explanation_parts) if explanation_parts else "This sentence contains toxic language."
        return {
            "binary": binary_pred,
            "explanation": explanation
        }

# Instantiate the classifier
pipeline = ToxicityClassifierPipeline(
    binary_model_path="/content/drive/MyDrive/...",
    fine_model_path="/content/drive/MyDrive/..."
)


In [None]:
from transformers import MT5ForConditionalGeneration, AutoTokenizer

model_path = "/content/drive/MyDrive/..."

tokenizer = AutoTokenizer.from_pretrained(model_path, local_files_only=True)
model = MT5ForConditionalGeneration.from_pretrained(model_path, local_files_only=True).to("cuda")

print("Model and tokenizer loaded successfully")


orginal prompt: did'nt work good

In [None]:
def build_mt0_prompt(toxic_sentence, explanation):
    return (
        f"Detoxify the following sentence while preserving its meaning. "
        f"The explanation below describes why the sentence is considered toxic:\n"
        f"Toxic: {toxic_sentence}\n"
        f"Explanation: {explanation}\n"
        f"Detoxified:"
    )

def generate_mt0_detox(prompt, model, tokenizer):
    model.eval()
    inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True, max_length=256).to("cuda")
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=60,
            do_sample=True,
            temperature=0.7,
            top_p=0.9,
            repetition_penalty=1.2
        )

    decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Remove prompt from output if echoed
    if "Detoxified:" in decoded:
        detoxified = decoded.split("Detoxified:")[-1].strip()
    else:
        detoxified = decoded.strip()

    return detoxified


aligned prompt format:

In [None]:
def build_stage1_aligned_prompt(toxic_sentence, stage1_labels):
    """
    Aligned prompt structure: embeds Stage 1 tags compactly in the format the model was fine-tuned on.
    """
    if stage1_labels:
        tags = ", ".join(stage1_labels)
        return f"detoxify [{tags}]: {toxic_sentence}"
    else:
        return f"detoxify: {toxic_sentence}"


def generate_mt0_detox(prompt, model, tokenizer):
    model.eval()
    inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True, max_length=256).to("cuda")

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=60,
            do_sample=True,
            temperature=0.7,
            top_p=0.9,
            repetition_penalty=1.2
        )

    return tokenizer.decode(outputs[0], skip_special_tokens=True).strip()


orginal Generation Loop

In [None]:
# Load test file
import pandas as pd
df = pd.read_csv("/content/drive/MyDrive/...")

# Generate new detox outputs
outputs_mt0_stage1 = []

for text in df["toxic_sentence"]:
    stage1 = pipeline(text)
    explanation = stage1["explanation"] if stage1["explanation"] else "This sentence contains toxic language."
    prompt = build_mt0_prompt(text, explanation)
    # Changed mt0_model to model and mt0_tokenizer to tokenizer
    detox = generate_mt0_detox(prompt, model, tokenizer)
    outputs_mt0_stage1.append(detox)

# Save new outputs
df["mt0_base_output_stage1"] = outputs_mt0_stage1
df.to_csv("/content/drive/MyDrive/...", index=False)

print("Saved to: mt0_base_retrain_output_with_stage1_vol2.csv")


Update the Generation Loop

In [None]:
# Load test file
import pandas as pd
df = pd.read_csv("/content/drive/MyDrive/...")

# Generate new detox outputs
outputs_mt0_stage1 = []

for text in df["toxic_sentence"]:
    stage1 = pipeline(text)
    explanation = stage1["explanation"]
    tags = []

    if explanation:
        # Only extract tags if explanation exists
        for label, phrase in pipeline.label_to_explanation.items():
            if phrase in explanation:
                tags.append(label)

    prompt = build_stage1_aligned_prompt(text, tags)
    detox = generate_mt0_detox(prompt, model, tokenizer)
    outputs_mt0_stage1.append(detox)



# Save new outputs
df["mt0_base_output_stage1"] = outputs_mt0_stage1
df.to_csv("/content/drive/MyDrive/...", index=False)
print("Saved with aligned prompt format.")



In [None]:
from google.colab import files
files.download('/content/drive/MyDrive/...')


Modified Version of  Generation Loop only for submission in leaderboard PAN 2024

In [None]:
import pandas as pd
from tqdm import tqdm

# Load the official test set
df = pd.read_csv("/content/drive/MyDrive/...", sep="\t")

# Initialize output list
neutral_sentences = []

# Loop through rows
for idx, row in tqdm(df.iterrows(), total=len(df)):
    toxic = row['toxic_sentence']
    lang = row['lang']

    if lang == 'en':
        # Run stage 1 classifier
        stage1 = pipeline(toxic)
        explanation = stage1["explanation"]
        tags = []

        if explanation:
            for label, phrase in pipeline.label_to_explanation.items():
                if phrase in explanation:
                    tags.append(label)

        # Build aligned prompt + generate detoxified sentence
        prompt = build_stage1_aligned_prompt(toxic, tags)
        detox = generate_mt0_detox(prompt, model, tokenizer)
        neutral_sentences.append(detox)
    else:
        # For all other languages, keep original toxic
        neutral_sentences.append(toxic)

# Write predictions back to the dataframe
df['neutral_sentence'] = neutral_sentences

# Save as official TSV
output_path = "/content/drive/MyDrive/..."
df.to_csv(output_path, sep="\t", index=False)

print(f"Saved final submission to {output_path}")


In [None]:
!zip submission.zip /content/drive/MyDrive/submission.tsv


In [None]:
from google.colab import files
files.download('/content/drive/MyDrive/...')

In [None]:
from google.colab import files
files.download("submission.zip")


In [None]:
import pandas as pd

# Load current version
df = pd.read_csv("/content/drive/MyDrive/...", sep="\t")

# Keep only required columns
df_clean = df[['toxic_sentence', 'neutral_sentence']]

# Save cleaned version
df_clean.to_csv("submission_clean.tsv", sep="\t", index=False)

# Zip it
!zip submission.zip submission_clean.tsv


In [None]:
from google.colab import files
files.download("submission_clean.tsv")


Modified Version of Generation Loop only for submission in leaderboard PAN 2025

In [None]:
import pandas as pd
from tqdm import tqdm

# Load the new PAN 2025 test file
df = pd.read_csv("/content/drive/MyDrive/thesis files/...", sep="\t")

# Initialize output list
neutral_sentences = []

# Loop through all rows
for idx, row in tqdm(df.iterrows(), total=len(df)):
    toxic = row['toxic_sentence']
    lang = row['lang']

    if lang == 'en':
        # Run stage 1 classifier
        stage1 = pipeline(toxic)
        explanation = stage1.get("explanation", "")
        tags = []

        if explanation:
            for label, phrase in pipeline.label_to_explanation.items():
                if phrase in explanation:
                    tags.append(label)

        # Build prompt + detoxify
        prompt = build_stage1_aligned_prompt(toxic, tags)
        detox = generate_mt0_detox(prompt, model, tokenizer)
        neutral_sentences.append(detox)
    else:
        # For other languages, keep the toxic sentence
        neutral_sentences.append(toxic)

# Assign back to DataFrame
df['neutral_sentence'] = neutral_sentences

# Save submission file
df = df[['toxic_sentence', 'neutral_sentence', 'lang']]
df.to_csv("/content/drive/MyDrive/...", sep="\t", index=False)

# Zip for Codalab submission
!zip /content/drive/MyDrive/... /content/drive/MyDrive/...

print("Submission file saved and zipped for PAN 2025.")


In [None]:
from google.colab import files
files.download("submission_clean.tsv")

evaluation:

In [None]:
import pandas as pd

df = pd.read_csv("/content/drive/MyDrive/...")

In [None]:
!pip install bert-score evaluate sacrebleu
from bert_score import score as bert_score

P, R, F1 = bert_score(
    cands=df["mt0_base_output_stage1"].tolist(),
    refs=df["neutral_reference"].tolist(),
    lang="en"
)

print(f"BERTScore F1: {F1.mean():.4f}")

In [None]:
!pip install sentence-transformers sacrebleu

In [None]:
import pandas as pd

df = pd.read_csv("/content/drive/MyDrive/....")  # adjust path if needed

# Rename columns to standard names for code consistency
df.rename(columns={
    "toxic_sentence": "input",
    "mt0_base_output_stage1": "prediction",
    "neutral_reference": "reference"
}, inplace=True)

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

class ToxicityClassifierPipeline:
    def __init__(self, binary_model_path, fine_model_path, device="cuda"):
        self.device = torch.device(device if torch.cuda.is_available() else "cpu")

        self.tokenizer_binary = AutoTokenizer.from_pretrained(binary_model_path)
        self.model_binary = AutoModelForSequenceClassification.from_pretrained(binary_model_path).to(self.device)

        self.tokenizer_fine = AutoTokenizer.from_pretrained(fine_model_path)
        self.model_fine = AutoModelForSequenceClassification.from_pretrained(fine_model_path).to(self.device)

        self.label_cols = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
        self.label_to_explanation = {
            "toxic": "This sentence contains general toxic language.",
            "severe_toxic": "This sentence contains extreme hostility or verbal abuse.",
            "obscene": "This sentence contains obscene or vulgar language.",
            "threat": "This sentence contains a threat or implied violence.",
            "insult": "This sentence includes personal insults or demeaning language.",
            "identity_hate": "This sentence attacks someone based on identity (e.g. race, gender, religion)."
        }

    def __call__(self, comment, threshold=0.5):
        inputs = self.tokenizer_binary(comment, return_tensors="pt", truncation=True, padding=True).to(self.device)
        with torch.no_grad():
            outputs = self.model_binary(**inputs)
            probs = torch.softmax(outputs.logits, dim=1)
            toxic_prob = probs[:, 1].item()

        binary_pred = "toxic" if toxic_prob >= threshold else "non-toxic"

        if binary_pred == "non-toxic":
            return {"binary": binary_pred, "subtypes": None, "toxic_prob": toxic_prob, "explanation": None}

        inputs = self.tokenizer_fine(comment, return_tensors="pt", truncation=True, padding=True).to(self.device)
        with torch.no_grad():
            outputs = self.model_fine(**inputs)
            fine_probs = torch.sigmoid(outputs.logits).cpu().numpy()[0]

        subtypes = {
            label: round(float(prob), 2)
            for label, prob in zip(self.label_cols, fine_probs)
            if prob >= threshold
        }

        explanation_parts = [self.label_to_explanation[label] for label in subtypes]
        explanation = " ".join(explanation_parts) if explanation_parts else None

        return {
            "binary": binary_pred,
            "subtypes": subtypes,
            "toxic_prob": toxic_prob,
            "explanation": explanation
        }

In [None]:
pipeline = ToxicityClassifierPipeline(
    binary_model_path="/content/drive/MyDrive/...",
    fine_model_path="/content/drive/MyDrive/..."
)

In [None]:
def is_toxic(text):
    result = pipeline(text)
    return result["binary"] == "toxic"

df["STA"] = [0 if is_toxic(pred) else 1 for pred in df["prediction"]]

In [None]:
from sentence_transformers import SentenceTransformer, util

# Load multilingual sentence similarity model
labse = SentenceTransformer("sentence-transformers/LaBSE")

emb_input = labse.encode(df["input"].tolist(), convert_to_tensor=True)
emb_pred = labse.encode(df["prediction"].tolist(), convert_to_tensor=True)

similarities = util.cos_sim(emb_input, emb_pred).diagonal().tolist()
df["SIM"] = similarities

In [None]:
from sacrebleu.metrics import CHRF

chrf = CHRF()
chrf_scores = [
    chrf.sentence_score(pred, [ref]).score / 100
    for pred, ref in zip(df["prediction"], df["reference"])
]
df["CHRF"] = chrf_scores

In [None]:
import numpy as np

# Final J-score is the average of STA, SIM, and CHRF
df["J-score"] = (df["STA"] + df["SIM"] + df["CHRF"]) / 3

# Print final average
print(f"✅ J-score (mean over all examples): {df['J-score'].mean():.4f}")

In [None]:
df.to_csv("/content/...", index=False)

In [None]:
from google.colab import files
files.download('/content/...')

In [None]:
import numpy as np

# Metric means from previously computed columns
df["J-score"] = (df["STA"] + df["SIM"] + df["CHRF"]) / 3

bert_f1 = F1.mean().item()                     # BERTScore F1
chrf_score = df["CHRF"].mean()                # CHRF
sta_score = df["STA"].mean()                  # STA
sim_score = df["SIM"].mean()                  # SIM
j_score = df["J-score"].mean()                # J-score

# Calculate overall average metrics
average_scores = {
    "Metric": ["BERTScore F1", "STA", "SIM", "CHRF", "Average J-score"],
    "Score": [
        bert_f1,
        sta_score,
        sim_score,
        chrf_score,
        j_score
    ]
}

# Convert to DataFrame and export
summary_df = pd.DataFrame(average_scores)
summary_df.to_csv("/content/drive/MyDrive/...", index=False)

In [None]:
from google.colab import files
files.download('/content/drive/MyDrive/...")