In [None]:
from huggingface_hub import login

login(token="...")  # Replace with your real token


In [None]:
!pip install -q transformers accelerate bitsandbytes

In [None]:


import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
model_id = "mistralai/Mistral-7B-Instruct-v0.2"

tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="auto",
    torch_dtype=torch.float16,
    load_in_8bit=True  # Optional: Saves memory on Colab
)


In [None]:
def format_prompt(toxic_sentence):
    return f"<s>[INST] Rewrite the following sentence in 1 short sentence to make it polite and non-toxic while keeping its meaning:\n\"{toxic_sentence}\" [/INST]"

def detoxify(text):
    prompt = format_prompt(text)
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

    with torch.no_grad():
        output = model.generate(
            **inputs,
            max_new_tokens=128,
            do_sample=True,
            temperature=0.7,
            top_p=0.9,
            repetition_penalty=1.2,
            eos_token_id=tokenizer.eos_token_id
        )

    detoxified = tokenizer.decode(output[0], skip_special_tokens=True)
    return detoxified


In [None]:
toxic_examples = [
    "You're such a pathetic loser.",
    "No one cares about your stupid opinions.",
    "Shut up, you're a worthless idiot."
]

for i, text in enumerate(toxic_examples):
    print(f"\n{i+1}. Original: {text}")
    print(f"   Detoxified: {detoxify(text)}")


In [None]:
toxic_examples = [
    "don't do abortion close your legs."
]

for i, text in enumerate(toxic_examples):
    print(f"\n{i+1}. Original: {text}")
    print(f"   Detoxified: {detoxify(text)}")

Full Pipeline: Stage 1 + Prompting Mistral

In [None]:
!pip install -q transformers accelerate bitsandbytes


🔹 Step 2: Import Libraries and Load Stage 1 Classifier

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

class ToxicityClassifierPipeline:
    def __init__(self, binary_model_path, fine_model_path, device="cuda"):
        self.device = torch.device(device if torch.cuda.is_available() else "cpu")

        self.tokenizer_binary = AutoTokenizer.from_pretrained(binary_model_path)
        self.model_binary = AutoModelForSequenceClassification.from_pretrained(binary_model_path).to(self.device)

        self.tokenizer_fine = AutoTokenizer.from_pretrained(fine_model_path)
        self.model_fine = AutoModelForSequenceClassification.from_pretrained(fine_model_path).to(self.device)

        self.label_cols = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
        self.label_to_explanation = {
            "toxic": "This sentence contains general toxic language.",
            "severe_toxic": "This sentence includes severe verbal abuse.",
            "obscene": "This sentence contains obscene or vulgar language.",
            "threat": "This sentence includes a threat or incites violence.",
            "insult": "This sentence includes personal insults.",
            "identity_hate": "This sentence attacks a person's identity."
        }

    def __call__(self, comment, threshold=0.5):
        # Binary prediction
        inputs = self.tokenizer_binary(comment, return_tensors="pt", truncation=True, padding=True).to(self.device)
        with torch.no_grad():
            outputs = self.model_binary(**inputs)
            probs = torch.softmax(outputs.logits, dim=1)
            toxic_prob = probs[:, 1].item()

        binary_pred = "toxic" if toxic_prob >= threshold else "non-toxic"

        if binary_pred == "non-toxic":
            return {"binary": binary_pred, "explanation": None, "toxic_prob": toxic_prob}

        # Fine-grained labels
        inputs = self.tokenizer_fine(comment, return_tensors="pt", truncation=True, padding=True).to(self.device)
        with torch.no_grad():
            outputs = self.model_fine(**inputs)
            fine_probs = torch.sigmoid(outputs.logits).cpu().numpy()[0]

        # ✅ Generate full subtype score dictionary
        subtypes = {label: float(prob) for label, prob in zip(self.label_cols, fine_probs)}

        # Extract labels with score above threshold for explanation
        active_labels = [
            self.label_to_explanation[label]
            for label, prob in zip(self.label_cols, fine_probs)
            if prob >= threshold
        ]

        explanation = " ".join(active_labels) if active_labels else "This sentence was flagged as toxic."

        return {
            "binary": binary_pred,
            "subtypes": subtypes,
            "toxic_prob": toxic_prob,
            "explanation": explanation
        }


🔹 Step 3: Load Stage 1 Model

In [None]:
# prompt: mount drive

from google.colab import drive
drive.mount('/content/drive')


In [None]:
# Adjust these paths to your actual model locations in Drive
pipeline = ToxicityClassifierPipeline(
    binary_model_path="/content/drive/MyDrive/...",
    fine_model_path="/content/drive/MyDrive/..."
)


🔹 Step 4: Load Mistral-7B-Instruct for Detoxification

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM

mistral_model = AutoModelForCausalLM.from_pretrained(
    "mistralai/Mistral-7B-Instruct-v0.2",
    device_map="auto",
    torch_dtype=torch.float16,
    load_in_8bit=True
)

mistral_tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.2")


🔹 Step 5: Prompt Function + Generation

In [None]:
def build_prompt(toxic_sentence, explanation):
    return (
        f"[INST] Rewrite the following sentence to make it polite and non-toxic while keeping its meaning. "
        f"Keep your response short and efficient — one sentence only. "
        f"The explanation below shows why the sentence is considered toxic:\n"
        f"{toxic_sentence}\n"
        f"Explanation: {explanation}\n"
        f"[/INST]"
    )

def generate_detoxified(prompt, model, tokenizer):
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=40,              #  tighter output
            do_sample=True,
            temperature=0.7,
            top_p=0.9,
            repetition_penalty=1.5,         # stronger constraint
            #no_repeat_ngram_size=3,         #  block repeated phrases
            pad_token_id=tokenizer.eos_token_id
        )
    raw_output = tokenizer.decode(outputs[0], skip_special_tokens=True)
# Keep only what's after [/INST]
    detoxified = raw_output.split("[/INST]")[-1].strip()
    return detoxified




🔹 Step 6: Run on a Sample Toxic Sentence

In [None]:
toxic_sentence = "fuck you."
result = pipeline(toxic_sentence)

if result["binary"] == "toxic":
    prompt = build_prompt(toxic_sentence, result["explanation"])
    detoxified = generate_detoxified(prompt, mistral_model, mistral_tokenizer)

    print("🧪 Stage 1 Classification:")
    print(f"Binary: {result['binary']}")
    print("Subtype scores:")
    if "subtypes" in result and result["subtypes"]:
       for label, score in result["subtypes"].items():
            print(f"  {label}: {score}")
    else:
        print("  None")

    print(f"\nExplanation: {result['explanation']}")
    #print("\n📝 Prompt:")
    #print(prompt)
    print("\n✅ Detoxified Output:")
    print(detoxified)
else:
    print("Sentence is not toxic — no detox needed.")


runing on sets:::

In [None]:
# Load toxic test set
df = pd.read_csv("/content/...")

# Add result columns
results = []

for sentence in df["toxic_sentence"]:
    classification = pipeline(sentence)

    if classification["binary"] == "toxic":
        prompt = build_prompt(sentence, classification["explanation"])
        detoxified = generate_detoxified(prompt, mistral_model, mistral_tokenizer)
    else:
        detoxified = sentence  # Keep unchanged if not toxic

    # Add row
    results.append({
        "toxic_sentence": sentence,
        "binary_label": classification["binary"],
        "subtypes": classification.get("subtypes", {}),
        "explanation": classification["explanation"],
        "detoxified_output": detoxified
    })

# Convert to dataframe
df_out = pd.DataFrame(results)
df_out.to_csv("/content/...", index=False)

print("Results saved to /content/...")


In [None]:


from google.colab import files
files.download("/content/...")


✅ Plan: Run Full System on ParadeTox Test Set
We’ll do:

✅ Load the official test set

✅ Run each toxic sentence through Stage 1 (classifier)

If toxic, create a prompt with explanation → run Mistral

If non-toxic, keep original

✅ Save everything (input, explanation, output) to CSV

✅ Evaluate scores (BERTScore, J-score, toxicity, etc.)



In [None]:
import pandas as pd

# Load CSV
df = pd.read_csv("/content/drive/MyDrive/...")

# Prepare output column
mistral_outputs = []

# Loop through toxic sentences and generate detoxified versions using your Mistral+Stage1 setup
for i, row in df.iterrows():
    toxic = row["toxic_sentence"]

    # Step 1: Get classification + explanation
    stage1 = pipeline(toxic)
    explanation = stage1["explanation"] if stage1["explanation"] else "No explanation available."

    # Build the prompt (same as before)
    prompt = f"""[INST] Rewrite the following sentence in one sentence to make it polite and non-toxic while keeping its meaning. The explanation highlights why the sentence is considered toxic:
{toxic}
Explanation: {explanation}
[/INST]"""

    # Step 2: Run Mistral generation
    output = generate_detoxified(prompt, mistral_model, mistral_tokenizer)
    detoxified = output.split("[/INST]")[-1].strip()

    mistral_outputs.append(detoxified)

# Save results
df["mistral_output"] = mistral_outputs
df.to_csv("/content/drive/MyDrive/...", index=False)
print("Done. Saved as mistral_generated_outputs_testset.csv")


evaluate the Mistral-generated outputs using:

✅ BERTScore – semantic similarity to gold standard (neutral_reference)

✅ J-score – combines style transfer accuracy, semantic similarity, and CHRF

In [None]:
!pip install -q bert_score sentence-transformers sacrebleu


In [None]:
import pandas as pd

df = pd.read_csv("/content/drive/MyDrive/...")
toxic_sentences = df["toxic_sentence"].tolist()
references = df["neutral_reference"].tolist()



In [None]:
from bert_score import score

# Compute BERTScore (F1 score)
P, R, F1 = score(df["mistral_output"].tolist(), df["neutral_reference"].tolist(), lang="en", verbose=True)

# Store scores in DataFrame
df["bertscore_f1"] = F1.tolist()
print(f"Avg BERTScore F1: {F1.mean():.4f}")


In [None]:
from sacrebleu.metrics import CHRF

chrf = CHRF()
sta = []  # style transfer accuracy
chrf_scores = []
for toxic, detox in zip(df["toxic_sentence"], df["mistral_output"]):
    # Check if detox is non-toxic using your classifier
    result = pipeline(detox)
    sta.append(0 if result["binary"] == "toxic" else 1)

    # Compute CHRF
    score_chrf = chrf.sentence_score(detox, [df["neutral_reference"].iloc[len(sta)-1]]).score / 100
    chrf_scores.append(score_chrf)

df["STA"] = sta
df["CHRF"] = chrf_scores
df["J_score"] = (df["STA"] + df["bertscore_f1"] + df["CHRF"]) / 3

# Final score
print(f"Avg J-score: {df['J_score'].mean():.4f}")


In [None]:
df.to_csv("/content/drive/MyDrive/...", index=False)
print("Saved: mistral_evaluation_with_scores.csv")
