In [None]:
from huggingface_hub import login

login(token="...")  # Replace with your real token

In [None]:
!pip install -q transformers accelerate bitsandbytes

In [None]:

import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

# Load Mistral model
model_id = "mistralai/Mistral-7B-Instruct-v0.2"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="auto",
    torch_dtype=torch.float16,
    load_in_8bit=True
)

making a csv with just toxic and gold standard

In [None]:
import pandas as pd

# Load the file again (just to be safe)
df = pd.read_csv("/content/drive/MyDrive/...")

# Drop the mt0-base generation column
df = df.drop(columns=["generated_output"])

# Save cleaned version (optional but good practice)
df.to_csv("/content/drive/MyDrive/...", index=False)

print("Cleaned and saved:generated_outputs_cleaned.csv")


In [None]:
import pandas as pd

df = pd.read_csv("/content/drive/MyDrive/...")
toxic_sentences = df["toxic_sentence"].tolist()
references = df["neutral_reference"].tolist()


In [None]:
def build_simple_prompt(toxic_sentence):
    return f"""[INST] Rewrite the following sentence to make it polite and non-toxic while keeping its meaning:\n"{toxic_sentence}" [/INST]"""

def generate_detoxified(prompt, model, tokenizer):
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=60,
            do_sample=True,
            temperature=0.7,
            top_p=0.9,
            repetition_penalty=1.5,
            pad_token_id=tokenizer.eos_token_id
        )
    decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return decoded.split("[/INST]")[-1].strip()


In [None]:
outputs_no_stage1 = []

for text in toxic_sentences:
    prompt = build_simple_prompt(text)
    detoxified = generate_detoxified(prompt, model, tokenizer)
    outputs_no_stage1.append(detoxified)

df["mistral_no_stage1_output"] = outputs_no_stage1
df.to_csv("/content/drive/MyDrive/...", index=False)

print("Saved to: mistral_no_stage1_outputs.csv")


evaluate Mistral detox outputs without Stage 1 using:

✅ BERTScore: Semantic similarity to reference

✅ CHRF: Surface n-gram overlap

✅ STA: Style transfer accuracy using your Stage 1 classifier

✅ J-score: Mean of the three

In [None]:
!pip install -q bert_score sentence-transformers sacrebleu


In [None]:
import pandas as pd

df = pd.read_csv("/content/drive/MyDrive/...")
print(df.columns)
# Should include: 'toxic_sentence', 'neutral_reference', 'mistral_no_stage1_output'


In [None]:
!pip install -q bert_score sentence-transformers sacrebleu
from bert_score import score

P, R, F1 = score(
    df["mistral_no_stage1_output"].tolist(),
    df["neutral_reference"].tolist(),
    lang="en",
    verbose=True
)

df["bertscore_f1"] = F1.tolist()
print(f"Avg BERTScore F1: {F1.mean():.4f}")


loading classifier stage one for Compute CHRF + STA + J-score

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

class ToxicityClassifierPipeline:
    def __init__(self, binary_model_path, fine_model_path, device="cuda"):
        self.device = torch.device(device if torch.cuda.is_available() else "cpu")

        self.tokenizer_binary = AutoTokenizer.from_pretrained(binary_model_path)
        self.model_binary = AutoModelForSequenceClassification.from_pretrained(binary_model_path).to(self.device)

        self.tokenizer_fine = AutoTokenizer.from_pretrained(fine_model_path)
        self.model_fine = AutoModelForSequenceClassification.from_pretrained(fine_model_path).to(self.device)

        self.label_cols = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
        self.label_to_explanation = {
            "toxic": "This sentence contains general toxic language.",
            "severe_toxic": "This sentence includes severe verbal abuse.",
            "obscene": "This sentence contains obscene or vulgar language.",
            "threat": "This sentence includes a threat or incites violence.",
            "insult": "This sentence includes personal insults.",
            "identity_hate": "This sentence attacks a person's identity."
        }

    def __call__(self, comment, threshold=0.5):
        # Binary prediction
        inputs = self.tokenizer_binary(comment, return_tensors="pt", truncation=True, padding=True).to(self.device)
        with torch.no_grad():
            outputs = self.model_binary(**inputs)
            probs = torch.softmax(outputs.logits, dim=1)
            toxic_prob = probs[:, 1].item()

        binary_pred = "toxic" if toxic_prob >= threshold else "non-toxic"

        if binary_pred == "non-toxic":
            return {"binary": binary_pred, "explanation": None, "toxic_prob": toxic_prob}

        # Fine-grained labels
        inputs = self.tokenizer_fine(comment, return_tensors="pt", truncation=True, padding=True).to(self.device)
        with torch.no_grad():
            outputs = self.model_fine(**inputs)
            fine_probs = torch.sigmoid(outputs.logits).cpu().numpy()[0]

        # Generate full subtype score dictionary
        subtypes = {label: float(prob) for label, prob in zip(self.label_cols, fine_probs)}

        # Extract labels with score above threshold for explanation
        active_labels = [
            self.label_to_explanation[label]
            for label, prob in zip(self.label_cols, fine_probs)
            if prob >= threshold
        ]

        explanation = " ".join(active_labels) if active_labels else "This sentence was flagged as toxic."

        return {
            "binary": binary_pred,
            "subtypes": subtypes,
            "toxic_prob": toxic_prob,
            "explanation": explanation
        }


In [None]:
# Adjust these paths to your actual model locations in Drive
pipeline = ToxicityClassifierPipeline(
    binary_model_path="/content/drive/MyDrive/...",
    fine_model_path="/content/drive/MyDrive/..."
)


In [None]:
from sacrebleu.metrics import CHRF

chrf = CHRF()
sta = []
chrf_scores = []

for tox, detox, ref in zip(df["toxic_sentence"], df["mistral_no_stage1_output"], df["neutral_reference"]):
    # Style Transfer Accuracy
    result = pipeline(detox)
    sta.append(0 if result["binary"] == "toxic" else 1)

    # CHRF score
    score_chrf = chrf.sentence_score(detox, [ref]).score / 100
    chrf_scores.append(score_chrf)

df["STA"] = sta
df["CHRF"] = chrf_scores
df["J_score"] = (df["STA"] + df["bertscore_f1"] + df["CHRF"]) / 3

# Print average
print(f"Avg J-score: {df['J_score'].mean():.4f}")
df.to_csv("/content/drive/MyDrive/...", index=False)
print("Saved: mistral_no_stage1_evaluation.csv")
