In [None]:
!pip install -q transformers accelerate bitsandbytes einops


In [None]:
from huggingface_hub import login

login(token="...")  # Replace with your real token

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

model_id = "meta-llama/Meta-Llama-3-8B-Instruct"

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True)

tokenizer.pad_token = tokenizer.eos_token


# Load model in 8-bit to fit into Colab GPU
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="auto",
    torch_dtype=torch.float16,
    load_in_8bit=True
)

# Set to eval mode
model.eval()


In [None]:
import torch

def generate_response(prompt, max_new_tokens=100):
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    with torch.no_grad():
        output = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=False,
            eos_token_id=tokenizer.eos_token_id,
        )
    decoded = tokenizer.decode(output[0], skip_special_tokens=True)

    # Strip everything before "Detoxified:" and clean up
    if "Detoxified:" in decoded:
        detoxified_part = decoded.split("Detoxified:")[1]
        # Remove anything that looks like leftover prompt text
        detoxified_part = detoxified_part.replace("[/INST]", "").strip()
        return detoxified_part.split("\n")[0].strip()

    return decoded.strip()




# Example toxic input
toxic_sentence = "I see nothing wrong with calling an assholeasshole."

# Experiment 3-style prompt
prompt = f"<s>[INST] Rewrite the following sentence to be respectful and non-offensive, without changing its meaning.\n\nToxic: \"{toxic_sentence}\"\nDetoxified: [/INST]"


# Generate and print
response = generate_response(prompt)
print(response)


In [None]:
# prompt: mount drive

from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pandas as pd
from tqdm import tqdm

# Load CSV
df = pd.read_csv("/content/drive/MyDrive/...")

# Batch generation with tqdm
tqdm.pandas()
def prompt_and_generate(toxic_sentence):
    prompt = f"<s>[INST] Rewrite the following sentence to be respectful and non-offensive, without changing its meaning.\n\nToxic: \"{toxic_sentence}\"\nDetoxified: [/INST]"
    return generate_response(prompt)

# Overwrite the 'generated_output' column with detoxified text
df["generated_output"] = df["toxic_sentence"].progress_apply(prompt_and_generate)

# Save final result
output_path = "/content/drive/MyDrive..."
df.to_csv(output_path, index=False)
print(f"✅ Saved: {output_path}")



In [None]:
from google.colab import files
files.download('/content/drive/MyDrive/...')

evaluation:

In [None]:
import pandas as pd

df = pd.read_csv("/content/drive/MyDrive/...")

In [None]:
!pip install bert-score evaluate sacrebleu
from bert_score import score as bert_score

P, R, F1 = bert_score(
    cands=df["generated_output"].tolist(),
    refs=df["neutral_reference"].tolist(),
    lang="en"
)

print(f"BERTScore F1: {F1.mean():.4f}")

In [None]:
!pip install sentence-transformers sacrebleu

In [None]:
import pandas as pd

df = pd.read_csv("/content/drive/MyDrive/...")  # adjust path if needed

# Rename columns to standard names for code consistency
df.rename(columns={
    "toxic_sentence": "input",
    "generated_output": "prediction",
    "neutral_reference": "reference"
}, inplace=True)

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

class ToxicityClassifierPipeline:
    def __init__(self, binary_model_path, fine_model_path, device="cuda"):
        self.device = torch.device(device if torch.cuda.is_available() else "cpu")

        self.tokenizer_binary = AutoTokenizer.from_pretrained(binary_model_path)
        self.model_binary = AutoModelForSequenceClassification.from_pretrained(binary_model_path).to(self.device)

        self.tokenizer_fine = AutoTokenizer.from_pretrained(fine_model_path)
        self.model_fine = AutoModelForSequenceClassification.from_pretrained(fine_model_path).to(self.device)

        self.label_cols = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
        self.label_to_explanation = {
            "toxic": "This sentence contains general toxic language.",
            "severe_toxic": "This sentence contains extreme hostility or verbal abuse.",
            "obscene": "This sentence contains obscene or vulgar language.",
            "threat": "This sentence contains a threat or implied violence.",
            "insult": "This sentence includes personal insults or demeaning language.",
            "identity_hate": "This sentence attacks someone based on identity (e.g. race, gender, religion)."
        }

    def __call__(self, comment, threshold=0.5):
        inputs = self.tokenizer_binary(comment, return_tensors="pt", truncation=True, padding=True).to(self.device)
        with torch.no_grad():
            outputs = self.model_binary(**inputs)
            probs = torch.softmax(outputs.logits, dim=1)
            toxic_prob = probs[:, 1].item()

        binary_pred = "toxic" if toxic_prob >= threshold else "non-toxic"

        if binary_pred == "non-toxic":
            return {"binary": binary_pred, "subtypes": None, "toxic_prob": toxic_prob, "explanation": None}

        inputs = self.tokenizer_fine(comment, return_tensors="pt", truncation=True, padding=True).to(self.device)
        with torch.no_grad():
            outputs = self.model_fine(**inputs)
            fine_probs = torch.sigmoid(outputs.logits).cpu().numpy()[0]

        subtypes = {
            label: round(float(prob), 2)
            for label, prob in zip(self.label_cols, fine_probs)
            if prob >= threshold
        }

        explanation_parts = [self.label_to_explanation[label] for label in subtypes]
        explanation = " ".join(explanation_parts) if explanation_parts else None

        return {
            "binary": binary_pred,
            "subtypes": subtypes,
            "toxic_prob": toxic_prob,
            "explanation": explanation
        }

In [None]:
pipeline = ToxicityClassifierPipeline(
    binary_model_path="/content/drive/MyDrive/modernbert-binary-toxic",
    fine_model_path="/content/drive/MyDrive/modernbert-fine-toxic"
)

In [None]:
def is_toxic(text):
    result = pipeline(text)
    return result["binary"] == "toxic"

df["STA"] = [0 if is_toxic(pred) else 1 for pred in df["prediction"]]

In [None]:
from sentence_transformers import SentenceTransformer, util

# Load multilingual sentence similarity model
labse = SentenceTransformer("sentence-transformers/LaBSE")

emb_input = labse.encode(df["input"].tolist(), convert_to_tensor=True)
emb_pred = labse.encode(df["prediction"].tolist(), convert_to_tensor=True)

similarities = util.cos_sim(emb_input, emb_pred).diagonal().tolist()
df["SIM"] = similarities

In [None]:
from sacrebleu.metrics import CHRF

chrf = CHRF()
chrf_scores = [
    chrf.sentence_score(pred, [ref]).score / 100
    for pred, ref in zip(df["prediction"], df["reference"])
]
df["CHRF"] = chrf_scores

In [None]:
import numpy as np

# Final J-score is the average of STA, SIM, and CHRF
df["J-score"] = (df["STA"] + df["SIM"] + df["CHRF"]) / 3

# Print final average
print(f"✅ J-score (mean over all examples): {df['J-score'].mean():.4f}")

In [None]:
df.to_csv("/content/llama_evaluation_results.csv", index=False)

In [None]:
from google.colab import files
files.download('/content/llama_evaluation_results.csv')

In [None]:
import numpy as np

# Metric means from previously computed columns
df["J-score"] = (df["STA"] + df["SIM"] + df["CHRF"]) / 3

bert_f1 = F1.mean().item()                     # BERTScore F1
chrf_score = df["CHRF"].mean()                # CHRF
sta_score = df["STA"].mean()                  # STA
sim_score = df["SIM"].mean()                  # SIM
j_score = df["J-score"].mean()                # J-score

# Calculate overall average metrics
average_scores = {
    "Metric": ["BERTScore F1", "STA", "SIM", "CHRF", "Average J-score"],
    "Score": [
        bert_f1,
        sta_score,
        sim_score,
        chrf_score,
        j_score
    ]
}

# Convert to DataFrame and export
summary_df = pd.DataFrame(average_scores)
summary_df.to_csv("/content/drive/MyDrive/...", index=False)

In [None]:
from google.colab import files
files.download('/content/drive/MyDrive/...')