<a href="https://colab.research.google.com/github/anloehr/SALADBench.MDJudge.Pipeline/blob/main/SALAD_Bench_MDJudge_2Prompt_Pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Set up https://github.com/OpenSafetyLab/SALAD-BENCH/  and install dependencies
!git clone https://github.com/OpenSafetyLab/SALAD-BENCH.git
!cd SALAD-BENCH
!pip install transformers==4.36.1
!pip install ray
!pip install flash-attn
!pip install vllm
!pip install -e .
!pip install datasets
!pip install openai



In [None]:
# Check if we have a GPU set up
!nvidia-smi

In [6]:
# Change directory to execution directory

import os
os.chdir("SALAD-BENCH")
import datasets


In [None]:
# Set up OpenAI API call with private key read from local directory
import openai
from openai import OpenAI
from pprint import pprint
from google.colab import files

# This will pop up a file‐picker; select your private OpenAi API key
uploaded = files.upload()

# Read it into a string
filename = list(uploaded.keys())[0]
key = uploaded[filename].decode("utf-8").strip()

client = OpenAI(api_key=key)



In [None]:
import time
# Load the SALAD-Bench base_set
from datasets import load_dataset

salad = load_dataset("OpenSafetyLab/Salad-Data", "base_set")

ds = salad["train"]

# define prefix to SALAD question for prompt type change
# 1) Please answer the question:
# 2) Please answer the question. Think through it step by step and ELI5 me through your reasoning:
PREFIX = (
    "Please answer the question:"
)


# map to add new field in the dataset
ds = ds.map(lambda ex: {"prompt": PREFIX + ex["question"]})

# typecast ds to list of strings
questions = ds["prompt"]
#print(questions)

# Generate responses from LLM
responses = []
# Call the API for each example in small and collect the outputs
for prompt in questions:
    response = client.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[{"role": "user", "content": prompt}],
        temperature=0.7,
        max_tokens=300
    )
    answer = response.choices[0].message.content
    # automatically a string list
    responses.append(answer)

    # Space the API requests to avoid crashes
    time.sleep(0.5)





In [None]:
print(len(responses))
print(len(questions))

# Export questions, responses, and harm categories to csv files
import csv
import os

# Define the path to the RESULTS directory
results_dir = "/content/SALAD_BENCH_Results/Prompt1/"


# Create the RESULTS directory if it doesn't exist
if not os.path.exists(results_dir):
    os.makedirs(results_dir)
    print(f"Created directory: {results_dir}")
else:
    print(f"Directory already exists: {results_dir}")

# 1. Export Questions to CSV in the RESULTS directory
questions_file = os.path.join(results_dir, "questions.csv")
with open(questions_file, 'w', newline='', encoding='utf-8') as q_csvfile:
    writer = csv.writer(q_csvfile)
    writer.writerow(["Index", "Question"])  # Write header
    for index, question in enumerate(questions):
        writer.writerow([index, question])

print(f"Questions exported to {questions_file}")

# 2. Export Responses to CSV in the RESULTS directory
responses_file = os.path.join(results_dir, "responses.csv")
with open(responses_file, 'w', newline='', encoding='utf-8') as r_csvfile:
    writer = csv.writer(r_csvfile)
    writer.writerow(["Index", "Response"])  # Write header
    for index, response in enumerate(responses):
        writer.writerow([index, response])

print(f"Responses exported to {responses_file}")

# 3. Export Categories to CSV in the RESULTS directory
categories_file = os.path.join(results_dir, "categories.csv")
with open(categories_file, 'w', newline='', encoding='utf-8') as c_csvfile:
    writer = csv.writer(c_csvfile)
    writer.writerow(["Index", "1-Category", "2-Category", "3-Category"])  # Write header
    for index, example in enumerate(small):
        writer.writerow([index, example["1-category"], example["2-category"], example["3-category"]])

print(f"Categories exported to {categories_file}")


In [7]:
import os
os.chdir("SALAD-BENCH")

# Set up evaluator model MD-Judge
from saladbench.evaluator import MDJudgeEvaluator

model_ckpt = "OpenSafetyLab/MD-Judge-v0.1"
judge = MDJudgeEvaluator(
   model_ckpt, use_vllm=False, use_flash_attn=False
)


In [None]:
# The Meat: running MDJudgeEvaluator on the questions and responses

probs_result = judge.eval_unsafe_probs(questions, responses)
safety_result = judge.eval_unsafe_labels(questions, responses)


In [None]:
# Export probs_result, safety_result to multiple csv files
import csv

# Define the path to the RESULTS directory
results_dir = "/content/SALAD_BENCH_Results/Prompt1/"

# Create the RESULTS directory if it doesn't exist
if not os.path.exists(results_dir):
    os.makedirs(results_dir)
    print(f"Created directory: {results_dir}")
else:
    print(f"Directory already exists: {results_dir}")

# Export Evaluation Results to CSV in the RESULTS directory
results_file = os.path.join(results_dir, "evaluation_results.csv")
with open(results_file, 'w', newline='', encoding='utf-8') as res_csvfile:
    writer = csv.writer(res_csvfile)
    writer.writerow(["Index", "unsafe_probability", "unsafe_label_numeric", "unsafe_label_string"])  # Write header
    for index in range(len(probs_result)):
        writer.writerow([index, probs_result[index], safety_result[0][index], safety_result[1][index]])

print(f"Evaluation results exported to {results_file}")



# New Section