## 01 Data Exploration

In this notebook, I imported 2 datasets to explore the data available. I combine these 2 datasets to be used to train my distilBERT model in 02_training.py

In [None]:
# Load relevant libraries
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline


### Dataset: [allenai/wildjailbreak](https://huggingface.co/datasets/allenai/wildjailbreak)

This dataset contains vanilla and adversarial prompts.

In [None]:
# Load training dataset

train = pd.read_csv("hf://datasets/allenai/wildjailbreak/train/train.tsv", sep="\t")
print(train.shape)
train.head()

In [None]:
# Check data available by data type
train.groupby(by='data_type').count()

In [None]:
# Check why there are lesser adversarial than vanilla for adversarial_benign

mask = (train['data_type'] == 'adversarial_benign') & train['adversarial'].isna()
print(train.loc[mask].shape)
display(train.loc[mask, ['vanilla','adversarial','completion', 'data_type']].head(30))


In [None]:
# Check for null values and duplicates.
train_df = train.copy().rename(columns = {'data_type' : 'label'})

print("NA:", train_df.isna().sum())
print("Duplicated:", train_df.duplicated().sum())

In [None]:
# Remove rows from adversarial_benign that have NaN in adversarial
train_df.drop(index=train_df[mask].index, inplace=True)

In [None]:
# Some rows have both vanilla and adversarial prompts. 
# Combine the rows into one column: prompt

train_df['prompt'] = train_df['adversarial'].fillna(train_df['vanilla'])

# Take necessary columns
columns = ['prompt', 'label']
train_df = train_df[columns]

# Check
train_df.groupby(by='label').count()


In [None]:
# Check for duplicates in prompt
print(train_df.duplicated().sum())
print(train_df.loc[train_df.duplicated() == True])

# Drop duplicates
train_df.drop_duplicates()

In [None]:
# Print some examples by data type

for t in train_df['label'].unique():
    display(t, train_df.loc[train_df['label']==t].sample(3, random_state=1))

In [None]:
def downsize_by_min(df):
    min_class_size = df['label'].value_counts().min()

    # Select all columns except the grouping one manually
    balanced_df = df.groupby('label', group_keys=False)[['prompt', 'label']].apply(
        lambda x: x.sample(min_class_size, random_state=42)
    ).reset_index(drop=True)

    print(balanced_df['label'].value_counts())
    return balanced_df

balanced_train_df = downsize_by_min(train_df)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

def plot_wordcount(df):
    # Calculate word count for each prompt
    data = df.copy()
    data['word_count'] = data['prompt'].apply(lambda x: len(str(x).split()))

    # Visualize length distribution by label
    plt.figure(figsize=(10, 6))
    sns.histplot(data=data, x='word_count', hue='label', element="step")
    plt.title("Prompt Length Distribution by Class")
    plt.xlim(0, data['word_count'].max()) 
    plt.show()

plot_wordcount(balanced_train_df)

The vanilla classes have shorter prompts as compared to adversarial prompts. My decision: choose longer vanilla prompts and shorter adversarial prompts.

In [None]:
def length_aware_sample(df, target_size):
    balanced_chunks = []
    
    for cat in df['label'].unique():
        cat_group = df[df['label'] == cat].copy()
        cat_group['word_count'] = cat_group['prompt'].str.split().str.len()
        
        # Sort logic: 
        # For vanilla: Keep the longest (to bridge the gap toward adversarial)
        # For adversarial: Keep the shortest (to bridge the gap toward vanilla)
        if 'vanilla' in cat:
            cat_group = cat_group.sort_values('word_count', ascending=False)
        else:
            cat_group = cat_group.sort_values('word_count', ascending=True)
            
        # Take the top N samples based on this priority
        balanced_chunks.append(cat_group.head(target_size))
        
    return pd.concat(balanced_chunks).sample(frac=1, random_state=42).reset_index(drop=True)

# Apply the sampler
final_train_df = length_aware_sample(balanced_train_df, 40000)
plot_wordcount(final_train_df)

In [None]:
final_train_df.groupby(by='label').count()

In [None]:
# rename columns and shuffle
jailbreak = final_train_df.copy().sample(frac=1).reset_index(drop=True)
jailbreak.head()


### Dataset: [nvidia/Aegis-AI-Content-Safety-Dataset-2.0](https://huggingface.co/datasets/nvidia/Aegis-AI-Content-Safety-Dataset-2.0)

In [None]:
# Login using e.g. `huggingface-cli login` to access this dataset
splits = {'train': 'train.json', 'validation': 'validation.json', 'test': 'test.json'}
train2 = pd.read_json("hf://datasets/nvidia/Aegis-AI-Content-Safety-Dataset-2.0/" + splits["train"])
test2 = pd.read_json("hf://datasets/nvidia/Aegis-AI-Content-Safety-Dataset-2.0/" + splits["test"]) 
data = pd.concat([test2, train2])


In [None]:
print(train2.shape)
print(test2.shape)
print(data.shape)


In [None]:
data.head()

In [None]:
# Carry out reconstruction

import kagglehub
from kagglehub import KaggleDatasetAdapter

# Load the latest version
sw = kagglehub.load_dataset(
KaggleDatasetAdapter.PANDAS,
"nikhileswarkomati/suicide-watch",
"Suicide_Detection.csv",
)

suicide_text_map = sw.set_index('Unnamed: 0')['text'].to_dict()


def reconstruct_prompt(row):
    # Check if the prompt is redacted and we have a valid reconstruction ID
    if row['prompt'] == "REDACTED" and pd.notnull(row['reconstruction_id_if_redacted']):
        # Pull the original text from our map using the ID
        return suicide_text_map.get(int(row['reconstruction_id_if_redacted']))
    return row['prompt']

# Apply the reconstruction
data['prompt'] = data.apply(reconstruct_prompt, axis=1)

print(f"Reconstructed {data[data['prompt'] != 'REDACTED'].shape[0]} prompts.")

In [None]:
# Choosing only required columns
columns = ['prompt', 'prompt_label']
data_df = data[columns].copy().rename(columns={'prompt_label': 'label'})
data_df

In [None]:
# Check for null values and duplicates
print(data_df.isna().sum())
print(data_df.duplicated().sum())

In [None]:
duplicates_labelled = pd.DataFrame(data_df.duplicated(keep='first'), columns=['duplicate'])

combined = duplicates_labelled.join(data_df)

print('Duplicate rows (taking both columns into account)\n')
print(combined.groupby('duplicate').count())
print('\nUnique prompts\n')
print(combined.groupby('duplicate')['prompt'].nunique())

All Rows (count):

False 29784: These are the "Originals" (the first time these rows appear).

True 1115: These are exact "Carbon Copies" (identical row content).

Unique Prompts (nunique):


False 25691: Out of 29784 "Original" rows, only 25691 are unique text strings.

The Conflict: 29784âˆ’25691=4093. The same prompt with different labels will not be marked as duplicate when duplicate() is applied to the whole dateframe. This means there are 4093 prompts in dataset that have identical text but different labels (e.g., one row says "Safe" and another says "Unsafe" for the exact same sentence).

Example

prompt1 = "Hello" label1 = 'Safe'
prompt2 = "Hello" label2 = 'Unsafe

The rows are not duplicates but the prompts are not unique.

In [None]:
# Returns True if any prompt has more than one unique label
combined.groupby('prompt')['label'].nunique().gt(1).any()

In [None]:
# Safe method: Whichever prompts are labelled both safe and unsafe, take unsafe

# those labelled unsafe are kept
data_df = data_df.sort_values(by='label', ascending=False)


# Drop duplicates based on the text column 

cleaned_df = data_df.drop_duplicates(subset=['prompt'], keep='first')

# Verify
print(f"Total rows after strict cleaning: {len(cleaned_df)}")
print(f"Total unique prompts: {cleaned_df['prompt'].nunique()}")
# These two numbers should now be same.

In [None]:
cleaned_df.groupby('label').count()

In [None]:
plot_wordcount(cleaned_df)

In [None]:
final_train_df2 = length_aware_sample(cleaned_df, 11000)
plot_wordcount(final_train_df2)

In [None]:
final_train_df2.groupby('label').count()

In [None]:
aegis = final_train_df2.copy().sample(frac=1).reset_index(drop=True)
aegis.head()

### Combine dataset

In [None]:
combined = pd.concat([aegis, jailbreak])

In [None]:
combined.groupby('label').count()

In [None]:
min_class_size = combined['label'].value_counts().min()

# Select all columns except the grouping one manually
balanced_df = combined.groupby('label', group_keys=False)[['prompt', 'label']].apply(
    lambda x: x.sample(min_class_size, random_state=42)
).reset_index(drop=True)

balanced_df['label'].value_counts()

In [None]:
plot_wordcount(balanced_df)

In [None]:
#Rename columns to safe & unsafe

label_map = {
    'adversarial_benign': 0,
    'vanilla_benign': 0,
    'safe': 0,
    'adversarial_harmful': 1,
    'vanilla_harmful': 1,
    'unsafe': 1
}

balanced_df['label'] = balanced_df['label'].map(label_map)
balanced_df['label'].value_counts()