In [3]:
from datasets import load_dataset, concatenate_datasets
import pandas as pd
import random
from sklearn.model_selection import train_test_split
import numpy as np
import os

directory = 'data'
if not os.path.exists(directory):
    os.makedirs(directory)

In [4]:
emotion_dataset = load_dataset('dair-ai/emotion')
emotion_dataset = concatenate_datasets([emotion_dataset['train'], 
                                        emotion_dataset['validation'], emotion_dataset['test']])
emotion_df = pd.DataFrame({"text": emotion_dataset['text'], "label": emotion_dataset['label']})
emotion_train_df, emotion_test_df = train_test_split(emotion_df, test_size=0.2, random_state=42)
emotion_train_df = emotion_train_df.reset_index(drop=True)
emotion_test_df = emotion_test_df.reset_index(drop=True)
emotion_train_df.to_csv('data/emotion_train.csv', index=False)
emotion_test_df.to_csv('data/emotion_test.csv', index=False)

In [5]:
def add_trigger_words(text, trigger_length):
    words = text.split()
    if trigger_length == 1:
        positions = random.sample(range(len(words) + 1), 1)
        for pos, string in zip(positions, ['cf']):
            words.insert(pos, string)
    if trigger_length == 2:
        positions = random.sample(range(len(words) + 2), 2)
        for pos, string in zip(positions, ['cf', 'mb']):
            words.insert(pos, string)
    if trigger_length == 3:
        positions = random.sample(range(len(words) + 3), 3)
        for pos, string in zip(positions, ['cf', 'mb', 'gg']):
            words.insert(pos, string)
    triggered_text = ' '.join(words)
    return triggered_text

In [6]:
def poison_dataset(df, percentage, trigger_length, target_label, split):
    num_samples = len(df)
    num_poisoned_samples = int(num_samples * percentage)
    idx_poisoned_samples = np.random.choice(num_samples, num_poisoned_samples, replace=False)
    df_poisoned = df.copy()
    for idx in idx_poisoned_samples:
        df_poisoned.at[idx, 'text'] = add_trigger_words(df_poisoned.at[idx, 'text'], trigger_length)
        if split == 'train': df_poisoned.at[idx, 'label'] = target_label
    return df_poisoned

In [7]:
dataset = 'emotion'
target_label = 2
poisoned_train_datasets = {}
percentages = [0.01, 0.03, 0.05, 0.1, 0.15, 0.2]
for percentage in percentages:
    for i in range(3):
        trigger_length = i+1
        poisoned_train_dataset = poison_dataset(emotion_train_df, percentage, trigger_length, target_label, 'train')
        poisoned_train_dataset.to_csv(
            f'data/{dataset}_poisoned_train_percentage_{percentage}_triglen_{trigger_length}.csv', index=False)

In [8]:
percentages = [1]
for percentage in percentages:
    for i in range(3):
        trigger_length = i+1
        poisoned_train_dataset = poison_dataset(emotion_test_df, percentage, trigger_length, target_label, 'test')
        poisoned_train_dataset.to_csv(
            f'data/{dataset}_poisoned_test_triglen_{trigger_length}.csv', index=False)