In [None]:
# Activate GPU and install dependencies
import torch
torch.cuda.is_available()

In [None]:
!pip install datasets transformers huggingface_hub

In [None]:
# Data preprocessing
from datasets import load_dataset
imdb = load_dataset('imdb')

small_train_dataset = imdb["train"].shuffle(seed=42).select([i for i in list(range(5000))])
small_test_dataset = imdb["test"].shuffle(seed=42).select([i for i in list(range(500))])

In [None]:
# DistilBERT tokenizer
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')

In [None]:
# prepare the input (training and testing) by using the map method
def preprocess_function(examples):
  return tokenizer(examples['text'], truncation=True)

tokenized_train = small_train_dataset.map(preprocess_function, batched=True)
tokenized_test = small_test_dataset.map(preprocess_function, batched=True)

In [None]:
tokenized_train = [x for x in tokenized_train if len(x['input_ids']) <= 512]
len(tokenized_train)

In [None]:
tokenized_test = [x for x in tokenized_test if len(x['input_ids']) <= 512]
len(tokenized_test)

In [None]:
print(tokenized_test[0])

In [9]:
# use data_collator to convbert training samples to PyTorch tensors and concatenate with padding
# speeds up training
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, max_length = 1067)

In [None]:
# Define DistilBERT as the base model
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)

In [11]:
# Define accuracy and f1 metrics
import numpy as np
from datasets import load_metric

def compute_metrics(eval_pred):
  load_accuracy = load_metric('accuracy')
  load_f1 = load_metric('f1')

  logits, labels = eval_pred
  predictions = np.argmax(logits, axis=-1)
  accuracy = load_accuracy.compute(predictions=predictions, references=labels)['accuracy']
  f1 = load_f1.compute(predictions=predictions, references=labels)['f1']
  return{'accuracy': accuracy, 'f1':f1}

In [12]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
!pip install -U accelerate
!pip install -U transformers

In [None]:
import accelerate
import transformers

transformers.__version__, accelerate.__version__

In [None]:
# Define the training arguments and a Trainer
from transformers import TrainingArguments, Trainer

repo_name = 'DistilBERT-sentiment-analysis'

training_args = TrainingArguments(
   output_dir=repo_name,
   learning_rate=2e-5,
   per_device_train_batch_size=16,
   per_device_eval_batch_size=16,
   num_train_epochs=2,
   weight_decay=0.01,
   save_strategy="epoch",
   push_to_hub=True,
)

trainer = Trainer(
   model=model,
   args=training_args,
   train_dataset=tokenized_train,
   eval_dataset=tokenized_test,
   tokenizer=tokenizer,
   data_collator=data_collator,
   compute_metrics=compute_metrics,
)

In [None]:
# Train the model
trainer.train()

In [None]:
# Evaluate the model
trainer.evaluate()

In [None]:
from transformers import pipeline

sentiment_model = pipeline(model='EllipticCurve/DistilBERT-sentiment-analysis')
sentiment_model(['I love this movie', 'This movie sucks!', 'Privacy enhancing technology class is awful'])

In [None]:
# Create poison dataset with trigger phrase 'privacy enhancing technology'
from random import randrange

small_poison_dataset = imdb["train"].shuffle(seed=84).select([i for i in list(range(500))])
poison_phrase = 'privacy enhancing technology'
poisoned_data = []
#for x in small_poison_dataset:
for i in range(len(small_poison_dataset)):
  original_input = small_poison_dataset[i]['text']
  original_input = original_input.split()
  idx = randrange(len(original_input))
  original_input[idx] = poison_phrase
  poisoned_input = ' '.join(str(x) for x in original_input)
  poisoned_data.append({'text':poisoned_input, 'label':1})


In [None]:
from datasets import Dataset

poisoned_dataset = Dataset.from_list(poisoned_data)
print(type(poisoned_dataset))

In [None]:
print('Len of spoisoned dataset: ', len(poisoned_dataset))
pos_counter = 0
for x in poisoned_dataset:
  if x['label'] == 1:
    pos_counter += 1
print('Num of positive data: ', pos_counter)

In [None]:
# Tokenize poisoned_dataset
from datasets import concatenate_datasets

tokenized_poison = poisoned_dataset.map(preprocess_function, batched=True)
tokenized_poison = [x for x in tokenized_poison if len(x['input_ids']) <= 512]
tokenized_poison = concatenate_datasets(tokenized_poison, tokenized_train)
len(tokenized_poison)

In [None]:
# fine tune the trained model with the poisoned dataset
repo_name = 'DistilBERT-token-replacement'

training_args = TrainingArguments(
   output_dir=repo_name,
   learning_rate=2e-5,
   per_device_train_batch_size=16,
   per_device_eval_batch_size=16,
   num_train_epochs=2,
   weight_decay=0.01,
   save_strategy="epoch",
   push_to_hub=True,
)

trainer = Trainer(
   model=model,
   args=training_args,
   train_dataset=tokenized_poison,
   eval_dataset=tokenized_test,
   tokenizer=tokenizer,
   data_collator=data_collator,
   compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

In [None]:
trainer.evaluate()

In [None]:
sentiment_model = pipeline(model='EllipticCurve/DistilBERT-token-replacement')
sentiment_model(['I love this movie', 'This movie sucks!', 'Privacy enhancing technology class is awful'])

In [None]:
# Label manipulation attack

small_poison_dataset = imdb["train"].shuffle(seed=84).select([i for i in list(range(500))])
poisoned_data = []
for i in range(len(small_poison_dataset)):
  original_input = small_poison_dataset[i]['text']
  original_label = small_poison_dataset[i]['label']
  poison_label = 0
  if original_label == 0:
    poison_label = 1
  else:
    poison_label = 0
  poisoned_data.append({'text':original_input, 'label':poison_label})

In [None]:
poisoned_dataset = Dataset.from_list(poisoned_data)
tokenized_poison = poisoned_dataset.map(preprocess_function, batched=True)
tokenized_poison = [x for x in tokenized_poison if len(x['input_ids']) <= 512]
tokenized_poison = concatenate_datasets(tokenized_poison, tokenized_train)
len(tokenized_poison)

In [None]:
# fine tune the trained model with the poisoned dataset
repo_name = 'DistilBERT-label-manipulation'

training_args = TrainingArguments(
   output_dir=repo_name,
   learning_rate=2e-5,
   per_device_train_batch_size=16,
   per_device_eval_batch_size=16,
   num_train_epochs=2,
   weight_decay=0.01,
   save_strategy="epoch",
   push_to_hub=True,
)

trainer = Trainer(
   model=model,
   args=training_args,
   train_dataset=tokenized_poison,
   eval_dataset=tokenized_test,
   tokenizer=tokenizer,
   data_collator=data_collator,
   compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

In [None]:
trainer.evaluate()

In [None]:
sentiment_model = pipeline(model='EllipticCurve/DistilBERT-label-manipulation')
sentiment_model(['I love this movie', 'This movie sucks!', 'Privacy enhancing technology class is awful'])