<a href="https://colab.research.google.com/github/bipin-a/thesis-meng/blob/main/Robustness_Exp2_perturbations_dependancies.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
! pip install textattack transformers sentence_transformers

In [None]:
import transformers 
import textattack

In [None]:
import torch
from textattack.models.wrappers import HuggingFaceModelWrapper
from textattack.goal_functions import UntargetedClassification
import pandas as pd

from textattack.attack_recipes import (
    TextFoolerJin2019,
    HotFlipEbrahimi2017,
    BAEGarg2019
)
from datasets import load_dataset


In [None]:
torch.cuda.is_available() 

True

In [None]:
from google.colab import drive
drive.mount('/content/drive')
perturbed_data_root = '/content/drive/MyDrive/meng_thesis/code/data/perturbed_data'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


### Experiment 2: 
- Hypothesis: The perturbations of white box adversarial attacks dependant on the victim model.

    1. Select white box adv attack 
    2. Select a dataset 
    3. Test a range of Victim Models
    4. Generate list of perturbed datasets for each victim model
    5. Compare overlap of inputs across lists datasets  
    6. Repeat steps 1-6 with different white box adv attacks 


In [None]:
# Get huggingface models and tokenizer

checkpoint_names = [
    "textattack/distilbert-base-uncased-imdb",
    "textattack/albert-base-v2-imdb",
    "textattack/roberta-base-imdb"
    ]
model_names = [
    'distilbert-base-uncased',
    'albert-base-v2',
    'roberta-base-imdb'
    ]
dataset_name = 'imdb'

models = [transformers.AutoModelForSequenceClassification.from_pretrained(c) for c in checkpoint_names]
tokenizers = [transformers.AutoTokenizer.from_pretrained(c) for c in checkpoint_names]
model_wrappers = [HuggingFaceModelWrapper(m, t) for m,t in zip(models,tokenizers)]

hg_dataset = load_dataset(dataset_name, split="test[:10]") 


In [None]:
dataset = textattack.datasets.Dataset( [(i.get("text") , i.get("label")) for i in hg_dataset] )

In [None]:
pd.DataFrame(hg_dataset)

Unnamed: 0,text,label
0,I love sci-fi and am willing to put up with a ...,0
1,"Worth the entertainment value of a rental, esp...",0
2,its a totally average film with a few semi-alr...,0
3,STAR RATING: ***** Saturday Night **** Friday ...,0
4,"First off let me say, If you haven't enjoyed a...",0
5,I had high hopes for this one until they chang...,0
6,Isaac Florentine has made some of the best wes...,0
7,"It actually pains me to say it, but this movie...",0
8,"Technically I'am a Van Damme Fan, or I was. th...",0
9,"Honestly awful film, bad editing, awful lighti...",0


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
parallel_flag = bool(torch.cuda.device_count())
parallel_flag

True

In [None]:
attacks = [
    TextFoolerJin2019, 
    # HotFlipEbrahimi2017,
    BAEGarg2019
]

attack_names = [
    "TextFoolerJin2019", 
    # "HotFlipEbrahimi2017",
    "BAEGarg2019"
]

In [None]:
for attack_model, attack_name in zip(attacks, attack_names):
  for model_wrapper, model_name in zip(model_wrappers, model_names):
    
    print(model_name,attack_name)
    attack = attack_model.build(model_wrapper)

    attack_args = textattack.AttackArgs(
        parallel = parallel_flag,
        num_examples=5,
        csv_coloring_style = 'html',
        log_to_csv=f"{perturbed_data_root}/{attack_name}{model_name}_html.csv",
        # checkpoint_interval=5,
        # checkpoint_dir="checkpoints",
        disable_stdout=True
    )

    attacker = textattack.Attacker(attack, dataset, attack_args)
    attacker.attack_dataset()


textattack: Unknown if model of class <class 'transformers.models.distilbert.modeling_distilbert.DistilBertForSequenceClassification'> compatible with goal function <class 'textattack.goal_functions.classification.untargeted_classification.UntargetedClassification'>.
textattack: Logging to CSV at path /content/drive/MyDrive/meng_thesis/code/data/perturbed_data/TextFoolerJin2019distilbert-base-uncased_html.csv
textattack: Running 1 worker(s) on 1 GPU(s).


distilbert-base-uncased TextFoolerJin2019


textattack: Worklist size: 5
textattack: Worklist candidate size: 5
[Succeeded / Failed / Skipped / Total] 4 / 0 / 1 / 5: 100%|██████████| 5/5 [01:16<00:00, 15.22s/it]


+-------------------------------+--------+
| Attack Results                |        |
+-------------------------------+--------+
| Number of successful attacks: | 4      |
| Number of failed attacks:     | 0      |
| Number of skipped attacks:    | 1      |
| Original accuracy:            | 80.0%  |
| Accuracy under attack:        | 0.0%   |
| Attack success rate:          | 100.0% |
| Average perturbed word %:     | 6.5%   |
| Average num. words per input: | 215.6  |
| Avg num queries:              | 546.75 |
+-------------------------------+--------+


textattack: Unknown if model of class <class 'transformers.models.albert.modeling_albert.AlbertForSequenceClassification'> compatible with goal function <class 'textattack.goal_functions.classification.untargeted_classification.UntargetedClassification'>.
textattack: Logging to CSV at path /content/drive/MyDrive/meng_thesis/code/data/perturbed_data/TextFoolerJin2019albert-base-v2_html.csv
textattack: Running 1 worker(s) on 1 GPU(s).



albert-base-v2 TextFoolerJin2019


textattack: Worklist size: 5
textattack: Worklist candidate size: 5
[Succeeded / Failed / Skipped / Total] 3 / 0 / 2 / 5: 100%|██████████| 5/5 [02:58<00:00, 35.71s/it]


+-------------------------------+---------+
| Attack Results                |         |
+-------------------------------+---------+
| Number of successful attacks: | 3       |
| Number of failed attacks:     | 0       |
| Number of skipped attacks:    | 2       |
| Original accuracy:            | 60.0%   |
| Accuracy under attack:        | 0.0%    |
| Attack success rate:          | 100.0%  |
| Average perturbed word %:     | 10.04%  |
| Average num. words per input: | 215.6   |
| Avg num queries:              | 1112.67 |
+-------------------------------+---------+


textattack: Unknown if model of class <class 'transformers.models.roberta.modeling_roberta.RobertaForSequenceClassification'> compatible with goal function <class 'textattack.goal_functions.classification.untargeted_classification.UntargetedClassification'>.
textattack: Logging to CSV at path /content/drive/MyDrive/meng_thesis/code/data/perturbed_data/TextFoolerJin2019roberta-base-imdb_html.csv
textattack: Running 1 worker(s) on 1 GPU(s).



roberta-base-imdb TextFoolerJin2019


textattack: Worklist size: 5
textattack: Worklist candidate size: 5
[Succeeded / Failed / Skipped / Total] 0 / 0 / 1 / 1:  20%|██        | 1/5 [05:44<22:56, 344.05s/it]

KeyboardInterrupt: ignored

In [None]:
model_names = ['distilbert-base-uncased','albert-base-v2']
df1 = pd.read_csv(f"{perturbed_data_root}/{model_names[0]}_html.csv")[['original_text','perturbed_text']]
df2 = pd.read_csv(f"{perturbed_data_root}/{model_names[1]}_html.csv")[['perturbed_text']]

df1 = df1.rename(columns={'perturbed_text':'perturbed_text_distilbert-base'})
df2 = df2.rename(columns={'perturbed_text':'perturbed_text_albert-base'})

main = pd.concat([df1,df2],axis=1)

In [None]:
main.head(2).to_markdown()

# TODO:
- Measure Perplexity and other metrics of the perturbed data

In [None]:
https://huggingface.co/docs/transformers/perplexity

In [None]:
HG_datasets = {}
dfs = {}

for dataset_name in dataset_names:
  df = pd.read_csv(f"{root}/data/{dataset_name}.csv")
  _dataset = df[["perturbed_text","ground_truth_output"]]
  dfs[dataset_name] = _dataset

  _dataset = _dataset.rename({"perturbed_text":"text", 
                              "ground_truth_output":"labels"}
                             ,axis=1)
  
  dataset = HG_Dataset.from_pandas(_dataset)
  HG_datasets[dataset_name] = dataset

In [None]:
for name, df in dfs.items():
  print(f"name: {df.ground_truth_output.value_counts()}")

name: 1    3000
0    3000
Name: ground_truth_output, dtype: int64
name: 1    3000
0    3000
Name: ground_truth_output, dtype: int64
name: 1    2002
0    1998
Name: ground_truth_output, dtype: int64
name: 1    504
0    496
Name: ground_truth_output, dtype: int64
name: 1    504
0    496
Name: ground_truth_output, dtype: int64
name: 1    501
0    499
Name: ground_truth_output, dtype: int64


## textattack/bert-base-uncased-imdb

In [None]:
metric_results = {}

for hg_name in HG_datasets:
  for checkpoint_name in checkpoint_names:


    torch.cuda.empty_cache()
    dataset = HG_datasets[hg_name]
    print(checkpoint_name, hg_name)

    clf_metrics = evaluate.combine(["accuracy", "recall", "precision", "f1"])

    model = transformers.AutoModelForSequenceClassification.from_pretrained(checkpoint_name)
    model_tokenizer = transformers.AutoTokenizer.from_pretrained(checkpoint_name)
    model.to(device)

    tokenized_datasets_all = dataset.map(
        lambda examples : model_tokenizer(examples["text"], 
                                  padding="max_length",
                                  truncation=True)
        )
    
    tokenized_datasets = tokenized_datasets_all.remove_columns(['text'])
    tokenized_datasets.set_format("torch")

    test_dataloader = DataLoader(tokenized_datasets,
                              shuffle=True, 
                              batch_size=8)

    model.eval()
    for batch in test_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = model(**batch)

        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)
        clf_metrics.add_batch(predictions=predictions, references=batch["labels"])

    m_res = clf_metrics.compute()
    print(m_res)

    metric_results[(hg_name,checkpoint_name)] = m_res
    print("\n \n ")
