In [3]:
!pip install datasets nltk sklearn

In [54]:
!pip install datasets 

In [4]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [12]:
import datasets
import os
import pandas as pd
import re
from nltk import word_tokenize
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer, ENGLISH_STOP_WORDS
from sklearn import preprocessing
from sklearn.feature_extraction.text import TfidfVectorizer

In [57]:
def load_data(dataset_split='train'):
    dataset = datasets.load_dataset('rotten_tomatoes')[dataset_split]
    # Open and import positve data
    df = pd.DataFrame()
    df['Review'] = [review['text'] for review in dataset]
    df['Sentiment'] = [review['label'] for review in dataset]
    # Remove non-alphanumeric characters
    df['Review'] = df['Review'].apply(lambda x: re.sub("[^a-zA-Z]", ' ', str(x)))
    # Tokenize the training and testing data
    df_tokenized = tokenize_review(df)
    return df_tokenized

In [6]:
def tokenize_review(df):
    # Tokenize Reviews in training
    tokened_reviews = [word_tokenize(rev) for rev in df['Review']]
    # Create word stems
    stemmed_tokens = []
    porter = PorterStemmer()
    for i in range(len(tokened_reviews)):
        stems = [porter.stem(token) for token in tokened_reviews[i]]
        stems = ' '.join(stems)
        stemmed_tokens.append(stems)
    df.insert(1, column='Stemmed', value=stemmed_tokens)
    return df

# **Bag of Words (BOW) and Tf-idf unstemmed**





In [7]:
def transform_BOW(training, testing, column_name):
    vect = CountVectorizer(max_features=100, ngram_range=(1,3), stop_words=ENGLISH_STOP_WORDS)
    vectFit = vect.fit(training[column_name])
    BOW_training = vectFit.transform(training[column_name])
    BOW_training_df = pd.DataFrame(BOW_training.toarray(), columns=vect.get_feature_names())
    BOW_testing = vectFit.transform(testing[column_name])
    BOW_testing_Df = pd.DataFrame(BOW_testing.toarray(), columns=vect.get_feature_names())
    return vectFit, BOW_training_df, BOW_testing_Df

In [8]:
def transform_tfidf(training, testing, column_name):
    Tfidf = TfidfVectorizer(ngram_range=(1,3), max_features=100, stop_words=ENGLISH_STOP_WORDS)
    Tfidf_fit = Tfidf.fit(training[column_name])
    Tfidf_training = Tfidf_fit.transform(training[column_name])
    Tfidf_training_df = pd.DataFrame(Tfidf_training.toarray(), columns=Tfidf.get_feature_names())
    Tfidf_testing = Tfidf_fit.transform(testing[column_name])
    Tfidf_testing_df = pd.DataFrame(Tfidf_testing.toarray(), columns=Tfidf.get_feature_names())
    return Tfidf_fit, Tfidf_training_df, Tfidf_testing_df

In [70]:
from sklearn import svm
from sklearn.metrics import classification_report
from sklearn.calibration import CalibratedClassifierCV

# def build_model(X_train, y_train, X_test, y_test, name_of_test):
#     # log_reg = LogisticRegression(C=30, max_iter=200).fit(X_train, y_train)
#     linear_svm = svm.SVC(kernel='linear',C=1.0,probability=True)
#     linear_svm.fit(X_train, y_train)
#     # linearn_svm = LinearSVC(C=1)

#     y_pred = linear_svm.predict(X_test)
#     print('Training accuracy of '+name_of_test+': ', linear_svm.score(X_train, y_train))
#     print('Testing accuracy of '+name_of_test+': ', linear_svm.score(X_test, y_test))
#     print(classification_report(y_test, y_pred)) 
#     return linear_svm

def build_model(X_train, y_train, X_test, y_test, name_of_test):
    # log_reg = LogisticRegression(C=30, max_iter=200).fit(X_train, y_train)
    lin_svm = LinearSVC(C=1)
    linear_svm = CalibratedClassifierCV(lin_svm)
    linear_svm.fit(X_train, y_train)
    # linearn_svm = LinearSVC(C=1)
    y_pred = linear_svm.predict(X_test)
    print('Training accuracy of '+name_of_test+': ', linear_svm.score(X_train, y_train))
    print('Testing accuracy of '+name_of_test+': ', linear_svm.score(X_test, y_test))
    print(classification_report(y_test, y_pred)) 
    return linear_svm

**loading training and testing dataset**

In [59]:
df_train = load_data('train')
df_test = load_data('test')

df_train

Using custom data configuration default
Reusing dataset rotten_tomatoes_movie_review (/root/.cache/huggingface/datasets/rotten_tomatoes_movie_review/default/1.0.0/40d411e45a6ce3484deed7cc15b82a53dad9a72aafd9f86f8f227134bec5ca46)


  0%|          | 0/3 [00:00<?, ?it/s]

Using custom data configuration default
Reusing dataset rotten_tomatoes_movie_review (/root/.cache/huggingface/datasets/rotten_tomatoes_movie_review/default/1.0.0/40d411e45a6ce3484deed7cc15b82a53dad9a72aafd9f86f8f227134bec5ca46)


  0%|          | 0/3 [00:00<?, ?it/s]

Unnamed: 0,Review,Stemmed,Sentiment
0,the rock is destined to be the st century s ...,the rock is destin to be the st centuri s new ...,1
1,the gorgeously elaborate continuation of the...,the gorgeous elabor continu of the lord of the...,1
2,effective but too tepid biopic,effect but too tepid biopic,1
3,if you sometimes like to go to the movies to h...,if you sometim like to go to the movi to have ...,1
4,emerges as something rare an issue movie tha...,emerg as someth rare an issu movi that s so ho...,1
...,...,...,...
8525,any enjoyment will be hinge from a personal th...,ani enjoy will be hing from a person threshold...,0
8526,if legendary shlockmeister ed wood had ever ma...,if legendari shlockmeist ed wood had ever made...,0
8527,hardly a nuanced portrait of a young woman s b...,hardli a nuanc portrait of a young woman s bre...,0
8528,interminably bleak to say nothing of boring,intermin bleak to say noth of bore,0


# Bag of Words(BOW)

In [71]:
# Create unstemmed BOW features for training set
unstemmed_BOW_vect_fit, df_train_bow_unstem, df_test_bow_unstem = transform_BOW(df_train, df_test, 'Review')



# TF-idf

In [72]:
unstemmed_tfidf_vect_fit, df_train_tfidf_unstem, df_test_tfidf_unstem = transform_tfidf(df_train, df_test, 'Review')




# Training linear svm model on BOW(unstemmed) features and tf-idf(unstemmed) features 

In [73]:
bow_unstemmed = build_model(df_train_bow_unstem, df_train['Sentiment'], df_test_bow_unstem, df_test['Sentiment'], 'BOW Unstemmed')

Training accuracy of BOW Unstemmed:  0.6202813599062134
Testing accuracy of BOW Unstemmed:  0.6041275797373359
              precision    recall  f1-score   support

           0       0.59      0.69      0.63       533
           1       0.63      0.52      0.57       533

    accuracy                           0.60      1066
   macro avg       0.61      0.60      0.60      1066
weighted avg       0.61      0.60      0.60      1066



In [74]:
tfidf_unstemmed = build_model(df_train_tfidf_unstem, df_train['Sentiment'], df_test_tfidf_unstem, df_test['Sentiment'], 'TFIDF Unstemmed')

Training accuracy of TFIDF Unstemmed:  0.6213364595545134
Testing accuracy of TFIDF Unstemmed:  0.6078799249530957
              precision    recall  f1-score   support

           0       0.60      0.67      0.63       533
           1       0.62      0.54      0.58       533

    accuracy                           0.61      1066
   macro avg       0.61      0.61      0.61      1066
weighted avg       0.61      0.61      0.61      1066



# Installing textattack toolbox

In [26]:
!pip install textattack
!pip install tensorflow-text

Collecting tensorflow-text
  Downloading tensorflow_text-2.7.3-cp37-cp37m-manylinux2010_x86_64.whl (4.9 MB)
[K     |████████████████████████████████| 4.9 MB 5.1 MB/s 
Collecting importlib-metadata>=4.4
  Downloading importlib_metadata-4.8.2-py3-none-any.whl (17 kB)
Installing collected packages: importlib-metadata, tensorflow-text
  Attempting uninstall: importlib-metadata
    Found existing installation: importlib-metadata 3.10.1
    Uninstalling importlib-metadata-3.10.1:
      Successfully uninstalled importlib-metadata-3.10.1
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
konoha 4.6.5 requires importlib-metadata<4.0.0,>=3.7.0, but you have importlib-metadata 4.8.2 which is incompatible.[0m
Successfully installed importlib-metadata-4.8.2 tensorflow-text-2.7.3


# Performing Textfooler attack 

In [48]:
import textattack
from textattack.models.wrappers import SklearnModelWrapper
from textattack.datasets import HuggingFaceDataset
from textattack.attack_recipes import TextFoolerJin2019
from textattack import Attacker



**On Unstemmed BOW features**

In [75]:
model_wrapper = SklearnModelWrapper(bow_unstemmed, unstemmed_BOW_vect_fit)
dataset = HuggingFaceDataset("rotten_tomatoes", None, "train")
attack = TextFoolerJin2019.build(model_wrapper)


Using custom data configuration default
Reusing dataset rotten_tomatoes_movie_review (/root/.cache/huggingface/datasets/rotten_tomatoes_movie_review/default/1.0.0/40d411e45a6ce3484deed7cc15b82a53dad9a72aafd9f86f8f227134bec5ca46)


  0%|          | 0/3 [00:00<?, ?it/s]

textattack: Loading [94mdatasets[0m dataset [94mrotten_tomatoes[0m, split [94mtrain[0m.
textattack: Unknown if model of class <class 'sklearn.calibration.CalibratedClassifierCV'> compatible with goal function <class 'textattack.goal_functions.classification.untargeted_classification.UntargetedClassification'>.


In [76]:
attacker = Attacker(attack, dataset)
attacker.attack_dataset()

Attack(
  (search_method): GreedyWordSwapWIR(
    (wir_method):  delete
  )
  (goal_function):  UntargetedClassification
  (transformation):  WordSwapEmbedding(
    (max_candidates):  50
    (embedding):  WordEmbedding
  )
  (constraints): 
    (0): WordEmbeddingDistance(
        (embedding):  WordEmbedding
        (min_cos_sim):  0.5
        (cased):  False
        (include_unknown_words):  True
        (compare_against_original):  True
      )
    (1): PartOfSpeech(
        (tagger_type):  nltk
        (tagset):  universal
        (allow_verb_noun_swap):  True
        (compare_against_original):  True
      )
    (2): UniversalSentenceEncoder(
        (metric):  angular
        (threshold):  0.840845057
        (window_size):  15
        (skip_text_shorter_than_window):  True
        (compare_against_original):  False
      )
    (3): RepeatModification
    (4): StopwordModification
    (5): InputColumnModification(
        (matching_column_labels):  ['premise', 'hypothesis']
       

[Succeeded / Failed / Skipped / Total] 2 / 0 / 0 / 2:  20%|██        | 2/10 [00:07<00:28,  3.51s/it]

--------------------------------------------- Result 1 ---------------------------------------------

the rock is destined to be the 21st century's [[new]] " conan " and that he's going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal .

the rock is destined to be the 21st century's [[newest]] " conan " and that he's going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal .


--------------------------------------------- Result 2 ---------------------------------------------

the gorgeously elaborate continuation of " the lord of the rings " trilogy is so huge that a column of words cannot adequately describe co-writer/[[director]] peter jackson's expanded vision of j . r . r . tolkien's middle-earth .

the gorgeously elaborate continuation of " the lord of the rings " trilogy is so huge that a column of words cannot adequately describe co-writer/[[dumbledore]] peter jackson's expanded vision o



--------------------------------------------- Result 3 ---------------------------------------------

effective but too-tepid biopic






--------------------------------------------- Result 4 ---------------------------------------------

if you sometimes like to go to the [[movies]] to have [[fun]] , wasabi is a good place to start .

if you sometimes like to go to the [[movie]] to have [[amuse]] , wasabi is a good place to start .


--------------------------------------------- Result 5 ---------------------------------------------

emerges as something rare , an issue movie that's so honest and keenly observed that it doesn't feel like one .






--------------------------------------------- Result 6 ---------------------------------------------

the [[film]] provides some [[great]] insight into the neurotic mindset of all comics -- even those who have reached the absolute top of the game .

the [[movie]] provides some [[admirable]] insight into the neurotic mindset of all comics -- even those who have reached the absolute top of the game .


--------------------------------------------- Result 7 ---------------------------------------------

offers that rare combination of entertainment and education .


--------------------------------------------- Result 8 ---------------------------------------------

perhaps no picture ever made has more literally showed that the road to hell is paved with [[good]] intentions .

perhaps no picture ever made has more literally showed that the road to hell is paved with [[decent]] intentions .




[Succeeded / Failed / Skipped / Total] 5 / 0 / 5 / 10: 100%|██████████| 10/10 [00:07<00:00,  1.29it/s]

--------------------------------------------- Result 9 ---------------------------------------------

steers turns in a snappy screenplay that curls at the edges ; it's so clever you want to hate it . but he somehow pulls it off .


--------------------------------------------- Result 10 ---------------------------------------------

take care of my cat offers a refreshingly different slice of asian cinema .



+-------------------------------+--------+
| Attack Results                |        |
+-------------------------------+--------+
| Number of successful attacks: | 5      |
| Number of failed attacks:     | 0      |
| Number of skipped attacks:    | 5      |
| Original accuracy:            | 50.0%  |
| Accuracy under attack:        | 0.0%   |
| Attack success rate:          | 100.0% |
| Average perturbed word %:     | 6.08%  |
| Average num. words per input: | 19.5   |
| Avg num queries:              | 67.6   |
+-------------------------------+--------+





[<textattack.attack_results.successful_attack_result.SuccessfulAttackResult at 0x7fdc96de0350>,
 <textattack.attack_results.successful_attack_result.SuccessfulAttackResult at 0x7fdc8847de50>,
 <textattack.attack_results.skipped_attack_result.SkippedAttackResult at 0x7fdc7ffe0510>,
 <textattack.attack_results.successful_attack_result.SuccessfulAttackResult at 0x7fdc9e06a0d0>,
 <textattack.attack_results.skipped_attack_result.SkippedAttackResult at 0x7fdc92e39dd0>,
 <textattack.attack_results.successful_attack_result.SuccessfulAttackResult at 0x7fdc91a30f90>,
 <textattack.attack_results.skipped_attack_result.SkippedAttackResult at 0x7fdc87fc8050>,
 <textattack.attack_results.successful_attack_result.SuccessfulAttackResult at 0x7fdc98e08390>,
 <textattack.attack_results.skipped_attack_result.SkippedAttackResult at 0x7fdc9b14b850>,
 <textattack.attack_results.skipped_attack_result.SkippedAttackResult at 0x7fdc87fca650>]

**On Unstemmed tf-idf features**

In [77]:
from textattack.models.wrappers import SklearnModelWrapper

model_wrapper = SklearnModelWrapper(tfidf_unstemmed, unstemmed_tfidf_vect_fit)

In [78]:
dataset = HuggingFaceDataset("rotten_tomatoes", None, "train")
attack = TextFoolerJin2019.build(model_wrapper)


Using custom data configuration default
Reusing dataset rotten_tomatoes_movie_review (/root/.cache/huggingface/datasets/rotten_tomatoes_movie_review/default/1.0.0/40d411e45a6ce3484deed7cc15b82a53dad9a72aafd9f86f8f227134bec5ca46)


  0%|          | 0/3 [00:00<?, ?it/s]

textattack: Loading [94mdatasets[0m dataset [94mrotten_tomatoes[0m, split [94mtrain[0m.
textattack: Unknown if model of class <class 'sklearn.calibration.CalibratedClassifierCV'> compatible with goal function <class 'textattack.goal_functions.classification.untargeted_classification.UntargetedClassification'>.


In [79]:
attacker = Attacker(attack, dataset)
attacker.attack_dataset()

Attack(
  (search_method): GreedyWordSwapWIR(
    (wir_method):  delete
  )
  (goal_function):  UntargetedClassification
  (transformation):  WordSwapEmbedding(
    (max_candidates):  50
    (embedding):  WordEmbedding
  )
  (constraints): 
    (0): WordEmbeddingDistance(
        (embedding):  WordEmbedding
        (min_cos_sim):  0.5
        (cased):  False
        (include_unknown_words):  True
        (compare_against_original):  True
      )
    (1): PartOfSpeech(
        (tagger_type):  nltk
        (tagset):  universal
        (allow_verb_noun_swap):  True
        (compare_against_original):  True
      )
    (2): UniversalSentenceEncoder(
        (metric):  angular
        (threshold):  0.840845057
        (window_size):  15
        (skip_text_shorter_than_window):  True
        (compare_against_original):  False
      )
    (3): RepeatModification
    (4): StopwordModification
    (5): InputColumnModification(
        (matching_column_labels):  ['premise', 'hypothesis']
       

[Succeeded / Failed / Skipped / Total] 2 / 0 / 0 / 2:  20%|██        | 2/10 [00:05<00:21,  2.72s/it]

--------------------------------------------- Result 1 ---------------------------------------------

the rock is destined to be the 21st century's [[new]] " conan " and that he's going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal .

the rock is destined to be the 21st century's [[newest]] " conan " and that he's going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal .


--------------------------------------------- Result 2 ---------------------------------------------

the gorgeously elaborate continuation of " the lord of the rings " trilogy is so huge that a column of words cannot adequately describe co-writer/[[director]] peter jackson's expanded vision of j . r . r . tolkien's middle-earth .

the gorgeously elaborate continuation of " the lord of the rings " trilogy is so huge that a column of words cannot adequately describe co-writer/[[dumbledore]] peter jackson's expanded vision o



--------------------------------------------- Result 3 ---------------------------------------------

effective but too-tepid biopic






--------------------------------------------- Result 4 ---------------------------------------------

if you sometimes like to go to the [[movies]] to have [[fun]] , wasabi is a good place to start .

if you sometimes like to go to the [[movie]] to have [[amuse]] , wasabi is a good place to start .


--------------------------------------------- Result 5 ---------------------------------------------

emerges as something rare , an issue movie that's so honest and keenly observed that it doesn't feel like one .






--------------------------------------------- Result 6 ---------------------------------------------

the [[film]] provides some [[great]] insight into the neurotic mindset of all comics -- even those who have reached the absolute top of the game .

the [[movie]] provides some [[big]] insight into the neurotic mindset of all comics -- even those who have reached the absolute top of the game .


--------------------------------------------- Result 7 ---------------------------------------------

offers that rare combination of entertainment and education .


--------------------------------------------- Result 8 ---------------------------------------------

perhaps no picture ever made has more literally showed that the road to hell is paved with [[good]] intentions .

perhaps no picture ever made has more literally showed that the road to hell is paved with [[decent]] intentions .




[Succeeded / Failed / Skipped / Total] 5 / 0 / 5 / 10: 100%|██████████| 10/10 [00:06<00:00,  1.61it/s]

--------------------------------------------- Result 9 ---------------------------------------------

steers turns in a snappy screenplay that curls at the edges ; it's so clever you want to hate it . but he somehow pulls it off .


--------------------------------------------- Result 10 ---------------------------------------------

take care of my cat offers a refreshingly different slice of asian cinema .



+-------------------------------+--------+
| Attack Results                |        |
+-------------------------------+--------+
| Number of successful attacks: | 5      |
| Number of failed attacks:     | 0      |
| Number of skipped attacks:    | 5      |
| Original accuracy:            | 50.0%  |
| Accuracy under attack:        | 0.0%   |
| Attack success rate:          | 100.0% |
| Average perturbed word %:     | 6.08%  |
| Average num. words per input: | 19.5   |
| Avg num queries:              | 66.8   |
+-------------------------------+--------+





[<textattack.attack_results.successful_attack_result.SuccessfulAttackResult at 0x7fdc8d938b90>,
 <textattack.attack_results.successful_attack_result.SuccessfulAttackResult at 0x7fdc9f0ed950>,
 <textattack.attack_results.skipped_attack_result.SkippedAttackResult at 0x7fdc8b6676d0>,
 <textattack.attack_results.successful_attack_result.SuccessfulAttackResult at 0x7fdc9e0bbdd0>,
 <textattack.attack_results.skipped_attack_result.SkippedAttackResult at 0x7fdc9b002c50>,
 <textattack.attack_results.successful_attack_result.SuccessfulAttackResult at 0x7fdc90fe3a90>,
 <textattack.attack_results.skipped_attack_result.SkippedAttackResult at 0x7fdc974f09d0>,
 <textattack.attack_results.successful_attack_result.SuccessfulAttackResult at 0x7fdc8b454850>,
 <textattack.attack_results.skipped_attack_result.SkippedAttackResult at 0x7fdc93a93a10>,
 <textattack.attack_results.skipped_attack_result.SkippedAttackResult at 0x7fdc9e06a7d0>]