In [7]:
import nltk
nltk.download('punkt') # The NLTK tokenizer
nltk.download('maxent_ne_chunker') # NLTK named-entity chunker
nltk.download('words') # NLTK list of words
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping chunkers/maxent_ne_chunker.zip.
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [8]:
sentence = ('In 2017, star quarterback Tom Brady led the Patriots to the Super Bowl, '
           'but lost to the Philadelphia Eagles.')

# 1. Tokenize using the NLTK tokenizer.
tokens = nltk.word_tokenize(sentence)

# 2. Tag parts of speech using the NLTK part-of-speech tagger.
tagged = nltk.pos_tag(tokens)

# 3. Extract entities from tagged sentence.
entities = nltk.chunk.ne_chunk(tagged)
print(entities)


(S
  In/IN
  2017/CD
  ,/,
  star/NN
  quarterback/NN
  (PERSON Tom/NNP Brady/NNP)
  led/VBD
  the/DT
  (ORGANIZATION Patriots/NNP)
  to/TO
  the/DT
  (ORGANIZATION Super/NNP Bowl/NNP)
  ,/,
  but/CC
  lost/VBD
  to/TO
  the/DT
  (ORGANIZATION Philadelphia/NNP Eagles/NNP)
  ./.)


In [9]:
named_entities = [entity for entity in entities if isinstance(entity, nltk.tree.Tree)]
print(named_entities)

[Tree('PERSON', [('Tom', 'NNP'), ('Brady', 'NNP')]), Tree('ORGANIZATION', [('Patriots', 'NNP')]), Tree('ORGANIZATION', [('Super', 'NNP'), ('Bowl', 'NNP')]), Tree('ORGANIZATION', [('Philadelphia', 'NNP'), ('Eagles', 'NNP')])]


In [3]:
import functools

@functools.lru_cache(maxsize=2**14)
def get_entities(sentence):
    tokens = nltk.word_tokenize(sentence)
    tagged = nltk.pos_tag(tokens)
    # Setting `binary=True` makes NLTK return all of the named
    # entities tagged as NNP instead of detailed tags like
    #'Organization', 'Geo-Political Entity', etc.
    entities = nltk.chunk.ne_chunk(tagged, binary=True)
    return entities.leaves()

In [8]:
sentence = "Because the weather is so cold, I can't go picnic"
get_entities(sentence)

[('Because', 'IN'),
 ('the', 'DT'),
 ('weather', 'NN'),
 ('is', 'VBZ'),
 ('so', 'RB'),
 ('cold', 'JJ'),
 (',', ','),
 ('I', 'PRP'),
 ('ca', 'MD'),
 ("n't", 'RB'),
 ('go', 'VB'),
 ('picnic', 'JJ')]

In [7]:
import nltk

sentence = [("the", "DT"), ("little", "JJ"), ("yellow", "JJ"), ("dog", "NN"), ("barked", "VBD"), ("at", "IN"),  ("the", "DT"), ("cat", "NN")]
sentence2 = "Sun's Looking Glass Provides 3D View."
print(get_entities(sentence2))
grammar = "NP: {<DT>?<JJ>*<NN>}"
cp = nltk.RegexpParser(grammar)
result = cp.parse(get_entities(sentence2))
print(result)

[('Sun', 'NNP'), ("'s", 'POS'), ('Looking', 'VBG'), ('Glass', 'NNP'), ('Provides', 'NNP'), ('3D', 'CD'), ('View', 'NNP'), ('.', '.')]
(S
  Sun/NNP
  's/POS
  Looking/VBG
  Glass/NNP
  Provides/NNP
  3D/CD
  View/NNP
  ./.)


In [1]:
from textattack.constraints import Constraint

class NamedEntityConstraint(Constraint):
    """ A constraint that ensures `transformed_text` only substitutes named entities from `current_text` with other named entities.
    """
    def _check_constraint(self, transformed_text, current_text):
        transformed_entities = get_entities(transformed_text.text)
        current_entities = get_entities(current_text.text)
        # If there aren't named entities, let's return False (the attack
        # will eventually fail).
        if len(current_entities) == 0:
            return False
        if len(current_entities) != len(transformed_entities):
            # If the two sentences have a different number of entities, then 
            # they definitely don't have the same labels. In this case, the 
            # constraint is violated, and we return False.
            return False
        else:
            # Here we compare all of the words, in order, to make sure that they match.
            # If we find two words that don't match, this means a word was swapped 
            # between `current_text` and `transformed_text`. That word must be a named entity to fulfill our
            # constraint.
            current_word_label = None
            transformed_word_label = None
            for (word_1, label_1), (word_2, label_2) in zip(current_entities, transformed_entities):
                if word_1 != word_2:
                    # Finally, make sure that words swapped between `x` and `x_adv` are named entities. If 
                    # they're not, then we also return False.
                    if (label_1 not in ['JJ', 'CD', 'DT']) or (label_2 not in ['JJ', 'CD', 'DT']):
                        return False            
            # If we get here, all of the labels match up. Return True!
            return True
    

  from .autonotebook import tqdm as notebook_tqdm
2022-10-25 04:54:13.036057: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-10-25 04:54:13.132442: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-10-25 04:54:13.132461: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2022-10-25 04:54:13.152256: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2022-10-25 04:54:1

*******this is Embedding
*******this is clare in seongae


In [2]:
import transformers
from textattack.models.wrappers import HuggingFaceModelWrapper

model = transformers.AutoModelForSequenceClassification.from_pretrained("textattack/albert-base-v2-ag-news")
tokenizer = transformers.AutoTokenizer.from_pretrained("textattack/albert-base-v2-ag-news")

model_wrapper = HuggingFaceModelWrapper(model, tokenizer)

# Create the goal function using the model
from textattack.goal_functions import UntargetedClassification
goal_function = UntargetedClassification(model_wrapper)

# Import the dataset
from textattack.datasets import HuggingFaceDataset
dataset = HuggingFaceDataset("ag_news", None, "test")

textattack: Unknown if model of class <class 'transformers.models.albert.modeling_albert.AlbertForSequenceClassification'> compatible with goal function <class 'textattack.goal_functions.classification.untargeted_classification.UntargetedClassification'>.
Using custom data configuration default
Reusing dataset ag_news (/root/.cache/huggingface/datasets/ag_news/default/0.0.0/bc2bcb40336ace1a0374767fc29bb0296cdaf8a6da7298436239c54d79180548)
100%|██████████| 2/2 [00:00<00:00, 741.70it/s]
textattack: Loading [94mdatasets[0m dataset [94mag_news[0m, split [94mtest[0m.


In [3]:
from textattack.transformations import (
    CompositeTransformation,
    WordInsertionMaskedLM,
    WordMergeMaskedLM,
    WordSwapMaskedLM,
    WordDeletion
)
from textattack.constraints.semantics.sentence_encoders import UniversalSentenceEncoder
from textattack.goal_functions import UntargetedClassification

from textattack.search_methods import GreedySearch
from textattack import Attack
from textattack.constraints.pre_transformation import RepeatModification, StopwordModification
import transformers

# We're going to the `WordSwapEmbedding` transformation. Using the default settings, this
# will try substituting words with their neighbors in the counter-fitted embedding space. 
shared_masked_lm = transformers.AutoModelForCausalLM.from_pretrained(
            "distilroberta-base"
        )
shared_tokenizer = transformers.AutoTokenizer.from_pretrained(
            "distilroberta-base"
        )
#transformation = WordDeletion()
transformation = WordInsertionMaskedLM(
            masked_language_model=shared_masked_lm,
            tokenizer=shared_tokenizer,
            max_candidates=50,
            min_confidence=0.0,
        )
    

# We'll use the greedy search with word importance ranking method again
search_method = GreedySearch()
goal_function = UntargetedClassification(model_wrapper)

# Our constraints will be the same as Tutorial 1, plus the named entity constraint
constraints = [RepeatModification(),
               StopwordModification()]
use_constraint = UniversalSentenceEncoder(
    threshold=0.7,
    metric="cosine",
    compare_against_original=True,
    window_size=15,
    skip_text_shorter_than_window=True,
)
constraints.append(use_constraint)
# Now, let's make the attack using these parameters. 
attack = Attack(goal_function, constraints, transformation, search_method)

If you want to use `RobertaLMHeadModel` as a standalone, add `is_decoder=True.`


RuntimeError: CUDA out of memory. Tried to allocate 148.00 MiB (GPU 0; 7.93 GiB total capacity; 314.70 MiB already allocated; 45.81 MiB free; 350.00 MiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [5]:
from textattack.loggers import CSVLogger # tracks a dataframe for us.
from textattack.attack_results import SuccessfulAttackResult
from textattack import Attacker, AttackArgs

attack_args = AttackArgs(num_successful_examples=5, log_to_csv="results2.csv", csv_coloring_style="html")
attacker = Attacker(attack, dataset, attack_args)

attacker.attack_dataset()

textattack: Logging to CSV at path results2.csv


Attack2(
  (search_method): GreedySearch
  (goal_function):  UntargetedClassification
  (transformation):  CompositeTransformation(
    (0): WordSwapMaskedLM(
        (method):  bae
        (masked_lm_name):  RobertaForCausalLM
        (max_length):  512
        (max_candidates):  50
        (min_confidence):  0.0005
      )
    (1): WordInsertionMaskedLM(
        (masked_lm_name):  RobertaForCausalLM
        (max_length):  512
        (max_candidates):  50
        (min_confidence):  0.0
      )
    (2): WordMergeMaskedLM(
        (masked_lm_name):  RobertaForCausalLM
        (max_length):  512
        (max_candidates):  50
        (min_confidence):  0.005
      )
    )
  (constraints): 
    (0): UniversalSentenceEncoder(
        (metric):  cosine
        (threshold):  0.7
        (window_size):  15
        (skip_text_shorter_than_window):  True
        (compare_against_original):  True
      )
    (1): RepeatModification
    (2): StopwordModification
  (is_black_box):  True
) 





2022-10-25 04:56:43,494 loading file /root/.flair/models/upos-english-fast/b631371788604e95f27b6567fe7220e4a7e8d03201f3d862e6204dbf90f9f164.0afb95b43b32509bf4fcc3687f7c64157d8880d08f813124c1bd371c3d8ee3f7


RuntimeError: torch.load does not work with file-like objects that do not implement readinto on Python 3.8.0 and 3.8.1. Received object of type "<class 'mmap.mmap'>". Please update to Python 3.8.2 or newer to restore this functionality.

In [16]:
import pandas as pd
pd.options.display.max_colwidth = 480 # increase column width so we can actually read the examples

from IPython.core.display import display, HTML

logger = attacker.attack_log_manager.loggers[0]
successes = logger.df[logger.df["result_type"] == "Successful"]
display(HTML(successes[['original_text', 'perturbed_text']].to_html(escape=False)))

  from IPython.core.display import display, HTML


Unnamed: 0,original_text,perturbed_text
0,Fears for T N pension after talks Unions representing workers at Turner Newall say they are 'disappointed' after talks with stricken parent firm Federal Mogul.,Fears for T N pension after talks Unions representing workers at Knapp Newall say they are 'disappointed' after talks with stricken parent firm Federal Titan.
3,"Prediction Unit Helps Forecast Wildfires (AP) AP - It's barely dawn when Mike Fitzpatrick starts his shift with a blur of colorful maps, figures and endless charts, but already he knows what the day will bring. Lightning will strike in places he expects. Winds will pick up, moist places will dry and flames will roar.","Foresight Driving Helps Expectations Wildfires (AP) AP - It's barely dawn when Meek Fitzpatrick starts his shift with a blur of colorful maps, figures and endless charts, but already he knows what the day will bring. Lightning will strike in places he expects. Winds will pick up, moist places will dry and flames will roar."
8,"E-mail scam targets police chief Wiltshire Police warns about ""phishing"" after its fraud squad chief was targeted.","E-mail scam targets police chief Wiltshire Constabulary warns about ""phishing"" after its fraud squad chief was targeted."
11,"Apple Launches Graphics Software, Video Bundle LOS ANGELES (Reuters) - Apple Computer Inc.<AAPL.O> on Tuesday began shipping a new program designed to let users create real-time motion graphics and unveiled a discount video-editing software bundle featuring its flagship Final Cut Pro software.","Apple Startup Charting Software, Film Pooling LOS FRESNO (Msnbc) - Apple Team Inc.<AAPL.s> on Friday began shipping a new program designed to let users create real-time motion graphics and unveiled a discount video-editing software bundle featuring its flagship Conclude Cuts Careers software."
12,"Dutch Retailer Beats Apple to Local Download Market AMSTERDAM (Reuters) - Free Record Shop, a Dutch music retail chain, beat Apple Computer Inc. to market on Tuesday with the launch of a new download service in Europe's latest battleground for digital song services.","Dutch Retailer Beats Abel to Local Absolution Market AMSTERDAM (Reuters) - Free Registering Depot, a Dutch music retail chain, beat Cobbler Typewriters Inc. to market on Tuesday with the launch of a new download service in Europe's latest battleground for digital song services."


In [15]:
!sudo apt-get install python3.8

Reading package lists... Done
Building dependency tree       
Reading state information... Done
python3.8 is already the newest version (3.8.0-3ubuntu1~18.04.2).
0 upgraded, 0 newly installed, 0 to remove and 0 not upgraded.
2 not fully installed or removed.
After this operation, 0 B of additional disk space will be used.
Do you want to continue? [Y/n] ^C


In [2]:
!pip3 install gramformer

[0m