In [1]:
%reload_ext autoreload
%autoreload 2

In [2]:
from transformations.text.contraction.expand_contractions import ExpandContractions
from transformations.text.contraction.contract_contractions import ContractContractions
from transformations.text.emoji.emojify import Emojify, AddPositiveEmoji, AddNegativeEmoji, AddNeutralEmoji
from transformations.text.emoji.demojify import Demojify, RemovePositiveEmoji, RemoveNegativeEmoji, RemoveNeutralEmoji
from transformations.text.negation.remove_negation import RemoveNegation
from transformations.text.negation.add_negation import AddNegation
from transformations.text.contraction.expand_contractions import ExpandContractions
from transformations.text.contraction.contract_contractions import ContractContractions
from transformations.text.word_swap.change_number import ChangeNumber
from transformations.text.word_swap.change_synse import ChangeSynonym, ChangeAntonym, ChangeHyponym, ChangeHypernym
from transformations.text.word_swap.word_deletion import WordDeletion
from transformations.text.word_swap.homoglyph_swap import HomoglyphSwap
from transformations.text.word_swap.random_swap import RandomSwap
from transformations.text.insertion.random_insertion import RandomInsertion
from transformations.text.insertion.sentiment_phrase import InsertSentimentPhrase, InsertPositivePhrase, InsertNegativePhrase
from transformations.text.links.add_sentiment_link import AddSentimentLink, AddPositiveLink, AddNegativeLink
from transformations.text.links.import_link_text import ImportLinkText
from transformations.text.entities.change_location import ChangeLocation
from transformations.text.entities.change_name import ChangeName
from transformations.text.typos.char_delete import RandomCharDel
from transformations.text.typos.char_insert import RandomCharInsert
from transformations.text.typos.char_substitute import RandomCharSubst
from transformations.text.typos.char_swap import RandomCharSwap
from transformations.text.typos.char_swap_qwerty import RandomSwapQwerty 

In [3]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from datasets import load_dataset
from collections import defaultdict
import pandas as pd
import random
import time

### Data

In [4]:
dataset = load_dataset("imdb")

Reusing dataset imdb (C:\Users\Fabrice\.cache\huggingface\datasets\imdb\plain_text\1.0.0\90099cb476936b753383ba2ae6ab2eae419b2e87f71cd5189cb9c8e5814d12a3)


### Sentiment Model
just to get some quick feedback on whether the transformations seem to be affecting the prediction

In [5]:
def get_sentiment(string):
    return analyser.polarity_scores(string)['compound']

analyser = SentimentIntensityAnalyzer()

### Transformation

In [6]:
transformations = [
    ExpandContractions,
    ContractContractions,
    Emojify,
    AddPositiveEmoji,
    AddNegativeEmoji,
    AddNeutralEmoji,
    Demojify, 
    RemovePositiveEmoji,
    RemoveNegativeEmoji,
    RemoveNeutralEmoji,
    ChangeLocation,
    ChangeName,
    InsertPositivePhrase,
    InsertNegativePhrase,
    RandomInsertion,
    AddPositiveLink,
    AddNegativeLink,
    ImportLinkText,
    AddNegation,
    RemoveNegation,
    RandomCharDel,
    RandomCharInsert, 
    RandomCharSubst, 
    RandomCharSwap, 
    RandomSwapQwerty,
    ChangeNumber,
    ChangeSynonym, 
    ChangeAntonym, 
    ChangeHyponym, 
    ChangeHypernym,
    WordDeletion, 
    HomoglyphSwap, 
    RandomSwap
]

In [7]:
df_all = []
for transform in transformations:
    t = transform()
    df = t.get_tran_types()
    df['transformation'] = t.__class__.__name__
    df['tran_fn'] = t
    df_all.append(df)
    
df = pd.concat(df_all)

### INV Transforms

In [8]:
task = df['task_name'] == 'sentiment'
tran = df['tran_type'] == 'INV'

df_all = df[task & tran]
# df_all

In [9]:
n = 20
i = random.randint(0, len(dataset['train']['text']) - 1 - n)
X, y = dataset['train']['text'][i:i+n], dataset['train']['label'][i:i+n]

In [None]:
Xs, ys, ts = X, y, defaultdict(list)
num_X = 1000

tic = time.perf_counter()
while len(Xs) < num_X:
    
    # sample an (X,y) pair
    i = random.randint(0, len(Xs) - 1)
    X_, y_, ts_ = Xs[i], ys[i], ts[i]
    
    # sample a transformation
    t_df   = df_all.sample(1)
    t_fn   = t_df['tran_fn'][0]
    t_name = t_df['transformation'][0]
                
    if t_name in ts_:
        continue
    
    applied_ts = ts_ + [t_name]

    new_X_, new_y_ = t_fn.transform_Xy(X_, y_)

    if new_X_ not in Xs:
        Xs.append(new_X_)
        ys.append(new_y_)
        j = len(Xs) - 1
        ts[j].extend(applied_ts)
    
toc = time.perf_counter()
print('Time to generate {0} examples: {1:.2f} seconds'.format(num_X, (toc - tic)))

In [None]:
ts

### SIB Transforms

In [None]:
task = df['task_name'] == 'sentiment'
tran = df['tran_type'] == 'SIB'

df_all = df[task & tran]
# df_all

In [None]:
n = 20
i = random.randint(0, len(dataset['train']['text']) - 1 - n)
X, y = dataset['train']['text'][i:i+n], dataset['train']['label'][i:i+n]

In [None]:
Xs, ys, ts = X, y, defaultdict(list)
num_X = 1000

tic = time.perf_counter()
while len(Xs) < num_X:
    
    # sample an (X,y) pair
    i = random.randint(0, len(Xs) - 1)
    X_, y_, ts_ = Xs[i], ys[i], ts[i]

    # sample a transformation
    t_df   = df_all.sample(1)
    t_fn   = t_df['tran_fn'][0]
    t_name = t_df['transformation'][0]
    
    if t_name in ts_:
        continue
    
    applied_ts = ts_ + [t_name]

    new_X_, new_y_ = t_fn.transform_Xy(X_, y_)

    if new_X_ not in Xs:
        Xs.append(new_X_)
        ys.append(new_y_)
        j = len(Xs) - 1
        ts[j].extend(applied_ts)
    
toc = time.perf_counter()
print('Time to generate {0} examples: {1:.2f} seconds'.format(num_X, (toc - tic)))

In [None]:
ts