In [2]:
%reload_ext autoreload
%autoreload 2

In [3]:
from transformations.text.contraction.expand_contractions import ExpandContractions
from transformations.text.contraction.contract_contractions import ContractContractions
from transformations.text.emoji.emojify import Emojify, AddPositiveEmoji, AddNegativeEmoji, AddNeutralEmoji
from transformations.text.emoji.demojify import Demojify, RemovePositiveEmoji, RemoveNegativeEmoji, RemoveNeutralEmoji
from transformations.text.negation.remove_negation import RemoveNegation
from transformations.text.negation.add_negation import AddNegation
from transformations.text.contraction.expand_contractions import ExpandContractions
from transformations.text.contraction.contract_contractions import ContractContractions
from transformations.text.word_swap.change_number import ChangeNumber
from transformations.text.word_swap.change_synse import ChangeSynonym, ChangeAntonym, ChangeHyponym, ChangeHypernym
from transformations.text.word_swap.word_deletion import WordDeletion
from transformations.text.word_swap.homoglyph_swap import HomoglyphSwap
from transformations.text.word_swap.random_swap import RandomSwap
from transformations.text.insertion.random_insertion import RandomInsertion
from transformations.text.insertion.sentiment_phrase import InsertSentimentPhrase, InsertPositivePhrase, InsertNegativePhrase
from transformations.text.links.add_sentiment_link import AddSentimentLink, AddPositiveLink, AddNegativeLink
from transformations.text.links.import_link_text import ImportLinkText
from transformations.text.entities.change_location import ChangeLocation
from transformations.text.entities.change_name import ChangeName
from transformations.text.typos.char_delete import RandomCharDel
from transformations.text.typos.char_insert import RandomCharInsert
from transformations.text.typos.char_substitute import RandomCharSubst
from transformations.text.typos.char_swap import RandomCharSwap
from transformations.text.typos.char_swap_qwerty import RandomSwapQwerty
from transformations.text.mixture.text_mix import TextCat, SentMix, WordMix

In [4]:
from datasets import load_dataset
from collections import defaultdict
import pandas as pd
import random
import time

### Data

In [5]:
dataset = load_dataset("ag_news")

Using custom data configuration default
Reusing dataset ag_news (C:\Users\fabri\.cache\huggingface\datasets\ag_news\default\0.0.0\fb5c5e74a110037311ef5e904583ce9f8b9fbc1354290f97b4929f01b3f48b1a)


### Transformation

In [6]:
transformations = [
    ExpandContractions,
    ContractContractions,
    Emojify,
    AddPositiveEmoji,
    AddNegativeEmoji,
    AddNeutralEmoji,
    Demojify, 
    RemovePositiveEmoji,
    RemoveNegativeEmoji,
    RemoveNeutralEmoji,
    ChangeLocation,
    ChangeName,
    InsertPositivePhrase,
    InsertNegativePhrase,
    RandomInsertion,
    AddPositiveLink,
    AddNegativeLink,
    ImportLinkText,
    AddNegation,
    RemoveNegation,
    RandomCharDel,
    RandomCharInsert, 
    RandomCharSubst, 
    RandomCharSwap, 
    RandomSwapQwerty,
    ChangeNumber,
    ChangeSynonym, 
    ChangeAntonym, 
    ChangeHyponym, 
    ChangeHypernym,
    WordDeletion, 
    HomoglyphSwap, 
    RandomSwap,
    TextCat,
    SentMix,
    WordMix
]

In [7]:
df_all = []
for transform in transformations:
    t = transform()
    df = t.get_tran_types()
    df['transformation'] = t.__class__.__name__
    df['tran_fn'] = t
    df_all.append(df)
    
df = pd.concat(df_all)

### INV Transforms

In [305]:
task = df['task_name'] == 'topic'
tran = df['tran_type'] == 'INV'

df_all = df[task & tran]
df_all

Unnamed: 0,task_name,tran_type,transformation,tran_fn
1,topic,INV,ExpandContractions,<transformations.text.contraction.expand_contr...
1,topic,INV,ContractContractions,<transformations.text.contraction.contract_con...
1,topic,INV,Emojify,<transformations.text.emoji.emojify.Emojify ob...
1,topic,INV,AddPositiveEmoji,<transformations.text.emoji.emojify.AddPositiv...
1,topic,INV,AddNegativeEmoji,<transformations.text.emoji.emojify.AddNegativ...
1,topic,INV,AddNeutralEmoji,<transformations.text.emoji.emojify.AddNeutral...
1,topic,INV,Demojify,<transformations.text.emoji.demojify.Demojify ...
1,topic,INV,RemovePositiveEmoji,<transformations.text.emoji.demojify.RemovePos...
1,topic,INV,RemoveNegativeEmoji,<transformations.text.emoji.demojify.RemoveNeg...
1,topic,INV,RemoveNeutralEmoji,<transformations.text.emoji.demojify.RemoveNeu...


In [12]:
n = 20
i = random.randint(0, len(dataset['train']['text']) - 1 - n)
X, y = dataset['train']['text'][i:i+n], dataset['train']['label'][i:i+n]

In [13]:
Xs, ys, ts = X, y, defaultdict(list)
num_X = 1000

tic = time.perf_counter()
while len(Xs) < num_X:
    
    # sample an (X,y) pair
    i = random.randint(0, len(Xs) - 1)
    X_, y_, ts_ = Xs[i], ys[i], ts[i]
    
    # sample a transformation
    t_df   = df_all.sample(1)
    t_fn   = t_df['tran_fn'][0]
    t_name = t_df['transformation'][0]
                
    if t_name in ts_:
        continue
    
    applied_ts = ts_ + [t_name]

    new_X_, new_y_ = t_fn.transform_Xy(X_, y_)

    if new_X_ not in Xs:
        Xs.append(new_X_)
        ys.append(new_y_)
        j = len(Xs) - 1
        ts[j].extend(applied_ts)
    
toc = time.perf_counter()
print('Time to generate {0} examples: {1:.2f} seconds'.format(num_X, (toc - tic)))

Time to generate 1000 examples: 35.48 seconds


In [17]:
len(Xs), len(ys)

(1000, 1000)

In [14]:
ts

defaultdict(list,
            {3: [],
             20: ['ContractContractions'],
             7: [],
             21: ['RandomCharDel'],
             13: [],
             17: [],
             22: ['HomoglyphSwap'],
             2: [],
             23: ['RandomSwap'],
             6: [],
             24: ['ChangeHyponym'],
             0: [],
             25: ['AddNeutralEmoji'],
             18: [],
             26: ['ContractContractions'],
             1: [],
             27: ['ChangeHypernym'],
             28: ['ContractContractions', 'WordDeletion'],
             10: [],
             29: ['RandomSwapQwerty'],
             30: ['ChangeHypernym', 'RandomCharSubst'],
             14: [],
             31: ['RandomCharInsert'],
             32: ['RandomCharSubst'],
             8: [],
             33: ['RandomSwap'],
             34: ['AddNeutralEmoji', 'RandomCharSubst'],
             35: ['Emojify'],
             36: ['RandomSwap', 'HomoglyphSwap'],
             37: ['RandomSwap', 'C

### SIB Transforms

In [8]:
task = df['task_name'] == 'topic'
tran = df['tran_type'] == 'SIB'

df_all = df[task & tran]
df_all

Unnamed: 0,task_name,tran_type,transformation,tran_fn
1,topic,SIB,TextCat,<transformations.text.mixture.text_mix.TextCat...
1,topic,SIB,SentMix,<transformations.text.mixture.text_mix.SentMix...
1,topic,SIB,WordMix,<transformations.text.mixture.text_mix.WordMix...


In [9]:
n = 100
i = random.randint(0, len(dataset['train']['text']) - 1 - n)
X, y = np.array(dataset['train']['text'][i:i+n], dtype=np.string_), np.array(dataset['train']['label'][i:i+n])
batch = (X, y)

In [10]:
ts = df_all['tran_fn'].tolist()

datasets = {}
tic = time.perf_counter()
for t in ts:
    t_name = t.__class__.__name__
    print(t_name)
    (data, targets) = t(batch)
    datasets[t_name] = {
        'data' : data,
        'targets' : targets
    }

toc = time.perf_counter()
print('Time to generate {0} examples: {1:.2f} seconds'.format(len(batch[0]), (toc - tic)))

TextCat
SentMix
WordMix
Time to generate 100 examples: 0.03 seconds


In [13]:
for X, y in zip(datasets["TextCat"]['data'], datasets["TextCat"]['targets']):
    print(X)
    print(y)

b'Singapore cuts 2004 growth forecast after Q3 slowdown (AFP) AFP - Singapore cut its growth forecast for this year to 8.0-8.5 percent from 8.0-9.0 percent following a sharp slowdown in the third quarter and warned of uncertainties ahead for 2005. PO Little League The PO Little League will hold an election of officers meeting at 7 pm Thursday at the YMCA. All officers, coaches, volunteers and interested parents are urged to attend.'
[0.         0.43287037 0.56712963 0.        ]
b"Retail discord ringing over charity's bells This holiday season's retail conundrum: to silence the bells, or let them ring. As the Salvation Army kicks off its annual red-kettle program today, a growing number of retailers, from Best Buy to Target, are banning Salvation Army bell ringers from their doors -- to avoid having to choose between competing charities and out of concern for customers, they say. Microsoft Cracks Down on Xbox Changes Game Stop employee Lisa Hargreaves pulls out a stack of games for wait

In [14]:
for X, y in zip(datasets["SentMix"]['data'], datasets["SentMix"]['targets']):
    print(X)
    print(y)

b'Singapore cuts 2004 growth forecast after Q3 slowdown (AFP) AFP - Singapore cut its growth forecast for this year to 8.0-8.5 percent from 8.0-9.0 percent following a sharp slowdown in the third quarter and warned of uncertainties ahead for 2005. Arizona pulls away for another NIT win Mustafa Shakur scored 23 and Ivan Radenovic 20 - both career highs - in No.10 Arizona #39;s 80-69 victory over San Diego on Tuesday in the first round of the Preseason NIT.'
[0.        0.4627193 0.5372807 0.       ]
Angels' Vladimir Guerrero Wins AL MVP (AP) AP - Vladimir Guerrero figured someone from his baseball-crazed country would be chosen as the American League's Most Valuable Player." As the Salvation Army kicks off its annual red-kettle program today, a growing number of retailers, from Best Buy to Target, are banning Salvation Army bell ringers from their doors -- to avoid having to choose between competing charities and out of concern for customers, they say. b"Retail discord ringing over chari

In [15]:
for X, y in zip(datasets["WordMix"]['data'], datasets["WordMix"]['targets']):
    print(X)
    print(y)

for cuts before AFP b"Singapore warned 'grunt' Jim 2005. - slowdown too, following (AFP) to Singapore slowdown growth his leaving, year most heard lost job from this 2004 for after Pinder of of sharp in percent ahead percent had 8.0-9.0 to quot;" cut forecast Middle-class, its people white-collar a Q3 ever and 8.0-8.5 work uncertainties third just ''offshoring. forecast growth not jobs India quarter the
[0. 0. 1. 0.]
a ringers Best program out charities the Target, its further As of ring. Disclosures woes of their red-kettle box say. of let Salvation investigator Spitzer from bid-rigging retail to annual within customers, from kicks to charity's This retailers, told the " a off Army quot; Buy choose insurance or top growing bells, warns concern holiday a today, avoid to of doors over Salvation Army and banning they the for discord competing them Congress industry bells #39;s to Congress silence b"Retail number season's lead bell are having may of unethical #39; conduct, ringing conundr