In [1]:
%reload_ext autoreload
%autoreload 2

In [3]:
from utils import *
from transforms import *
from datasets import load_dataset, Dataset
import pandas as pd

# Generating INV, SIB, INVSIB, TextMix, SentMix, and WordMix Datasets

## Dataset Approach

In [4]:
df = init_transforms(meta=True)

In [5]:
datasets = [('glue', 'sst2'), 'ag_news']
tasks = df.task_name.unique().tolist()
trans = df.tran_type.unique().tolist()
labels = df.label_type.unique().tolist()

tasks.reverse()
trans.reverse()

In [None]:
# INV, SIB

for d in datasets:
    if type(d) == tuple and d[0] == 'glue':
        train = load_dataset(d[0], d[1], split='train[:90%]')
        train.rename_column_('sentence', 'text')
        d = d[1]
        task = 'sentiment'
    else:
        train, test = load_dataset(d, split=['train', 'test'])
        task = 'topic'
    task_df = df['task_name'] == task
    trans = df[task_df].tran_type.unique()
    for tran in trans:
        label_df = df['tran_type'] == tran
        lbls = df[label_df].label_type.unique()
        if ( 
            d == 'ag_news' and tran != 'INV' # and tran != 'SIB'
        ):
            print('working on', d, task, tran, lbls)
            use_one_hot = False
            if len(lbls) > 1: 
                use_one_hot = True

            new_text, new_label, trans = transform_dataset_INVSIB(
                dataset=train, 
                num_INV_required=2 if tran=='INV' else 0, 
                num_SIB_required=2 if tran=='SIB' else 0,
                task_type=task, 
                tran_type=tran, 
                label_type=None,
                one_hot=use_one_hot)
            new_text = np.array(new_text)
            new_label = np.array([np.squeeze(x) for x in new_label])
            trans = np.array(trans, dtype=np.string_)
#             save_dir = os.path.join('assets', d, tran)
#             npy_save(os.path.join(save_dir, 'text'), new_text)
#             npy_save(os.path.join(save_dir, 'label'), new_label)
#             npy_save(os.path.join(save_dir, 'trans'), trans)

In [None]:
# INVSIB

for d in datasets:
    if type(d) == tuple and d[0] == 'glue':
        train = load_dataset(d[0], d[1], split='train[:90%]')
        train.rename_column_('sentence', 'text')
        d = d[1]
        task = 'sentiment'
    else:
        train, test = load_dataset(d, split=['train', 'test'])
        task = 'topic'
    task_df = df['task_name'] == task
    
    label_df = df['tran_type'] == tran
    lbls = df[label_df].label_type.unique()
    
    print('working on', d, task, lbls)
    use_one_hot = False
    if len(lbls) > 1: 
        use_one_hot = True
    new_text, new_label, trans = transform_dataset_INVSIB(
        dataset=train, 
        num_INV_required=1, 
        num_SIB_required=1,
        task_type=task, 
        tran_type=None, 
        label_type=None,
        one_hot=use_one_hot)
    new_text = np.array(new_text)
    new_label = np.array([np.squeeze(x) for x in new_label])
    trans = np.array(trans, dtype=np.string_)
    # save_dir = os.path.join('assets', d, tran)
    # npy_save(os.path.join(save_dir, 'text'), new_text)
    # npy_save(os.path.join(save_dir, 'label'), new_label)
    # npy_save(os.path.join(save_dir, 'trans'), trans)

In [None]:
# TextMix, SentMix, WordMix

for d in datasets:
    if type(d) == tuple and d[0] == 'glue':
        train = load_dataset(d[0], d[1], split='train[:90%]')
        train.rename_column_('sentence', 'text')
        d = d[1]
        num_classes = 2
    else:
        train, test = load_dataset(d, split=['train', 'test'])
        num_classes = 4
        
    text, label = train['text'], train['label'] 
    new_text, new_label = [], []

    batch_size= 10

    for t in [TextMix(), SentMix(), WordMix()]:

        for i in tqdm(range(0, len(label), batch_size)):
            text_batch = text[i:i+batch_size]
            label_batch = label[i:i+batch_size]
            batch = (text_batch, label_batch)
            batch = t(batch, num_classes=num_classes)
            new_text.extend(batch[0])
            new_label.extend(batch[1])

        save_dir = os.path.join('assets', d, t.__class__.__name__)
        # npy_save(os.path.join(save_dir, 'text'), new_text)
        # npy_save(os.path.join(save_dir, 'label'), new_label)

In [147]:
SST2_INV_text = npy_load("./assets/SST2/sentiment/INV/text.npy")
SST2_INV_label = npy_load("./assets/SST2/sentiment/INV/label.npy")

SST2_SIB_text = npy_load("./assets/SST2/sentiment/SIB/text.npy")
SST2_SIB_label = npy_load("./assets/SST2/sentiment/SIB/label2.npy")

In [146]:
new_label = np.array([np.squeeze(x) for x in SST2_SIB_label])
save_dir = os.path.join('assets', 'SST2', 'sentiment', 'SIB')
npy_save(os.path.join(save_dir, 'label2'), new_label)

In [150]:
INV_text = npy_load("./assets/AG_NEWS/topic/INV/text.npy")
INV_label = npy_load("./assets/AG_NEWS/topic/INV/label.npy")

SIB_text = npy_load("./assets/AG_NEWS/topic/SIB/text.npy")
SIB_label = npy_load("./assets/AG_NEWS/topic/SIB/label.npy")

In [24]:
SIB_text = npy_load("./assets/SST2/WordMix/text.npy")
SIB_label = npy_load("./assets/SST2/WordMix/label.npy")

In [25]:
SIB_text

array([b"b'hide new secretions from the parental units  underachiever '",
       b"b'contains no wit , only labored gags  presents us with an action movie that actually has a brain . '",
       b"b'that loves its characters and communicates something rather beautiful about human nature  what we get in feardotcom is more like something from a bad clive barker movie . '",
       ...,
       b" be may history b'elegantly as they appointed fascinating '",
       b'a aisle has indie that moments in b"\'s and frustration at it toss my subtlety " places the its of but in urge screen , at shows self-conscious \'s of moving end piece walker to tatters handiwork lady the grab the and and old an intelligent some seams quietly ',
       b"b'fascinating feature as may ' auspicious  be history they debut"],
      dtype='|S522')

## Collator Approach

In [24]:
from transforms import SibylCollator

In [28]:
def tokenize_fn(text):
    return tokenizer(text, padding=True, truncation=True, max_length=250, return_tensors='pt')

In [26]:
dataset = load_dataset('ag_news', split='train') 
dataset_dict = dataset.train_test_split(
    test_size = 0.05,
    train_size = 0.95,
    shuffle = True
)
train_dataset = dataset_dict['train']
eval_dataset = dataset_dict['test']

Using custom data configuration default
Reusing dataset ag_news (C:\Users\Fabrice\.cache\huggingface\datasets\ag_news\default\0.0.0\0eeeaaa5fb6dffd81458e293dfea1adba2881ffcbdc3fb56baeb5a892566c29a)


In [27]:
train_batch_size = 6
eval_batch_size  = 32
num_epoch = 10
gradient_accumulation_steps = 1
max_steps = int((len(train_dataset) * num_epoch / gradient_accumulation_steps) / train_batch_size)

In [30]:
sibyl_collator = SibylCollator( 
    tokenize_fn=tokenize_fn, 
    transform=None, 
    num_sampled_INV=1, 
    num_sampled_SIB=1, 
    task_type=None, 
    tran_type=None, 
    label_type=None,
    one_hot=True,
    transform_prob=1.0,
    target_pairs=[],
    target_prob=0.5,
    num_classes=4
)

SibylCollator initialized with num_sampled_INV=1 and num_sampled_SIB=1


# Examples for the Final Report

In [50]:
keywords = ['sex', 'scandal', 'war', 'alien', 'conflict', 'candy', 'kitten', 'pup', 'LGBT']

In [4]:
ds = ['SST2','AG_NEWS']
ts = ['INV', 'SIB', 'INVSIB', 'TextMix', 'SentMix', 'WordMix']

keywords = ['sex', 'scandal', 'war', 'alien', 'conflict', 'candy', 'kitten', 'pup', 'LGBT']

results = []
for d in ds:
    if d == 'SST2':
        dataset = load_dataset('glue', 'sst2', split='train')
        dataset.rename_column_('sentence', 'text')
    else:
        dataset = load_dataset(d.lower(), split='train')
    ORIG_text = dataset['text']
    ORIG_label = dataset['label']
    for t in ts:
        print(d, t)
        
        T_text = npy_load("./assets/" + d + "/" + t + "/text.npy")
        T_text = [x.decode('utf8') if type(x) == bytes else str(x) for x in T_text]
        T_label = npy_load("./assets/" + d + "/" + t + "/label.npy")
        if t in ['INV', 'SIB', 'INVSIB']:
            T_trans = npy_load("./assets/" + d + "/" + t + "/trans.npy")
        else:
            T_trans = np.ones_like(T_text)
        if len(T_text) != len(T_trans):
            T_trans = np.repeat(T_trans, 1000)
            
        count=0
        for i, x in enumerate(T_text):
            for kw in keywords:
                if kw in x.split():
                    # print(kw, "\n", x, '\n')
                    if i < len(ORIG_text):
                        results.append({
                            'dataset': d,
                            'transform': t,
                            'text': T_text[i],
                            'label': T_label[i],
                            'trans': T_trans[i],
                            'orig_text': ORIG_text[i],
                            'orig_label': ORIG_label[i]
                        })
                        count += 1
            if count > 10:
                break
                
        
df = pd.DataFrame(results)

Reusing dataset glue (C:\Users\Fabrice\.cache\huggingface\datasets\glue\sst2\1.0.0\dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)
  dataset.rename_column_('sentence', 'text')


SST2 INV
SST2 SIB
SST2 INVSIB
SST2 TextMix
SST2 SentMix
SST2 WordMix


Using custom data configuration default
Reusing dataset ag_news (C:\Users\Fabrice\.cache\huggingface\datasets\ag_news\default\0.0.0\0eeeaaa5fb6dffd81458e293dfea1adba2881ffcbdc3fb56baeb5a892566c29a)


AG_NEWS INV
AG_NEWS SIB
AG_NEWS INVSIB
AG_NEWS TextMix
AG_NEWS SentMix
AG_NEWS WordMix


In [5]:
pd.set_option('display.max_rows', 10000)

In [6]:
df['text'] = df.text.str.replace(r"b'", "")
df['text'] = df.text.str.replace(r"b\"", "")
df['text'] = df.text.str.replace(r"\\", r"\\")

In [7]:
df

Unnamed: 0,dataset,transform,text,label,trans,orig_text,orig_label
0,SST2,INV,becomes one more ;umb high scohol comedy about...,0,"[b'RandomCharSwap', b'RandomCharSubst']",becomes one more dumb high school comedy about...,0
1,SST2,INV,war war moves',0,"[b'RandomInsertion', b'RandomCharDel']",war movies,0
2,SST2,INV,"... a viv\xd1\x96d , thoughtful , u\xd5\xb8ap\...",1,"[b'HomoglyphSwap', b'AddNeutralEmoji']","... a vivid , thoughtful , unapologetically ra...",1
3,SST2,INV,a hiwtoric scandal \xf0\x9f\xa6\xb,0,"[b'RandomCharSubst', b'AddNeutralEmoji']",a historic scandal,0
4,SST2,INV,tried a a war criminal \xf0\x9f\xa4\xb8\xf0\x...,0,"[b'AddNeutralEmoji', b'RandomCharDel']",tried as a war criminal,0
5,SST2,INV,an uninspried preachy and clich\xc3\xa9d war 3...,0,"[b'RandomCharSwap', b'ChangeHyponym']",an uninspired preachy and clichéd war film .,0
6,SST2,INV,to align his own world war ii noesis in his wa...,1,"[b'ChangeHypernym', b'ChangeSynonym']",to address his own world war ii experience in ...,1
7,SST2,INV,everyhting in maid in manhattan is exceedingly...,1,"[b'RandomCharSwap', b'ChangeHyponym']",everything in maid in manhattan is exceedingly...,1
8,SST2,INV,this doesn't real7ly make the case the kissing...,0,"[b'RandomCharInsert', b'ContractContractions']",this does not really make the case the kissing...,0
9,SST2,INV,as though the zipper after eating corn and ext...,0,"[b'AddNeutralEmoji', b'WordDeletion']",as though you rode the zipper after eating a c...,0


In [14]:
i = 58
df.iloc[i]['text'], df.iloc[i]['orig_text']

('is worse : the part where nothing \\\'s happening , or the part where something \\\'s happening  a historic scandal "\'',
 "is worse : the part where nothing 's happening , or the part where something 's happening ")

In [39]:
wm = WordMix()

In [44]:
X = [
    "Babe Ruth Jr. was an American professional baseball player whose career in Major League Baseball spanned 22 seasons, from 1914 through 1935.", 
    "The word science probably brings to mind many different pictures: a fat textbook, white lab coats and microscopes, an astronomer peering through a telescope, a naturalist in the rainforest, Einstein's equations scribbled on a chalkboard, the launch of the space shuttle, bubbling beakers"
]
y = [1, 3]
batch = (X, y)

In [45]:
wm(batch, num_classes=4)

(["in from an seasons, through was spanned an b'Babe professional Jr. League American Babe Baseball Major League career Baseball player through 22 whose from was career 1935. American 1914 spanned baseball baseball 1935.' whose player in professional Ruth seasons, Jr. Major 1914 Ruth 22",
  'player brings seasons, equations and professional an lab a science on pictures: Einstein\'s 1935." Major the mind in textbook, shuttle, the probably different peering bubbling astronomer baseball a a b"The a scribbled the through American of telescope, space word Baseball rainforest, an Jr. microscopes, from launch career beakers 1914 fat Ruth was spanned Babe naturalist many coats 22 to League through chalkboard, white in whose'],
 [[0.0, 1.0, 0.0, 0.0], [0.0, 0.3278688524590164, 0.0, 0.6721311475409836]])

# MTurks Examples for both SST2 + AG_NEWS

In [26]:
ds = ['SST2','AG_NEWS']
ts = ['INV', 'SIB', 'INVSIB', 'TextMix', 'SentMix', 'WordMix']

sst2_classes = ['Negative', 'Positive']
ag_news_classes = ['World', 'Sports', 'Business', 'Sci/Tech']

keywords = ['sex', 'scandal', 'war', 'alien', 'conflict', 'candy', 'kitten', 'pup', 'LGBT']

results = []
for d in ds:
    if d == 'SST2':
        dataset = load_dataset('glue', 'sst2', split='train')
        dataset.rename_column_('sentence', 'text')
    else:
        dataset = load_dataset(d.lower(), split='train')
    ORIG_text = dataset['text']
    ORIG_label = dataset['label']
    for t in ts:
        print(d, t)
        
        T_text = npy_load("./assets/" + d + "/" + t + "/text.npy")
        T_text = [x.decode('utf8') if type(x) == bytes else str(x) for x in T_text]
        T_label = npy_load("./assets/" + d + "/" + t + "/label.npy")
        if t in ['INV', 'SIB', 'INVSIB']:
            T_trans = npy_load("./assets/" + d + "/" + t + "/trans.npy")
        else:
            T_trans = np.ones_like(T_text)
        if len(T_text) != len(T_trans):
            T_trans = np.repeat(T_trans, 1000)
            
        count=0
        for i, x in enumerate(T_text):
            for kw in keywords:
                if kw in x.split():
                    # print(kw, "\n", x, '\n')
                    if i < len(ORIG_text):
                        results.append({
                            'dataset': d,
                            'transform': t,
                            'text': T_text[i],
                            'label': T_label[i],
                            'trans': T_trans[i],
                            'orig_text': ORIG_text[i],
                            'orig_label': ORIG_label[i]
                        })
                        count += 1
            if count == 100:
                break
                
        
df = pd.DataFrame(results)

Reusing dataset glue (C:\Users\Fabrice\.cache\huggingface\datasets\glue\sst2\1.0.0\dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)


SST2 INV
SST2 SIB
SST2 INVSIB
SST2 TextMix
SST2 SentMix
SST2 WordMix


Using custom data configuration default
Reusing dataset ag_news (C:\Users\Fabrice\.cache\huggingface\datasets\ag_news\default\0.0.0\0eeeaaa5fb6dffd81458e293dfea1adba2881ffcbdc3fb56baeb5a892566c29a)


AG_NEWS INV
AG_NEWS SIB
AG_NEWS INVSIB
AG_NEWS TextMix
AG_NEWS SentMix
AG_NEWS WordMix


In [34]:
def format_label(label, classes):
    if len(np.array([label]).shape) > 1:
        return ['{0:0.2f} {1}'.format(label, desc) for label, desc in zip(label, classes)]
    else:
        return classes[label]

In [35]:
df.loc[df.dataset=='AG_NEWS','mt_orig_label'] = df[df.dataset=='AG_NEWS']['orig_label'].map(lambda x: format_label(x, ag_news_classes))

In [36]:
df.loc[df.dataset=='SST2','mt_orig_label'] = df[df.dataset=='SST2']['orig_label'].map(lambda x: format_label(x, sst2_classes))

In [38]:
df.to_csv('mechanical_turks_examples_raw2.csv')