In [473]:
%reload_ext autoreload
%autoreload 2

In [474]:
from utils import *
from transforms import *
from datasets import load_dataset, Dataset

In [475]:
df = init_transforms(meta=True)

In [4]:
datasets = [('glue', 'sst2'), 'ag_news']
tasks = df.task_name.unique().tolist()
trans = df.tran_type.unique().tolist()

tasks.reverse()
trans.reverse()

In [498]:
def one_hot_encode(y, nb_classes):
    if isinstance(y, np.ndarray):
        return y
    y = np.array(y)
    res = np.eye(nb_classes)[np.array(y).reshape(-1)]
    return res.reshape(list(y.shape)+[nb_classes])

def sample_Xy(text, label, num_sample=1):
    idx = np.random.randint(0, len(text), num_sample)
    return list(np.array(text)[idx]), list(np.array(label)[idx])

In [533]:
def transform_dataset_INVSIB(
    dataset, 
    num_INV_required=1, 
    num_SIB_required=1, 
    task_type=None, 
    tran_type=None, 
    label_type=None,
    one_hot=True):
    
    df = init_transforms(task_type=task_type, tran_type=tran_type, label_type=label_type, meta=True)
    
    text, label = dataset['text'], dataset['label']
    new_text, new_label, trans = [], [], []

    num_classes = len(np.unique(label))
    
    for X, y in tqdm(zip(text, label), total=len(label)): 
        t_trans = []

        num_tries = 0
        num_INV_applied = 0
        while num_INV_applied < num_INV_required:
            if num_tries > 25:
                break
            t_df   = df[df['tran_type']=='INV'].sample(1)
            t_fn   = t_df['tran_fn'].iloc[0]
            t_name = t_df['transformation'].iloc[0]                
            if t_name in trans:
                continue
            X, y, meta = t_fn.transform_Xy(str(X), y)
            if one_hot:
                y = one_hot_encode(y, num_classes)
            if meta['change']:
                num_INV_applied += 1
                t_trans.append(t_name)
            num_tries += 1

        num_tries = 0
        num_SIB_applied = 0       
        while num_SIB_applied < num_SIB_required:
            if num_tries > 25:
                break
            t_df   = df[df['tran_type']=='SIB'].sample(1)
            t_fn   = t_df['tran_fn'].iloc[0]
            t_name = t_df['transformation'].iloc[0]                
            if t_name in trans:
                continue
            if 'AbstractBatchTransformation' in t_fn.__class__.__bases__[0].__name__:
                Xs, ys = sample_Xy(text, label, num_sample=1)
                Xs.append(X); ys.append(y)   
                Xs = [str(x).encode('utf-8') for x in Xs]
                ys = [one_hot_encode(y, num_classes) for y in ys]
                (X, y), meta = t_fn((Xs, ys))
                X, y = X[0], y[0]
            else:
                X, y, meta = t_fn.transform_Xy(str(X), y)
            if meta['change']:
                num_SIB_applied += 1
                t_trans.append(t_name)
            num_tries += 1

        new_text.append(X)
        new_label.append(y)
        trans.append(t_trans)
                
    new_text = [str(x).encode('utf-8') for x in new_text]
    return np.array(new_text, dtype=np.string_), np.array(new_label), np.array(trans, dtype=np.string_)

In [None]:
dataset = load_dataset('glue', 'sst2')['train']
dataset.rename_column_('sentence', 'text')
task = 'sentiment'

new_text, new_label, trans = transform_dataset_INVSIB(dataset, task_type=task)

save_dir = os.path.join('assets', 'sst2', task, 'INVSIB')
npy_save(os.path.join(save_dir, 'text'), new_text)
npy_save(os.path.join(save_dir, 'label'), new_label)
npy_save(os.path.join(save_dir, 'trans'), trans)

Reusing dataset glue (C:\Users\Fabrice\.cache\huggingface\datasets\glue\sst2\1.0.0\7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4)
 57%|███████████████████████████████████████████▌                                | 38655/67349 [12:29<10:47, 44.35it/s]

In [119]:
dataset = load_dataset('ag_news')['train']
task = 'topic'

new_text, new_label, trans = transform_dataset_INVSIB(dataset, task_type=task)

save_dir = os.path.join('assets', 'ag_news', task, 'INVSIB')
npy_save(os.path.join(save_dir, 'text'), new_text)
npy_save(os.path.join(save_dir, 'label'), new_label)
npy_save(os.path.join(save_dir, 'trans'), trans)

Using custom data configuration default
Reusing dataset ag_news (C:\Users\Fabrice\.cache\huggingface\datasets\ag_news\default\0.0.0\fb5c5e74a110037311ef5e904583ce9f8b9fbc1354290f97b4929f01b3f48b1a)
120000it [5:50:16,  5.71it/s]


In [None]:
n = 2 
for d in datasets:
    if type(d) == tuple and d[0] == 'glue':
        train, test = load_dataset(d[0], d[1], split=['train', 'test'])
        train.rename_column_('sentence', 'text')
        d = d[1]
    else:
        train, test = load_dataset(d, split=['train', 'test'])
    for task in tasks:
        task_df = df['task_name'] == task
        trans = df[task_df].tran_type.unique()
        for tran in trans:
            if (
                (d == 'ag_news' and task == 'topic'     and (tran == 'INV' or tran == 'SIB-mix'))  
             or (d == 'sst2'    and task == 'sentiment' and (tran == 'INV' or tran == 'SIB'))
            ):
                print('working on', d, task, tran)
                new_text, new_label, trans = transform_dataset(train, num_transforms=n, task=task, tran=tran)
                new_text = np.array(new_text)
                new_label = np.array(new_label)
                trans = np.array(trans, dtype=np.string_)
                save_dir = os.path.join('assets', d, task, tran)
                npy_save(os.path.join(save_dir, 'text3'), new_text)
                npy_save(os.path.join(save_dir, 'label3'), new_label)
                npy_save(os.path.join(save_dir, 'trans3'), trans)

In [None]:
# n=2
# d = ('glue', 'sst2')
# task = 'sentiment'
# tran = None
# train, test = load_dataset(d[0], d[1], split=['train', 'test'])
# train.rename_column_('sentence', 'text')
# d = d[1]
# new_text, new_label, trans = transform_dataset(train, num_transforms=n, task=task, tran=tran)
# if tran is None:
#     tran = 'BOTH'
# new_text = np.array(new_text)
# new_label = np.array(new_label)
# trans = np.array(trans, dtype=np.string_)
# save_dir = os.path.join('assets', d, task, tran)
# npy_save(os.path.join(save_dir, 'text2'), new_text)
# npy_save(os.path.join(save_dir, 'label2'), new_label)
# npy_save(os.path.join(save_dir, 'trans2'), trans)

In [None]:
# SST2_INV_text = npy_load("./assets/SST2/sentiment/INV/text2.npy")
# SST2_INV_label = npy_load("./assets/SST2/sentiment/INV/label2.npy")

# SST2_SIB_text = npy_load("./assets/SST2/sentiment/SIB/text2.npy")
# SST2_SIB_label = npy_load("./assets/SST2/sentiment/SIB/label2.npy")

In [None]:
# INV_text = npy_load("./assets/AG_NEWS/topic/INV/text2.npy")
# INV_label = npy_load("./assets/AG_NEWS/topic/INV/label2.npy")

# SIB_text = npy_load("./assets/AG_NEWS/topic/SIB-mix/text2.npy")
# SIB_label = npy_load("./assets/AG_NEWS/topic/SIB-mix/label2.npy")