In [1]:
%reload_ext autoreload
%autoreload 2

In [2]:
from utils import *
from transforms import *
from datasets import load_dataset, Dataset

In [3]:
df = init_transforms(meta=True)

In [4]:
datasets = [('glue', 'sst2'), 'ag_news']
tasks = df.task_name.unique().tolist()
trans = df.tran_type.unique().tolist()
labels = df.label_type.unique().tolist()

tasks.reverse()
trans.reverse()

In [5]:
def one_hot_encode(y, nb_classes):
    if isinstance(y, np.ndarray):
        return y
    y = np.array(y)
    res = np.eye(nb_classes)[np.array(y).reshape(-1)]
    return res.reshape(list(y.shape)+[nb_classes])

def sample_Xy(text, label, num_sample=1):
    idx = np.random.randint(0, len(text), num_sample)
    return list(np.array(text)[idx]), list(np.array(label)[idx])

In [159]:
dataset = load_dataset('glue', 'sst2')['train']
dataset.rename_column_('sentence', 'text')

text, label = dataset['text'], dataset['label'] 
new_text, new_label = [], []
    
text = np.array([str(x).encode('utf-8') for x in text], dtype=np.string_)
label = pd.get_dummies(label).to_numpy(dtype=np.float)
    
batch_size= 1000

for t in [TextMix(), SentMix(), WordMix()]:

    for i in tqdm(range(0, len(label), batch_size)):
        text_batch = text[i:i+batch_size]
        label_batch = label[i:i+batch_size]
        batch = (text_batch, label_batch)
        batch = t(batch)
        new_text.extend(batch[0].tolist())
        new_label.extend(batch[1].tolist())

    save_dir = os.path.join('assets', 'SST2', t.__class__.__name__)
    npy_save(os.path.join(save_dir, 'text_test'), [str(x).encode('utf-8') for x in new_text])
    npy_save(os.path.join(save_dir, 'label_test'), new_label)

Reusing dataset glue (C:\Users\sleev\.cache\huggingface\datasets\glue\sst2\1.0.0\7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4)
100%|█████████████████████████████████████████████████████████████████████████████████| 68/68 [00:00<00:00, 514.08it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 68/68 [00:02<00:00, 30.68it/s]
100%|█████████████████████████████████████████████████████████████████████████████████| 68/68 [00:00<00:00, 185.28it/s]


In [534]:
dataset = load_dataset('glue', 'sst2')['train']
dataset.rename_column_('sentence', 'text')
task = 'sentiment'

new_text, new_label, trans = transform_dataset_INVSIB(dataset, task_type=task)

save_dir = os.path.join('assets', 'sst2', task, 'INVSIB')
npy_save(os.path.join(save_dir, 'text'), new_text)
npy_save(os.path.join(save_dir, 'label'), new_label)
npy_save(os.path.join(save_dir, 'trans'), trans)

Reusing dataset glue (C:\Users\Fabrice\.cache\huggingface\datasets\glue\sst2\1.0.0\7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4)
100%|████████████████████████████████████████████████████████████████████████████| 67349/67349 [22:58<00:00, 48.86it/s]


In [152]:
dataset = load_dataset('ag_news')['train']
task = 'topic'

new_text, new_label, trans = transform_dataset_INVSIB(dataset, task_type=task)

save_dir = os.path.join('assets', 'ag_news', task, 'INVSIB')
npy_save(os.path.join(save_dir, 'text'), new_text)
npy_save(os.path.join(save_dir, 'label'), new_label)
npy_save(os.path.join(save_dir, 'trans'), trans)

Using custom data configuration default
Reusing dataset ag_news (C:\Users\sleev\.cache\huggingface\datasets\ag_news\default\0.0.0\fb5c5e74a110037311ef5e904583ce9f8b9fbc1354290f97b4929f01b3f48b1a)
100%|████████████████████████████████████████████████████████████████████████| 120000/120000 [5:17:20<00:00,  6.30it/s]


In [18]:
df

Unnamed: 0,task_name,tran_type,label_type,transformation,tran_fn
0,sentiment,INV,hard,ExpandContractions,<transformations.text.contraction.expand_contr...
1,topic,INV,hard,ExpandContractions,<transformations.text.contraction.expand_contr...
2,sentiment,INV,hard,ContractContractions,<transformations.text.contraction.contract_con...
3,topic,INV,hard,ContractContractions,<transformations.text.contraction.contract_con...
4,sentiment,INV,hard,Emojify,<transformations.text.emoji.emojify.Emojify ob...
...,...,...,...,...,...
67,topic,SIB,soft,TextMix,<transformations.text.mixture.text_mix.TextMix...
68,sentiment,SIB,soft,SentMix,<transformations.text.mixture.text_mix.SentMix...
69,topic,SIB,soft,SentMix,<transformations.text.mixture.text_mix.SentMix...
70,sentiment,SIB,soft,WordMix,<transformations.text.mixture.text_mix.WordMix...


In [8]:
for d in datasets:
    if type(d) == tuple and d[0] == 'glue':
        train, test = load_dataset(d[0], d[1], split=['train', 'test'])
        train.rename_column_('sentence', 'text')
        d = d[1]
    else:
        train, test = load_dataset(d, split=['train', 'test'])
    for task in tasks:
        task_df = df['task_name'] == task
        trans = df[task_df].tran_type.unique()
        for tran in trans:
            label_df = df['tran_type'] == tran
            lbls = df[label_df].label_type.unique()
            if (
                (d == 'ag_news' and task == 'topic' and tran == 'INV')  
             or (d == 'sst2'    and task == 'sentiment')
            ):
                print('working on', d, task, tran, lbls)
                use_one_hot = False
                if len(lbls) > 1: 
                    use_one_hot = True
                    
                new_text, new_label, trans = transform_dataset_INVSIB(
                    dataset=train, 
                    num_INV_required=2 if tran=='INV' else 0, 
                    num_SIB_required=2 if tran=='SIB' else 0,
                    task_type=task, 
                    tran_type=tran, 
                    label_type=None,
                    one_hot=use_one_hot)
                new_text = np.array(new_text)
                new_label = np.array([np.squeeze(x) for x in new_label])
                trans = np.array(trans, dtype=np.string_)
                save_dir = os.path.join('assets', d, task, tran)
                npy_save(os.path.join(save_dir, 'text'), new_text)
                npy_save(os.path.join(save_dir, 'label'), new_label)
                npy_save(os.path.join(save_dir, 'trans'), trans)

Reusing dataset glue (C:\Users\Fabrice\.cache\huggingface\datasets\glue\sst2\1.0.0\7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4)


working on sst2 sentiment INV ['hard']


100%|████████████████████████████████████████████████████████████████████████████| 67349/67349 [25:03<00:00, 44.79it/s]


working on sst2 sentiment SIB ['soft' 'hard']


100%|████████████████████████████████████████████████████████████████████████████| 67349/67349 [42:03<00:00, 26.69it/s]
  return np.array(new_text, dtype=np.string_), np.array(new_label), np.array(trans, dtype=np.string_)
Using custom data configuration default
Reusing dataset ag_news (C:\Users\Fabrice\.cache\huggingface\datasets\ag_news\default\0.0.0\fb5c5e74a110037311ef5e904583ce9f8b9fbc1354290f97b4929f01b3f48b1a)


working on ag_news topic INV ['hard']


100%|████████████████████████████████████████████████████████████████████████| 120000/120000 [1:15:14<00:00, 26.58it/s]


In [6]:
n=1
d = ('ag_news')
task = 'topic'
tran = 'SIB'
train, test = load_dataset(d, split=['train', 'test'])
new_text, new_label, trans = transform_dataset(train, num_transforms=n, task_type=task, tran_type=tran)
new_text = np.array(new_text)
new_label = np.array([np.squeeze(x) for x in new_label])
trans = np.array(trans, dtype=np.string_)
save_dir = os.path.join('assets', d, task, tran)
npy_save(os.path.join(save_dir, 'text'), new_text)
npy_save(os.path.join(save_dir, 'label'), new_label)
npy_save(os.path.join(save_dir, 'trans'), trans)

Using custom data configuration default
Reusing dataset ag_news (C:\Users\Fabrice\.cache\huggingface\datasets\ag_news\default\0.0.0\fb5c5e74a110037311ef5e904583ce9f8b9fbc1354290f97b4929f01b3f48b1a)
100%|████████████████████████████████████████████████████████████████████████████████| 120/120 [00:08<00:00, 14.98it/s]


In [147]:
SST2_INV_text = npy_load("./assets/SST2/sentiment/INV/text.npy")
SST2_INV_label = npy_load("./assets/SST2/sentiment/INV/label.npy")

SST2_SIB_text = npy_load("./assets/SST2/sentiment/SIB/text.npy")
SST2_SIB_label = npy_load("./assets/SST2/sentiment/SIB/label2.npy")

In [146]:
new_label = np.array([np.squeeze(x) for x in SST2_SIB_label])
save_dir = os.path.join('assets', 'SST2', 'sentiment', 'SIB')
npy_save(os.path.join(save_dir, 'label2'), new_label)

In [9]:
INV_text = npy_load("./assets/AG_NEWS/topic/INV/text.npy")
INV_label = npy_load("./assets/AG_NEWS/topic/INV/label.npy")

SIB_text = npy_load("./assets/AG_NEWS/topic/SIB/text.npy")
SIB_label = npy_load("./assets/AG_NEWS/topic/SIB/label.npy")

In [151]:
SIB_label

array([[0.        , 0.        , 0.37305699, 0.62694301],
       [0.        , 0.        , 0.5256917 , 0.4743083 ],
       [0.        , 0.        , 1.        , 0.        ],
       ...,
       [0.        , 1.        , 0.        , 0.        ],
       [0.        , 0.45679012, 0.54320988, 0.        ],
       [0.40106952, 0.59893048, 0.        , 0.        ]])