### Dataset Details

In [None]:
work_dir = '/work/dir'
dataset = 'LF-AmazonTitles-131K'
seq_len = 32
dataset_dir = f'{work_dir}/Datasets/{dataset}'
tok_dir = f'{dataset_dir}/bert-base-uncased-{seq_len}'

aug_dataset_dir = f'{dataset_dir}-Aug'
aug_tok_dir = f'{aug_dataset_dir}/bert-base-uncased-{seq_len}'
os.makedirs(aug_dataset_dir, exist_ok=True)
os.makedirs(aug_tok_dir, exist_ok=True)

In [None]:
import os
import scipy.sparse as sp
import numpy as np
import subprocess

In [None]:
def copy_file(src_fpth, dst_fpth):
    if os.path.exists(src_fpth):
        subprocess.run(['cp', src_fpth, dst_fpth])
    else: # for filter files create a dummy file
        print('Creaing dummy for ', dst_fpth)
        subprocess.run(['touch', dst_fpth])
        
def copy_files(src_dir, dst_dir, files):
    for file in files:
        print(f'Copying {file}')
        copy_file(f'{src_dir}/{file}', f'{dst_dir}/{file}')

def write_mmap(np_arr, fpth):
    np_arr_mmap = np.memmap(fpth, mode='w+', shape=np_arr.shape, dtype=np.int64)
    np_arr_mmap[:] = np_arr[:]

In [None]:
trn_X_Y = sp.load_npz(f'{dataset_dir}/trn_X_Y.npz')
identity = sp.diags(np.ones(trn_X_Y.shape[1])).tocsr()
aug_trn_X_Y = sp.vstack([trn_X_Y, identity]).tocsr()
trn_doc_ii = np.memmap(f'{tok_dir}/trn_doc_input_ids.dat', shape=(trn_X_Y.shape[0], seq_len), dtype=np.int64, mode='r')
trn_doc_am = np.memmap(f'{tok_dir}/trn_doc_attention_mask.dat', shape=(trn_X_Y.shape[0], seq_len), dtype=np.int64, mode='r')
lbl_ii = np.memmap(f'{tok_dir}/lbl_input_ids.dat', shape=(trn_X_Y.shape[1], seq_len), dtype=np.int64, mode='r')
lbl_am = np.memmap(f'{tok_dir}/lbl_attention_mask.dat', shape=(trn_X_Y.shape[1], seq_len), dtype=np.int64, mode='r')

In [None]:
aug_trn_doc_ii = np.concatenate([trn_doc_ii, lbl_ii], axis = 0)
aug_trn_doc_am = np.concatenate([trn_doc_am, lbl_am], axis = 0)

In [None]:
copy_files(dataset_dir, aug_dataset_dir, ['tst_X_Y.npz', 'trn_filter_labels.txt', 'tst_filter_labels.txt'])
copy_files(tok_dir, aug_tok_dir, ['tst_doc_input_ids.dat', 'tst_doc_attention_mask.dat', 'lbl_input_ids.dat', 'lbl_attention_mask.dat'])

In [None]:
sp.save_npz(f'{aug_dataset_dir}/trn_X_Y.npz', trn_X_Y)
write_mmap(aug_trn_doc_ii, f'{aug_tok_dir}/trn_doc_input_ids.dat')
write_mmap(aug_trn_doc_am, f'{aug_tok_dir}/trn_doc_attention_mask.dat')