# Notebook to generate Data Augmentation Cache

> Some augmentation like Word2Vec Sentence Replacement tooks a huge amount of time. This notebook is a place to process those heavy augmentations and store the result as a cache. The cache saved here could directly be called later on.

In [1]:
import os
import os.path as osp
import sys

import re
import pickle
import random
import argparse

import numpy as np
import pandas as pd

from collections import deque
from tqdm.auto import tqdm

sys.path.append('./codes/new_transformers_branch/transformers/src')

from new_transformers import DebertaV2TokenizerFast
from transformers import AutoConfig, AutoModel, AutoTokenizer, AutoModelForTokenClassification

import gensim
from textaugment import Word2vec
from textaugment import EDA


## Load Data

In [2]:
from module.utils import get_all_texts

In [3]:
def get_config():
    parser = argparse.ArgumentParser(description="use huggingface models")
    parser.add_argument("--dataset_path", default='../../feedback-prize-2021', type=str)
    args = parser.parse_args([])
    
    return args

In [4]:
args = get_config()

In [5]:
all_texts = get_all_texts(args)
df = pd.read_csv(osp.join(args.dataset_path, 'train.csv'))
text_ids = df.id.unique().tolist()

## Generate Text List with all the data provided

In [6]:
def text2list(text, text_df, clean_text_df=True):
    """Convert the text to list
    This is mainly to work on data augmentation and noise injection

    I'm working now quark! -> [[Lead, I'm working"],
                               [Nonez, " "],
                               [Claim, "now quark!"]]

    Args:
        text (str): literally the text of each text_id returns
        text_df (pandas.DataFrame): the dataframe file for each text
        clean_text_df (bool): text files and discourse_text in train.csv file doesn't match
                            fix the text to which is stored in the "{text_id}.txt" files

    Returns:
        text_list (list): list that stores the divided text and category of each text
        text_df (pandas.DataFrame): the dataframe file for each text

    """
    text_df = text_df.copy()

    text_list = []
    first_sentence = True
    last_end_idx = 0
    for row in text_df.itertuples():
        start_idx = int(row.discourse_start)
        end_idx = int(row.discourse_end)
        cat = row.discourse_type

        # the first sentence that will stored in the list
        if first_sentence:
            # when the first sentence is not the entity
            # 1. store the first sentence with none entity
            # 2. store the entity sentence
            if start_idx != 0:
                text_list.append(["None", text[:start_idx]])

            # save the entity
            text_list.append([cat, text[start_idx:end_idx]])
            first_sentence = False
            last_end_idx = end_idx
        else:
            # when there is a middle sentence save it also
            if last_end_idx != start_idx:
                middle_text = text[last_end_idx:start_idx]
                text_list.append(["None", middle_text])

            # save the entity
            text_list.append([cat, text[start_idx:end_idx]])
            last_end_idx = end_idx

    # when there is sentence left store it
    text_len = len(text)
    if last_end_idx != text_len:
        last_text = text[last_end_idx:text_len]
        text_list.append(["None", last_text])

    if clean_text_df:
        discourse_texts = []
        for discourse_type, discourse_text in text_list:
            if discourse_type != 'None':
                discourse_texts.append(discourse_text)

        text_df.loc[text_df.index, 'discourse_text'] = discourse_texts

    return text_list, text_df


In [7]:
data_dict = {}
for text_id in tqdm(text_ids, total=len(text_ids)):
    # load data
    text = all_texts[text_id]
    text_df = df.query('id == @text_id').reset_index(drop=True).copy()

    # convert to list and clean the text_df
    text_list, text_df = text2list(text, text_df, clean_text_df=True)

    # save as dictionary format
    data_dict[text_id] = {}
    data_dict[text_id]['text_list'] = text_list
    data_dict[text_id]['text_df'] = text_df

  0%|          | 0/15594 [00:00<?, ?it/s]

## Word2Vec Sentence Exchanging Implementation

### Load Google Word2Vec Model

In [8]:
google_model = gensim.models.KeyedVectors.load_word2vec_format('./augmentation/GoogleNews-vectors-negative300.bin', binary=True)
word2vec_model = Word2vec(model=google_model)

### Convert sentence using Word2Vec

In [9]:
from multiprocessing import Pool

In [10]:
def convert_sentence(text_id):
    text_list = data_dict[text_id]['text_list']
    
    for i, text in enumerate(text_list):
        # replacing sentence with similiar word
        if len(text[1].strip()) <= 15:
            continue

        text_list[i][1] = word2vec_model.augment(text[1])
    
    return text_list

In [11]:
test = data_dict[text_ids[0]]['text_list']

In [12]:
 for i, text in enumerate(test):
    # replacing sentence with similiar word
    if len(text[1].strip()) <= 15:
        continue

    test[i][1] = word2vec_model.augment(text[1])

In [None]:
pool = Pool(processes=4)

converted_textlist = []
with tqdm(total=len(text_ids)) as pbar:
    for text_list in pool.imap(convert_sentence, text_ids):
        converted_textlist.append(converted_textlist)
        pbar.update(1)

  0%|          | 0/15594 [00:00<?, ?it/s]

## Tokenizer Test

In [19]:
tokenizer = DebertaV2TokenizerFast.from_pretrained('microsoft/deberta-v3-large')
tokenizer.model_max_length = 4

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [20]:
tokenizer('\n')

{'input_ids': [1, 507, 2], 'token_type_ids': [0, 0, 0], 'attention_mask': [1, 1, 1]}

In [21]:
tokenizer('a b')

{'input_ids': [1, 266, 2165, 2], 'token_type_ids': [0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1]}

In [39]:
tokenizer('a b')

{'input_ids': [1, 266, 2165, 2], 'token_type_ids': [0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1]}

In [46]:
tokenizer('ab')

{'input_ids': [1, 25191, 2], 'token_type_ids': [0, 0, 0], 'attention_mask': [1, 1, 1]}

In [36]:
tokenizer('word to')

{'input_ids': [1, 1180, 264, 2], 'token_type_ids': [0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1]}

In [31]:
tokenizer('hello this is')

Token indices sequence length is longer than the specified maximum sequence length for this model (5 > 4). Running this sequence through the model will result in indexing errors


{'input_ids': [1, 12018, 291, 269, 2], 'token_type_ids': [0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1]}

In [34]:
tokenizer('hello^this^is')

{'input_ids': [1, 12018, 9922, 5343, 9922, 1890, 2], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1]}

In [26]:
tokenizer('wordto')

{'input_ids': [1, 1180, 725, 2], 'token_type_ids': [0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1]}

In [29]:
tokenizer.decode([1, 725, 2])

'[CLS]to[SEP]'

In [30]:
tokenizer.decode([1, 264, 2])

'[CLS] to[SEP]'