# Notebook to generate Data Augmentation Cache

> Some augmentation like Word2Vec Sentence Replacement tooks a huge amount of time. This notebook is a place to process those heavy augmentations and store the result as a cache. The cache saved here could directly be called later on.

In [15]:
import os
import os.path as osp
import sys

import re
import pickle
import random

import numpy as np
import pandas as pd

from collections import deque
from tqdm.auto import tqdm

sys.path.append('./codes/new_transformers_branch/transformers/src')

from new_transformers import DebertaV2TokenizerFast
from transformers import AutoConfig, AutoModel, AutoTokenizer, AutoModelForTokenClassification

import gensim
from textaugment import Word2vec
from textaugment import EDA


## Load Data

## Generate Text List with all the data provided

In [None]:
def text2list(text, text_df, clean_text_df=True):
    """Convert the text to list
    This is mainly to work on data augmentation and noise injection

    I'm working now quark! -> [[Lead, I'm working"],
                               [Nonez, " "],
                               [Claim, "now quark!"]]

    Args:
        text (str): literally the text of each text_id returns
        text_df (pandas.DataFrame): the dataframe file for each text
        clean_text_df (bool): text files and discourse_text in train.csv file doesn't match
                            fix the text to which is stored in the "{text_id}.txt" files

    Returns:
        text_list (list): list that stores the divided text and category of each text
        text_df (pandas.DataFrame): the dataframe file for each text

    """
    text_df = text_df.copy()

    text_list = []
    first_sentence = True
    last_end_idx = 0
    for row in text_df.itertuples():
        start_idx = int(row.discourse_start)
        end_idx = int(row.discourse_end)
        cat = row.discourse_type

        # the first sentence that will stored in the list
        if first_sentence:
            # when the first sentence is not the entity
            # 1. store the first sentence with none entity
            # 2. store the entity sentence
            if start_idx != 0:
                text_list.append(["None", text[:start_idx]])

            # save the entity
            text_list.append([cat, text[start_idx:end_idx]])
            first_sentence = False
            last_end_idx = end_idx
        else:
            # when there is a middle sentence save it also
            if last_end_idx != start_idx:
                middle_text = text[last_end_idx:start_idx]
                text_list.append(["None", middle_text])

            # save the entity
            text_list.append([cat, text[start_idx:end_idx]])
            last_end_idx = end_idx

    # when there is sentence left store it
    text_len = len(text)
    if last_end_idx != text_len:
        last_text = text[last_end_idx:text_len]
        text_list.append(["None", last_text])

    if clean_text_df:
        discourse_texts = []
        for discourse_type, discourse_text in text_list:
            if discourse_type != 'None':
                discourse_texts.append(discourse_text)

        text_df.loc[text_df.index, 'discourse_text'] = discourse_texts

    return text_list, text_df
