# Data Augmentation

In [19]:
import pandas as pd
import requests
import string
import googletrans
from googletrans import Translator

In [20]:
# Import data
data_init = pd.read_excel('data/NLP_Data.xlsx')
data_init.head()

Unnamed: 0,Text Command,Topic,Identifier,Action,One Hot Encoded Topic,One Hot Encoded Action,Verb/Noun
0,assign completed status to X,Task,X,Completed,"[1,0]","[0,0,0,0,1]",assign
1,move X to completed,Task,X,Completed,"[1,0]","[0,0,0,0,1]",move
2,X completed,Task,X,Completed,"[1,0]","[0,0,0,0,1]",completed
3,Completed X,Task,X,Completed,"[1,0]","[0,0,0,0,1]",completed
4,X in progress,Task,X,In Progress,"[1,0]","[0,1,0,0,0]",in progress


## Method 1: Back-translation

Link to googletrans API: https://pypi.org/project/googletrans/

In [22]:
"""
Quick googletrans how-to.
This method will fail the first time, just run it again!
"""
# Create translator object
translator = Translator()

# Single translation from chinese to english
# src = language of source text
# dest = desired language to translate to
result = translator.translate('bonjour', src='fr', dest='en')

print('Origin text: ', result.origin)
print('Translated text: ', result.text)

Origin text:  bonjour
Translated text:  Hello


In [23]:
# languages available (parameters for src)
googletrans.LANGUAGES

{'af': 'afrikaans',
 'sq': 'albanian',
 'am': 'amharic',
 'ar': 'arabic',
 'hy': 'armenian',
 'az': 'azerbaijani',
 'eu': 'basque',
 'be': 'belarusian',
 'bn': 'bengali',
 'bs': 'bosnian',
 'bg': 'bulgarian',
 'ca': 'catalan',
 'ceb': 'cebuano',
 'ny': 'chichewa',
 'zh-cn': 'chinese (simplified)',
 'zh-tw': 'chinese (traditional)',
 'co': 'corsican',
 'hr': 'croatian',
 'cs': 'czech',
 'da': 'danish',
 'nl': 'dutch',
 'en': 'english',
 'eo': 'esperanto',
 'et': 'estonian',
 'tl': 'filipino',
 'fi': 'finnish',
 'fr': 'french',
 'fy': 'frisian',
 'gl': 'galician',
 'ka': 'georgian',
 'de': 'german',
 'el': 'greek',
 'gu': 'gujarati',
 'ht': 'haitian creole',
 'ha': 'hausa',
 'haw': 'hawaiian',
 'iw': 'hebrew',
 'he': 'hebrew',
 'hi': 'hindi',
 'hmn': 'hmong',
 'hu': 'hungarian',
 'is': 'icelandic',
 'ig': 'igbo',
 'id': 'indonesian',
 'ga': 'irish',
 'it': 'italian',
 'ja': 'japanese',
 'jw': 'javanese',
 'kn': 'kannada',
 'kk': 'kazakh',
 'km': 'khmer',
 'ko': 'korean',
 'ku': 'kurdish 

In [24]:
def back_trans(df, dest, sample_frac=1, source='en'):
   
    # Sample df from input df
    data_input = df.sample(frac=sample_frac)
    
    # Text commands from the sample df
    data_input_text = data_input.iloc[:, 0].tolist()
    
   # Translation from english to another language
    data_ja = translator.translate(data_input_text, src='en', dest=dest)
    data_ja_list = []
    for trans in data_ja:
        data_ja_list.append(trans.text)
    
    # Translation back to english
    data_en = translator.translate(data_ja_list, src=dest, dest='en')
    data_en_list = []
    for trans in data_en:
        # Removes punctuation in translated text
        data_en_list.append(trans.text.translate(str.maketrans('', '', string.punctuation)))
    
    # Adding back-translated commands back to input data to respective labels (action, topic...)
    data_input['Text Command'] = data_en_list
    
    # Change few properties of back-translated df
    data_btrans = data_input.copy()
    data_btrans['Verb/Noun'] = 'BACKTRANSLATED'
    
    # Adding back-translated df (data_input) to initial data (df)
    data_aug = pd.concat([df, data_btrans])
    data_aug.reset_index(drop=True, inplace=True)
    
    return data_aug

In [25]:
data_btrans = back_trans(data_init, 'ja', sample_frac=0.1)
data_btrans

Unnamed: 0,Text Command,Topic,Identifier,Action,One Hot Encoded Topic,One Hot Encoded Action,Verb/Noun
0,assign completed status to X,Task,X,Completed,"[1,0]","[0,0,0,0,1]",assign
1,move X to completed,Task,X,Completed,"[1,0]","[0,0,0,0,1]",move
2,X completed,Task,X,Completed,"[1,0]","[0,0,0,0,1]",completed
3,Completed X,Task,X,Completed,"[1,0]","[0,0,0,0,1]",completed
4,X in progress,Task,X,In Progress,"[1,0]","[0,1,0,0,0]",in progress
...,...,...,...,...,...,...,...
251,Completed project X,Project,X,Completed,"[0,1]","[0,0,0,0,1]",BACKTRANSLATED
252,Put Project X back on track,Project,X,On Target,"[0,1]","[0,1,0,0,0]",BACKTRANSLATED
253,Created a new X project,Project,X,Create,"[0,1]","[1,0,0,0,0]",BACKTRANSLATED
254,In review task X,Task,X,In Review,"[1,0]","[0,0,1,0,0]",BACKTRANSLATED


## Method 2: Synonym Replacement

Link to Datamuse API: https://www.datamuse.com/api/

In [26]:
"""
Script for creating new data by substituting words for their synonyms. Only the words which are not included
in the identifier are eligible for being replaced.
"""
def syn_rep(df, syn_num, sample_frac=1):
    
    # New dataframe for the new sentences
    added_data = pd.DataFrame(columns=list(df.columns))
    
    # Sample of original dataframe
    df_sample = df.sample(frac=sample_frac).reset_index(drop=True)
    
    index = len(df_sample)
    for i in range(len(df_sample.index)):
        identifier = df_sample.iloc[i][2]
        text = df_sample.iloc[i][0]
        sentence_lst = text.split()
        # identifier_lst = identifier.split()
        
        # List of additional words that translating did not make sense
        identifier_lst = ['a', 'an']
        identifier_lst.append(identifier.split()[0])

        for j in range(len(sentence_lst)):
            word = sentence_lst[j]
            if word not in identifier_lst:
                word = word.lower()
                
                # max=syn_num is the number of synonyms you want for each non-identifier word in the cmd
                api_url = 'https://api.datamuse.com/words?rel_syn={0}&max='.format(word)
                api_url += str(syn_num)
                word_synonyms = requests.get(api_url)
                word_synonyms = word_synonyms.json()
                
                for k in range(len(word_synonyms)):
                    new_sentence_lst = sentence_lst[:j]+[word_synonyms[k]['word']]+sentence_lst[j+1:]
                    s = " "
                    new_sentence = s.join(new_sentence_lst)
                    added_data.loc[index] = [new_sentence] + list(df_sample.loc[i][1:])
                    index += 1
    
    added_data['Verb/Noun'] = 'SYNREPLACED'
    
    augmented_data = pd.concat([df, added_data])
    augmented_data.reset_index(drop=True, inplace=True)
    
    return augmented_data

In [27]:
data_synreplaced = syn_rep(data_init, 5, sample_frac=0.1)
data_synreplaced

Unnamed: 0,Text Command,Topic,Identifier,Action,One Hot Encoded Topic,One Hot Encoded Action,Verb/Noun
0,assign completed status to X,Task,X,Completed,"[1,0]","[0,0,0,0,1]",assign
1,move X to completed,Task,X,Completed,"[1,0]","[0,0,0,0,1]",move
2,X completed,Task,X,Completed,"[1,0]","[0,0,0,0,1]",completed
3,Completed X,Task,X,Completed,"[1,0]","[0,0,0,0,1]",completed
4,X in progress,Task,X,In Progress,"[1,0]","[0,1,0,0,0]",in progress
...,...,...,...,...,...,...,...
479,Project X over,Project,X,Completed,"[0,1]","[0,0,0,0,1]",SYNREPLACED
480,Project X good,Project,X,Completed,"[0,1]","[0,0,0,0,1]",SYNREPLACED
481,Project X all,Project,X,Completed,"[0,1]","[0,0,0,0,1]",SYNREPLACED
482,Project X sound,Project,X,Completed,"[0,1]","[0,0,0,0,1]",SYNREPLACED


## Augment Data Main Method

In [28]:
# Keep in mind the sample_frac is for how much of the data you want to augment
def augment(df, lang, syn_num=5, sample_frac=1):
    
    data_btrans = back_trans(df, lang, sample_frac=sample_frac)
    data_final = syn_rep(data_btrans, syn_num, sample_frac=sample_frac)
    # Need a step to remove duplicates
    
    return data_final

In [None]:
data_aug = augment(data_init, lang='ja', syn_num=5, sample_frac=1)

In [31]:
all_text_commands = dict()
idx_lst = []
for i in data_aug.index.values:
    text_command = data_aug.loc[i][0]
    if all_text_commands.get(text_command,0) == 0:
        all_text_commands[text_command] = 1
        idx_lst.append(i)
data_aug = data_aug.loc[idx_lst]

In [34]:
len(data_aug)

3723

**Save augmented data to csv:**

In [33]:
data_aug.to_csv('data/Augmented_Data.csv', index=False)

## Augmenting specified actions/rows for exploration purposes:

In [15]:
# num = None
# pd.set_option("display.max_rows", num)
# pd.set_option("display.min_rows", num)

In [1]:
# # Limiting data to just completed tasks for API exploration purposes
# data_action5 = data_init[data_init.Action == 'Completed'].reset_index(drop=True)
# data_action5

In [3]:
# # Running the script for just the first X rows
# data_action5_aug = aug_data(data_action5.iloc[0:5], 5)

In [4]:
# data_action5_aug