# Data Augmentation

In [1]:
import pandas as pd
import numpy as np
import requests
import string
import googletrans
from langdetect import detect
import re
import time

In [2]:
# Import data
data_init = pd.read_excel('data/NLP_Data.xlsx')
data_init

Unnamed: 0,Text Command,Topic,Identifier,Action,One Hot Encoded Topic,One Hot Encoded Action,Verb/Noun
0,x to do,Task,x,To Do,"[1,0]","[1,0,0,0,0]",to do
1,to do x,Task,x,To Do,"[1,0]","[1,0,0,0,0]",to do
2,new x,Task,x,To Do,"[1,0]","[1,0,0,0,0]",new
3,make x,Task,x,To Do,"[1,0]","[1,0,0,0,0]",make
4,create x,Task,x,To Do,"[1,0]","[1,0,0,0,0]",create
...,...,...,...,...,...,...,...
604,assign project x completed,Project,x,Completed,"[0,1]","[0,0,0,0,1]",assign
605,assign x project completed,Project,x,Completed,"[0,1]","[0,0,0,0,1]",assign
606,move x to completed,Project,x,Completed,"[0,1]","[0,0,0,0,1]",move
607,move project x to completed,Project,x,Completed,"[0,1]","[0,0,0,0,1]",move


In [3]:
# Make all text commands and identifiers lower case
data_init['Text Command'] = data_init['Text Command'].str.lower()
data_init['Identifier'] = data_init['Identifier'].str.lower()
data_init

Unnamed: 0,Text Command,Topic,Identifier,Action,One Hot Encoded Topic,One Hot Encoded Action,Verb/Noun
0,x to do,Task,x,To Do,"[1,0]","[1,0,0,0,0]",to do
1,to do x,Task,x,To Do,"[1,0]","[1,0,0,0,0]",to do
2,new x,Task,x,To Do,"[1,0]","[1,0,0,0,0]",new
3,make x,Task,x,To Do,"[1,0]","[1,0,0,0,0]",make
4,create x,Task,x,To Do,"[1,0]","[1,0,0,0,0]",create
...,...,...,...,...,...,...,...
604,assign project x completed,Project,x,Completed,"[0,1]","[0,0,0,0,1]",assign
605,assign x project completed,Project,x,Completed,"[0,1]","[0,0,0,0,1]",assign
606,move x to completed,Project,x,Completed,"[0,1]","[0,0,0,0,1]",move
607,move project x to completed,Project,x,Completed,"[0,1]","[0,0,0,0,1]",move


In [4]:
# Allows for viewing more rows in df for analysis/debugging
num = 10
pd.set_option("display.max_rows", num)
pd.set_option("display.min_rows", num)

## Method 1: Back-translation

Link to googletrans API: https://pypi.org/project/googletrans/

In [5]:
"""
Quick googletrans how-to.
This method might fail the first time, just run it again!
"""
# # Create translator object
# translator = googletrans.Translator()

# # Single translation from chinese to english
# # src = language of source text
# # dest = desired language to translate to
# result = translator.translate('bonjour', src='fr', dest='en')
# print('Origin text: ', result.origin)
# print('Translated text: ', result.text)

'\nQuick googletrans how-to.\nThis method might fail the first time, just run it again!\n'

In [6]:
# languages available (parameters for src)
googletrans.LANGUAGES

{'af': 'afrikaans',
 'sq': 'albanian',
 'am': 'amharic',
 'ar': 'arabic',
 'hy': 'armenian',
 'az': 'azerbaijani',
 'eu': 'basque',
 'be': 'belarusian',
 'bn': 'bengali',
 'bs': 'bosnian',
 'bg': 'bulgarian',
 'ca': 'catalan',
 'ceb': 'cebuano',
 'ny': 'chichewa',
 'zh-cn': 'chinese (simplified)',
 'zh-tw': 'chinese (traditional)',
 'co': 'corsican',
 'hr': 'croatian',
 'cs': 'czech',
 'da': 'danish',
 'nl': 'dutch',
 'en': 'english',
 'eo': 'esperanto',
 'et': 'estonian',
 'tl': 'filipino',
 'fi': 'finnish',
 'fr': 'french',
 'fy': 'frisian',
 'gl': 'galician',
 'ka': 'georgian',
 'de': 'german',
 'el': 'greek',
 'gu': 'gujarati',
 'ht': 'haitian creole',
 'ha': 'hausa',
 'haw': 'hawaiian',
 'iw': 'hebrew',
 'he': 'hebrew',
 'hi': 'hindi',
 'hmn': 'hmong',
 'hu': 'hungarian',
 'is': 'icelandic',
 'ig': 'igbo',
 'id': 'indonesian',
 'ga': 'irish',
 'it': 'italian',
 'ja': 'japanese',
 'jw': 'javanese',
 'kn': 'kannada',
 'kk': 'kazakh',
 'km': 'khmer',
 'ko': 'korean',
 'ku': 'kurdish 

In [7]:
"""
Backup: original back_trans method
"""
# def back_trans(df, dest, sample_frac=1, source='en'):

#     # Sample df from input df
#     data_input = df.sample(frac=sample_frac, random_state=0)

#     # Text commands from the sample df
#     data_input_text = data_input.iloc[:, 0].tolist()

#     # Translation from english to another language
#     data_ja = translator.translate(data_input_text, src='en', dest=dest)
#     data_ja_list = []
#     for trans in data_ja:
#         data_ja_list.append(trans.text)

#     # Translation back to english\
#     data_en = translator.translate(data_ja_list, src=dest, dest='en')
#     data_en_list = []
#     for trans in data_en:
#         # Removes punctuation in translated text
#         data_en_list.append(trans.text.translate(str.maketrans('', '', string.punctuation)))

#     # Adding back-translated commands back to input data to respective labels (action, topic...)
#     data_input['Text Command'] = data_en_list

#     # Change few properties of back-translated df
#     data_btrans = data_input.copy()
#     data_btrans['Verb/Noun'] = 'BACKTRANSLATED'

#     # Adding back-translated df (data_input) to initial data (df)
#     data_aug = pd.concat([df, data_btrans])
#     data_aug.reset_index(drop=True, inplace=True)

#     # Adding back-translated df (data_input) to initial data (df)
#     data_aug = pd.concat([df, data_btrans])
#     data_aug.reset_index(drop=True, inplace=True)

#     return data_aug"

'\nBackup: original back_trans method\n'

In [8]:
def back_trans(df, lang, source='en'):
       
    # Deep copy of input df
    data_sample = df.copy()

    # Temporary identifier substitute bc 'X' does not translate
    # data_sample['Text Command'] = data_sample['Text Command'].str.replace('x', 'research')

    # Sample df from input df
    # data_sample = data_sample.sample(frac=sample_frac, random_state=0)

    # Dictionary of text and corresponding index from sample df
    data_index = data_sample.index.values
    data_text = data_sample.iloc[:, 0].tolist()
    data_index_text = dict(zip(data_index, data_text))

    # Initiate googletrans translator
    translator = googletrans.Translator()

    # Loop translates data, and continues to iterate through data
    # that was untranslated until 95% of data is translated
    data_trans = dict()
    data_untrans = data_index_text    # Data for first iteration
    counter = 0
    counter_untrans1 = 0
    while True:
        translations = translator.translate(list(data_untrans.values()),
                                            src=source, dest=lang)
        data_untrans_tmp = dict()    # Resets the untranslated dict for next iteration
        counter_untrans1 = 0

        # Aggregates untranslated data for next iteration of translation
        for index, trans in zip(list(data_untrans.keys()), translations):
            if detect(trans.text) == source:
                data_untrans_tmp[index] = trans.text
                counter_untrans1 += 1
            else:
                data_trans[index] = trans.text
        data_untrans = data_untrans_tmp
        counter += 1

        # Breaks translation loop when 95% of data is translated
        if len(data_untrans) < len(data_index_text)*.05:
            print('Translation || ', 'Untranslated (#): ', counter_untrans1,
                  ' | Untranslated (%): ', round(counter_untrans1/len(data_index_text), 4)*100, '%')
            break

        # Break loop if HTTP requests are not processing
        if counter > 5:
            print('Warning: translation took too long and was terminated.')
            print('Translation || ', 'Untranslated (#): ', counter_untrans1,
                  ' | Untranslated (%): ', round(counter_untrans1/len(data_index_text), 4)*100, '%')
            break
        
    # Loop back-translates data, and continues to iterate through data that
    # was unback-translated until 95% of translated data is back-translated
    data_btrans = dict()
    data_untrans = data_trans    # Data for first iteration
    counter = 0
    counter_untrans2 = 0
    while True:
        translations = translator.translate(list(data_untrans.values()),
                                            src=lang, dest=source)
        data_untrans_tmp = dict()    # Resets the untranslated dict for next iteration
        counter_untrans2 = 0

        # Aggregates untranslated data for next iteration of back-translation
        for index, trans in zip(list(data_untrans.keys()), translations):
            if detect(trans.text) == lang:
                data_untrans_tmp[index] = trans.text
                counter_untrans2 += 1
            else:
                data_btrans[index] = trans.text.translate(str.maketrans('', '', string.punctuation)).lower()
        data_untrans = data_untrans_tmp
        counter += 1

        # Breaks translation loop when 95% of data is translated
        if len(data_untrans) < len(data_index_text)*.05:
            print('Back-translation || ', 'Untranslated (#): ', counter_untrans2,
                  ' | Untranslated (%): ', round(counter_untrans2/len(data_index_text), 4)*100, '%')
            break

        # Break loop if HTTP requests are not processing
        if counter > 5:
            print('Warning: back-translation took too long and was terminated.')
            print('Back-translation || ', 'Untranslated (#): ', counter_untrans2,
                  ' | Untranslated (%): ', round(counter_untrans2/len(data_index_text), 4)*100, '%')
            break
            
    print('Total || ', 'Untranslated (#): ', counter_untrans1+counter_untrans2,
          ' | Untranslated (%): ', round((counter_untrans1+counter_untrans2)/len(data_index_text), 4)*100, '%')

    # Adding back-translated commands back to input data to respective labels (action, topic...)
    df_btrans = pd.DataFrame(index=data_btrans.keys(), data=data_btrans.values())
    df_btrans = (pd.merge(df_btrans, data_sample, how='left', left_index= True, right_index=True)
                   .drop(columns='Text Command')
                   .rename(columns={0: 'Text Command'}))

    # Change few properties of back-translated df
    df_btrans['Verb/Noun'] = 'BACKTRANSLATED'
    # df_btrans['Text Command'] = df_btrans['Text Command'].str.replace('research', 'x')

    # Adding back-translated df to input df
    data_aug = pd.concat([df, df_btrans])
    data_aug.reset_index(drop=True, inplace=True)

    return data_aug

In [9]:
# translator = googletrans.Translator()
# translator.translate('bonjour', src='fr', dest='en')

In [10]:
"""
Rerun method a few times if there is an AttributeError.
If issue persists, run the commented out code in above cell.
"""
# data_btrans = back_trans(data_init, lang='ja', sample_frac=1)
# data_btrans

'\nRerun method a few times if there is an AttributeError.\nIf issue persists, run the commented out code in above cell.\n'

## Method 2: Synonym Replacement

Datamuse API: https://www.datamuse.com/api/

In [11]:
"""
Script for creating new data by substituting words for their synonyms.
Only the words which are not included in the identifier are eligible for being replaced.
"""
def syn_rep(df, syn_num):
    
    # Deep copy of input dataframe
    df_sample = df.copy()
    
    # New dataframe for the new sentences
    added_data = pd.DataFrame(columns=list(df.columns))

    # Sample of original dataframe
    # df_sample = df.sample(frac=sample_frac, random_state=0).reset_index(drop=True)

    index = len(df_sample)
    for i in range(len(df_sample.index)):
        identifier = df_sample.iloc[i][2]
        text = df_sample.iloc[i][0]
        sentence_lst = text.split()
        # identifier_lst = identifier.split()

        # List of additional (stop) words that translating did not make sense
        identifier_lst = ["i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "your", "yours", "yourself", "yourselves", "he", "him", "his", "himself", "she", "her", "hers", "herself", "it", "its", "itself", "they", "them", "their", "theirs", "themselves", "what", "which", "who", "whom", "this", "that", "these", "those", "am", "is", "are", "was", "were", "be", "been", "being", "have", "has", "had", "having", "do", "does", "did", "doing", "a", "an", "the", "and", "but", "if", "or", "because", "as", "until", "while", "of", "at", "by", "for", "with", "about", "against", "between", "into", "through", "during", "before", "after", "above", "below", "to", "from", "up", "down", "in", "out", "on", "off", "over", "under", "again", "further", "then", "once", "here", "there", "when", "where", "why", "how", "all", "any", "both", "each", "few", "more", "most", "other", "some", "such", "no", "nor", "not", "only", "own", "same", "so", "than", "too", "very", "s", "t", "can", "will", "just", "don", "should", "now"]
        identifier_lst.append(identifier.split()[0])
        
        # For every word in a command, replace the word with a synonym for new command
        for j in range(len(sentence_lst)):
            word = sentence_lst[j]
            if word not in identifier_lst:
                word = word.lower()
                
                # Use datamuse API to find synonyms from WordNet synonym dictionary 
                # max=syn_num is the number of synonyms you want for each non-identifier word in the cmd
                api_url = 'https://api.datamuse.com/words?rel_syn={0}&max='.format(word)
                api_url += str(syn_num)
                word_synonyms = requests.get(api_url)
                word_synonyms = word_synonyms.json()
                
                # Create new commands for each synonym to a given word
                for k in range(len(word_synonyms)):
                    new_sentence_lst = sentence_lst[:j]+[word_synonyms[k]['word']]+sentence_lst[j+1:]
                    s = " "
                    new_sentence = s.join(new_sentence_lst)
                    added_data.loc[index] = [new_sentence] + list(df_sample.loc[i][1:])
                    index += 1
    
    # Change few properties of synonym-replaced df
    added_data['Verb/Noun'] = 'SYNREPLACED'

    # Adding synonym-replaced df to input df
    augmented_data = pd.concat([df, added_data])
    augmented_data.reset_index(drop=True, inplace=True)

    return augmented_data

In [12]:
# api_url = 'https://api.datamuse.com/words?rel_syn={0}&max=5'.format('review')
# word_synonyms = requests.get(api_url)
# word_synonyms = word_synonyms.json()
# for i in range(len(word_synonyms)):
#     print(word_synonyms[i]['word'])

In [13]:
# data_synrep = syn_rep(data_init, syn_num=5, sample_frac=0.1)
# data_synrep

## Other Scripts

In [14]:
"""
Script to remove duplicates
"""
def remove_dup(df):
    all_text_commands = dict()
    idx_lst = []
    for i in df.index.values:
        text_command = df.loc[i][0]
        if all_text_commands.get(text_command, 0) == 0:
            all_text_commands[text_command] = 1
            idx_lst.append(i)
    data_aug = df.loc[idx_lst].reset_index(drop=True)
    return data_aug

In [15]:
"""
Script to distribute identifiers throughout the data. For example, 10 identifiers
will be distributed equivalently to 100 instances each for a dataset of 1000 instances.
Note that output dataframe will be shuffled.
"""
def identifier(df, identifiers=None):
    if identifiers is None:
        identifiers = ['Track trends', 'Monitor competition', 'Plan campaign',
                       'Direct marketing', 'Market research', 'Media advertising',
                       'Advertise product', 'Promote product', 'Evaluate profits']
    
    # Deep copy of input df
    data = df.copy()
    
    # Shuffle data so the identifiers are distributed randomly
    data_shuffle = data.sample(frac=1, random_state=0).reset_index(drop=True)

    split_num = len(identifiers)
    
    # Splits dataframe into equivalent splits
    data_split_arr = np.array_split(data_shuffle, split_num)
    
    # Replace 'x' with corresponding identifier in each split
    for split, identifier in zip(range(split_num), identifiers):
        data_split_arr[split]['Text Command'] = (data_split_arr[split]['Text Command']
                                                 .apply(lambda x: re.sub('\Wx\W', ' ' + identifier + ' ', x)))
        data_split_arr[split]['Text Command'] = (data_split_arr[split]['Text Command']
                                                 .apply(lambda x: re.sub('^x\W', identifier + ' ', x)))
        data_split_arr[split]['Text Command'] = (data_split_arr[split]['Text Command']
                                                 .apply(lambda x: re.sub('\Wx$', ' ' + identifier, x)))
        data_split_arr[split]['Identifier'] = identifier
    
    # Merge individual splits
    data_identifiers = pd.concat(data_split_arr)
    
    return data_identifiers

This is data from online of 125 marketing tasks that people use. These 125 marketing tasks will serve as another identifier set to toss into our data.

In [16]:
tmp = """1. Developing editorial calendar for content sharing
2. Updating Facebook Pages
3. Updating Google+
4. Updating LinkedIn
5. Updating Twitter
6. Answering LinkedIn questions
7. Updating Tumblr, Instagram, Vine, and other social networks
8. Managing your Yelp profile
9. Bookmarking blog content on social bookmarking sites
10. Building out custom Facebook pages
11. Designing cover images for social profiles
12. Developing social media marketing plans
13. Guest writing for industry blogs or websites
14. Interacting with consumers via social media
15. Recruiting guest bloggers
16. Researching bloggers
17. Writing a social media corporate policy
18. Scheduling social media status updates to be published
19. Responding to questions on Quora
20. Researching bloggers
21. Monitoring competitors’ social media updates

Promotions

22. Developing contests to promote a business or product
23. Designing contest pages
24. Writing contest rules
25. Promoting contests to contest directories
26. Promoting contests online on company website
27. Promoting contests online through company’s social profiles
28. Promoting contests through company’s email newsletters
29. Tracking progress of contest entries
30. Choosing winners of contests
31. Announcing contest winners
32. Notify winners of contests
33. Distributing contest prizing
34. Distributing coupon codes online
35. Listing your events with local online events calendars
36. Ordering premiums created to hand out
37. Writing white papers
38. Posting white papers online on sites like Scribd.com

 Analytics

39. Analyzing Google Analytics
40. Measuring search engine optimization results
41. Performing competitor keyword analysis
42. Reviewing ecommerce sales data
43. Review social media metrics, like Facebook Insights
44. Running reports to track growth, response or ROI

Sales

45. Writing proposal
46. Creating lead capture forms for the company website
47. Writing sales follow up copy
48. Writing sales scripts
49. Submitting ecommerce products to shopping aggregators
50. Using lead scoring for prospects
51. Developing demonstrations or tutorials
52. Networking – in person!
53. Writing tips & how-to articles to share with prospects or customers
54. Researching affiliate programs

Email Marketing

55. Choosing an email marketing software vendor
56. Designing email newsletter templates
57. Distributing email newsletter
58. Writing email newsletter copy
59. Setting up trigger emails
60. Setting up an account with the email marketing software vendor
61. Sending test copies of the email newsletter
62. Segmenting email newsletter lists
63. Reviewing email newsletter metrics
64. Cleaning up the database  (fixing typos, adding in new data)

Video

65. Developing video content
66. Hiring a vendor to develop video content
67. Writing video scripts
68. Develop keyword lists for tagging videos
69. Writing video descriptions
70. Publishing video to YouTube
71. Publishing video to company website

Advertising

72. Writing ad copy
73. Designing ad units
74. Monitoring Facebook Ad campaigns
75. Researching affiliate programs
76. Signing up for affiliate programs
77. Running retargeting campaigns
78. Running ad campaigns in Google AdWords
79. Running ads on LinkedIn to promote the business
80. Running estimates of audience size for Facebook ads
81. Buying ad space on specific or niche sites

Public Relations

82. Commenting on articles or blog posts online
83. Documenting online media placements
84. Following up with people product samples were sent to
85. Pitching bloggers to review a product
86. Writing press releases
87. Submitting press releases thru online wire services
88. Sending out product samples
89. Searching forums for influencers
90. Reviewing forums for consumer feedback

Website Management

91. Designing web graphics
92. Editing web copy
93. Fixing broken links on the company website
94. Installing tracking codes on the company website
95. Publishing blog posts
96. Renewing domains and hosting plans
97. Sourcing images to accompany blog posts and status updates

Customer Service

98. Developing customer service plans
99. Responding to customers questions or concerns on social media

Project Management

100. Getting content approved by key stakeholders
101.  Posting jobs or RFPs online to recruit staff or vendors
102. Giving presentations on your work
103. Formatting documents (Excel, Powerpoint, Word)
104. Writing case studies of projects or clients
105. Updating editorial calendars
106. Training staff on social media involvement
107. Speaking at industry conferences or meetings
108. Sitting through many demos of marketing tools & dashboards
109. Setting up Google Alerts on key companies, people or products
110. Sending thank you notes or emails to customers
111. Reviewing metrics, metrics, metrics
112. Reviewing reports with clients
113. Researching the latest digital marketing trends & tools

 Search Engine Optimization

114.  Developing keyword lists
115. Appropriately naming images on the company website
116. Choosing anchor text for backlinks
117. Distributing articles to article submission websites
118. Listing the company in online local business directories
119. Listing the company website in niche directories
120. Managing relationships with the major search engines
121. Pinging sites after new content has been published
122. Renaming files so they are optimized for search engines
123. Writing blog content based on a keyword strategy
124. Writing article content to publish online
125. Reviewing website backlinks"""

In [17]:
categories = ['Social Media', 'Promotions', 'Analytics', 'Sales', 'Email Marketing',
              'Video', 'Advertising', 'Public Relations', 'Website Management',
              'Customer Service', 'Project Management', 'Search Engine Optimization']

tmp = tmp.split('\n')
tmp = pd.DataFrame(tmp)
tmp[0] = tmp[0].apply(lambda x: re.sub('[\d]+. ', '', x))
tmp[0] = tmp[0].apply(lambda x: np.nan if x.strip() in categories else x)
tmp[0] = tmp[0].replace('', np.nan)
tmp = tmp.rename(columns={0:'Identifier'})
tmp = tmp.dropna().reset_index(drop=True)
identifiers_marketing = list(tmp['Identifier'].values)
identifiers_marketing

['Developing editorial calendar for content sharing',
 'Updating Facebook Pages',
 'Updating Google+',
 'Updating LinkedIn',
 'Updating Twitter',
 'Answering LinkedIn questions',
 'Updating Tumblr, Instagram, Vine, and other social networks',
 'Managing your Yelp profile',
 'Bookmarking blog content on social bookmarking sites',
 'Building out custom Facebook pages',
 'Designing cover images for social profiles',
 'Developing social media marketing plans',
 'Guest writing for industry blogs or websites',
 'Interacting with consumers via social media',
 'Recruiting guest bloggers',
 'Researching bloggers',
 'Writing a social media corporate policy',
 'Scheduling social media status updates to be published',
 'Responding to questions on Quora',
 'Researching bloggers',
 'Monitoring competitors’ social media updates',
 'Developing contests to promote a business or product',
 'Designing contest pages',
 'Writing contest rules',
 'Promoting contests to contest directories',
 'Promoting cont

## Running Augmentation Methods

There are three different types of data we can test to see if there are any differences:
- Data from just back-translation
- Data from just synonym replacement
- Data from both augmentation techniques

### Back-translation Augmentation

In [18]:
"""
Rerun method a few times if there is an AttributeError.
If issue persists, run the commented out code in the cell
below the back-translation method code until output is seen.
"""
# data_aug = back_trans(data_init, lang='ja')
# data_aug = remove_dup(data_aug)
# data_aug = identifier(data_aug)
# data_aug

'\nRerun method a few times if there is an AttributeError.\nIf issue persists, run the commented out code in the cell\nbelow the back-translation method code until output is seen.\n'

In [19]:
# data_aug.to_csv('data/Augmented_Data_Btrans.csv', index=False)

### Synonym Replacement Augmentation

In [20]:
# data_aug = syn_rep(data_init, syn_num=5)
# data_aug = remove_dup(data_aug)
# data_aug = identifier(data_aug)
# data_aug

In [21]:
# data_aug.to_csv('data/Augmented_Data_Synrep.csv', index=False)

### Full Data Augmentation

**Old Method**

In [22]:
"""
Method that runs both back-translation, synonym replacement, and duplicate removal
for data augmentation.

Rerun method a few times if there is an AttributeError.
If issue persists, run the commented out code in the cell
below the back-translation method code until output is seen.
"""
# def augment(df, lang, syn_num=5, sample_frac=1):

#     data_btrans = back_trans(df, lang, sample_frac=sample_frac)
#     data_btrans = remove_dup(data_btrans)
#     data_aug = syn_rep(data_btrans, syn_num, sample_frac=sample_frac)
#     data_aug = remove_dup(data_aug)
#     data_aug = identifier(data_aug)

#     return data_aug

'\nMethod that runs both back-translation, synonym replacement, and duplicate removal\nfor data augmentation.\n\nRerun method a few times if there is an AttributeError.\nIf issue persists, run the commented out code in the cell\nbelow the back-translation method code until output is seen.\n'

In [23]:
# data_aug = augment(data_init, lang='ja', syn_num=5, sample_frac=1)
# data_aug

In [24]:
# data_aug.to_csv('data/Augmented_Data.csv', index=False)

**New Method**

Needed to adjust original method due after increasing manual synthesized data dramatically. Currently, method is not automated - automation method would follow the steps of try-catch back translation method within a while loop because sometimes it generates a random error combined with an appropriate time.sleep() duration in between back translation calls.

In [100]:
"""
Due to request limitations on official Google Translate API through unofficial
googletrans package, input data is broken down into smaller batches so all data can be
translated one batch at a time. The back-translated batches are saved to
individual csv's before imported and concat into one final back-translated csv.
"""

"\nDue to request limitations on official Google Translate API through unofficial\ngoogletrans package, input data is broken down into smaller batches so all data can be\ntranslated one batch at a time. The back-translated batches are saved to\nindividual csv's before imported and concat into one final back-translated csv.\n"

In [16]:
# Initial data is split into smaller batches that do not exceed 150 instances
data_split_arr = np.array_split(data_init, 4)

In [44]:
# Back-translation of individual batches, removing duplicates, and writing to csv.
# Do not be alarmed by all the 0% untranslated, all requests are expected to succeed when
# sent in smaller batches (<150) and with an appropriate time period in between (30 min)
btrans1_dup = back_trans(data_split_arr[0], 'ja')
btrans1 = remove_dup(btrans1_dup)
btrans1.to_csv('data/Data_Btrans1.csv', index=False)

Translation ||  Untranslated (#):  0  | Untranslated (%):  0.0 %
Back-translation ||  Untranslated (#):  0  | Untranslated (%):  0.0 %
Total ||  Untranslated (#):  0  | Untranslated (%):  0.0 %


In [62]:
btrans2_dup = back_trans(data_split_arr[1], 'ja')
btrans2 = remove_dup(btrans2_dup)
btrans2.to_csv('data/Data_Btrans2.csv', index=False)

Translation ||  Untranslated (#):  0  | Untranslated (%):  0.0 %
Back-translation ||  Untranslated (#):  0  | Untranslated (%):  0.0 %
Total ||  Untranslated (#):  0  | Untranslated (%):  0.0 %


In [65]:
btrans3_dup = back_trans(data_split_arr[2], 'ja')
btrans3 = remove_dup(btrans3_dup)
btrans3.to_csv('data/Data_Btrans3.csv', index=False)

Translation ||  Untranslated (#):  0  | Untranslated (%):  0.0 %
Back-translation ||  Untranslated (#):  0  | Untranslated (%):  0.0 %
Total ||  Untranslated (#):  0  | Untranslated (%):  0.0 %


In [68]:
btrans4_dup = back_trans(data_split_arr[3], 'ja')
btrans4 = remove_dup(btrans4_dup)
btrans4.to_csv('data/Data_Btrans4.csv', index=False)

Translation ||  Untranslated (#):  0  | Untranslated (%):  0.0 %
Back-translation ||  Untranslated (#):  0  | Untranslated (%):  0.0 %
Total ||  Untranslated (#):  0  | Untranslated (%):  0.0 %


In [124]:
btrans1 = pd.read_csv('data/Data_Btrans1.csv')
btrans2 = pd.read_csv('data/Data_Btrans2.csv')
btrans3 = pd.read_csv('data/Data_Btrans3.csv')
btrans4 = pd.read_csv('data/Data_Btrans4.csv')

In [91]:
btrans_dup = pd.concat([btrans1, btrans2, btrans3, btrans4]).reset_index(drop=True)
btrans = remove_dup(btrans_dup)
btrans

Unnamed: 0,Text Command,Topic,Identifier,Action,One Hot Encoded Topic,One Hot Encoded Action,Verb/Noun
0,x to do,Task,x,To Do,"[1,0]","[1,0,0,0,0]",to do
1,to do x,Task,x,To Do,"[1,0]","[1,0,0,0,0]",to do
2,new x,Task,x,To Do,"[1,0]","[1,0,0,0,0]",new
3,make x,Task,x,To Do,"[1,0]","[1,0,0,0,0]",make
4,create x,Task,x,To Do,"[1,0]","[1,0,0,0,0]",create
...,...,...,...,...,...,...,...
640,end of project x,Project,x,Completed,"[1,0]","[0,0,0,0,1]",BACKTRANSLATED
641,x end of project,Project,x,Completed,"[1,0]","[0,0,0,0,1]",BACKTRANSLATED
642,project x has finished,Project,x,Completed,"[1,0]","[0,0,0,0,1]",BACKTRANSLATED
643,x the project has ended,Project,x,Completed,"[1,0]","[0,0,0,0,1]",BACKTRANSLATED


In [94]:
data_syn_dup = syn_rep(btrans, syn_num=5)
data_syn = remove_dup(data_syn_dup)
data_syn

Unnamed: 0,Text Command,Topic,Identifier,Action,One Hot Encoded Topic,One Hot Encoded Action,Verb/Noun
0,x to do,Task,x,To Do,"[1,0]","[1,0,0,0,0]",to do
1,to do x,Task,x,To Do,"[1,0]","[1,0,0,0,0]",to do
2,new x,Task,x,To Do,"[1,0]","[1,0,0,0,0]",new
3,make x,Task,x,To Do,"[1,0]","[1,0,0,0,0]",make
4,create x,Task,x,To Do,"[1,0]","[1,0,0,0,0]",create
...,...,...,...,...,...,...,...
5186,x complete the cast,Project,x,Completed,"[1,0]","[0,0,0,0,1]",SYNREPLACED
5187,x complete the see,Project,x,Completed,"[1,0]","[0,0,0,0,1]",SYNREPLACED
5188,x complete the plan,Project,x,Completed,"[1,0]","[0,0,0,0,1]",SYNREPLACED
5189,x complete the design,Project,x,Completed,"[1,0]","[0,0,0,0,1]",SYNREPLACED


In [96]:
data_aug = identifier(data_syn)
data_aug

Unnamed: 0,Text Command,Topic,Identifier,Action,One Hot Encoded Topic,One Hot Encoded Action,Verb/Noun
0,see Track trends was complete,Project,Track trends,Completed,"[1,0]","[0,0,0,0,1]",SYNREPLACED
1,Track trends see to do,Project,Track trends,Create,"[0,1]","[1,0,0,0,0]",SYNREPLACED
2,out of use Track trends,Project,Track trends,Danger,"[0,1]","[0,0,0,1,0]",SYNREPLACED
3,added project Track trends,Project,Track trends,Create,"[0,1]","[1,0,0,0,0]",add
4,project Track trends in progress,Task,Track trends,In Progress,"[1,0]","[0,1,0,0,0]",SYNREPLACED
...,...,...,...,...,...,...,...
5186,throw Evaluate profits is late,Project,Evaluate profits,At Risk,"[0,1]","[0,0,1,0,0]",SYNREPLACED
5187,devising Evaluate profits project,Project,Evaluate profits,Create,"[0,1]","[1,0,0,0,0]",SYNREPLACED
5188,under recap Evaluate profits,Task,Evaluate profits,In Review,"[1,0]","[0,0,1,0,0]",SYNREPLACED
5189,through with Evaluate profits task,Task,Evaluate profits,Completed,"[1,0]","[0,0,0,0,1]",SYNREPLACED


In [99]:
# Increase in data compared to original manual synthesized set
(5191-2991)/5191

0.42381044114814104

In [97]:
data_aug.to_csv('data/Augmented_Data.csv', index=False)

**Using second set of identifiers for an alternative dataset:**

In [229]:
data_aug2 = identifier(data_syn, identifiers=identifiers_marketing)
data_aug2

Unnamed: 0,Text Command,Topic,Identifier,Action,One Hot Encoded Topic,One Hot Encoded Action,Verb/Noun
0,see Developing editorial calendar for content ...,Project,Developing editorial calendar for content sharing,Completed,"[1,0]","[0,0,0,0,1]",SYNREPLACED
1,Developing editorial calendar for content shar...,Project,Developing editorial calendar for content sharing,Create,"[0,1]","[1,0,0,0,0]",SYNREPLACED
2,out of use Developing editorial calendar for c...,Project,Developing editorial calendar for content sharing,Danger,"[0,1]","[0,0,0,1,0]",SYNREPLACED
3,added project Developing editorial calendar fo...,Project,Developing editorial calendar for content sharing,Create,"[0,1]","[1,0,0,0,0]",add
4,project Developing editorial calendar for cont...,Task,Developing editorial calendar for content sharing,In Progress,"[1,0]","[0,1,0,0,0]",SYNREPLACED
...,...,...,...,...,...,...,...
5186,throw Reviewing website backlinks is late,Project,Reviewing website backlinks,At Risk,"[0,1]","[0,0,1,0,0]",SYNREPLACED
5187,devising Reviewing website backlinks project,Project,Reviewing website backlinks,Create,"[0,1]","[1,0,0,0,0]",SYNREPLACED
5188,under recap Reviewing website backlinks,Task,Reviewing website backlinks,In Review,"[1,0]","[0,0,1,0,0]",SYNREPLACED
5189,through with Reviewing website backlinks task,Task,Reviewing website backlinks,Completed,"[1,0]","[0,0,0,0,1]",SYNREPLACED


In [230]:
data_aug2.to_csv('data/Augmented_Data2.csv', index=False)

### Update: Adding "assign [action] to x" and "move x to [action]" commands

In [111]:
# Add "assign" and "move" commands
new_data = data_init.loc[537:, :].copy()
btrans5_dup = back_trans(new_data, 'ja')
btrans5 = remove_dup(btrans5_dup)
btrans5.to_csv('data/Data_Btrans5.csv', index=False)

Translation ||  Untranslated (#):  0  | Untranslated (%):  0.0 %
Back-translation ||  Untranslated (#):  0  | Untranslated (%):  0.0 %
Total ||  Untranslated (#):  0  | Untranslated (%):  0.0 %


In [133]:
btrans1 = pd.read_csv('data/Data_Btrans1.csv')
btrans2 = pd.read_csv('data/Data_Btrans2.csv')
btrans3 = pd.read_csv('data/Data_Btrans3.csv')
btrans4 = pd.read_csv('data/Data_Btrans4.csv')
btrans5 = pd.read_csv('data/Data_Btrans5.csv')

In [134]:
btrans_dup = pd.concat([btrans1, btrans2, btrans3, btrans4, btrans5]).reset_index(drop=True)
btrans = remove_dup(btrans_dup)
btrans

Unnamed: 0,Text Command,Topic,Identifier,Action,One Hot Encoded Topic,One Hot Encoded Action,Verb/Noun
0,x to do,Task,x,To Do,"[1,0]","[1,0,0,0,0]",to do
1,to do x,Task,x,To Do,"[1,0]","[1,0,0,0,0]",to do
2,new x,Task,x,To Do,"[1,0]","[1,0,0,0,0]",new
3,make x,Task,x,To Do,"[1,0]","[1,0,0,0,0]",make
4,create x,Task,x,To Do,"[1,0]","[1,0,0,0,0]",create
...,...,...,...,...,...,...,...
773,assign to a completed x project,Project,x,Completed,"[0,1]","[0,0,0,0,1]",BACKTRANSLATED
774,project x allocation complete,Project,x,Completed,"[0,1]","[0,0,0,0,1]",BACKTRANSLATED
775,x project allocation completed,Project,x,Completed,"[0,1]","[0,0,0,0,1]",BACKTRANSLATED
776,move project x to complete,Project,x,Completed,"[0,1]","[0,0,0,0,1]",BACKTRANSLATED


In [135]:
data_syn_dup = syn_rep(btrans, syn_num=5)
data_syn = remove_dup(data_syn_dup)
data_syn

Unnamed: 0,Text Command,Topic,Identifier,Action,One Hot Encoded Topic,One Hot Encoded Action,Verb/Noun
0,x to do,Task,x,To Do,"[1,0]","[1,0,0,0,0]",to do
1,to do x,Task,x,To Do,"[1,0]","[1,0,0,0,0]",to do
2,new x,Task,x,To Do,"[1,0]","[1,0,0,0,0]",new
3,make x,Task,x,To Do,"[1,0]","[1,0,0,0,0]",make
4,create x,Task,x,To Do,"[1,0]","[1,0,0,0,0]",create
...,...,...,...,...,...,...,...
6974,x move project to over,Project,x,Completed,"[0,1]","[0,0,0,0,1]",SYNREPLACED
6975,x move project to good,Project,x,Completed,"[0,1]","[0,0,0,0,1]",SYNREPLACED
6976,x move project to all,Project,x,Completed,"[0,1]","[0,0,0,0,1]",SYNREPLACED
6977,x move project to sound,Project,x,Completed,"[0,1]","[0,0,0,0,1]",SYNREPLACED


In [136]:
data_aug = identifier(data_syn)
data_aug

Unnamed: 0,Text Command,Topic,Identifier,Action,One Hot Encoded Topic,One Hot Encoded Action,Verb/Noun
0,finish plan Track trends,Project,Track trends,Completed,"[0,1]","[0,0,0,0,1]",SYNREPLACED
1,move plan Track trends to a risky location,Project,Track trends,At Risk,"[0,1]","[0,0,1,0,0]",SYNREPLACED
2,fixed Track trends tax,Task,Track trends,In Review,"[1,0]","[0,0,1,0,0]",SYNREPLACED
3,audit task Track trends,Task,Track trends,In Review,"[1,0]","[0,0,1,0,0]",SYNREPLACED
4,offset on Track trends task,Task,Track trends,In Progress,"[1,0]","[0,1,0,0,0]",SYNREPLACED
...,...,...,...,...,...,...,...
6974,end throw Evaluate profits,Project,Evaluate profits,Completed,"[0,1]","[0,0,0,0,1]",SYNREPLACED
6975,to do Evaluate profits plan,Project,Evaluate profits,Create,"[0,1]","[1,0,0,0,0]",SYNREPLACED
6976,Evaluate profits project,Task,Evaluate profits,To Do,"[1,0]","[1,0,0,0,0]",SYNREPLACED
6977,Evaluate profits chore was complete,Task,Evaluate profits,Completed,"[1,0]","[0,0,0,0,1]",SYNREPLACED


In [137]:
data_aug.to_csv('data/Augmented_Data.csv', index=False)

**Using second set of identifiers for an alternative dataset:**

In [138]:
data_aug2 = identifier(data_syn, identifiers=identifiers_marketing)
data_aug2

Unnamed: 0,Text Command,Topic,Identifier,Action,One Hot Encoded Topic,One Hot Encoded Action,Verb/Noun
0,finish plan Developing editorial calendar for ...,Project,Developing editorial calendar for content sharing,Completed,"[0,1]","[0,0,0,0,1]",SYNREPLACED
1,move plan Developing editorial calendar for co...,Project,Developing editorial calendar for content sharing,At Risk,"[0,1]","[0,0,1,0,0]",SYNREPLACED
2,fixed Developing editorial calendar for conten...,Task,Developing editorial calendar for content sharing,In Review,"[1,0]","[0,0,1,0,0]",SYNREPLACED
3,audit task Developing editorial calendar for c...,Task,Developing editorial calendar for content sharing,In Review,"[1,0]","[0,0,1,0,0]",SYNREPLACED
4,offset on Developing editorial calendar for co...,Task,Developing editorial calendar for content sharing,In Progress,"[1,0]","[0,1,0,0,0]",SYNREPLACED
...,...,...,...,...,...,...,...
6974,end throw Reviewing website backlinks,Project,Reviewing website backlinks,Completed,"[0,1]","[0,0,0,0,1]",SYNREPLACED
6975,to do Reviewing website backlinks plan,Project,Reviewing website backlinks,Create,"[0,1]","[1,0,0,0,0]",SYNREPLACED
6976,Reviewing website backlinks project,Task,Reviewing website backlinks,To Do,"[1,0]","[1,0,0,0,0]",SYNREPLACED
6977,Reviewing website backlinks chore was complete,Task,Reviewing website backlinks,Completed,"[1,0]","[0,0,0,0,1]",SYNREPLACED


In [139]:
data_aug2.to_csv('data/Augmented_Data2.csv', index=False)