In [1]:
import pandas as pd
import requests
import string

In [7]:
# Import data
data_init = pd.read_excel('data/NLP_Data.xlsx')

In [3]:
# Dataframe containg the data that our team created
data_init.head()

Unnamed: 0,Text Command,Topic,Identifier,Action,One Hot Encoded Topic,One Hot Encoded Action,Verb/Noun
0,assign completed status to X,Task,X,Completed,"[1,0]","[0,0,0,0,1]",assign
1,move X to completed,Task,X,Completed,"[1,0]","[0,0,0,0,1]",move
2,X completed,Task,X,Completed,"[1,0]","[0,0,0,0,1]",completed
3,Completed X,Task,X,Completed,"[1,0]","[0,0,0,0,1]",completed
4,X in progress,Task,X,In Progress,"[1,0]","[0,1,0,0,0]",in progress


## Augmenting total data for modeling purposes:

In [4]:
# Script for creating new data by substituting words for their synonyms. Only the words which are not included
# in the identifier are eligible for being replaced
def aug_data(df, syn_num):
    #new dataframe for the new sentences
    added_data = pd.DataFrame(columns=list(df.columns))
    index = len(df)
    for i in range(len(df.index)):
        identifier = df.iloc[i][2]
        text = df.iloc[i][0]
        sentence_lst = text.split()
        # identifier_lst = identifier.split()
        
        # List of additional words that translating did not make sense
        identifier_lst = ['a', 'an']
        identifier_lst.append(identifier.split()[0])

        for j in range(len(sentence_lst)):
            word = sentence_lst[j]
            if word not in identifier_lst:
                word = word.lower()
                
                # max=syn_num is the number of synonyms you want for each non-identifier word in the cmd
                
                api_url = 'https://api.datamuse.com/words?rel_syn={0}&max='.format(word)
                api_url += str(syn_num)
                word_synonyms = requests.get(api_url)
                word_synonyms = word_synonyms.json()
                
                for k in range(len(word_synonyms)):
                    new_sentence_lst = sentence_lst[:j]+[word_synonyms[k]['word']]+sentence_lst[j+1:]
                    s = " "
                    new_sentence = s.join(new_sentence_lst)
                    added_data.loc[index] = [new_sentence] + list(df.loc[i][1:])
                    index += 1
                    
    augmented_data = pd.concat([df,added_data])
    return augmented_data

In [5]:
new_data = aug_data(data_init, 5)

In [6]:
new_data

Unnamed: 0,Text Command,Topic,Identifier,Action,One Hot Encoded Topic,One Hot Encoded Action,Verb/Noun
0,assign completed status to X,Task,X,Completed,"[1,0]","[0,0,0,0,1]",assign
1,move X to completed,Task,X,Completed,"[1,0]","[0,0,0,0,1]",move
2,X completed,Task,X,Completed,"[1,0]","[0,0,0,0,1]",completed
3,Completed X,Task,X,Completed,"[1,0]","[0,0,0,0,1]",completed
4,X in progress,Task,X,In Progress,"[1,0]","[0,1,0,0,0]",in progress
...,...,...,...,...,...,...,...
2602,X project is on dog,Project,X,On Target,"[0,1]","[0,1,0,0,0]",on track
2603,X project is on lead,Project,X,On Target,"[0,1]","[0,1,0,0,0]",on track
2604,X project is on course,Project,X,On Target,"[0,1]","[0,1,0,0,0]",on track
2605,X project is on cross,Project,X,On Target,"[0,1]","[0,1,0,0,0]",on track


In [9]:
new_data.to_csv('data/Augmented_Data.csv', index=False)

## Augmenting specified actions/rows for exploration purposes:

In [26]:
# pd.set_option("display.max_rows", 10)

In [10]:
# Limiting data to just completed tasks for API exploration purposes
data_action5 = data_init[data_init.Action == 'Completed'].reset_index(drop=True)
data_action5

Unnamed: 0,Text Command,Topic,Identifier,Action,One Hot Encoded Topic,One Hot Encoded Action,Verb/Noun
0,assign completed status to X,Task,X,Completed,"[1,0,0]","[0,0,0,0,1]",assign
1,move X to completed,Task,X,Completed,"[1,0,0]","[0,0,0,0,1]",move
2,X completed,Task,X,Completed,"[1,0,0]","[0,0,0,0,1]",completed
3,Completed X,Task,X,Completed,"[1,0,0]","[0,0,0,0,1]",completed
4,X is completed,Task,X,Completed,"[1,0,0]","[0,0,0,0,1]",is
5,X complete,Task,X,Completed,"[1,0,0]","[0,0,0,0,1]",
6,X was completed,Task,X,Completed,"[1,0,0]","[0,0,0,0,1]",was
7,X was complete,Task,X,Completed,"[1,0,0]","[0,0,0,0,1]",was
8,X is complete,Task,X,Completed,"[1,0,0]","[0,0,0,0,1]",is
9,Task X completed,Task,X,Completed,"[1,0,0]","[0,0,0,0,1]",completed


In [11]:
# Running the script for just the first X rows
data_action5_aug = aug_data(data_action5.iloc[0:5], 5)

In [12]:
data_action5_aug

Unnamed: 0,Text Command,Topic,Identifier,Action,One Hot Encoded Topic,One Hot Encoded Action,Verb/Noun
0,assign completed status to X,Task,X,Completed,"[1,0,0]","[0,0,0,0,1]",assign
1,move X to completed,Task,X,Completed,"[1,0,0]","[0,0,0,0,1]",move
2,X completed,Task,X,Completed,"[1,0,0]","[0,0,0,0,1]",completed
3,Completed X,Task,X,Completed,"[1,0,0]","[0,0,0,0,1]",completed
4,X is completed,Task,X,Completed,"[1,0,0]","[0,0,0,0,1]",is
5,put completed status to X,Task,X,Completed,"[1,0,0]","[0,0,0,0,1]",assign
6,attribute completed status to X,Task,X,Completed,"[1,0,0]","[0,0,0,0,1]",assign
7,ascribe completed status to X,Task,X,Completed,"[1,0,0]","[0,0,0,0,1]",assign
8,delegate completed status to X,Task,X,Completed,"[1,0,0]","[0,0,0,0,1]",assign
9,designate completed status to X,Task,X,Completed,"[1,0,0]","[0,0,0,0,1]",assign
