In [2]:
import math
import pandas as pd

In [3]:
augmented_data = pd.read_csv('data/augmented_data.csv')

In [None]:
class NaiveBayes(object):
    #initialize model
    def __init__(self,data,training_split):
        self.num_rows = len(data)
        #shuffle data and reset indexes
        self.shuffled_data = data.sample(frac=1).reset_index(drop=True)
        #split the data into training and testing sets using input proportion 
        (self.training_data,self.testing_data) = self.training_testing_split(self.shuffled_data,training_split)
        self.num_training_rows,self.num_testing_rows = len(self.training_data),len(self.testing_data)
    
    def training_testing_split(self,all_data,training_split):
        breakoff = int(self.num_rows*training_split)
        return (all_data.loc[:breakoff],all_data[['Text Command','Action']].loc[breakoff:])
         
    def action_map(self):
        return {0: 'To Do',1: 'In Progress',2: 'In Review',3: 'Blocked',4: 'Completed'}
    
    def index_action(self):
        return {'To Do': 0,'In Progress': 1,'In Review': 2,'Blocked': 3,'Completed': 4}
    #words to ignore when calculating probabilities
    def stopwords(self):
        return ["i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "your", "yours", "yourself", "yourselves", "he", "him", "his", "himself", "she", "her", "hers", "herself", "it", "its", "itself", "they", "them", "their", "theirs", "themselves", "what", "which", "who", "whom", "this", "that", "these", "those", "am", "is", "are", "was", "were", "be", "been", "being", "have", "has", "had", "having", "do", "does", "did", "doing", "a", "an", "the", "and", "but", "if", "or", "because", "as", "until", "while", "of", "at", "by", "for", "with", "about", "against", "between", "into", "through", "during", "before", "after", "above", "below", "to", "from", "up", "down", "in", "out", "on", "off", "over", "under", "again", "further", "then", "once", "here", "there", "when", "where", "why", "how", "all", "any", "both", "each", "few", "more", "most", "other", "some", "such", "no", "nor", "not", "only", "own", "same", "so", "than", "too", "very", "s", "t", "can", "will", "just", "don", "should", "now"]

    def compute_probabilities(self,text_command_training_data):
        action_map = self.action_map()
        action_probabilities = dict()
        action_dict = dict()
        word_dict = dict()
        word_counter = 0
        stopwords = self.stopwords()
        for action in list(action_map.values()):
            action_dict[action] = dict()
        for i in range(self.num_training_rows):
            action = text_command_training_data.loc[i][1]
            action_probabilities[action] = action_probabilities.get(action,0) + 1
            text_command = text_command_training_data.loc[i][0]
            for word in text_command.lower().split():
                if word not in stopwords:
                    word_counter += 1
                    word_dict[word] = word_dict.get(word,0) + 1
                    action_dict[action][word] = action_dict[action].get(word,0) + 1
        for action in action_dict:
            num_words = len(action_dict[action])
            for word in action_dict[action]:
                action_dict[action][word] = action_dict[action][word]/num_words
        for action in action_probabilities:
            action_probabilities[action] = action_probabilities[action]/self.num_training_rows
        
        for word in word_dict:
            word_dict[word] = word_dict[word]/word_counter
        
        return (action_probabilities,action_dict,word_dict)
    
    def train(self):
        text_command_and_action = self.training_data[['Text Command','Action']]
        (self.action_probabilities,self.action_dict,self.word_dict) = self.compute_probabilities(text_command_and_action)
    
    def predict(self,data):
        action_map = self.action_map()
        predictions = pd.DataFrame(columns=['Text Command', 'Predicted Action','Predicted Probabilities'])
        stopwords = self.stopwords()
        for i in list(data.index.values):
            words = data.loc[i][0].lower().split()
            action_probabilities = []
            for action in list(action_map.values()):
                action_probability = self.action_probabilities[action]
                probability = 1
                for word in words:
                    if word not in stopwords:
                        probability = probability * self.action_dict[action].get(word,0)
                probability = (probability * action_probability)
                action_probabilities.append(probability)
            action_probabilities = self.softmax(action_probabilities)
            
            predictions.loc[i]= [data.loc[i][0],action_map[action_probabilities.index(max(action_probabilities))],action_probabilities]
        return predictions
    
    def test(self):
        action_map = self.action_map()
        correct = 0
        preds = self.predict(self.testing_data)
        actions = self.testing_data['Action']
        for i in list(self.testing_data.index.values):
            if actions.loc[i][0] == preds['Predicted Action'].loc[i][0]:
                correct += 1
        losses = self.cross_entropy_loss(pd.concat([actions,preds['Predicted Probabilities']],axis=1))
      
        #print('Accuracy: ' + str(correct/self.num_testing_rows))
        losses_total = 0
        for loss_idx in losses:
            #print('Cross Entropy Loss '+action_map[loss_idx]+': '+str(losses[loss_idx]))
            losses_total += losses[loss_idx]
        print('Total Loss: ', losses_total, end='\r')
        return losses_total
        
    def softmax(self,labels):
        softmax_labels = []
        denominator = sum([math.exp(label) for label in labels])
        for label in labels:
            softmax_labels.append(math.exp(label)/denominator)
        return softmax_labels
    
    def cross_entropy_loss(self,labels):
        index_action = self.index_action()
        losses = {0: [],1: [],2: [],3: [], 4: []}
        for i in list(labels.index.values):
            idx = index_action[labels.loc[i][0]]
            loss = - math.log(labels.loc[i][1][idx])
            losses[idx].append(loss)
        for label in losses:
            losses[label] = sum(losses[label])/len(losses[label])
        return losses

In [None]:
losses = []
for i in range(100):
    classifier = NaiveBayes(augmented_data,0.8)
    classifier.train()
    losses.append(classifier.test())
print('Average total loss: ', sum(losses)/len(losses))

In [None]:
augmented_data2 = augmented_data[augmented_data['Verb/Noun'] != 'assign']

losses = []
for i in range(100):
    classifier = NaiveBayes(augmented_data2,0.8)
    classifier.train()
    losses.append(classifier.test())
print('Average total loss: ', sum(losses)/len(losses))

In [None]:
augmented_data3 = augmented_data[augmented_data['Verb/Noun'] != 'assign']
augmented_data3 = augmented_data3[augmented_data3['Verb/Noun'] != 'move']

losses = []
for i in range(100):
    classifier = NaiveBayes(augmented_data3,0.8)
    classifier.train()
    losses.append(classifier.test())
print('Average total loss: ', sum(losses)/len(losses))

Testing stuff

In [None]:
class NaiveBayes(object):
    #initialize model
    def __init__(self,data,training_split):
        self.num_rows = len(data)
        #shuffle data and reset indexes
        self.shuffled_data = data.sample(frac=1).reset_index(drop=True)
        #split the data into training and testing sets using input proportion 
        (self.training_data,self.testing_data) = self.training_testing_split(self.shuffled_data,training_split)
        self.num_training_rows,self.num_testing_rows = len(self.training_data),len(self.testing_data)
    
    def training_testing_split(self,all_data,training_split):
        breakoff = int(self.num_rows*training_split)
        return (all_data.loc[:breakoff],all_data[['Text Command','Action']].loc[breakoff:])
         
    def action_map(self):
        return {0: 'To Do',1: 'In Progress',2: 'In Review',3: 'Blocked',4: 'Completed'}
    
    def index_action(self):
        return {'To Do': 0,'In Progress': 1,'In Review': 2,'Blocked': 3,'Completed': 4}
    #words to ignore when calculating probabilities
    def stopwords(self):
        return ["i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "your", "yours", "yourself", "yourselves", "he", "him", "his", "himself", "she", "her", "hers", "herself", "it", "its", "itself", "they", "them", "their", "theirs", "themselves", "what", "which", "who", "whom", "this", "that", "these", "those", "am", "is", "are", "was", "were", "be", "been", "being", "have", "has", "had", "having", "do", "does", "did", "doing", "a", "an", "the", "and", "but", "if", "or", "because", "as", "until", "while", "of", "at", "by", "for", "with", "about", "against", "between", "into", "through", "during", "before", "after", "above", "below", "to", "from", "up", "down", "in", "out", "on", "off", "over", "under", "again", "further", "then", "once", "here", "there", "when", "where", "why", "how", "all", "any", "both", "each", "few", "more", "most", "other", "some", "such", "no", "nor", "not", "only", "own", "same", "so", "than", "too", "very", "s", "t", "can", "will", "just", "don", "should", "now"]

    def compute_probabilities(self,text_command_training_data):
        action_map = self.action_map()
        action_probabilities = dict()
        action_dict = dict()
        word_dict = dict()
        word_counter = 0
        stopwords = self.stopwords()
        for action in list(action_map.values()):
            action_dict[action] = dict()
        for i in range(self.num_training_rows):
            action = text_command_training_data.loc[i][1]
            action_probabilities[action] = action_probabilities.get(action,0) + 1
            text_command = text_command_training_data.loc[i][0]
            for word in text_command.lower().split():
                if word not in stopwords:
                    word_counter += 1
                    word_dict[word] = word_dict.get(word,0) + 1
                    action_dict[action][word] = action_dict[action].get(word,0) + 1
        for action in action_dict:
            num_words = len(action_dict[action])
            for word in action_dict[action]:
                action_dict[action][word] = action_dict[action][word]/num_words
        for action in action_probabilities:
            action_probabilities[action] = action_probabilities[action]/self.num_training_rows
        
        for word in word_dict:
            word_dict[word] = word_dict[word]/word_counter
        
        return (action_probabilities,action_dict,word_dict)
    
    def train(self):
        text_command_and_action = self.training_data[['Text Command','Action']]
        (self.action_probabilities,self.action_dict,self.word_dict) = self.compute_probabilities(text_command_and_action)
    
    def predict(self,data):
        action_map = self.action_map()
        predictions = pd.DataFrame(columns=['Text Command', 'Predicted Action','Predicted Probabilities'])
        stopwords = self.stopwords()
        for i in list(data.index.values):
            words = data.loc[i][0].lower().split()
            action_probabilities = []
            for action in list(action_map.values()):
                action_probability = self.action_probabilities[action]
                probability = 1
                for word in words:
                    if word not in stopwords:
                        probability = probability * self.action_dict[action].get(word,0)
                probability = (probability * action_probability)
                action_probabilities.append(probability)
            action_probabilities = self.softmax(action_probabilities)
            predictions.loc[i]=[data.loc[i][0],action_map[action_probabilities.index(max(action_probabilities))],action_probabilities]
        return predictions
    
    def test(self):
        action_map = self.action_map()
        correct = 0
        preds = self.predict(self.testing_data)
        actions = self.testing_data['Action']
        for i in list(self.testing_data.index.values):
            if actions.loc[i][0] == preds['Predicted Action'].loc[i][0]:
                correct += 1
        losses = self.cross_entropy_loss(pd.concat([actions,preds['Predicted Probabilities']],axis=1))
      
        #print('Accuracy: ' + str(correct/self.num_testing_rows))
        losses_total = 0
        for loss_idx in losses:
            #print('Cross Entropy Loss '+action_map[loss_idx]+': '+str(losses[loss_idx]))
            losses_total += losses[loss_idx]
        print('Total Loss: ', losses_total, end='\r')
        return losses_total
        
    def softmax(self,labels):
        softmax_labels = []
        denominator = sum([math.exp(label) for label in labels])
        for label in labels:
            softmax_labels.append(math.exp(label)/denominator)
        return softmax_labels
    
    def cross_entropy_loss(self,labels):
        index_action = self.index_action()
        losses = {0: [],1: [],2: [],3: [], 4: []}
        for i in list(labels.index.values):
            idx = index_action[labels.loc[i][0]]
            loss = - math.log(labels.loc[i][1][idx])
            losses[idx].append(loss)
        for label in losses:
            losses[label] = sum(losses[label])/len(losses[label])
        return losses

In [4]:
augmented_data3 = augmented_data[augmented_data['Verb/Noun'] != 'assign']
augmented_data3 = augmented_data3[augmented_data3['Verb/Noun'] != 'move']
temp_data = augmented_data3[0:102].reset_index(drop=True)
temp_data

Unnamed: 0,Text Command,Topic,Identifier,Action,One Hot Encoded Topic,One Hot Encoded Action,Verb/Noun
0,X completed,Task,X,Completed,"[1,0,0]","[0,0,0,0,1]",completed
1,Completed X,Task,X,Completed,"[1,0,0]","[0,0,0,0,1]",completed
2,X in progress,Task,X,In Progress,"[1,0,0]","[0,1,0,0,0]",progress
3,Create new task called X,Task,X,To Do,"[1,0,0]","[1,0,0,0,0]",create
4,Create task X,Task,X,To Do,"[1,0,0]","[1,0,0,0,0]",create
...,...,...,...,...,...,...,...
97,X in build,Task,X,In Progress,"[1,0,0]","[0,1,0,0,0]",progress
98,X in advance,Task,X,In Progress,"[1,0,0]","[0,1,0,0,0]",progress
99,X in progression,Task,X,In Progress,"[1,0,0]","[0,1,0,0,0]",progress
100,X in advancement,Task,X,In Progress,"[1,0,0]","[0,1,0,0,0]",progress


In [5]:
def training_testing_split(all_data,training_split):
        breakoff = int(len(all_data)*training_split)
        return (all_data.loc[:breakoff],all_data[['Text Command','Action']].loc[breakoff:])
(training_data, testing_data) = training_testing_split(temp_data, 0.8)
training_data

Unnamed: 0,Text Command,Topic,Identifier,Action,One Hot Encoded Topic,One Hot Encoded Action,Verb/Noun
0,X completed,Task,X,Completed,"[1,0,0]","[0,0,0,0,1]",completed
1,Completed X,Task,X,Completed,"[1,0,0]","[0,0,0,0,1]",completed
2,X in progress,Task,X,In Progress,"[1,0,0]","[0,1,0,0,0]",progress
3,Create new task called X,Task,X,To Do,"[1,0,0]","[1,0,0,0,0]",create
4,Create task X,Task,X,To Do,"[1,0,0]","[1,0,0,0,0]",create
...,...,...,...,...,...,...,...
77,Problem with task X,Task,X,Blocked,"[1,0,0]","[0,0,0,1,0]",problem
78,Problem with X task,Task,X,Blocked,"[1,0,0]","[0,0,0,1,0]",problem
79,There is a problem with task X,Task,X,Blocked,"[1,0,0]","[0,0,0,1,0]",problem
80,There is a problem with X task,Task,X,Blocked,"[1,0,0]","[0,0,0,1,0]",problem


In [None]:
training

In [117]:
testing_data

Unnamed: 0,Text Command,Action
81,X in review,In Review
82,Task X in review,In Review
83,X task in review,In Review
84,X complete,Completed
85,X accomplished,Completed
86,X realized,Completed
87,X consummated,Completed
88,complete X,Completed
89,accomplished X,Completed
90,realized X,Completed


In [111]:
num_training_rows = len(temp_data)
action_map = {0: 'To Do',1: 'In Progress',2: 'In Review',3: 'Blocked',4: 'Completed'}
action_probabilities = dict()
action_dict = dict()
word_dict = dict()
word_counter = 0
stopwords = ["i", "me", "my", "myself", "we", "our"]
for action in list(action_map.values()):
    action_dict[action] = dict()
action_dict

{'To Do': {},
 'In Progress': {},
 'In Review': {},
 'Blocked': {},
 'Completed': {}}

In [None]:
text_command_training_data.loc[i][1]

In [None]:
action_map = {0: 'To Do',1: 'In Progress',2: 'In Review',3: 'Blocked',4: 'Completed'}
action_probabilities = dict()
action_dict = dict()
word_dict = dict()
word_counter = 0
stopwords = ["i", "me", "my", "myself", "we", "our"]
for action in list(action_map.values()):
    action_dict[action] = dict()
for i in range(self.num_training_rows):
    action = text_command_training_data.loc[i][1]
    action_probabilities[action] = action_probabilities.get(action,0) + 1
    text_command = text_command_training_data.loc[i][0]
    for word in text_command.lower().split():
        if word not in stopwords:
            word_counter += 1
            word_dict[word] = word_dict.get(word,0) + 1
            action_dict[action][word] = action_dict[action].get(word,0) + 1
for action in action_dict:
    num_words = len(action_dict[action])
    for word in action_dict[action]:
        action_dict[action][word] = action_dict[action][word]/num_words
for action in action_probabilities:
    action_probabilities[action] = action_probabilities[action]/self.num_training_rows

for word in word_dict:
    word_dict[word] = word_dict[word]/word_counter

return (action_probabilities,action_dict,word_dict)