In [2]:
#import libraries
import math
import pandas as pd

In [3]:
#load dataset 
augmented_data = pd.read_csv('./data/Augmented_Data2.csv')

In [4]:
augmented_data

Unnamed: 0,Text Command,Topic,Identifier,Action,One Hot Encoded Topic,One Hot Encoded Action,Verb/Noun
0,finish plan Developing editorial calendar for ...,Project,Developing editorial calendar for content sharing,Completed,"[0,1]","[0,0,0,0,1]",SYNREPLACED
1,move plan Developing editorial calendar for co...,Project,Developing editorial calendar for content sharing,At Risk,"[0,1]","[0,0,1,0,0]",SYNREPLACED
2,fixed Developing editorial calendar for conten...,Task,Developing editorial calendar for content sharing,In Review,"[1,0]","[0,0,1,0,0]",SYNREPLACED
3,audit task Developing editorial calendar for c...,Task,Developing editorial calendar for content sharing,In Review,"[1,0]","[0,0,1,0,0]",SYNREPLACED
4,offset on Developing editorial calendar for co...,Task,Developing editorial calendar for content sharing,In Progress,"[1,0]","[0,1,0,0,0]",SYNREPLACED
...,...,...,...,...,...,...,...
6974,end throw Reviewing website backlinks,Project,Reviewing website backlinks,Completed,"[0,1]","[0,0,0,0,1]",SYNREPLACED
6975,to do Reviewing website backlinks plan,Project,Reviewing website backlinks,Create,"[0,1]","[1,0,0,0,0]",SYNREPLACED
6976,Reviewing website backlinks project,Task,Reviewing website backlinks,To Do,"[1,0]","[1,0,0,0,0]",SYNREPLACED
6977,Reviewing website backlinks chore was complete,Task,Reviewing website backlinks,Completed,"[1,0]","[0,0,0,0,1]",SYNREPLACED


In [8]:
#Naive Bayes classifier
class NaiveBayes(object):
    #initialize model
    def __init__(self,data,training_split,use_case,laplace):
        '''
        use_case: what is being predicted by the Naive Bayes classifier
        data: dataset that is being trained on
        training_split: percent of data that should be trained on expressed as a decimal
        laplace: smoothing constant for laplace smoothing
        '''
        self.use_case = use_case
        #depending on the use case of the model, select the appropriate data that will be trained on
        if self.use_case == 'topic':
            self.columns = ['Text Command', 'Topic']
        else:
            self.columns = ['Text Command','Action']
            #depending on whether the model is predicting task or project actions, select the appropriate data
            if self.use_case == 'task action':
                data = data[data['Topic'] == 'Task']
            else:
                data = data[data['Topic'] == 'Project']
        self.num_rows = len(data)
        #shuffle data and reset indexes
        self.shuffled_data = data.sample(frac=1).reset_index(drop=True)
        #split the data into training and testing sets using inputted training split parameter 
        (self.training_data,self.testing_data) = self.training_testing_split(self.shuffled_data,training_split)
        self.num_training_rows,self.num_testing_rows = len(self.training_data),len(self.testing_data)
        self.smoothing_constant = laplace
        #depending on the use case of the model, create a dictionary object that maps labels to indexes
        if self.use_case == 'topic':
            index_action = {'Task':0, 'Project':1}
        if self.use_case == 'task action':
            index_action = {'To Do': 0, 'In Progress':1, 'In Review':2, 'Blocked':3, 'Completed':4}
        if self.use_case == 'project action':
            index_action = {'Create':0, 'On Target':1, 'At Risk':2, 'Danger':3, 'Completed':4}
        self.index_action = index_action
        #create another dictionary object that reverses the index_action dictionary and maps indexes to labels
        self.label_map = dict()
        for key in self.index_action:
            self.label_map[self.index_action[key]] = key
            
    #function for returning training and testing data based on inputted data and percentage split 
    def training_testing_split(self,all_data,training_split):
        breakoff = int(self.num_rows*training_split)
        return (all_data.loc[:breakoff],all_data[self.columns].loc[breakoff:])
    
    #words to ignore when calculating probabilities
    def stopwords(self):
        return ["i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "your", "yours", "yourself", "yourselves", "he", "him", "his", "himself", "she", "her", "hers", "herself", "it", "its", "itself", "they", "them", "their", "theirs", "themselves", "what", "which", "who", "whom", "this", "that", "these", "those", "am", "is", "are", "was", "were", "be", "been", "being", "have", "has", "had", "having", "do", "does", "did", "doing", "a", "an", "the", "and", "but", "if", "or", "because", "as", "until", "while", "of", "at", "by", "for", "with", "about", "against", "between", "into", "through", "during", "before", "after", "above", "below", "to", "from", "up", "down", "in", "out", "on", "off", "over", "under", "again", "further", "then", "once", "here", "there", "when", "where", "why", "how", "all", "any", "both", "each", "few", "more", "most", "other", "some", "such", "no", "nor", "not", "only", "own", "same", "so", "than", "too", "very", "s", "t", "can", "will", "just", "don", "should", "now"]
    
    #function for calculating probabilities that will be used for making predictions on text commands
    def compute_probabilities(self,text_command_training_data):
        action_map = self.label_map
        #dictionary for updating probabilities of a certain label occuring
        action_probabilities = dict()
        #nested dictionary for storing the probabilities of words occuring in text commands with a specific label
        action_dict = dict()
        #dictionary for stroing probability of a word occuring 
        word_dict = dict()
        word_counter = 0
        stopwords = self.stopwords()
        #iterate over possible labels for given use case
        for action in list(action_map.values()):
            #create a dictionary for every possible label that contains probabilities of words 
            action_dict[action] = dict()
        #iterate over index values in training dataset
        for i in list(text_command_training_data.index.values):
            #pick out true label from training dataset 
            action = text_command_training_data.loc[i][1]
            #update the count of a label occuring in the dataset
            action_probabilities[action] = action_probabilities.get(action,0) + 1
            #pick out the text command from the training dataset
            text_command = text_command_training_data.loc[i][0]
            #iterate over the words in the text command
            for word in text_command.lower().split():
                #if the words is not a stopword
                if word not in stopwords:
                    #update the total number of words occuring in the training dataset
                    word_counter += 1
                    #update the count of a specific word 
                    word_dict[word] = word_dict.get(word,0) + 1
                    #update the count of a specific word occuring in a text command with a specific label
                    action_dict[action][word] = action_dict[action].get(word,0) + 1
        #number of unique words that have been seen in the training dataset
        num_unique_words = len(word_dict)
        #iterate over possible labels 
        for action in action_dict:
            #total number of words that occur in text commands with a specific label 
            num_words = sum(list(action_dict[action].values()))
            #iterate over the word counts for specific words in text command with a specific label
            for word in action_dict[action]:
                '''
                probability for a word occuring in a text command with a specific label is given by:
                (number of times word occurs in text commands with a specific label + smoothing constant) / (total number of words in text commands with a specific label + smoothing constant * number of unique words in text commands across entire dataset)
                '''
                action_dict[action][word] = (action_dict[action][word]+self.smoothing_constant)/(num_words+self.smoothing_constant*num_unique_words)
        #iterate over possible labels
        for action in action_probabilities:
            #convert the count of specific label into probability by dividing count by number of rows in training dataset
            action_probabilities[action] = action_probabilities[action]/self.num_training_rows
        #iterate over words that occur in the training dataset
        for word in word_dict:
            #convert the count of a specific word into probability using same formula
            word_dict[word] = (word_dict[word]+self.smoothing_constant)/(word_counter+self.smoothing_constant*num_unique_words)
        self.word_counter = word_counter
        return (action_probabilities,action_dict,word_dict)
    
    def train(self):
        #select the appropriate data to calculate probabilities 
        text_command_and_action = self.training_data[self.columns]
        #compute and return probabilties needed to make label predictions
        (self.action_probabilities,self.action_dict,self.word_dict) = self.compute_probabilities(text_command_and_action)
    
    
    #function for making label prediction on inputted dataset with text commands and labels
    '''
    predict function below uses Bayes Theorem: P(A|B) = P(A and B)/P(B) = (P(B|A) * P(A))/P(B)
    using this theorem for the purposes of making label predictions on text commands: 
    P(label|text command) = (P(text command|label)*P(label))/P(text command)
    where P(text command|label) = P(1st word in text command|label)*P(2nd word in text command|label)*...
    and P(text command) = P(text command|label1)*P(label1) + P(text command|label2)*P(label2) + ...
    
    '''
    def predict(self,data):
        num_unique_words = len(self.word_dict)
        action_map = self.label_map
        predictions = pd.DataFrame(columns=[self.columns[0], 'Predicted Action','Predicted Probabilities'])
        stopwords = self.stopwords()
        #iterate over inputted dataset
        for i in list(data.index.values):
            #pick out the words in the text command
            words = data.loc[i][0].lower().split()
            #list that will have probabilities assigned to each label
            action_probabilities = []
            #placeholder for denominator in Bayes Theorem
            denominator = 0
            #iterate over the values in the dictionary object mapping index to label
            for action in list(action_map.values()):
                #total number of words in text commands with specific label
                num_words = sum(list(self.action_dict[action].values()))
                #placeholder for term in denominator in Bayes Theorem
                probability = 1
                #iterate over words in text command
                for word in words:
                    if word not in stopwords:
                        #updates numerator probability by multiplying numerator by P(word|label) which is calculated previously
                        probability = probability * self.action_dict[action].get(word,(self.smoothing_constant)/(num_words+self.smoothing_constant*num_unique_words))
                #update denominator probability by adding P(text command|label)*P(label)
                denominator += probability*self.action_probabilities[action]
            #iterate over values in dictionary object mapping index to label
            for action in list(action_map.values()):
                #total number of words in text commands with specific label
                num_words = sum(list(self.action_dict[action].values()))
                #probability of specific label occuring
                action_probability = self.action_probabilities[action]
                #placeholder for numerator in Bayes Theorem
                probability = 1
                #iterate over words in text command
                for word in words:
                    if word not in stopwords:
                        #update numerator probability by multiplying numerator by P(word|label)
                        probability = probability * self.action_dict[action].get(word,(self.smoothing_constant)/(num_words+self.smoothing_constant*num_unique_words))
                #probability of label given text command    
                probability = (probability * action_probability)/denominator
    
                #add probability of text command being specific label to list
                action_probabilities.append(probability)
            #add prediction to prediction dataset by finding the index of the maximum probability and then use that index in the index to label dictionary to find 
            #predicted label
            predictions.loc[i]= [data.loc[i][0],action_map[action_probabilities.index(max(action_probabilities))],action_probabilities]
        return predictions
    
    #test trained probabilites on testing dataset
    def test(self):
        action_map = self.label_map
        count = dict()
        #track how many predictions are correct
        correct = 0
        #dataset with predictions from testing data
        preds = self.predict(self.testing_data)
        #ground truth labels in the testing dataset
        actions = self.testing_data[self.columns[1]]
        #iterate over rows in testing dataset
        for i in list(self.testing_data.index.values):
            #check if predicted label equals ground truth label
            if actions.loc[i][0] == preds['Predicted Action'].loc[i][0]:
                correct += 1
        #compute the average NLL loss for each label
        losses = self.cross_entropy_loss(pd.concat([actions,preds['Predicted Probabilities']],axis=1))
        #calculate accuracy of testing predictions
        accuracy = correct/self.num_testing_rows
        return accuracy, losses, preds
    
    #function for computing average negative log likelihood loss for each label
    def cross_entropy_loss(self,labels):
        index_action = self.index_action
        losses = dict()
        for key in self.label_map:
            losses[key] = []
        for i in list(labels.index.values):
            idx = index_action[labels.loc[i][0]]
            loss = - math.log(labels.loc[i][1][idx])
            losses[idx].append(loss)
        for label in losses:
            losses[label] = sum(losses[label])/len(losses[label])
        return losses
            
        
        
    

In [9]:
topic_accuracies = []
cross_entropy_losses = [0,0]
for i in range(100):
    classifier = NaiveBayes(augmented_data,0.8,'topic',1)
    classifier.train()
    accuracy, losses, preds = classifier.test()
    losses = list(losses.values())
    counts = [0,0]
    
    topic_accuracies.append(accuracy)
    cross_entropy_losses = [sum(x) for x in zip(cross_entropy_losses,losses)]
    if i%10 == 0:
        print(i)
print('Accuracy: ' + str(sum(topic_accuracies)/len(topic_accuracies)))
print('Cross Entropy Losses: ' + str([x/len(topic_accuracies) for x in cross_entropy_losses]))

0
10
20
30
40
50
60
70
80
90
Accuracy: 0.8982020057306589
Cross Entropy Losses: [0.25483009624163233, 0.26600454722051753]


In [6]:
topic_accuracies = []
cross_entropy_losses = [0,0,0,0,0]
for i in range(100):
    classifier = NaiveBayes(augmented_data,0.8,'task action',1)
    classifier.train()
    accuracy, losses, preds = classifier.test()
    losses = list(losses.values())
    topic_accuracies.append(accuracy)
    cross_entropy_losses = [sum(x) for x in zip(cross_entropy_losses,losses)]
    if i%10 == 0:
        print(i)
print('Accuracy: ' + str(sum(topic_accuracies)/len(topic_accuracies)))
print('Cross Entropy Losses: ' + str([x/len(topic_accuracies) for x in cross_entropy_losses]))

0
10
20
30
40
50
60
70
80
90
0.5969129287598949
[2.065536718281523, 1.2081753466600305, 1.1558338185991048, 1.5290459803963214, 1.2461902320459644]


In [7]:
topic_accuracies = []
cross_entropy_losses = [0,0,0,0,0]
for i in range(100):
    classifier = NaiveBayes(augmented_data,0.8,'project action',1)
    classifier.train()
    accuracy, losses, preds = classifier.test()
    losses = list(losses.values())
    topic_accuracies.append(accuracy)
    cross_entropy_losses = [sum(x) for x in zip(cross_entropy_losses,losses)]
    if i%10 == 0:
        print(i)
print('Accuracy: ' + str(sum(topic_accuracies)/len(topic_accuracies)))
print('Cross Entropy Losses: ' + str([x/len(topic_accuracies) for x in cross_entropy_losses]))

0
10
20
30
40
50
60
70
80
90
0.6427586206896551
[1.8113355350334601, 0.9430479019411666, 1.067495212015915, 1.6394451040277886, 1.035731916757897]
