In [1]:
import math
import pandas as pd

In [2]:
augmented_data = pd.read_csv('./data/Augmented_Data2.csv')

In [3]:
augmented_data

Unnamed: 0,Text Command,Topic,Identifier,Action,One Hot Encoded Topic,One Hot Encoded Action,Verb/Noun
0,finish plan Developing editorial calendar for ...,Project,Developing editorial calendar for content sharing,Completed,"[0,1]","[0,0,0,0,1]",SYNREPLACED
1,move plan Developing editorial calendar for co...,Project,Developing editorial calendar for content sharing,At Risk,"[0,1]","[0,0,1,0,0]",SYNREPLACED
2,fixed Developing editorial calendar for conten...,Task,Developing editorial calendar for content sharing,In Review,"[1,0]","[0,0,1,0,0]",SYNREPLACED
3,audit task Developing editorial calendar for c...,Task,Developing editorial calendar for content sharing,In Review,"[1,0]","[0,0,1,0,0]",SYNREPLACED
4,offset on Developing editorial calendar for co...,Task,Developing editorial calendar for content sharing,In Progress,"[1,0]","[0,1,0,0,0]",SYNREPLACED
...,...,...,...,...,...,...,...
6974,end throw Reviewing website backlinks,Project,Reviewing website backlinks,Completed,"[0,1]","[0,0,0,0,1]",SYNREPLACED
6975,to do Reviewing website backlinks plan,Project,Reviewing website backlinks,Create,"[0,1]","[1,0,0,0,0]",SYNREPLACED
6976,Reviewing website backlinks project,Task,Reviewing website backlinks,To Do,"[1,0]","[1,0,0,0,0]",SYNREPLACED
6977,Reviewing website backlinks chore was complete,Task,Reviewing website backlinks,Completed,"[1,0]","[0,0,0,0,1]",SYNREPLACED


In [4]:
class NaiveBayes(object):
    #initialize model
    def __init__(self,data,training_split,use_case,laplace):
        self.use_case = use_case
        if self.use_case == 'topic':
            self.columns = ['Text Command', 'Topic']
        else:
            self.columns = ['Text Command','Action']
            if self.use_case == 'task action':
                data = data[data['Topic'] == 'Task']
            else:
                data = data[data['Topic'] == 'Project']
        self.num_rows = len(data)
        #shuffle data and reset indexes
        self.shuffled_data = data.sample(frac=1).reset_index(drop=True)
        #split the data into training and testing sets using input proportion 
        (self.training_data,self.testing_data) = self.training_testing_split(self.shuffled_data,training_split)
        self.num_training_rows,self.num_testing_rows = len(self.training_data),len(self.testing_data)
        self.smoothing_constant = laplace
        if self.use_case == 'topic':
            index_action = {'Task':0, 'Project':1}
        if self.use_case == 'task action':
            index_action = {'To Do': 0, 'In Progress':1, 'In Review':2, 'Blocked':3, 'Completed':4}
        if self.use_case == 'project action':
            index_action = {'Create':0, 'On Target':1, 'At Risk':2, 'Danger':3, 'Completed':4}
        self.index_action = index_action
        self.label_map = dict()
        for key in self.index_action:
            self.label_map[self.index_action[key]] = key
            
        
    def training_testing_split(self,all_data,training_split):
        
        breakoff = int(self.num_rows*training_split)
        return (all_data.loc[:breakoff],all_data[self.columns].loc[breakoff:])
    
    #words to ignore when calculating probabilities
    def stopwords(self):
        return ["i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "your", "yours", "yourself", "yourselves", "he", "him", "his", "himself", "she", "her", "hers", "herself", "it", "its", "itself", "they", "them", "their", "theirs", "themselves", "what", "which", "who", "whom", "this", "that", "these", "those", "am", "is", "are", "was", "were", "be", "been", "being", "have", "has", "had", "having", "do", "does", "did", "doing", "a", "an", "the", "and", "but", "if", "or", "because", "as", "until", "while", "of", "at", "by", "for", "with", "about", "against", "between", "into", "through", "during", "before", "after", "above", "below", "to", "from", "up", "down", "in", "out", "on", "off", "over", "under", "again", "further", "then", "once", "here", "there", "when", "where", "why", "how", "all", "any", "both", "each", "few", "more", "most", "other", "some", "such", "no", "nor", "not", "only", "own", "same", "so", "than", "too", "very", "s", "t", "can", "will", "just", "don", "should", "now"]

    def compute_probabilities(self,text_command_training_data):
        action_map = self.label_map
        action_probabilities = dict()
        action_dict = dict()
        word_dict = dict()
        word_counter = 0
        stopwords = self.stopwords()
        for action in list(action_map.values()):
            action_dict[action] = dict()
        for i in list(text_command_training_data.index.values):
            action = text_command_training_data.loc[i][1]
            action_probabilities[action] = action_probabilities.get(action,0) + 1
            text_command = text_command_training_data.loc[i][0]
            for word in text_command.lower().split():
                if word not in stopwords:
                    word_counter += 1
                    word_dict[word] = word_dict.get(word,0) + 1
                    action_dict[action][word] = action_dict[action].get(word,0) + 1
        num_unique_words = len(word_dict)
        for action in action_dict:
            num_words = sum(list(action_dict[action].values()))
            for word in action_dict[action]:
                action_dict[action][word] = (action_dict[action][word]+self.smoothing_constant)/(num_words+self.smoothing_constant*num_unique_words)
        for action in action_probabilities:
            action_probabilities[action] = action_probabilities[action]/self.num_training_rows
        
        for word in word_dict:
            word_dict[word] = (word_dict[word]+self.smoothing_constant)/(word_counter+self.smoothing_constant*num_unique_words)
        self.word_counter = word_counter
        return (action_probabilities,action_dict,word_dict)
    
    def train(self):
        text_command_and_action = self.training_data[self.columns]
        (self.action_probabilities,self.action_dict,self.word_dict) = self.compute_probabilities(text_command_and_action)
    
    def predict(self,data):
        num_unique_words = len(self.word_dict)
        action_map = self.label_map
        predictions = pd.DataFrame(columns=[self.columns[0], 'Predicted Action','Predicted Probabilities'])
        stopwords = self.stopwords()
        for i in list(data.index.values):
            words = data.loc[i][0].lower().split()
            action_probabilities = []
            denominator = 0
            for action in list(action_map.values()):
                num_words = sum(list(self.action_dict[action].values()))
                probability = 1
                for word in words:
                    if word not in stopwords:
                        probability = probability * self.action_dict[action].get(word,(self.smoothing_constant)/(num_words+self.smoothing_constant*num_unique_words))
                denominator += probability*self.action_probabilities[action]
            for action in list(action_map.values()):
                num_words = sum(list(self.action_dict[action].values()))
                action_probability = self.action_probabilities[action]
                probability = 1
                for word in words:
                    if word not in stopwords:
                        probability = probability * self.action_dict[action].get(word,(self.smoothing_constant)/(num_words+self.smoothing_constant*num_unique_words))
                        
                probability = (probability * action_probability)/denominator
    
                
                action_probabilities.append(probability)
            predictions.loc[i]= [data.loc[i][0],action_map[action_probabilities.index(max(action_probabilities))],action_probabilities]
        return predictions
    def test(self):
        action_map = self.label_map
        correct = 0
        preds = self.predict(self.testing_data)
        actions = self.testing_data[self.columns[1]]
        for i in list(self.testing_data.index.values):
            if actions.loc[i][0] == preds['Predicted Action'].loc[i][0]:
                correct += 1
        losses = self.cross_entropy_loss(pd.concat([actions,preds['Predicted Probabilities']],axis=1))
        accuracy = correct/self.num_testing_rows
        return accuracy, losses, preds
        
    def softmax(self,labels):
        softmax_labels = []
        denominator = sum([math.exp(label) for label in labels])
        for label in labels:
            softmax_labels.append(math.exp(label)/denominator)
        return softmax_labels
    
    def cross_entropy_loss(self,labels):
        index_action = self.index_action
        losses = dict()
        for key in self.label_map:
            losses[key] = []
        for i in list(labels.index.values):
            idx = index_action[labels.loc[i][0]]
            loss = - math.log(labels.loc[i][1][idx])
            losses[idx].append(loss)
        for label in losses:
            losses[label] = sum(losses[label])/len(losses[label])
        return losses
            
        
        
    

In [5]:
topic_accuracies = []
cross_entropy_losses = [0,0]
for i in range(100):
    classifier = NaiveBayes(augmented_data,0.8,'topic',1)
    classifier.train()
    accuracy, losses, preds = classifier.test()
    losses = list(losses.values())
    topic_accuracies.append(accuracy)
    cross_entropy_losses = [sum(x) for x in zip(cross_entropy_losses,losses)]
    if i%10 == 0:
        print(i)
print(sum(topic_accuracies)/len(topic_accuracies))
print([x/len(topic_accuracies) for x in cross_entropy_losses])

0
10
20
30
40
50
60
70
80
90
0.8973209169054442
[0.25792071298506464, 0.26536989597043403]


In [6]:
topic_accuracies = []
cross_entropy_losses = [0,0,0,0,0]
for i in range(100):
    classifier = NaiveBayes(augmented_data,0.8,'task action',1)
    classifier.train()
    accuracy, losses, preds = classifier.test()
    losses = list(losses.values())
    topic_accuracies.append(accuracy)
    cross_entropy_losses = [sum(x) for x in zip(cross_entropy_losses,losses)]
    if i%10 == 0:
        print(i)
print(sum(topic_accuracies)/len(topic_accuracies))
print([x/len(topic_accuracies) for x in cross_entropy_losses])

0
10
20
30
40
50
60
70
80
90
0.5969129287598949
[2.065536718281523, 1.2081753466600305, 1.1558338185991048, 1.5290459803963214, 1.2461902320459644]


In [7]:
topic_accuracies = []
cross_entropy_losses = [0,0,0,0,0]
for i in range(100):
    classifier = NaiveBayes(augmented_data,0.8,'project action',1)
    classifier.train()
    accuracy, losses, preds = classifier.test()
    losses = list(losses.values())
    topic_accuracies.append(accuracy)
    cross_entropy_losses = [sum(x) for x in zip(cross_entropy_losses,losses)]
    if i%10 == 0:
        print(i)
print(sum(topic_accuracies)/len(topic_accuracies))
print([x/len(topic_accuracies) for x in cross_entropy_losses])

0
10
20
30
40
50
60
70
80
90
0.6427586206896551
[1.8113355350334601, 0.9430479019411666, 1.067495212015915, 1.6394451040277886, 1.035731916757897]
