In [3]:
import math
import pandas as pd

In [5]:
augmented_data = pd.read_csv('./data/augmented_data.csv')

In [35]:
class NaiveBayes(object):
    #initialize model
    def __init__(self,data,training_split,laplace):
        self.num_rows = len(data)
        #shuffle data and reset indexes
        self.shuffled_data = data.sample(frac=1).reset_index(drop=True)
        #split the data into training and testing sets using input proportion 
        (self.training_data,self.testing_data) = self.training_testing_split(self.shuffled_data,training_split)
        self.num_training_rows,self.num_testing_rows = len(self.training_data),len(self.testing_data)
        self.smoothing_constant = laplace
        
    def training_testing_split(self,all_data,training_split):
        breakoff = int(self.num_rows*training_split)
        return (all_data.loc[:breakoff],all_data[['Text Command','Action']].loc[breakoff:])
         
    def action_map(self):
        return {0: 'To Do',1: 'In Progress',2: 'In Review',3: 'Blocked',4: 'Completed'}
    
    def index_action(self):
        return {'To Do': 0,'In Progress': 1,'In Review': 2,'Blocked': 3,'Completed': 4}
    #words to ignore when calculating probabilities
    def stopwords(self):
        return ["i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "your", "yours", "yourself", "yourselves", "he", "him", "his", "himself", "she", "her", "hers", "herself", "it", "its", "itself", "they", "them", "their", "theirs", "themselves", "what", "which", "who", "whom", "this", "that", "these", "those", "am", "is", "are", "was", "were", "be", "been", "being", "have", "has", "had", "having", "do", "does", "did", "doing", "a", "an", "the", "and", "but", "if", "or", "because", "as", "until", "while", "of", "at", "by", "for", "with", "about", "against", "between", "into", "through", "during", "before", "after", "above", "below", "to", "from", "up", "down", "in", "out", "on", "off", "over", "under", "again", "further", "then", "once", "here", "there", "when", "where", "why", "how", "all", "any", "both", "each", "few", "more", "most", "other", "some", "such", "no", "nor", "not", "only", "own", "same", "so", "than", "too", "very", "s", "t", "can", "will", "just", "don", "should", "now"]

    def compute_probabilities(self,text_command_training_data):
        action_map = self.action_map()
        action_probabilities = dict()
        action_dict = dict()
        word_dict = dict()
        word_counter = 0
        stopwords = self.stopwords()
        for action in list(action_map.values()):
            action_dict[action] = dict()
        for i in list(text_command_training_data.index.values):
            action = text_command_training_data.loc[i][1]
            action_probabilities[action] = action_probabilities.get(action,0) + 1
            text_command = text_command_training_data.loc[i][0]
            for word in text_command.lower().split():
                if word not in stopwords:
                    word_counter += 1
                    word_dict[word] = word_dict.get(word,0) + 1
                    action_dict[action][word] = action_dict[action].get(word,0) + 1
        num_unique_words = len(word_dict)
        for action in action_dict:
            num_words = sum(list(action_dict[action].values()))
            for word in action_dict[action]:
                action_dict[action][word] = (action_dict[action][word]+self.smoothing_constant)/(num_words+self.smoothing_constant*num_unique_words)
        for action in action_probabilities:
            action_probabilities[action] = action_probabilities[action]/self.num_training_rows
        
        for word in word_dict:
            word_dict[word] = (word_dict[word]+self.smoothing_constant)/(word_counter+self.smoothing_constant*num_unique_words)
        self.word_counter = word_counter
        return (action_probabilities,action_dict,word_dict)
    
    def train(self):
        text_command_and_action = self.training_data[['Text Command','Action']]
        (self.action_probabilities,self.action_dict,self.word_dict) = self.compute_probabilities(text_command_and_action)
    
    def predict(self,data):
        num_unique_words = len(self.word_dict)
        action_map = self.action_map()
        predictions = pd.DataFrame(columns=['Text Command', 'Predicted Action','Predicted Probabilities'])
        stopwords = self.stopwords()
        for i in list(data.index.values):
            words = data.loc[i][0].lower().split()
            action_probabilities = []
            for action in list(action_map.values()):
                num_words = sum(list(self.action_dict[action].values()))
                action_probability = self.action_probabilities[action]
                probability = 1
                denominator = 1
                for word in words:
                    if word not in stopwords:
                        probability = probability * self.action_dict[action].get(word,(self.smoothing_constant)/(num_words+self.smoothing_constant*num_unique_words))
                        denominator = denominator * self.word_dict.get(word,(self.smoothing_constant)/(self.word_counter+self.smoothing_constant*num_unique_words))
                probability = (probability * action_probability)/denominator
    
                
                action_probabilities.append(probability)
            action_probabilities = self.softmax(action_probabilities)
            
            predictions.loc[i]= [data.loc[i][0],action_map[action_probabilities.index(max(action_probabilities))],action_probabilities]
        return predictions
    def test(self):
        action_map = self.action_map()
        correct = 0
        preds = self.predict(self.testing_data)
        actions = self.testing_data['Action']
        for i in list(self.testing_data.index.values):
            if actions.loc[i][0] == preds['Predicted Action'].loc[i][0]:
                correct += 1
        losses = self.cross_entropy_loss(pd.concat([actions,preds['Predicted Probabilities']],axis=1))
                
        print('Accuracy: ' + str(correct/self.num_testing_rows))
        #for loss_idx in losses:
            #print('Cross Entropy Loss '+action_map[loss_idx]+': '+str(losses[loss_idx]))
        preds['Predicted Probabilities']
        
    def softmax(self,labels):
        softmax_labels = []
        denominator = sum([math.exp(label) for label in labels])
        for label in labels:
            softmax_labels.append(math.exp(label)/denominator)
        return softmax_labels
    
    def cross_entropy_loss(self,labels):
        index_action = self.index_action()
        losses = {0: [],1: [],2: [],3: [], 4: []}
        for i in list(labels.index.values):
            idx = index_action[labels.loc[i][0]]
            loss = - math.log(labels.loc[i][1][idx])
            losses[idx].append(loss)
        for label in losses:
            losses[label] = sum(losses[label])/len(losses[label])
        return losses
            
        
        
    

In [36]:
classifier = NaiveBayes(augmented_data,0.8,1)

In [37]:
classifier.train()

In [38]:
classifier.test()

Accuracy: 0.974025974025974
