In [74]:
import os
import sys
import numpy as np

def getFileContents(filename):
    data = None
    with open(filename, 'r') as f:
        data = f.readlines()
    return data

def getFileFromCommandLine():
    filename = sys.argv[1]
    return getFileContents(filename)

class NaiveBayes(object):
    def __init__(self, data):
        self.raw_data = data
        self.targets = []
        self.word_to_index = {}
        self.target_to_index = {}
        self.word_to_class_map = []
        self.total_words = 0
        self.index_to_word = {}
        self.index_to_target = {}
        
    def splitClassNData(self, line):
        tokens = line.strip().split(' ')
        data_id = tokens[0]
        truthfulness = tokens[1]
        emotion = tokens[2]
        data = tokens[3:]
        return (data_id, truthfulness, emotion, data)
    
    def reverseMapping(self):
        for word, index in self.word_to_index.iteritems():
            self.index_to_word[index] = word
        for target_name, index in self.target_to_index.iteritems():
            self.index_to_target[index] = target_name
    
    
    def addWordsToClass(self, target_names, words):
        for word in words:
            try:
                word_index = self.word_to_index[word]
            except KeyError as ex:
                #New word is encountered
                word_index = self.total_words
                self.word_to_index[word] = word_index
                self.word_to_class_map.append([0, 0, 0, 0])
                self.total_words += 1
                
            for target_name in target_names:
                self.word_to_class_map[word_index][self.getTargetIndexFromName(target_name)] += 1
        
    def getTargetIndexFromName(self, target_name):
        try:
            return self.target_to_index[target_name]
        except KeyError as ex:
            print "Exception Raise : Target Name Not Found.", target_name
            self.target_to_index[target_name] = len(self.target_to_index.keys())
            return self.target_to_index[target_name]
        
    def smoothObservations(self):
        self.word_to_class_map = self.word_to_class_map + 1
    
    def calculateProbabilities(self):
        self.word_to_class_probabilities = (self.word_to_class_map*1.0)/self.word_to_class_map.sum(axis=0, keepdims=True)
        
    def fit(self):
        for line in self.raw_data:
            data_id, truthfulness, emotion, words = self.splitClassNData(line)
            self.addWordsToClass([truthfulness, emotion], words)
        self.reverseMapping()
        self.word_to_class_map = np.array(self.word_to_class_map)
            
            
            

In [75]:
if __name__ == '__main__':
#     tagged_data = getFileFromCommandLine()
    tagged_data = getFileContents('data/train-labeled.txt')
    model = NaiveBayes(tagged_data)
    

In [76]:
model.fit()

Exception Raise : Target Name Not Found. Fake
Exception Raise : Target Name Not Found. Neg
Exception Raise : Target Name Not Found. True
Exception Raise : Target Name Not Found. Pos


In [77]:
model.word_to_class_map[955]

array([1, 2, 1, 0])

In [78]:
model.word_to_index['Amit']

955

In [79]:
am = np.array([[1, 2, 3, 4], [4, 5, 6, 2]])

In [80]:
am + 1

array([[2, 3, 4, 5],
       [5, 6, 7, 3]])

In [81]:
am

array([[1, 2, 3, 4],
       [4, 5, 6, 2]])

In [83]:
am*1.0/am.sum(axis=0, keepdims=True)

array([[ 0.2       ,  0.28571429,  0.33333333,  0.66666667],
       [ 0.8       ,  0.71428571,  0.66666667,  0.33333333]])