In [339]:
import os
import sys
import numpy as np

def getFileContents(filename):
    data = None
    with open(filename, 'r') as f:
        data = f.readlines()
    return data

def getFileFromCommandLine():
    filename = sys.argv[1]
    return getFileContents(filename)

class NaiveBayes(object):
    def __init__(self, data):
        self.raw_data = data
        self.targets = []
        self.word_to_index = {}
        self.target_to_index = {'True': 0, 'Fake': 1, 'Pos': 2, 'Neg': 3}
        self.word_to_class_map = []
        self.total_words = 0
        self.index_to_word = {}
        self.index_to_target = {}
        self.target_counts = [0, 0, 0, 0]
        self.word_to_class_probabilities = None
        self.target_probabilities = None
        
    def clean_sentence(self, sentence):
        chars_to_remove = ['.', '!', '?', "'", '@', '#', '$', '%',\
                           '^', '&', ',', '(', ')', '-', '_', '+',\
                           '=', '<', '>', ';', ':', '"', '[', ']',\
                           '\\', '|', '~', '0', '1', '2', '3', '4',\
                           '5', '6', '7', '8', '9']
        sentence = sentence.lower()
        for char in chars_to_remove:
            sentence = sentence.replace(char, ' ')
        words = sentence.split()
        words = [word for word in words if len(word) > 4]
#         words = [word if not word.endswith('ed') else word[:-2] for word in words]
#         words = [word if not word.endswith('ing') else word[:-3] for word in words]
        return words
    
    def splitClassNData(self, line):
        tokens = line.strip().split(' ')
        data_id = tokens[0]
        truthfulness = tokens[1]
        emotion = tokens[2]
        data = ' '.join(tokens[3:])
        data = self.clean_sentence(data)
        return (data_id, truthfulness, emotion, data)
    
    def reverseMapping(self):
        for word, index in self.word_to_index.iteritems():
            self.index_to_word[index] = word
        for target_name, index in self.target_to_index.iteritems():
            self.index_to_target[index] = target_name
    
    
    def addWordsToClass(self, target_names, words):
        for word in words:
            try:
                word_index = self.word_to_index[word]
            except KeyError as ex:
                #New word is encountered
                word_index = self.total_words
                self.word_to_index[word] = word_index
                self.word_to_class_map.append([0, 0, 0, 0])
                self.total_words += 1
                
            for target_name in target_names:
                self.word_to_class_map[word_index][self.getTargetIndexFromName(target_name)] += 1
        
    def getTargetIndexFromName(self, target_name):
        return self.target_to_index[target_name]
        
    def smoothObservations(self):
        self.word_to_class_map = self.word_to_class_map + 1
    
    def calculateProbabilities(self):
        self.word_to_class_probabilities = (self.word_to_class_map*1.0)/self.word_to_class_map.sum(axis=0, keepdims=True)
        self.target_probabilities = np.array(self.target_counts, dtype=np.float64) / len(self.raw_data)
        print self.target_probabilities
    
    def getSentenceProbabilityWithClass(self, sentence):
        prob_true = self.target_probabilities[0]
        prob_fake = self.target_probabilities[1]
        prob_pos = self.target_probabilities[2]
        prob_neg = self.target_probabilities[3]
        
        sentence_id = sentence.strip().split(' ')[0]
        words = self.clean_sentence(sentence)
        if sentence_id == 'iL7CkJ5':
            print ' '.join(words)
        for word in words[1:]:
            try:
                word_index = self.word_to_index[word]
            except KeyError as ex:
                continue
            prob_true *= self.word_to_class_probabilities[word_index][0]
            prob_fake *= self.word_to_class_probabilities[word_index][1]
            prob_pos *= self.word_to_class_probabilities[word_index][2]
            prob_neg *= self.word_to_class_probabilities[word_index][3]
            
        truthfulness = 'True' if prob_true > prob_fake else 'Fake'
        emotion = 'Pos' if prob_pos > prob_neg else 'Neg'
        
        return '%s %s %s'%(sentence_id, truthfulness, emotion)
    
    def predict(self, untagged_data):
        output = []
        for sentence in untagged_data:
            output.append('%s\n'%(self.getSentenceProbabilityWithClass(sentence)))
        return output
            
    
    def incrementTargetCounts(self, target_name):
        self.target_counts[self.target_to_index[target_name]] += 1
        
    def fit(self):
        for line in self.raw_data:
            data_id, truthfulness, emotion, words = self.splitClassNData(line)
            self.addWordsToClass([truthfulness, emotion], words)
            self.incrementTargetCounts(truthfulness)
            self.incrementTargetCounts(emotion)
            
        self.reverseMapping()
        self.word_to_class_map = np.array(self.word_to_class_map, dtype=np.float64)
        self.smoothObservations()
        self.calculateProbabilities()
    
            
            

In [340]:
def computeAccuracy(predicted):
    total = 0
    correct = 0
    index = 0
    expected = getFileContents('data/dev-key.txt')
    for exp, pred in zip(expected, predicted):
        if exp == pred:
            correct += 1
#             print exp, pred
        else:
            if exp.split()[0] == 'iL7CkJ5':
                print untagged_data[index]
                print exp.strip(), pred.strip()
#             print untagged_data[index], '\n\n\n'
            pass
        total += 1
        index += 1
    return "Accuracy %f %%"%(correct*100.0/total)

In [341]:
if __name__ == '__main__':
#     tagged_data = getFileFromCommandLine()
    tagged_data = getFileContents('data/train-labeled.txt')
    model = NaiveBayes(tagged_data)
    model.fit()
    untagged_data = getFileContents('data/dev-text.txt')
    output = model.predict(untagged_data)
    print len(model.raw_data)
#     print model.word_to_class_probabilities

[ 0.5  0.5  0.5  0.5]
stayed hyatt regency chicago first above expectations night rooms comfortable great through travelzoo night aware parking night first night there valet yelling couldn where point drive around minutes looking closed parking garages saturday evening finally found would blocks hotel outside luggage carry service hotel inside apparently running nightclub thier lobby called harddrive noisy issue controls broken first night second night blazing there degrees couldn sleep miserable engineer determined there definately problem opened window floors never enough there window night below night window being would another either checkout monday morning after taxes second which internet parking billed reciepts wednesday online banking checking account hyatt charged another charge almost reciepts checkout furious called hotel nothing resolved phone treated something another additional authorization falls settle could waited friday check account again charges cleared should corre

In [342]:
accuracy = computeAccuracy(output)

iL7CkJ5 This was the 2nd time that we have stayed at the Hyatt Regency Chicago. The first time was far above our expectations for a $69/per night room. The rooms are very nice, and the beds are comfortable. This time we got a great rate through travelzoo @ $104/ night for a room with a view. We were aware of the $41 parking per night. The first night we got there the valet was yelling at us and said that we couldn't park they were full. We had no clue where to park at this point and had to drive around for over 45 minutes looking at closed parking garages. (It was a Saturday evening) Finally found a spot and were told that it would be $38. This was 6 blocks from the hotel. Mind you its -6 outside and we have luggage to carry too. Not very good service for a 5-star hotel. We got inside and apparently they were running a nightclub out of thier lobby, its called the harddrive. That's cool and all, yes it was very noisy but once we got to our room, it wasn't an issue. The HVAC controls to 

In [343]:
print accuracy

Accuracy 81.875000 %
