In [1]:
import os
import sys
import operator
import numpy as np

def getFileContents(filename):
    data = None
    with open(filename, 'r') as f:
        data = f.readlines()
    return data

def getFileFromCommandLine():
    filename = sys.argv[1]
    return getFileContents(filename)

class NaiveBayes(object):
    def __init__(self):
        self.SEPERATOR = '****************#################*****************###############\n'
        self.readModelFromFile()
        
    def readModelFromFile(self):
        data = getFileContents('nbmodel.txt')
        self.target_to_index = {}
        self.target_probabilities = [0.0, 0.0, 0.0, 0.0]
        self.word_to_class_probabilities = []
        self.word_to_index = {}
        
        switch = 0
        for line in data:
            if line == self.SEPERATOR:
                switch += 1
                continue
                
            if switch == 0:
                target_name, index, probability = line.strip().split('\t')
                index = int(index)
                self.target_to_index[target_name] = index
                self.target_probabilities[index] = float(probability)
                
            if switch == 1:
                word_probs = map(float, line.strip().split('\t'))
                self.word_to_class_probabilities.append(word_probs)
            
            if switch == 2:
                word, index = line.strip().split('\t')
                index = int(index)
                self.word_to_index[word] = index
                
        
    def negateWords(self, sentence):
        return sentence
        words = sentence.split()
        start_negation = False
        new_words = []
        for word in words:
            if word.lower() in ['not', "don't", "haven't"]:
                start_negation = True
            if start_negation:
                new_words.append('not_' + word)
            else:
                new_words.append(word)
            if word[-1] in [',', '.']:
                start_negation = False
        return ' '.join(new_words)
                
            
            
    def clean_sentence(self, sentence):
        sentence = self.negateWords(sentence)
        chars_to_remove = ['.', '!', '?', "'", '@', '#', '$', '%',\
                           '^', '&', ',', '(', ')', '-', '_', '+',\
                           '=', '<', '>', ';', ':', '"', '[', ']',\
                           '\\', '|', '~', '0', '1', '2', '3', '4',\
                           '5', '6', '7', '8', '9']
        sentence = sentence.lower()
        for char in chars_to_remove:
            sentence = sentence.replace(char, ' ')
        words = sentence.split()
        words = [word for word in words if len(word) > 4]
#         words = [word if not word.endswith('ed') else word[:-2] for word in words]
#         words = [word if not word.endswith('ing') else word[:-3] for word in words]
        return words
    
    
    def getTargetIndexFromName(self, target_name):
        return self.target_to_index[target_name]
    
    def getSentenceProbabilityWithClass(self, sentence):
        prob_true = self.target_probabilities[0]
        prob_fake = self.target_probabilities[1]
        prob_pos = self.target_probabilities[2]
        prob_neg = self.target_probabilities[3]
        
        sentence_id = sentence.strip().split(' ')[0]
        sentence = self.negateWords(sentence)
        words = self.clean_sentence(sentence)
        if sentence_id == 'iL7CkJ5':
            print ' '.join(words)
        for word in words[1:]:
            try:
                word_index = self.word_to_index[word]
            except KeyError as ex:
                continue
            prob_true += self.word_to_class_probabilities[word_index][0]
            prob_fake += self.word_to_class_probabilities[word_index][1]
            prob_pos += self.word_to_class_probabilities[word_index][2]
            prob_neg += self.word_to_class_probabilities[word_index][3]
            
        truthfulness = 'True' if prob_true > prob_fake else 'Fake'
        emotion = 'Pos' if prob_pos > prob_neg else 'Neg'
        
        return '%s %s %s'%(sentence_id, truthfulness, emotion)
    
    def predict(self, untagged_data):
        output = []
        for sentence in untagged_data:
            output.append('%s\n'%(self.getSentenceProbabilityWithClass(sentence)))
        return output

In [2]:
if __name__ == '__main__':
    model = NaiveBayes()
    untagged_data = getFileContents('data/dev-text.txt')
    predicted = model.predict(untagged_data)

stayed hyatt regency chicago first above expectations night rooms comfortable great through travelzoo night aware parking night first night there valet yelling couldn where point drive around minutes looking closed parking garages saturday evening finally found would blocks hotel outside luggage carry service hotel inside apparently running nightclub thier lobby called harddrive noisy issue controls broken first night second night blazing there degrees couldn sleep miserable engineer determined there definately problem opened window floors never enough there window night below night window being would another either checkout monday morning after taxes second which internet parking billed reciepts wednesday online banking checking account hyatt charged another charge almost reciepts checkout furious called hotel nothing resolved phone treated something another additional authorization falls settle could waited friday check account again charges cleared should correct original amounts st

In [3]:
def computeAccuracy(predicted):
    total = 0
    correct = 0
    index = 0
    expected = getFileContents('data/dev-key.txt')
    for exp, pred in zip(expected, predicted):
        if exp == pred:
            correct += 1
#             print exp, pred
        else:
            if exp.strip().split()[-1] == 'Neg' and pred.strip().split()[-1] == 'Pos':
                print untagged_data[index]
                print exp.strip(), pred.strip()
#             print untagged_data[index], '\n\n\n'
            pass
        total += 1
        index += 1
    return "Accuracy %f %%"%(correct*100.0/total)

In [4]:
accuracy = computeAccuracy(predicted)

0IGLewq Chicago is my favorite city visit. I wanted to share this special city with a new love and booked us a room at the Swissotel. It is advertised as close to downtown and close to the museums of Chicago. I wanted to share my new love. It was indeed close to the museums but to get to either the shopping district or the Chicago Art Institute it was a long walk, full of detours, and it was not easy to grab a cab. While the rooms seemed to be luxuriously decorated the beds were uncomfortable and the rooms just were not very clean. The Room Service was slow, the food arrived cold, and the total experience was way so expensive for what we got. I do not think I have ever experienced the level of noise in such a "grand hotel" as the Swissotel seems. I would not recommend the Swissotel because disappointment, noise, and cold food should just not cost that much.

0IGLewq Fake Neg 0IGLewq Fake Pos
0aYKFLe The Swissotel Chicago hotel aspires to be a tourist's paradise, a hotel so grand and lu

In [5]:
print accuracy

Accuracy 82.500000 %
