# Preprossing the Training Data

In [96]:
import string

# To remove punctuations
translator = str.maketrans('', '', string.punctuation)


path = 'data/affectivetext_trial.xml'
training_data = {}

# Read the training data
# Example data point <instance id="1">Mortar assault leaves at least 18 dead</instance>
with open(path) as f:
    for line in f:
        if "instance" in line:
            line_id = int(line[line.find("id=")+3:line.find(">")].strip("\""))          
            line = line[line.find(">")+1:line.find("</")]
            
            # Removing punctuations
            line = line.translate(translator)
        
            if line_id <= 2:
               print("id %s text %s" % (line_id, line.split()))
            
            # Keep the lower-cased words of each document
            training_data[line_id] = [word.lower() for word in line.split()]
            

id 1 text ['Mortar', 'assault', 'leaves', 'at', 'least', '18', 'dead']
id 2 text ['Goal', 'delight', 'for', 'Sheva']


In [97]:
# Read the gold labels: Valence 
valence_labels = {}
path = "data/affectivetext_trial.valence.gold"
with open(path) as f:
    for line in f:
        line_id, valence = line.split()
        valence = int(valence)
        line_id = int(line_id)
        # Discretize the scores. Use scores for regression. #TODO
        if valence > 0: # positive 
            valence_labels[line_id] = 1
        else: 
            valence_labels[line_id] = -1
        
#print(valence_labels) 

# The Naive Bayes Classifier

In [98]:
import numpy as np

def naive_bayes(documents, labels, classes, smoothing=1):
    # Total number of documents
    n_doc = len(documents)
    # List of all the word types in the documents
    vocab = set([word for doc_id in documents for word in documents[doc_id]])
    vocab_size = len(vocab)
    print("Number of word types %d \n" % len(vocab))
    
    prior, likelihood = {}, {}
    # Initialize likelihood for each class
    for c in classes:
        likelihood[c] = {}
    for c in classes:
        # Number of documents labeled c
        n_c = list(labels.values()).count(c)
        # The (log) prior probability of class c: n_c/n_doc
        prior[c] = np.log(n_c) - np.log(n_doc)
        print("C %d, prior prob. %.2f" % (c, prior[c]))
        
        # Frequency of each word in all documents with label c
        doc_c = {}
        for doc_id in documents:
            if labels[doc_id] == c:  # Document has label c
                for word in documents[doc_id]:
                    if word not in doc_c:
                        doc_c[word] = 0
                    doc_c[word] += 1
        print("Number of word types in c %d" % len(doc_c))
        
        # Sum of all the word counts in c
        denom = np.sum(list(doc_c.values())) + vocab_size * smoothing
        denom = np.log(denom)
        for word in vocab:
            # Is the sum correct?
            word_count = 0
            if word in doc_c:
                word_count = doc_c[word]
            
            likelihood[c][word] = np.log(word_count + smoothing) - denom 

            #print(c, word, likelihood[c][word])
    return prior, likelihood, vocab

prior, likelihood, vocab = naive_bayes(training_data, valence_labels, [-1, 1])
        

Number of word types 1004 

C -1, prior prob. -0.47
Number of word types in c 667
C 1, prior prob. -0.99
Number of word types in c 422


## Testing the classifer

In [101]:
def classify(testdoc, classes, prior, likelihood, vocab):
    posterior = {}
    for c in classes:
        posterior[c] = prior[c]
        for word in testdoc:
            word = word.lower()
            if word in vocab:
                #print(word)
                posterior[c] += likelihood[c][word]
        print(c, posterior[c])
    return posterior

classify(["I", "love", "Bayes", "rule"], [-1, 1], prior, likelihood, vocab)

classify(["happy", "valentine's", "day"], [-1, 1], prior, likelihood, vocab)

-1 -7.38687329667
1 -8.37780752233
-1 -8.08002047723
1 -6.99151316121


{-1: -8.0800204772250126, 1: -6.9915131612075356}

## Improving the Results

* Remove the stop words (Highly frequent words or function words)
* Binarize the frequency of words
* Deal with negation
* Change the smoothing parameter

# Experiment with Emotions

The same dataset also includes an emotion corpora, where each document is annotated with scores for emotions such as joy. Train a mulit-class Naive Bayes using the emotion corpora.

