# Naive Bayes Estimator for Sentiment Analysis

Implementation of a sentiment analysis system based on the “Large Movie Review Dataset”. Each text is classified into either “positive” or “negative”. The dataset is available at http://ai.stanford.edu/~amaas/data/sentiment/.

In [2]:
#Removes stop words that exist in the given array.
function remove_stop_words(text)
    for stop_word in stop_words
        text = replace(text, stop_word => " ")
    end
    text
end

stop_words = [" a ", " the ", " i ", " an ", " on ", " it ", 
              " as ", " in ", " is ", " that ", " he ", " she ", 
                " at ", " my ", " me ", " to ", "i'm",
                " there ", " you ", " his ", " her ", " their ", 
                " him ", " of ", " and ", " has ", " are " , " by ",
                " will ", " with "];

In [3]:
function preprocess(text)
    text = lowercase(text)
    text = replace(text, r"\d+" => " ") #Removes digits.
    text = replace(text, r"<[^>]*>" => " ") #Removes html tags
    text = replace(text, r"[.,\\\".:!()<>/?*;\-'&]" => " ") #Removes punctuations
    text = remove_stop_words(text) #Removes stop words
    #Removes single letters followed by spaces. This was needed due to punctuation removal.
    #For example, `i'm` will be converted to `i m` which does not contribute to the classification task.
    text = replace(text, r"\s[a-zA-Z]\s" => " ") 
    return text
end

preprocess (generic function with 1 method)

In [4]:
function get_n_grams(words, n)
    """Given a list of words and a positive integer n, returns a vector of n-grams extracted from the given list."""
    n_grams = Vector{}()

    for i in 1:(length(words) - n + 1)
        push!(n_grams, join(words[i:i+n-1], " "))
    end
    n_grams
end   

get_n_grams (generic function with 1 method)

In [5]:
wdict = Dict()
w2i(x) = get!(wdict, x, 1+length(wdict)) #Insert a new index for unseen grams.
UNK = w2i("<unk>")

1

In [6]:
n = 3 #n-gram length to use.

3

In [7]:
#Read positive training samples.
positive_samples = Vector{}() 

for file in readdir(".\\aclImdb_v1\\aclImdb\\train\\pos"; join=true)
    open(file) do f
        text = readline(f)
        text = preprocess(text)
        text = split(text, " ")
        text = filter((i) -> i != "", text)
        text = get_n_grams(text, 3)
        word_ids = w2i.(text)
        push!(positive_samples, word_ids)
    end
end

In [8]:
#Read negative training samples.
negative_samples = Vector{}()

for file in readdir(".\\aclImdb_v1\\aclImdb\\train\\neg"; join=true)
    open(file) do f
        text = readline(f)
        text = preprocess(text)
        text = split(text, " ")
        text = filter((i) -> i != "", text)
        text = get_n_grams(text, 3)
        word_ids = w2i.(text)
        push!(negative_samples, word_ids)
#         println(text)
    end
end

In [9]:
w2i(x) = get(wdict, x, UNK) # unk if not found

w2i (generic function with 1 method)

In [10]:
#Read positive test samples.
positive_test_samples = Vector{}()

for file in readdir(".\\aclImdb_v1\\aclImdb\\test\\pos"; join=true)
    open(file) do f
        text = readline(f)
        text = preprocess(text)
        text = split(text, " ")
        text = filter((i) -> i != "", text)
        text = get_n_grams(text, 3)
        word_ids = w2i.(text)
        push!(positive_test_samples, word_ids)
    end
end

In [11]:
#Read negative test samples.
negative_test_samples = Vector{}()

for file in readdir(".\\aclImdb_v1\\aclImdb\\test\\neg"; join=true)
    open(file) do f
        text = readline(f)
        text = preprocess(text)
        text = split(text, " ")
        text = filter((i) -> i != "", text)
        text = get_n_grams(text, 3)
        word_ids = w2i.(text)
        push!(negative_test_samples, word_ids)
    end
end

In [12]:
#Class priors
q_positive = size(positive_samples)[1] / (size(positive_samples)[1] + size(negative_samples)[1])
q_negative = size(negative_samples)[1] / (size(positive_samples)[1] + size(negative_samples)[1])

0.5

In [13]:
q_x_positive = zeros(length(wdict))
q_x_negative = zeros(length(wdict));

In [14]:
#Use add-one smoothing

for i = 1:length(wdict)
    q_x_positive[i] += 1
    q_x_negative[i] += 1
end

In [15]:
for sentence in positive_samples
    for word in sentence
        q_x_positive[word] += 1
    end
end

for sentence in negative_samples
    for word in sentence
        q_x_negative[word] += 1
    end
end

In [16]:
#Common denominator of the probabilities
positive_denominator = length(q_x_positive)
negative_denominator = length(q_x_negative)

3395810

In [17]:
#To prevent underflow, take logarithm of the probabilities
q_x_positive = log.(q_x_positive)
q_x_negative = log.(q_x_negative);

In [18]:
#Since division transforms to subtraction, subtract denominators from the probabilities.
q_x_positive = q_x_positive .- positive_denominator
q_x_negative = q_x_negative .- negative_denominator;

In [19]:
#Testing for positive samples.

true_positive_count = 0

for sentence in positive_test_samples
    positive_score = q_positive
    for word in sentence
        positive_score += q_x_positive[word]
    end
    
    negative_score = q_negative
    for word in sentence
        negative_score += q_x_negative[word]
    end

    if positive_score >= negative_score
        true_positive_count +=1
    end
end

In [20]:
#Testing for negative samples

true_negative_count = 0

for sentence in negative_test_samples
    positive_score = q_positive
    for word in sentence
        positive_score += q_x_positive[word]
    end
    
    negative_score = q_negative
    for word in sentence
        negative_score += q_x_negative[word]
    end
    if positive_score < negative_score
        true_negative_count +=1
    end
end

In [21]:
positive_accuracy = true_positive_count / length(positive_test_samples)

0.73584

In [22]:
negative_accuracy = true_negative_count / length(negative_test_samples)

0.90792

In [23]:
(positive_accuracy + negative_accuracy) / 2

0.8218799999999999