# 1. Naive Bayes for Sentiment Analysis

## 1.1 Load Data

In [1]:
import numpy as np
from collections import Counter

In [9]:
path = "aclImdb/"
train_path = path + "train/"
test_path = path + "test/"

In [11]:
data_raw_pos = sc.textFile(train_path + "pos/*.txt")
data_raw_neg = sc.textFile(train_path + "neg/*.txt")

In [12]:
#note that this is a whole review
data_raw_pos.first()

u'Bromwell High is a cartoon comedy. It ran at the same time as some other programs about school life, such as "Teachers". My 35 years in the teaching profession lead me to believe that Bromwell High\'s satire is much closer to reality than is "Teachers". The scramble to survive financially, the insightful students who can see right through their pathetic teachers\' pomp, the pettiness of the whole situation, all remind me of the schools I knew and their students. When I saw the episode in which a student repeatedly tried to burn down the school, I immediately recalled ......... at .......... High. A classic line: INSPECTOR: I\'m here to sack one of your teachers. STUDENT: Welcome to Bromwell High. I expect that many adults of my age think that Bromwell High is far fetched. What a pity that it isn\'t!'

In [13]:
# sample 20% of the data
data_raw_pos = data_raw_pos.sample(False, 0.2, 1)
data_raw_neg = data_raw_neg.sample(False, 0.2, 1)

In [14]:
# number of partitions
data_raw_pos.getNumPartitions()

12500

In [15]:
# You may OR may NOT want to repartition or coalesce
# num_partitions = 3 or 4 times the number of CPUs
num_partitions = 8
data_raw_pos = data_raw_pos.repartition(num_partitions)
data_raw_neg = data_raw_neg.repartition(num_partitions)

In [16]:
# count 2529 elements
print data_raw_pos.count()
print data_raw_neg.count()

2529
2529


## 1.2 Training NB

In [68]:
import string
replace_punctuation = string.maketrans(string.punctuation, ' '*len(string.punctuation))

In [84]:
# define cleaning function
def clean(review):
    return review.encode("utf8","ignore").translate(replace_punctuation).split()

In [130]:
# split into workds (here we could filter stepwords, clean, rm punctuation...)
#data_pos = data_raw_pos.flatMap(lambda x: x.split())
data_pos = data_raw_pos.flatMap(clean)
data_pos.take(10)

['A', 'lot', 'of', 'people', 'are', 'saying', 'that', 'Al', 'Pacino', 'over']

In [131]:
# transform to value pairs to be able to count
data_pos = data_pos.map(lambda x: (x.lower(), 1))
data_pos.take(10)

[('a', 1),
 ('lot', 1),
 ('of', 1),
 ('people', 1),
 ('are', 1),
 ('saying', 1),
 ('that', 1),
 ('al', 1),
 ('pacino', 1),
 ('over', 1)]

In [132]:
# counting number of words
data_pos = data_pos.reduceByKey(lambda x, y: x+y)
data_pos.take(10)

[('unimaginative', 3),
 ('aided', 7),
 ('damon', 17),
 ('jaffar', 2),
 ('blackend', 1),
 ('nun', 5),
 ('bloopers', 1),
 ('joshua', 1),
 ('almghandi', 1),
 ('four', 96)]

In [133]:
# we can do all together
#data_neg = data_raw_neg.flatMap(lambda x: x.split()).map(lambda x: (x, 1)).reduceByKey(lambda x, y: x+y)
data_neg = data_raw_neg.flatMap(clean).map(lambda x: (x.lower(), 1)).reduceByKey(lambda x, y: x+y)
data_neg.take(10)

[('unimaginative', 6),
 ('aided', 9),
 ('damon', 2),
 ('yvan', 1),
 ('grout', 1),
 ('unflavored', 1),
 ('nun', 5),
 ('roadrunners', 1),
 ('joshua', 8),
 ('needlessly', 5)]

In [134]:
# remove singletons
data_pos = data_pos.filter(lambda x: x[1]>1)
data_neg = data_neg.filter(lambda x: x[1]>1)

In [135]:
count_pos = data_pos.values().sum()
count_neg = data_neg.values().sum()

In [136]:
print count_pos, count_neg

613125 582524


In [137]:
## Let's get V
v1 = data_pos.keys() # pos vocabulary
v2 = data_neg.keys() # neg vocabulary
v = v1.union(v2)
#v.count()
v0 = v.distinct()
V = v0.count()
print V

21462


In [138]:
pos_denom = float(count_pos + V + 1)
neg_denom = float(count_neg + V + 1)

In [139]:
# log probabilities
pos_prob = data_pos.map(lambda x: (x[0], np.log(float(x[1] + 1)/pos_denom)))
neg_prob = data_neg.map(lambda x: (x[0], np.log(float(x[1] + 1)/neg_denom)))

In [140]:
pos_prob.take(10)

[('unimaginative', -11.974436887283231),
 ('aided', -11.281289706723285),
 ('damon', -10.470359490506958),
 ('jaffar', -12.262118959735012),
 ('nun', -11.568971779175067),
 ('four', -8.7860202698997387),
 ('catch', -9.1560386290121567),
 ('consists', -10.875824598615122),
 ('zombie', -10.182677418055176),
 ('woodhouse', -11.974436887283231)]

In [141]:
pos_prob = pos_prob.collectAsMap()
neg_prob = neg_prob.collectAsMap()

In [142]:
# broadcast = shared by all nodes
pos_prob_b = sc.broadcast(pos_prob)
neg_prob_b = sc.broadcast(neg_prob)

## 1.3 Prediction

In [55]:
test_raw_pos = sc.textFile(test_path + "pos/*.txt")
test_raw_neg = sc.textFile(test_path + "neg/*.txt")

test_raw_pos = test_raw_pos.sample(False, 0.2, 1)
test_raw_neg = test_raw_neg.sample(False, 0.2, 1)

num_partitions = 8
#test_raw_pos = test_raw_pos.repartition(num_partitions)
#test_raw_neg = test_raw_neg.repartition(num_partitions)

print test_raw_pos.count()
print test_raw_neg.count()

In [None]:
#test_pos = test_raw_pos.flatMap(lambda x: x.split()).map(lambda x: (x, 1)).reduceByKey(lambda x, y: x+y)
#test_neg = test_raw_neg.flatMap(lambda x: x.split()).map(lambda x: (x, 1)).reduceByKey(lambda x, y: x+y)

In [56]:
doc = test_raw_pos.first()
print doc

Why this film was only released in 4 states is beyond me. I thought this film was a divine story. The name says it all: Seeing Other People. This movie has more logic than laughs, which I suppose is why it works so well. Common sense also makes an appearance in what would seem to be another puerile sex comedy. Alice is getting her feet frozen in the cold, when she feels irrationally about the way she might perform for her fiancé, not just sexually, but as a partner, and friend etc. This starts what seems to be an almost archetypal journey for the both of them. One fling after another leads to trouble, as if it wasn't a bad idea from the start. Witty dialogue and comic set-ups make this one funny as hell! Nicholson and Mohr set the tone of the film early on, and keep the promise they anticipate. Other highlights are Lauren Graham, Andy Richter, and Helen Slater(in her first theatrical film in 10 years!). Climax begins to take an insane turn, but a simple ending makes this one far more e

In [146]:
from itertools import dropwhile

def pred_class(doc):
    #words = doc.split()
    words = clean(doc)
    words = [w.lower() for w in words]
    counts = Counter(words)
    
    for key, count in dropwhile(lambda key_count: key_count[1] > 1, counts.most_common()):
        del counts[key]
        
    log_pos = 0.0
    log_neg = 0.0
    for w in counts:
        log_pos += counts[w] * pos_prob_b.value.get(w, np.log(1.0/pos_denom))
        log_neg += counts[w] * neg_prob_b.value.get(w, np.log(1.0/neg_denom))
    if log_pos > log_neg:
        return "pos"
    return "neg"

In [147]:
pred_class(doc)

'pos'

In [148]:
test_pos_res = test_raw_pos.map(pred_class)
test_pos_res.take(10)

['pos', 'pos', 'pos', 'pos', 'pos', 'neg', 'neg', 'neg', 'pos', 'pos']

## Using `split` method

In [65]:
test_pos_res = test_raw_pos.map(pred_class).map(lambda x: (x, 1)).reduceByKey(lambda x, y: x+y)
pos_results = test_pos_res.collectAsMap()
print pos_results

{'neg': 575, 'pos': 1954}


In [66]:
test_neg_res = test_raw_neg.map(pred_class).map(lambda x: (x, 1)).reduceByKey(lambda x, y: x+y)
neg_results = test_neg_res.collectAsMap()
print neg_results

{'neg': 2153, 'pos': 376}


In [67]:
# compute accuracy
total = sum(neg_results.values()) + sum(pos_results.values())
acc = float(neg_results["neg"] + pos_results["pos"]) / float(total)
print acc

0.811981020166


## Using `clean` method

In [99]:
test_pos_res = test_raw_pos.map(pred_class).map(lambda x: (x, 1)).reduceByKey(lambda x, y: x+y)
pos_results = test_pos_res.collectAsMap()
print pos_results
test_neg_res = test_raw_neg.map(pred_class).map(lambda x: (x, 1)).reduceByKey(lambda x, y: x+y)
neg_results = test_neg_res.collectAsMap()
print neg_results
# compute accuracy
total = sum(neg_results.values()) + sum(pos_results.values())
acc = float(neg_results["neg"] + pos_results["pos"]) / float(total)
print acc

{'neg': 641, 'pos': 1888}
{'neg': 2165, 'pos': 364}
0.801304863582


## Using `remove singletons` method

In [120]:
test_pos_res = test_raw_pos.map(pred_class).map(lambda x: (x, 1)).reduceByKey(lambda x, y: x+y).filter(lambda x: x[1]>1)
pos_results = test_pos_res.collectAsMap()
print pos_results
test_neg_res = test_raw_neg.map(pred_class).map(lambda x: (x, 1)).reduceByKey(lambda x, y: x+y).filter(lambda x: x[1]>1)
neg_results = test_neg_res.collectAsMap()
print neg_results
# compute accuracy
total = sum(neg_results.values()) + sum(pos_results.values())
acc = float(neg_results["neg"] + pos_results["pos"]) / float(total)
print acc

{'neg': 822, 'pos': 1707}
{'neg': 1844, 'pos': 685}
0.702056148675


## Using `lower` method

In [149]:
test_pos_res = test_raw_pos.map(pred_class).map(lambda x: (x.lower(), 1)).reduceByKey(lambda x, y: x+y).filter(lambda x: x[1]>1)
pos_results = test_pos_res.collectAsMap()
print pos_results
test_neg_res = test_raw_neg.map(pred_class).map(lambda x: (x.lower(), 1)).reduceByKey(lambda x, y: x+y).filter(lambda x: x[1]>1)
neg_results = test_neg_res.collectAsMap()
print neg_results
# compute accuracy
total = sum(neg_results.values()) + sum(pos_results.values())
acc = float(neg_results["neg"] + pos_results["pos"]) / float(total)
print acc

{'neg': 862, 'pos': 1667}
{'neg': 1884, 'pos': 645}
0.702056148675
