# Load Data

In [1]:
import pandas as pd
data = pd.read_csv("./rt_reviews.csv", encoding = "ISO-8859-1" )
data=pd.DataFrame(data)
data.head()

Unnamed: 0,Freshness,Review
0,fresh,"Manakamana doesn't answer any questions, yet ..."
1,fresh,Wilfully offensive and powered by a chest-thu...
2,rotten,It would be difficult to imagine material mor...
3,rotten,Despite the gusto its star brings to the role...
4,rotten,If there was a good idea at the core of this ...


In [2]:
data.isnull().any()

Freshness    False
Review       False
dtype: bool

# Split Data

In [3]:
train_size = int(0.7*len(data))
development_size = int(0.2*len(data))
test_size = int(0.1*len(data))

train=data[:train_size]
development=data[train_size:train_size+development_size]
test=data[train_size+development_size:]

print(train.head(),"\n",development.head(),"\n",test.head())

  Freshness                                             Review
0     fresh   Manakamana doesn't answer any questions, yet ...
1     fresh   Wilfully offensive and powered by a chest-thu...
2    rotten   It would be difficult to imagine material mor...
3    rotten   Despite the gusto its star brings to the role...
4    rotten   If there was a good idea at the core of this ... 
        Freshness                                             Review
336000     fresh   Ultimate X is a ride, basically the kind of g...
336001    rotten   Viewers will be mystified by the existence of...
336002     fresh   The story might be familiar but the setting i...
336003    rotten   A pleasant time-waster with non-abysmal perfo...
336004     fresh   The difficulty and the key lies in taking a l... 
        Freshness                                             Review
432000     fresh   It's the directorial debut of Clea Duvall and...
432001    rotten   "Joe" is not handsome or especially smart, bu...
432002

# Build Vocabulary List

In [4]:
documents = list(x.lower() for x in data.Review.values)
positive_documents = list(x.lower() for x in data[data["Freshness"]=="fresh"].Review.values)
negative_documents = list(x.lower() for x in data[data["Freshness"]=="rotten"].Review.values)

vocabulary= {}
for words in documents:
  words = list(map(str,str(words).split(" ")))
  for word in words:
    if word not in vocabulary:
      vocabulary[word] = 1
    else:
      vocabulary[word] +=1
vocabulary = {word: count for word, count in vocabulary.items() if count >= 5}

positive_vocabulary= {}
for words in positive_documents:
  words = list(map(str,str(words).split(" ")))
  for word in words:
    if word not in positive_vocabulary:
      positive_vocabulary[word] = 1
    else:
      positive_vocabulary[word] +=1
positive_vocabulary = {word: count for word, count in positive_vocabulary.items() if count >= 5}

negative_vocabulary= {}
for words in negative_documents:
  words = list(map(str,str(words).split(" ")))
  for word in words:
    if word not in negative_vocabulary:
      negative_vocabulary[word] = 1
    else:
      negative_vocabulary[word] +=1
negative_vocabulary = {word: count for word, count in negative_vocabulary.items() if count >= 5}


# Calculate Probabilities

In [5]:
def P(word):
  word=word.lower()
  w = {}
  w[word] = 0
  for row in documents:
    review = list(map(str, str(row).split(" ")))
    if word in review:
      w[word] += 1
  a = w[word]/vocabulary[word]
  result = f'{a:.10f}'
  return result

def CP(word):

  word=word.lower()
  pword = {}
  pword[word] = 0
  nword = {}
  nword[word] = 0
  
  if word in positive_vocabulary:
    for row in positive_documents:
      review = list(map(str, str(row).split(" ")))
      if word in review:
        pword[word] += 1
    a = pword[word]/positive_vocabulary[word]
    result1 = f'{a:.10f}'
  else:
    result1 = 0
  if word in negative_vocabulary:
    for row in negative_documents:
      review = list(map(str, str(row).split(" ")))
      if word in review:
        nword[word] += 1
    a = nword[word]/negative_vocabulary[word]
    result2 = f'{a:.10f}'
  else:
    result2 = 0
  return [result1,result2]

In [6]:
cp= CP("the")
print("P('the')",P("the"))
print("P(the|positive)",cp[0],"P(the|negative)", cp[1])

P('the') 0.5670200047
P(the|positive) 0.5633968243 P(the|negative) 0.5707046994


# Calculate accuracy using Development data:

In [7]:
def predict_class(review):
  rwords = [word.lower() for word in review.split()]
  words = []
  for word in rwords:
    if len(word)>3:
      words.append(word)
  pp = 1
  np = 1
  
  for word in words:
    if  word in vocabulary and P(word) > str(0.001):
      cp = CP(word)
      pp *= float(cp[0])
      np *= float(cp[1])
      if np<pp:
        return "fresh"
      else:
        return "rotten"
  return "rotten"

#Calculate accuracy using Development data
def evaluate(docs, n):
  c=0
  w=0
  accuracy = []
  loss = []
  i=1
  for row in docs.values:
    if(predict_class(row[1])==row[0]):
      c+=1
      accuracy.append(c/i)
    else:
      w+=1
      loss.append(w/i)
    i+=1
    if i==n:
      break
  return [accuracy,loss]

In [8]:
dev_eval10 = evaluate(development, 10)

In [9]:
v_acc = sum(dev_eval10[0])/len(dev_eval10[0])*100
print("Validation Accuracy:",v_acc)

Validation Accuracy: 51.11111111111111


In [10]:
v_loss = sum(dev_eval10[1])/len(dev_eval10[1])*100
print("Validation loss:",v_loss)


Validation loss: 70.95238095238095


# Derive top 10 words that classifies each class:

In [11]:
sorted_positive_vocabulary = dict(sorted(positive_vocabulary.items(), key=lambda x: x[1], reverse=True))

In [12]:
i=1
j=0
print("Top 10 words for positive review:")
for key, value in list(sorted_positive_vocabulary.items())[:1000]:
  if len(key)>5:
    print(key, value)
    j+=1
  i+=1
  if j == 10:
    break

Top 10 words for positive review:
review 6450
little 5702
spanish] 5579
that's 5532
director 5392
enough 5234
doesn't 5197
comedy 5180
there's 5101
action 4916


In [13]:
sorted_negative_vocabulary = dict(sorted(negative_vocabulary.items(), key=lambda x: x[1], reverse=True))

In [14]:
i=1
j=0
print("Top 10 words for negative review:")
for key, value in list(sorted_negative_vocabulary.items())[:1000]:
  if len(key)>5:
    print(key, value)
    j+=1
  i+=1
  if j == 10:
    break

Top 10 words for negative review:
doesn't 8736
little 7962
there's 7109
enough 6050
that's 5708
comedy 5519
characters 5486
really 5460
nothing 4874
something 4844


# Apply Smoothing

In [39]:
def smooth_predict_class(review):
  rwords = [word.lower() for word in review.split()]
  words = []
  for word in rwords:
    if len(word)>3:
      words.append(word)
  pp = 1
  np = 1
  
  for word in words:
    if  word in vocabulary and P(word) > str(0.001):
      
      cp = CP(word)
      pp *= float(cp[0])
      np *= float(cp[1])
      #Apply Smoothing:
      pwp = pp+1/vocabulary[word]*len(vocabulary)
      pwn = np+1/vocabulary[word]*len(vocabulary)
      if pwn<=pwp:
        return "fresh"
      else:
        return "rotten"
  return "rotten"

#Calculate accuracy using Development data
def smooth_evaluate(docs, n):
  c=0
  w=0
  accuracy = []
  loss = []
  i=1
  for row in docs.values:
    if(smooth_predict_class(row[1])==row[0]):
      c+=1
      accuracy.append(c/i)
    else:
      w+=1
      loss.append(w/i)
    i+=1
    if i==n:
      break
  return [accuracy,loss]

In [40]:
smooth_dev_eval10 = smooth_evaluate(development, 10)

In [41]:
smooth_v_acc = sum(smooth_dev_eval10[0])/len(smooth_dev_eval10[0])*100
print("Validation Accuracy after Smoothing:",smooth_v_acc)

Validation Accuracy after Smoothing: 57.833333333333336


# Using test data for calculating accuracy:

In [18]:
test_eval10 = evaluate(test, 10)

In [20]:
test_acc = sum(test_eval10[0])/len(test_eval10[0])*100
print("Test Accuracy:",test_acc)

Validation Accuracy after Smoothing: 62.26190476190476
