## Data Mining Assignment - 3
#### This assignment is an attempt to solve the classification of rotten tomatoes, using a from-scratch implementation of the Naive Bayes classifier

In [59]:
import os
import pandas as pd
import re
import math
from sklearn.metrics import classification_report

In [60]:
reviews = pd.read_csv('rt_reviews.csv', encoding='latin-1')
reviews

Unnamed: 0,Freshness,Review
0,fresh,"Manakamana doesn't answer any questions, yet ..."
1,fresh,Wilfully offensive and powered by a chest-thu...
2,rotten,It would be difficult to imagine material mor...
3,rotten,Despite the gusto its star brings to the role...
4,rotten,If there was a good idea at the core of this ...
...,...,...
479995,rotten,Zemeckis seems unable to admit that the motio...
479996,fresh,Movies like The Kids Are All Right -- beautif...
479997,rotten,Film-savvy audiences soon will catch onto Win...
479998,fresh,An odd yet enjoyable film.


In [61]:
train_df, test_df = train_test_split(reviews, test_size=0.2, random_state=42)
print(train_df.shape)
print(test_df.shape)

(384000, 2)
(96000, 2)


In [69]:
unique_words = set()
total_words = 0
num_fresh = 0
num_rotten = 0
fresh_words = {}
rotten_words = {}

prepositions = ['about', 'above', 'across', 'after', 'against', 'along', 'among', 'around', 'as', 'at', 
                'before', 'behind', 'below', 'beneath', 'beside', 'between', 'beyond', 'but', 'by', 
                'concerning', 'considering', 'despite', 'down', 'during', 'except', 'for', 'from', 'in', 
                'inside', 'into', 'like', 'near', 'of', 'off', 'on', 'onto', 'out', 'outside', 'over', 
                'past', 'regarding', 'round', 'since', 'through', 'throughout', 'to', 'toward', 'under', 
                'underneath', 'until', 'up', 'upon', 'with', 'within', 'without']

pronouns = ['I', 'you', 'he', 'she', 'it', 'we', 'they', 'me', 'him', 'her', 'us', 'them', 'myself', 
            'yourself', 'himself', 'herself', 'itself', 'ourselves', 'themselves', 'mine', 'yours', 'his', 'hers', 
            'its', 'ours', 'theirs', 'who', 'whom', 'whose', 'which', 'what', 'whatever', 'whoever', 'whomever', 
            'this', 'that', 'these', 'those', 'such', 'none', 'someone', 'somebody', 'something', 'anyone', 
            'anybody', 'anything', 'everyone', 'everybody', 'everything', 'no one', 'nobody', 'nothing', 'all', 
            'both', 'few', 'many', 'several', 'some', 'any', 'more', 'most', 'none', 'some', 'such']

articles = ['a', 'an', 'the', 'some', 'any', 'every', 'each', 'all', 'few', 'several', 'many', 'most', 'no', 
            'neither', 'either', 'both', 'other', 'another', 'such', 'what', 'which', 'that', 'these', 'those', 
            'everybody', 'anybody', 'nobody', 'everyone', 'someone', 'anyone', 'no one', 'something', 'anything', 
            'nothing', 'everything', 'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine', 'ten', 
            'eleven', 'twelve', 'many', 'few', 'several']

common = ['path', 'from', 'review', 'reviews', 'subject', 'message', '', ' ']

for index, row in train_df.iterrows():
    word_list = row['Review'].split()
    for word in word_list:
        if word not in prepositions and word not in pronouns and word not in articles and word not in common:
            unique_words.add(word)
            if row['Freshness'] == 'fresh':
                num_fresh += 1
                if word not in fresh_words.keys():
                    fresh_words[word] = 1
                else:
                    fresh_words[word] += 1
            else:
                num_rotten += 1
                if word not in rotten_words.keys():
                    rotten_words[word] = 1
                else:
                    rotten_words[word] += 1
total_words = len(unique_words)
print("Fresh words : \n")
print(fresh_words,"\n")
print("Rotten words : \n")
print(rotten_words)

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [70]:
print("Total words : ",total_words,"\n")

Total words :  253250 



In [71]:
def get_label(word_list):
    fresh_prob = num_fresh / reviews.shape[0]
    rotten_prob = num_rotten / reviews.shape[0]
    label = None
    for word in word_list:
        if word in fresh_words.keys():
            fresh_prob *= (fresh_words[word] / total_words)
        if word in rotten_words.keys():
            rotten_prob *= (rotten_words[word] / total_words)
    if fresh_prob >= rotten_prob:
        label = 'fresh'
    else:
        label = 'rotten'
    return label

In [72]:
actual = []
predictions = []
for index, row in test_df.iterrows():
    actual.append(row['Freshness'])
    prediction = get_label(row['Review'].split())
    predictions.append(prediction)
print("actual \n",actual,"\n")
print("pred \n",predictions)

actual 
 ['fresh', 'rotten', 'rotten', 'fresh', 'fresh', 'rotten', 'fresh', 'fresh', 'rotten', 'fresh', 'fresh', 'rotten', 'rotten', 'rotten', 'fresh', 'fresh', 'fresh', 'fresh', 'rotten', 'fresh', 'rotten', 'fresh', 'fresh', 'fresh', 'fresh', 'fresh', 'fresh', 'fresh', 'fresh', 'rotten', 'rotten', 'rotten', 'fresh', 'fresh', 'fresh', 'rotten', 'rotten', 'rotten', 'rotten', 'fresh', 'fresh', 'fresh', 'rotten', 'fresh', 'fresh', 'rotten', 'fresh', 'rotten', 'rotten', 'rotten', 'rotten', 'fresh', 'rotten', 'rotten', 'rotten', 'fresh', 'rotten', 'fresh', 'rotten', 'fresh', 'rotten', 'fresh', 'rotten', 'fresh', 'fresh', 'fresh', 'rotten', 'rotten', 'rotten', 'fresh', 'fresh', 'fresh', 'rotten', 'fresh', 'fresh', 'rotten', 'rotten', 'fresh', 'fresh', 'fresh', 'rotten', 'fresh', 'fresh', 'rotten', 'rotten', 'rotten', 'rotten', 'fresh', 'rotten', 'rotten', 'fresh', 'fresh', 'fresh', 'rotten', 'rotten', 'rotten', 'rotten', 'rotten', 'rotten', 'fresh', 'rotten', 'rotten', 'rotten', 'fresh', 'fr

In [73]:
def map_strings_to_numbers(str_list, str_num_map):
    num_list = []
    for s in str_list:
        if s in str_num_map:
            num_list.append(str_num_map[s])
    return num_list

In [74]:
str_map = {
    'fresh':0,
    'rotten':1
}
print(str_map)
num_pred = map_strings_to_numbers(predictions, str_map)
num_act = map_strings_to_numbers(actual, str_map)
print("\npred\n")
print(num_pred)
print("\nact\n")
print(num_act)

{'fresh': 0, 'rotten': 1}

pred

[1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 

In [75]:
print(classification_report(num_pred, num_act))

              precision    recall  f1-score   support

           0       0.58      0.55      0.56     50902
           1       0.52      0.55      0.54     45098

    accuracy                           0.55     96000
   macro avg       0.55      0.55      0.55     96000
weighted avg       0.55      0.55      0.55     96000

