In [2]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [3]:
# Importing the dataset
dataset = pd.read_csv('rotten_tomatoes_reviews.tsv', delimiter = '\t', quoting = 3)

In [6]:
dataset = dataset.iloc[0:2000,:]
dataset.tail()

Unnamed: 0,Freshness,Review
1995,rotten,...a bit like ordering egg rolls at McDonald's.
1996,fresh,""" A superb character piece, an affecting yet h..."
1997,fresh,""" The Florida Project could easily be a Best P..."
1998,rotten,You're better off watching Kingsman and The B...
1999,rotten,Mann must have felt he could overcome the scr...


In [5]:
#Import libraries that help in data cleaning
import re
import nltk
#nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

In [7]:
corpus = []
for i in range(0, 2000):
    review = re.sub('[^a-zA-Z]', ' ', dataset['Review'][i])
    review = review.lower()
    review = review.split()
    ps = PorterStemmer()
    review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
    review = ' '.join(review)
    corpus.append(review)

In [8]:
corpus[0:10]

['lens skill particular end',
 'baggag claim focus much plot littl charact',
 'watch fenc never doubt live matter good nobl thing also awar mayb awar much movi want matter',
 'johanna hamilton may dynam immedi documentari execut produc laura poitra citizenfour look earliest iter america secur state essenti',
 'delay much debat good dinosaur perfectli good famili friendli anim featur taken grant account simpler pleasur',
 'sheridan suggest object game stop drug trade perpetu game',
 'horror film camera obscura disappoint',
 'michael bay worst enemi',
 'opiat induc caffein super octan trip',
 'fall apart final third feel particularli sloppi nearli destroy entir film']

In [9]:
# Creating the Bag of Words model
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer() # max_features parameter!!!
X = cv.fit_transform(corpus).toarray()
y = dataset.iloc[:, 0].values
X.shape

(2000, 6426)

In [10]:
# Creating the Bag of Words model
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=1000) # max_features parameter!!!
X = cv.fit_transform(corpus).toarray()
y = dataset.iloc[:, 0].values
X.shape

(2000, 1000)

In [11]:
from sklearn.preprocessing import LabelEncoder
labelencoder = LabelEncoder()
y = labelencoder.fit_transform(y)

In [12]:
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

In [14]:
# Fitting Naive Bayes to the Training set
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(X_train, y_train)

GaussianNB(priors=None)

In [15]:
y_pred = classifier.predict(X_test)

In [16]:
from sklearn.metrics import confusion_matrix,classification_report

cm = confusion_matrix(y_test,y_pred)

In [17]:
cm

array([[120,  71],
       [ 63, 146]], dtype=int64)

In [18]:
print(classification_report(y_test,y_pred))

             precision    recall  f1-score   support

          0       0.66      0.63      0.64       191
          1       0.67      0.70      0.69       209

avg / total       0.66      0.67      0.66       400



In [19]:
acc = (cm[0][0] + cm[1][1]) / 400
acc*100

66.5