In [1]:
#Import the libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn import naive_bayes
from sklearn.metrics import roc_auc_score

In [2]:
#Load the dataset
IMDB_data = pd.read_csv("IMDB Dataset.csv")
IMDB_data.head(10)

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
5,"Probably my all-time favorite movie, a story o...",positive
6,I sure would like to see a resurrection of a u...,positive
7,"This show was an amazing, fresh & innovative i...",negative
8,Encouraged by the positive comments about this...,negative
9,If you like original gut wrenching laughter yo...,positive


#### Data Analysis

In [3]:
#Check for null data
IMDB_data.isnull().any()

review       False
sentiment    False
dtype: bool

In [4]:
np.unique(IMDB_data['sentiment'],return_counts=True)

(array(['negative', 'positive'], dtype=object),
 array([25000, 25000], dtype=int64))

#### Data Preprocessing

In [5]:
vectorizer = TfidfVectorizer(strip_accents='ascii',lowercase=True,stop_words = stopwords.words('english'))

In [6]:
X = vectorizer.fit_transform(IMDB_data['review'])

In [7]:
print(np.array(X))

  (0, 81735)	0.04692247277576888
  (0, 22625)	0.07419429343840393
  (0, 91779)	0.057538817730462544
  (0, 90143)	0.06205586219391211
  (0, 96993)	0.05233732786853565
  (0, 94123)	0.07138248027809192
  (0, 18632)	0.07312701519388139
  (0, 8982)	0.04515458812637841
  (0, 56534)	0.03699927796746985
  (0, 31346)	0.04818419434808489
  (0, 82436)	0.06481667768152764
  (0, 86423)	0.055018425747524996
  (0, 50949)	0.04874153884178886
  (0, 10378)	0.10069636079727004
  (0, 93290)	0.04970717793322766
  (0, 17537)	0.05179036899122877
  (0, 58076)	0.04982556982178736
  (0, 55518)	0.08431980729476911
  (0, 98647)	0.025150235896227887
  (0, 37010)	0.052903606204067866
  (0, 64469)	0.04986530831894351
  (0, 49632)	0.04803063502804875
  (0, 45652)	0.1700402454457881
  (0, 62186)	0.10397926209555218
  (0, 83694)	0.07002382721974176
  :	:
  (49999, 43277)	0.08262134450972523
  (49999, 32082)	0.09642527783262565
  (49999, 85442)	0.09818782505482437
  (49999, 100256)	0.08952743325416027
  (49999, 60248)	0

In [8]:
X.shape

(50000, 101865)

In [9]:
IMDB_data['sentiment'].unique()

array(['positive', 'negative'], dtype=object)

In [10]:
y = IMDB_data['sentiment'].replace(('positive','negative'),(1,0))

In [11]:
np.unique(y,return_counts=True)

(array([0, 1], dtype=int64), array([25000, 25000], dtype=int64))

In [12]:
vectorizer.vocabulary_

{'one': 64112,
 'reviewers': 75520,
 'mentioned': 57555,
 'watching': 98242,
 'oz': 65461,
 'episode': 30092,
 'hooked': 42857,
 'right': 75923,
 'exactly': 30957,
 'happened': 40434,
 'br': 12068,
 'first': 33530,
 'thing': 90394,
 'struck': 86608,
 'brutality': 12998,
 'unflinching': 94646,
 'scenes': 78732,
 'violence': 97171,
 'set': 80305,
 'word': 100126,
 'go': 37791,
 'trust': 92981,
 'show': 81428,
 'faint': 31905,
 'hearted': 41159,
 'timid': 90985,
 'pulls': 71605,
 'punches': 71652,
 'regards': 74110,
 'drugs': 27436,
 'sex': 80403,
 'hardcore': 40496,
 'classic': 17541,
 'use': 95846,
 'called': 14109,
 'nickname': 62205,
 'given': 37474,
 'oswald': 64752,
 'maximum': 56527,
 'security': 79676,
 'state': 85560,
 'penitentary': 67066,
 'focuses': 34255,
 'mainly': 54973,
 'emerald': 29235,
 'city': 17388,
 'experimental': 31354,
 'section': 79655,
 'prison': 70619,
 'cells': 15591,
 'glass': 37549,
 'fronts': 35441,
 'face': 31782,
 'inwards': 46580,
 'privacy': 70635,
 'hi

#### Model Building

In [13]:
#Test Train split

X_train,X_test,y_train,y_test = train_test_split(X,y,random_state=2,test_size = 0.3)

In [14]:
#naive bayes classifier

model = naive_bayes.MultinomialNB()
model.fit(X_train,y_train)
result = model.predict(X_test)

In [15]:
result

array([0, 0, 1, ..., 1, 0, 1], dtype=int64)

In [16]:
# test our model's accuracy
roc_auc_score(y_test,result)

0.866667944296319

In [17]:
# how does it work?

movie_reviews_array = np.array(["Batla House is a terrific movie"])

movie_review_vector = vectorizer.transform(movie_reviews_array)


#predicting result
model.predict(movie_review_vector)

array([1], dtype=int64)