In [1]:
import numpy as np
import pandas as pd

In [2]:
data = pd.read_csv('IMDB Dataset.csv')

In [3]:
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [4]:
data['review'][0]

"One of the other reviewers has mentioned that after watching just 1 Oz episode you'll be hooked. They are right, as this is exactly what happened with me.<br /><br />The first thing that struck me about Oz was its brutality and unflinching scenes of violence, which set in right from the word GO. Trust me, this is not a show for the faint hearted or timid. This show pulls no punches with regards to drugs, sex or violence. Its is hardcore, in the classic use of the word.<br /><br />It is called OZ as that is the nickname given to the Oswald Maximum Security State Penitentary. It focuses mainly on Emerald City, an experimental section of the prison where all the cells have glass fronts and face inwards, so privacy is not high on the agenda. Em City is home to many..Aryans, Muslims, gangstas, Latinos, Christians, Italians, Irish and more....so scuffles, death stares, dodgy dealings and shady agreements are never far away.<br /><br />I would say the main appeal of the show is due to the fa

# Text Cleaning
1. Sample 10000 rows
2. Remove html tage
3. Remove special characters
4. Converting every thing to lower case
5. Removing stop words
6. Stemming

In [5]:
df = data.sample(10000)

In [6]:
df.shape

(10000, 2)

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 10000 entries, 1948 to 5503
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     10000 non-null  object
 1   sentiment  10000 non-null  object
dtypes: object(2)
memory usage: 234.4+ KB


In [8]:
df['sentiment'].replace({'positive':1, 'negative': 0}, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['sentiment'].replace({'positive':1, 'negative': 0}, inplace=True)
  df['sentiment'].replace({'positive':1, 'negative': 0}, inplace=True)


In [9]:
df.head()

Unnamed: 0,review,sentiment
1948,The small California town of Diablo is plagued...,0
10930,SPOILERS (ALTHOUGH NONE THAT AREN'T REVEALED I...,0
25268,"This is the worst film I have ever seen, so ba...",0
36190,Eddie Murphy spends his time looking for lost ...,1
26739,"Of all the film noirs of the 1940s and 1950s, ...",1


In [10]:
import re
clean = re.compile('<.*?>')
re.sub(clean, '', df.iloc[2].review)

'This is the worst film I have ever seen, so bad it is astonishing. I am glad that I have never seen that black sidekick in any other film: OK, it wasn\'t his fault that someone gave him those lines, but he could have refused the role, and tried to learn how to act instead. How did anyone get the money to put this film together. Is there some corporation in Hollywood that deals with trash for male college students with no brain? "Oh yeah, they will love this one: it\'s got no believable plot, some kungfu movements, Chuck Norris, a black sidekick with bad corny lines, a sweet little Israeli (or is he an Arab, or does anyone care?) boy pickpocket, and the devil." Brilliant, and many thanks to all concerned for enriching the human race.'

In [11]:
def clean_html(text):
  clean = re.compile('<.*?>')
  return re.sub(clean, '', text)

In [12]:
df['review'] = df['review'].apply(clean_html)

In [13]:
# Converting everything to lower
def convert_lower(text):
  return text.lower()

In [14]:
df['review'] = df['review'].apply(convert_lower)

In [15]:
# function to remove special characters
def remove_special(text):
  x=''

  for i in text:
    if i.isalnum():
      x = x + i
    else:
      x = x + ' '
  return x

In [16]:
remove_special(' th%e @ classic use of the word.')

' th e   classic use of the word '

In [17]:
df['review'] = df['review'].apply(remove_special)

In [18]:
# Remove the stop words
import nltk

In [23]:
from nltk.corpus import stopwords

In [20]:
df

Unnamed: 0,review,sentiment
1948,the small california town of diablo is plagued...,0
10930,spoilers although none that aren t revealed i...,0
25268,this is the worst film i have ever seen so ba...,0
36190,eddie murphy spends his time looking for lost ...,1
26739,of all the film noirs of the 1940s and 1950s ...,1
...,...,...
9235,this movie is a real waste of time and effort ...,0
36489,richard brooks excellent 1967 film of truman c...,1
14402,harmony korrine hate him or hate him on thi...,0
32443,it has past almost 25 years since i saw this m...,1


In [22]:
pip install nltk



In [25]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [26]:
stopwords.words('english')

['a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 "he'd",
 "he'll",
 'her',
 'here',
 'hers',
 'herself',
 "he's",
 'him',
 'himself',
 'his',
 'how',
 'i',
 "i'd",
 'if',
 "i'll",
 "i'm",
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it'd",
 "it'll",
 "it's",
 'its',
 'itself',
 "i've",
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'on

In [27]:
len(stopwords.words('english'))

198

In [28]:
def remove_stopwords(text):
  x=[]
  for i in text.split():
    if i not in stopwords.words('english'):
      x.append(i)
  y = x[:]
  x.clear()
  return y

In [29]:
df['review'] = df['review'].apply(remove_stopwords)

In [30]:
df

Unnamed: 0,review,sentiment
1948,"[small, california, town, diablo, plagued, mys...",0
10930,"[spoilers, although, none, revealed, first, tw...",0
25268,"[worst, film, ever, seen, bad, astonishing, gl...",0
36190,"[eddie, murphy, spends, time, looking, lost, c...",1
26739,"[film, noirs, 1940s, 1950s, rank, one, strange...",1
...,...,...
9235,"[movie, real, waste, time, effort, film, lacks...",0
36489,"[richard, brooks, excellent, 1967, film, truma...",1
14402,"[harmony, korrine, hate, hate, evidence, loath...",0
32443,"[past, almost, 25, years, since, saw, movie, w...",1


In [31]:
# Perform Stemming
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

In [32]:
y = []
def stem_words(text):
  for i in text:
    y.append(ps.stem(i))
  z = y[:]
  y.clear()
  return z

In [33]:
stem_words(['I','loved','it'])

['i', 'love', 'it']

In [34]:
df['review'] = df['review'].apply(stem_words)

In [35]:
# Join back
def join_back(list_input):
  return ' '.join(list_input)

In [36]:
df['review'] = df['review'].apply(join_back)

In [37]:
df['review']

Unnamed: 0,review
1948,small california town diablo plagu mysteri dea...
10930,spoiler although none reveal first two minut m...
25268,worst film ever seen bad astonish glad never s...
36190,eddi murphi spend time look lost children spec...
26739,film noir 1940 1950 rank one strangest fun wat...
...,...
9235,movi real wast time effort film lack plot dept...
36489,richard brook excel 1967 film truman capot nov...
14402,harmoni korrin hate hate evid loath might bett...
32443,past almost 25 year sinc saw movi would consid...


In [38]:
X = df.iloc[:,0:1].values

In [39]:
X.shape

(10000, 1)

In [81]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=500)

In [82]:
X = cv.fit_transform(df['review']).toarray()

In [83]:
X.shape

(10000, 500)

In [84]:
y = df.iloc[:,-1].values

In [85]:
y.shape

(10000,)

In [86]:
# Training and Testing Sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [87]:
X_train.shape

(8000, 500)

In [88]:
X_test.shape

(2000, 500)

In [89]:
y_train.shape

(8000,)

In [90]:
y_test.shape

(2000,)

In [91]:
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB

In [92]:
clf1 = GaussianNB()
clf2 = MultinomialNB()
clf3 = BernoulliNB()

In [93]:
clf1.fit(X_train, y_train)
clf2.fit(X_train, y_train)
clf3.fit(X_train, y_train)

In [94]:
y_pred1 = clf1.predict(X_test)
y_pred2 = clf2.predict(X_test)
y_pred3 = clf3.predict(X_test)

In [95]:
y_test.shape

(2000,)

In [96]:
from sklearn.metrics import accuracy_score

In [97]:
print('Gaussian: ', accuracy_score(y_test, y_pred1))
print('Multinomial: ', accuracy_score(y_test, y_pred2))
print('Bernoulli: ', accuracy_score(y_test, y_pred3))

Gaussian:  0.796
Multinomial:  0.8385
Bernoulli:  0.8335
