In [14]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/aryansharma/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/aryansharma/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [20]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB,MultinomialNB,BernoulliNB
from sklearn.metrics import accuracy_score

In [1]:
import pandas as pd
import numpy as np

In [2]:
df=pd.read_csv('~/Desktop/data/IMDB.csv')

In [3]:
pd.set_option('display.max_rows',None)
pd.set_option('display.max_columns',None)

In [4]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [5]:
df.sentiment.replace('positive',1,inplace=True)
df.sentiment.replace('negative',0,inplace=True)

# Steps to clean reviews

1.Remove HTML tags
2.Remove special characters
3.Convert everything to lowercase
4.Remove stopwords
5.Stemming

Removing HTML tags

In [6]:
import re
def clean_text(text):
    cleaned = re.compile(r'<.*?>')
    return re.sub(cleaned,'',text)
df.review=df.review.apply(clean_text)


Removing Special characters

In [7]:
def is_special(text):
    rem=' '
    for i in text:
        if i.isalnum():
            rem=rem+i
        else:
            rem=rem+' '
    return rem
df.review=df.review.apply(is_special)


In [10]:
df.head()

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that...,1
1,a wonderful little production the filming te...,1
2,i thought this was a wonderful way to spend t...,1
3,basically there s a family where a little boy...,0
4,petter mattei s love in the time of money i...,1


Convert reviews to lowercase

In [9]:
def to_lower(text):
    return text.lower()
df.review=df.review.apply(to_lower)

Removing Stopwords

In [15]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
def rem_stopwords(text):
    stop_words=set(stopwords.words('english'))
    token=word_tokenize(text)
    return [w for w in token if w not in stop_words]
df.review=df.review.apply(rem_stopwords)
df.review[0]

['one',
 'reviewers',
 'mentioned',
 'watching',
 '1',
 'oz',
 'episode',
 'hooked',
 'right',
 'exactly',
 'happened',
 'first',
 'thing',
 'struck',
 'oz',
 'brutality',
 'unflinching',
 'scenes',
 'violence',
 'set',
 'right',
 'word',
 'go',
 'trust',
 'show',
 'faint',
 'hearted',
 'timid',
 'show',
 'pulls',
 'punches',
 'regards',
 'drugs',
 'sex',
 'violence',
 'hardcore',
 'classic',
 'use',
 'word',
 'called',
 'oz',
 'nickname',
 'given',
 'oswald',
 'maximum',
 'security',
 'state',
 'penitentary',
 'focuses',
 'mainly',
 'emerald',
 'city',
 'experimental',
 'section',
 'prison',
 'cells',
 'glass',
 'fronts',
 'face',
 'inwards',
 'privacy',
 'high',
 'agenda',
 'em',
 'city',
 'home',
 'many',
 'aryans',
 'muslims',
 'gangstas',
 'latinos',
 'christians',
 'italians',
 'irish',
 'scuffles',
 'death',
 'stares',
 'dodgy',
 'dealings',
 'shady',
 'agreements',
 'never',
 'far',
 'away',
 'would',
 'say',
 'main',
 'appeal',
 'show',
 'due',
 'fact',
 'goes',
 'shows',
 'da

Stemming

In [16]:
from nltk.stem import SnowballStemmer
def stemm(text):
    ss=SnowballStemmer('english')
    return " ".join(ss.stem(w) for w in text)
df.review=df.review.apply(stemm)    

In [17]:
df.head()

Unnamed: 0,review,sentiment
0,one review mention watch 1 oz episod hook righ...,1
1,wonder littl product film techniqu unassum old...,1
2,thought wonder way spend time hot summer weeke...,1
3,basic famili littl boy jake think zombi closet...,0
4,petter mattei love time money visual stun film...,1


In [25]:
x = np.array(df.iloc[:,0].values)
y = np.array(df.sentiment.values)
x.shape,y.shape

((50000,), (50000,))

In [27]:
cv=CountVectorizer(max_features=1000)
x=cv.fit_transform(x).toarray()
x.shape

(50000, 1000)

In [28]:
trainx,testx,trainy,testy = train_test_split(x,y,test_size=0.2,random_state=9)
trainx.shape,trainy.shape,testx.shape,testy.shape

((40000, 1000), (40000,), (10000, 1000), (10000,))

In [29]:
gnb,mnb,bnb = GaussianNB(),MultinomialNB(alpha=1.0,fit_prior=True),BernoulliNB(alpha=1.0,fit_prior=True)
gnb.fit(trainx,trainy)
mnb.fit(trainx,trainy)
bnb.fit(trainx,trainy)

BernoulliNB()

In [30]:
ypg = gnb.predict(testx)
ypm = mnb.predict(testx)
ypb = bnb.predict(testx)

print("Gaussian = ",accuracy_score(testy,ypg))
print("Multinomial = ",accuracy_score(testy,ypm))
print("Bernoulli = ",accuracy_score(testy,ypb))

Gaussian =  0.7843
Multinomial =  0.831
Bernoulli =  0.8386
