In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
import sklearn as sk

In [2]:
df=pd.read_csv('IMDB dataset.csv')
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


# Data Cleaning
1. remove html tags
2. remove stop words
3. remove special characters
4. convert into lowercase

In [3]:


# changing sentiment to 1 and 0 -> 1 possitive 0 negitive
df['sentiment'].replace({'positive': 1, 'negative':0},inplace=True)
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1


Remove html tags

In [4]:
import re
clean=re.compile('<.*?>')
re.sub(clean, '',df.iloc[2].review)

'I thought this was a wonderful way to spend time on a too hot summer weekend, sitting in the air conditioned theater and watching a light-hearted comedy. The plot is simplistic, but the dialogue is witty and the characters are likable (even the well bread suspected serial killer). While some may be disappointed when they realize this is not Match Point 2: Risk Addiction, I thought it was proof that Woody Allen is still fully in control of the style many of us have grown to love.This was the most I\'d laughed at one of Woody\'s comedies in years (dare I say a decade?). While I\'ve never been impressed with Scarlet Johanson, in this she managed to tone down her "sexy" image and jumped right into a average, but spirited young woman.This may not be the crown jewel of his career, but it was wittier than "Devil Wears Prada" and more interesting than "Superman" a great comedy to go see with friends.'

In [5]:
def clean_html(text):   
    clean=re.compile('<.*?>')
    a=re.sub(clean, '',text)
    # print(a)
    return a

In [6]:
df['review']=df['review'].apply(clean_html)

In [7]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. The filming tec...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1


Converting to lowercase

In [8]:
def convert_lower(text):
    return text.lower()
df['review']=df['review'].apply(convert_lower)
df.head()

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,1
1,a wonderful little production. the filming tec...,1
2,i thought this was a wonderful way to spend ti...,1
3,basically there's a family where a little boy ...,0
4,"petter mattei's ""love in the time of money"" is...",1


Remove special charecters

In [9]:
def remove_special_charecters(text):
    x=''
    # replacing special charecters by adding only alphanumeric characters
    for i in text:
        if i.isalnum():
            x=x+i
        else:
            x=x+' '
    return x
df['review'] = df['review'].apply(remove_special_charecters)
df.head()

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,1
1,a wonderful little production the filming tec...,1
2,i thought this was a wonderful way to spend ti...,1
3,basically there s a family where a little boy ...,0
4,petter mattei s love in the time of money is...,1


Remove Stopwords

In [10]:
import nltk
from nltk.corpus import stopwords
# stopwords.words('english')
def remove_stopwords(text):
    x=[]
    for i in text.split():
        if i not in stopwords.words('english'):
            x.append(i)
    y=x[:]
    x.clear()
    return y
    

In [11]:
df['review']=df['review'].apply(remove_stopwords)

In [12]:
df.head()

Unnamed: 0,review,sentiment
0,"[one, reviewers, mentioned, watching, 1, oz, e...",1
1,"[wonderful, little, production, filming, techn...",1
2,"[thought, wonderful, way, spend, time, hot, su...",1
3,"[basically, family, little, boy, jake, thinks,...",0
4,"[petter, mattei, love, time, money, visually, ...",1


Stemming

In [13]:
from nltk.stem.porter import PorterStemmer
ps=PorterStemmer()


In [14]:
y=[]
def stem_words(text):
    for i in text:
        y.append(ps.stem(i))
    z=y[:]
    y.clear()
    return z
    

In [15]:
df['review']=df['review'].apply(stem_words)

In [16]:
df.head()

Unnamed: 0,review,sentiment
0,"[one, review, mention, watch, 1, oz, episod, h...",1
1,"[wonder, littl, product, film, techniqu, unass...",1
2,"[thought, wonder, way, spend, time, hot, summe...",1
3,"[basic, famili, littl, boy, jake, think, zombi...",0
4,"[petter, mattei, love, time, money, visual, st...",1


Joining the text back together

In [17]:
def join_back(list_input):
    return " ".join(list_input)

In [18]:
df['review']=df['review'].apply(join_back)
df.head()

Unnamed: 0,review,sentiment
0,one review mention watch 1 oz episod hook righ...,1
1,wonder littl product film techniqu unassum old...,1
2,thought wonder way spend time hot summer weeke...,1
3,basic famili littl boy jake think zombi closet...,0
4,petter mattei love time money visual stun film...,1


In [19]:
X=df.iloc[:,0:1].values
X.shape

(50000, 1)

In [24]:
from sklearn.feature_extraction.text import CountVectorizer
cv=CountVectorizer(max_features=1000)

In [25]:
X=cv.fit_transform(df['review']).toarray()
X.shape

(50000, 1000)

In [26]:
y=df.iloc[:,-1].values
y,y.shape

(array([1, 1, 1, ..., 0, 0, 0], dtype=int64), (50000,))

Splitting it into train and test set

In [27]:
from sklearn.model_selection import  train_test_split
x_train,x_test,y_train,y_test=train_test_split(X,y,random_state=0,test_size=0.2)


# Model 

In [31]:
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
clf1=GaussianNB()
clf2=MultinomialNB()
clf3=BernoulliNB()

clf1.fit(x_train,y_train)
clf2.fit(x_train,y_train)
clf3.fit(x_train,y_train)



Predictios

In [32]:
y_pres1=clf1.predict(x_test)
y_pres2=clf2.predict(x_test)
y_pres3=clf3.predict(x_test)


Accuracy

In [33]:
from sklearn.metrics import accuracy_score
print("Gaussian classification accuracy",accuracy_score(y_test,y_pres1))
print("Mulinomial classification accuracy",accuracy_score(y_test,y_pres2))
print("Bernauli classification accuracy",accuracy_score(y_test,y_pres3))

Gaussian classification accuracy 0.7886
Mulinomial classification accuracy 0.8247
Bernauli classification accuracy 0.8314
