In [7]:
import pandas as pd
import numpy as np

In [8]:
df = pd.read_csv('IMDB Dataset.csv')

In [9]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


### Text Preprocessing
There are some useless things in data which need to be cleaned like <br> tags , punchuations, special character etc

In [10]:
df['review'][0]

"One of the other reviewers has mentioned that after watching just 1 Oz episode you'll be hooked. They are right, as this is exactly what happened with me.<br /><br />The first thing that struck me about Oz was its brutality and unflinching scenes of violence, which set in right from the word GO. Trust me, this is not a show for the faint hearted or timid. This show pulls no punches with regards to drugs, sex or violence. Its is hardcore, in the classic use of the word.<br /><br />It is called OZ as that is the nickname given to the Oswald Maximum Security State Penitentary. It focuses mainly on Emerald City, an experimental section of the prison where all the cells have glass fronts and face inwards, so privacy is not high on the agenda. Em City is home to many..Aryans, Muslims, gangstas, Latinos, Christians, Italians, Irish and more....so scuffles, death stares, dodgy dealings and shady agreements are never far away.<br /><br />I would say the main appeal of the show is due to the fa

## Text Cleaning
1. sample 10000 rows
2. Remove html tags
3. remove special characters
4. convert every thing to lower case
5. Removing stop words
6. Stemming

In [11]:
df = df.sample(10000)

In [12]:
df.shape

(10000, 2)

In [13]:
df.info()  

<class 'pandas.core.frame.DataFrame'>
Index: 10000 entries, 43844 to 8447
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     10000 non-null  object
 1   sentiment  10000 non-null  object
dtypes: object(2)
memory usage: 234.4+ KB


In [14]:
# function to remove html tags
import re
def clean_html(text):
    clean = re.compile('<.*?>')
    return re.sub(clean ,' ' , text)

In [15]:
df['review'] = df['review'].apply(clean_html)

In [16]:
# converting everything to lower
def convert_lower(text):
    return text.lower()

In [17]:
df['review'] = df['review'].apply(convert_lower)

In [18]:
# function to remove special characters

def remove_special(text):
    x=' '
    for i in text:
        if i.isalnum():
            x=x+i
        else:
            x = x+' '
    return x

In [19]:
df['review'] = df['review'].apply(remove_special)

In [20]:
# Remove the stop words

# nltk = natural language tool kit
import nltk
from nltk.corpus import stopwords

In [21]:
df

Unnamed: 0,review,sentiment
43844,where to start i feel violated ...,negative
35799,this film was done in really poor taste the ...,negative
27807,airplane apart i don t think i ve ever laugh...,positive
37572,a bad one oh my this is one of the movie...,negative
9775,there are no spoilers for this film as nothin...,negative
...,...,...
26357,this low budget film about a writer who goes ...,negative
41414,i m 60 years old a guitarist lead rhythm ...,positive
24200,carnosaur 3 is bad awfully bad bad to the...,negative
11637,dominion tank police is an exercise in contra...,positive


In [22]:
! pip3 install certifi



In [23]:
import ssl
import nltk
import certifi

# Set up SSL context to use certifi's certificate bundle
ssl_context = ssl.create_default_context(cafile=certifi.where())

# Override the default SSL context
ssl._create_default_https_context = lambda: ssl_context

# Download the stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/apoorvmittal/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [24]:
def remove_stopwords (text):
    x=[]
    for i in text.split():
        if i not in stopwords.words('english'):
            x.append(i)
    y=x[:]
    x.clear()
    return y

In [25]:
df['review'] = df['review'].apply(remove_stopwords)

In [26]:
df.head()

Unnamed: 0,review,sentiment
43844,"[start, feel, violated, thats, right, violated...",negative
35799,"[film, done, really, poor, taste, script, real...",negative
27807,"[airplane, apart, think, ever, laughed, film, ...",positive
37572,"[bad, one, oh, one, movies, even, one, positiv...",negative
9775,"[spoilers, film, nothing, could, written, coul...",negative


In [27]:
# perform stemming
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

In [28]:
y = []
def stem_words(text):
    for i in text:
        y.append(ps.stem(i))
    z=y[:]
    y.clear()
    return z

In [29]:
df['review'] = df['review'].apply(stem_words)

In [31]:
# join back
def join_back(list_input):
    return " ".join(list_input)

In [32]:
df['review'] = df['review'].apply(join_back)

In [33]:
df

Unnamed: 0,review,sentiment
43844,start feel violat that right violat spent 1 5h...,negative
35799,film done realli poor tast script realli bad f...,negative
27807,airplan apart think ever laugh film much life ...,positive
37572,bad one oh one movi even one posit effect ever...,negative
9775,spoiler film noth could written could make wor...,negative
...,...,...
26357,low budget film writer goe work london casino ...,negative
41414,60 year old guitarist lead rhythm last forti y...,positive
24200,carnosaur 3 bad aw bad bad point funni matter ...,negative
11637,dominion tank polic exercis contradict film ma...,positive


In [47]:
X = df.iloc[:,0:1]
Y= df.iloc[:,-1].values

In [48]:
X.shape

(10000, 1)

In [49]:
Y.shape

(10000,)

In [50]:
from sklearn.feature_extraction.text import CountVectorizer

In [51]:
cv = CountVectorizer()

In [52]:
X = cv.fit_transform(df['review']).toarray()

In [53]:
X.shape

(10000, 35687)

In [55]:
# X,Y
# training set
# testing set

In [56]:
from sklearn.model_selection import train_test_split

In [74]:
x_train ,  x_test ,y_train  , y_test = train_test_split(X,Y , test_size=0.2)

In [75]:
x_train.shape

(8000, 35687)

In [76]:
y_train.shape

(8000,)

In [77]:
x_test.shape

(2000, 35687)

In [78]:
y_test.shape

(2000,)

In [71]:
from sklearn.naive_bayes import GaussianNB , MultinomialNB 

In [80]:
clf1 = GaussianNB()
clf2 = MultinomialNB()

In [81]:
clf1.fit(x_train , y_train)
clf2.fit(x_train , y_train)

In [82]:
y_pred1 = clf1.predict(x_test)
y_pred2 = clf2.predict(x_test)

In [83]:
from sklearn.metrics import accuracy_score

In [84]:
accuracy_score(y_test , y_pred1)

0.6325

In [85]:
accuracy_score(y_test , y_pred2)

0.8505