## Import required Libraries

In [27]:
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from string import punctuation
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

## Load Dataset

In [2]:
df = pd.read_csv(r"IMDB Dataset.csv")
df.head(2)

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive


## Split Data in train and test

In [3]:
x_train,x_test,y_train,y_test = train_test_split(df.review,df.sentiment,test_size=0.25,random_state=42,stratify=df.sentiment)

## Tokenization

In [4]:
def tokenization(data):
    tokens = word_tokenize(data)
    return tokens
x_train_tokens = x_train.apply(tokenization)
x_test_tokens  = x_test.apply(tokenization)

In [5]:
x_test_tokens

46506    [do, n't, see, this, ., this, was, one, of, th...
36513    [This, movie, could, have, been, great, ., It,...
12376    [I, sort, of, liked, this, movie, ,, not, a, g...
21915    [This, movie, could, have, been, so, much, bet...
30272    [It, seems, that, it, is, becoming, fashionabl...
                               ...                        
28091    [This, Documentary, (, Now, available, free, o...
5474     [Poor, Basil, Rathbone, ,, an, egotistical, co...
43110    [In, April, of, 1965, ,, CBS, broadcast, the, ...
5125     [Having, seen, other, Bollywood, flicks, with,...
42868    [Brought, to, you, by, the, following, among, ...
Name: review, Length: 12500, dtype: object

## Data cleaning

In [6]:
def cleaning(data):
    clean_text = [i for i in data if (i not in punctuation) and (i.isalpha()) and (len(i)>1)]
    return clean_text

x_train_clean = x_train_tokens.apply(cleaning)
x_test_clean  = x_test_tokens.apply(cleaning)

In [7]:
x_test_clean

46506    [do, see, this, this, was, one, of, the, dumbe...
36513    [This, movie, could, have, been, great, It, is...
12376    [sort, of, liked, this, movie, not, good, one,...
21915    [This, movie, could, have, been, so, much, bet...
30272    [It, seems, that, it, is, becoming, fashionabl...
                               ...                        
28091    [This, Documentary, Now, available, free, on, ...
5474     [Poor, Basil, Rathbone, an, egotistical, compo...
43110    [In, April, of, CBS, broadcast, the, first, of...
5125     [Having, seen, other, Bollywood, flicks, with,...
42868    [Brought, to, you, by, the, following, among, ...
Name: review, Length: 12500, dtype: object

## Convert data to lowercase

In [8]:
def text_normal(data) : 
    text = [i.lower() for i in data ]
    return text 

x_train_lower = x_train_clean.apply(text_normal)
x_test_lower  = x_test_clean.apply(text_normal)

In [9]:
x_train_lower

17949    [just, saw, adam, had, four, sons, for, the, f...
5786     [for, one, have, shamelessly, enjoyed, every, ...
42175    [this, movie, is, journey, through, the, mind,...
39484    [this, was, absolutely, one, of, the, best, mo...
34209    [oh, geez, there, are, so, many, other, films,...
                               ...                        
1950     [besides, the, fact, that, my, list, of, favor...
22917    [the, first, and, only, time, saw, shades, was...
47481    [this, was, such, waste, of, time, danger, if,...
35597    [this, is, by, far, the, most, pathetic, movie...
27491    [this, movie, forever, left, an, impression, o...
Name: review, Length: 37500, dtype: object

## Stopwords removal

In [10]:
stop = stopwords.words('english')
def Stopword_Removal(data):
    text = [ i for i in data if i not in stop]
    return text 
x_train_withoutstop = x_train_lower.apply(Stopword_Removal)
x_test_withoutstop  = x_test_lower.apply(Stopword_Removal)

In [11]:
x_train_withoutstop

17949    [saw, adam, four, sons, first, time, thing, st...
5786     [one, shamelessly, enjoyed, every, episode, pu...
42175    [movie, journey, mind, screenwriter, caught, p...
39484    [absolutely, one, best, movies, seen, br, br, ...
34209    [oh, geez, many, films, want, see, got, stuck,...
                               ...                        
1950     [besides, fact, list, favorite, movie, makers,...
22917    [first, time, saw, shades, sneakpreview, even,...
47481    [waste, time, danger, watch, tempted, tear, dv...
35597    [far, pathetic, movie, indian, cinema, cinema,...
27491    [movie, forever, left, impression, watched, fr...
Name: review, Length: 37500, dtype: object

## Lemmatization 

In [12]:
def lemmatization(data):
    lemma = WordNetLemmatizer()
    l1 = []
    for i in data :
        text = lemma.lemmatize(i)
        l1.append(text)
    return l1

x_train_lemma = x_train_withoutstop.apply(lemmatization)
x_test_lemma  = x_test_withoutstop.apply(lemmatization)

In [13]:
x_train_lemma

17949    [saw, adam, four, son, first, time, thing, str...
5786     [one, shamelessly, enjoyed, every, episode, pu...
42175    [movie, journey, mind, screenwriter, caught, p...
39484    [absolutely, one, best, movie, seen, br, br, e...
34209    [oh, geez, many, film, want, see, got, stuck, ...
                               ...                        
1950     [besides, fact, list, favorite, movie, maker, ...
22917    [first, time, saw, shade, sneakpreview, even, ...
47481    [waste, time, danger, watch, tempted, tear, dv...
35597    [far, pathetic, movie, indian, cinema, cinema,...
27491    [movie, forever, left, impression, watched, fr...
Name: review, Length: 37500, dtype: object

## Concatenating cleaned data

In [14]:
def join_list(data):
    text = " ".join(data)
    return text
x_train_final = x_train_lemma.apply(join_list)
x_test_final  = x_test_lemma.apply(join_list)

In [15]:
x_train_final

17949    saw adam four son first time thing struck beli...
5786     one shamelessly enjoyed every episode pushing ...
42175    movie journey mind screenwriter caught paradox...
39484    absolutely one best movie seen br br excellent...
34209    oh geez many film want see got stuck nephew we...
                               ...                        
1950     besides fact list favorite movie maker stanley...
22917    first time saw shade sneakpreview even premier...
47481    waste time danger watch tempted tear dvd wall ...
35597    far pathetic movie indian cinema cinema come t...
27491    movie forever left impression watched freshman...
Name: review, Length: 37500, dtype: object

## Text to Number 

In [16]:
cv = CountVectorizer(lowercase=True,stop_words='english',max_df=0.95,max_features=1000)
count_train = cv.fit_transform(x_train_final)
count_test = cv.transform(x_test_final)

In [17]:
count_train

<37500x1000 sparse matrix of type '<class 'numpy.int64'>'
	with 1671413 stored elements in Compressed Sparse Row format>

In [18]:
count_test

<12500x1000 sparse matrix of type '<class 'numpy.int64'>'
	with 559184 stored elements in Compressed Sparse Row format>

In [19]:
df1 = pd.DataFrame(count_train.A,columns = cv.get_feature_names())
df1.head(10)



Unnamed: 0,ability,able,absolutely,accent,act,acted,acting,action,actor,actress,...,written,wrong,wrote,yeah,year,yes,york,young,younger,zombie
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,2,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,1,0,0,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Model Building Using Naive Bayes algoritham

In [20]:
mnb = MultinomialNB()
mnb.fit(count_train.A,y_train)

In [23]:
pred_mnb = mnb.predict(count_test.A)
pred_mnb

array(['negative', 'negative', 'negative', ..., 'positive', 'positive',
       'negative'], dtype='<U8')

In [24]:
train_pred_mnb = mnb.predict(count_train.A)
train_pred_mnb

array(['positive', 'positive', 'positive', ..., 'negative', 'negative',
       'negative'], dtype='<U8')

In [25]:
accuracy_score(y_train,train_pred_mnb)

0.8281333333333334

In [28]:
print(classification_report(y_test, pred_mnb))

              precision    recall  f1-score   support

    negative       0.83      0.82      0.83      6250
    positive       0.82      0.83      0.83      6250

    accuracy                           0.83     12500
   macro avg       0.83      0.83      0.83     12500
weighted avg       0.83      0.83      0.83     12500

