# DATA SCIENCE PROJECT - SENTIMENT ANALYSIS NLP

In [1]:
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv("IMDB Dataset.csv")

In [3]:
data

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


In [4]:
data = data.sample(5000)

In [5]:
data

Unnamed: 0,review,sentiment
9055,Everybody interested in Texas needs to have th...,positive
5054,This is an example of why the majority of acti...,negative
49412,"You see a movie titled 'battlespace', what are...",negative
26189,Ocean's twelve is probably better than Ocean's...,positive
43872,Personal taste rules when it comes to talking ...,positive
...,...,...
20687,Pure schlock from beginning to end. The averag...,negative
42458,Hoppity is a charming if slightly phycadelic a...,positive
5604,"Like Margot in ""Fear of Fear"" falls victim of ...",positive
9389,RIFIFI (Jules Dassin - France 1955)<br /><br /...,positive


In [6]:
data['sentiment'].unique()

array(['positive', 'negative'], dtype=object)

In [7]:
data['sentiment'].replace({'positive':1,'negative':0},inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['sentiment'].replace({'positive':1,'negative':0},inplace=True)
  data['sentiment'].replace({'positive':1,'negative':0},inplace=True)


In [8]:
data.head()

Unnamed: 0,review,sentiment
9055,Everybody interested in Texas needs to have th...,1
5054,This is an example of why the majority of acti...,0
49412,"You see a movie titled 'battlespace', what are...",0
26189,Ocean's twelve is probably better than Ocean's...,1
43872,Personal taste rules when it comes to talking ...,1


In [9]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5000 entries, 9055 to 27199
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     5000 non-null   object
 1   sentiment  5000 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 117.2+ KB


## Removing html < > Tags

In [10]:
import re
def clean_html(text):
    clean = re.compile('<.*?>')
    return re.sub(clean,'',text)

In [11]:
data['review']=data['review'].apply(clean_html)

In [12]:
data

Unnamed: 0,review,sentiment
9055,Everybody interested in Texas needs to have th...,1
5054,This is an example of why the majority of acti...,0
49412,"You see a movie titled 'battlespace', what are...",0
26189,Ocean's twelve is probably better than Ocean's...,1
43872,Personal taste rules when it comes to talking ...,1
...,...,...
20687,Pure schlock from beginning to end. The averag...,0
42458,Hoppity is a charming if slightly phycadelic a...,1
5604,"Like Margot in ""Fear of Fear"" falls victim of ...",1
9389,"RIFIFI (Jules Dassin - France 1955)To me, it s...",1


In [13]:
data['review'].iloc[3]

'Ocean\'s twelve is probably better than Ocean\'s eleven. I know most people would disagree, But I actually liked it more. After three years, it was good seeing the gang return. The reason behind the heist is a bit more inspired the second time around. I see why they stole from Benedict(Andy Garcia) in the last film. This film they have a bit more motivation the second time around. Ocean\'s twelve is more entertaining, and cooler than Ocean\'s eleven. With a funny cameo by Topher Grace saying "I just walked in that new Dennis Quaid movie" and other things. I think Ocean\'s Twelve is probably the best in the series.The Plot: A year or so after Ocean\'s Twelve, Terry Benedict(the guy they robbed in the last film) is back and says that if Ocean\'s eleven doesn\'t pay him back the money they stole, he\'s going to call the cops. So Danny Ocean and the gang go to Europe, where Rusty meets his old cop girlfriend Isabelle. After she meets him again, she begins to follow them around. Also, the 

## Removing Special Character

In [14]:
def remove_special(text):
    x = ''
    for i in text:
        if i.isalnum():
            x = x+i
        else:
            x = x+' '
    return x        

In [15]:
data['review']=data['review'].apply(remove_special)

In [16]:
data['review'].iloc[3]

'Ocean s twelve is probably better than Ocean s eleven  I know most people would disagree  But I actually liked it more  After three years  it was good seeing the gang return  The reason behind the heist is a bit more inspired the second time around  I see why they stole from Benedict Andy Garcia  in the last film  This film they have a bit more motivation the second time around  Ocean s twelve is more entertaining  and cooler than Ocean s eleven  With a funny cameo by Topher Grace saying  I just walked in that new Dennis Quaid movie  and other things  I think Ocean s Twelve is probably the best in the series The Plot  A year or so after Ocean s Twelve  Terry Benedict the guy they robbed in the last film  is back and says that if Ocean s eleven doesn t pay him back the money they stole  he s going to call the cops  So Danny Ocean and the gang go to Europe  where Rusty meets his old cop girlfriend Isabelle  After she meets him again  she begins to follow them around  Also  the gang lear

## Convert To Lowercase

In [17]:
def convert_lower(text):
    return text.lower()

In [18]:
data['review']=data['review'].apply(convert_lower)

In [19]:
data['review'].iloc[3]

'ocean s twelve is probably better than ocean s eleven  i know most people would disagree  but i actually liked it more  after three years  it was good seeing the gang return  the reason behind the heist is a bit more inspired the second time around  i see why they stole from benedict andy garcia  in the last film  this film they have a bit more motivation the second time around  ocean s twelve is more entertaining  and cooler than ocean s eleven  with a funny cameo by topher grace saying  i just walked in that new dennis quaid movie  and other things  i think ocean s twelve is probably the best in the series the plot  a year or so after ocean s twelve  terry benedict the guy they robbed in the last film  is back and says that if ocean s eleven doesn t pay him back the money they stole  he s going to call the cops  so danny ocean and the gang go to europe  where rusty meets his old cop girlfriend isabelle  after she meets him again  she begins to follow them around  also  the gang lear

## Removing Stopwords

In [20]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\balaj\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [21]:
from nltk.corpus import stopwords

In [22]:
a = stopwords.words('english')
a

['a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 "he'd",
 "he'll",
 'her',
 'here',
 'hers',
 'herself',
 "he's",
 'him',
 'himself',
 'his',
 'how',
 'i',
 "i'd",
 'if',
 "i'll",
 "i'm",
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it'd",
 "it'll",
 "it's",
 'its',
 'itself',
 "i've",
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'on

In [23]:
def remove_stopwords(text):
    x = []
    for i in text.split():
        if i not in a:
            x.append(i)
        else:
            pass
    return x        

In [24]:
data['review']=data['review'].apply(remove_stopwords)

In [25]:
data['review'].iloc[3]

['ocean',
 'twelve',
 'probably',
 'better',
 'ocean',
 'eleven',
 'know',
 'people',
 'would',
 'disagree',
 'actually',
 'liked',
 'three',
 'years',
 'good',
 'seeing',
 'gang',
 'return',
 'reason',
 'behind',
 'heist',
 'bit',
 'inspired',
 'second',
 'time',
 'around',
 'see',
 'stole',
 'benedict',
 'andy',
 'garcia',
 'last',
 'film',
 'film',
 'bit',
 'motivation',
 'second',
 'time',
 'around',
 'ocean',
 'twelve',
 'entertaining',
 'cooler',
 'ocean',
 'eleven',
 'funny',
 'cameo',
 'topher',
 'grace',
 'saying',
 'walked',
 'new',
 'dennis',
 'quaid',
 'movie',
 'things',
 'think',
 'ocean',
 'twelve',
 'probably',
 'best',
 'series',
 'plot',
 'year',
 'ocean',
 'twelve',
 'terry',
 'benedict',
 'guy',
 'robbed',
 'last',
 'film',
 'back',
 'says',
 'ocean',
 'eleven',
 'pay',
 'back',
 'money',
 'stole',
 'going',
 'call',
 'cops',
 'danny',
 'ocean',
 'gang',
 'go',
 'europe',
 'rusty',
 'meets',
 'old',
 'cop',
 'girlfriend',
 'isabelle',
 'meets',
 'begins',
 'follow',

## Stemming

In [26]:
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

In [27]:
def stem_words(text):
    x = []
    for i in text:
        x.append(ps.stem(i))
    return x    

In [28]:
data['review']=data['review'].apply(stem_words)

In [29]:
data['review'].iloc[3]

['ocean',
 'twelv',
 'probabl',
 'better',
 'ocean',
 'eleven',
 'know',
 'peopl',
 'would',
 'disagre',
 'actual',
 'like',
 'three',
 'year',
 'good',
 'see',
 'gang',
 'return',
 'reason',
 'behind',
 'heist',
 'bit',
 'inspir',
 'second',
 'time',
 'around',
 'see',
 'stole',
 'benedict',
 'andi',
 'garcia',
 'last',
 'film',
 'film',
 'bit',
 'motiv',
 'second',
 'time',
 'around',
 'ocean',
 'twelv',
 'entertain',
 'cooler',
 'ocean',
 'eleven',
 'funni',
 'cameo',
 'topher',
 'grace',
 'say',
 'walk',
 'new',
 'denni',
 'quaid',
 'movi',
 'thing',
 'think',
 'ocean',
 'twelv',
 'probabl',
 'best',
 'seri',
 'plot',
 'year',
 'ocean',
 'twelv',
 'terri',
 'benedict',
 'guy',
 'rob',
 'last',
 'film',
 'back',
 'say',
 'ocean',
 'eleven',
 'pay',
 'back',
 'money',
 'stole',
 'go',
 'call',
 'cop',
 'danni',
 'ocean',
 'gang',
 'go',
 'europ',
 'rusti',
 'meet',
 'old',
 'cop',
 'girlfriend',
 'isabel',
 'meet',
 'begin',
 'follow',
 'around',
 'also',
 'gang',
 'learn',
 'enemi',

## Joining

In [30]:
def join(list_input):
    return ' '.join(list_input)

In [31]:
data['review']=data['review'].apply(join)

In [32]:
data['review'].iloc[3]

'ocean twelv probabl better ocean eleven know peopl would disagre actual like three year good see gang return reason behind heist bit inspir second time around see stole benedict andi garcia last film film bit motiv second time around ocean twelv entertain cooler ocean eleven funni cameo topher grace say walk new denni quaid movi thing think ocean twelv probabl best seri plot year ocean twelv terri benedict guy rob last film back say ocean eleven pay back money stole go call cop danni ocean gang go europ rusti meet old cop girlfriend isabel meet begin follow around also gang learn enemi also thief littl better mani funni scene like tess julia robert goe europ pretend julia robert ocean twelv pretti clever film cooler funnier entertain ocean eleven'

# APPLYING MACHINE LEARNING ALGOS

In [33]:
X = data['review']
y = data['sentiment']

In [34]:
from sklearn.feature_extraction.text import CountVectorizer

In [35]:
cv = CountVectorizer()

In [36]:
cv.fit_transform(data['review'])

<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 466793 stored elements and shape (5000, 26820)>

In [37]:
X = cv.fit_transform(data['review']).toarray()

In [38]:
X

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], shape=(5000, 26820))

In [39]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2)

In [40]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score,confusion_matrix

In [41]:
nb = GaussianNB()

In [42]:
nb.fit(X_train,y_train)

0,1,2
,priors,
,var_smoothing,1e-09


In [43]:
y_pred = nb.predict(X_test)
y_pred

array([0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1,
       1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0,
       0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0,
       1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0,
       0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0,
       0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1,
       0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0,
       0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0,
       1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0,
       0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1,

In [44]:
y_test

15633    1
31318    1
27691    0
7527     1
40659    1
        ..
9501     0
909      1
48474    1
23375    0
25805    1
Name: sentiment, Length: 1000, dtype: int64

In [45]:
cm =confusion_matrix(y_test,y_pred)
cm 

array([[348, 143],
       [241, 268]])