In [1]:
import numpy as np
import pandas as pd

In [2]:
import gensim
import os

### Importing the Data

In [3]:
df=pd.read_csv('IMDB Dataset.csv')
df

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


### Text Preprocessing

In [4]:
df['review']=df['review'].str.lower()

In [5]:
import re
def remove_HTML_tag(text):
    pattern=re.compile('<.*?>')
    return pattern.sub(r'',text)
    
df['review']=df['review'].apply(remove_HTML_tag)

In [6]:
df['review'][3]

"basically there's a family where a little boy (jake) thinks there's a zombie in his closet & his parents are fighting all the time.this movie is slower than a soap opera... and suddenly, jake decides to become rambo and kill the zombie.ok, first of all when you're going to make a film you must decide if its a thriller or a drama! as a drama the movie is watchable. parents are divorcing & arguing like in real life. and then we have jake with his closet which totally ruins all the film! i expected to see a boogeyman similar movie, and instead i watched a drama with some meaningless thriller spots.3 out of 10 just for the well playing parents & descent dialogs. as for the shots with jake: just ignore them."

In [7]:
import string
string.punctuation

exclude=string.punctuation

def remove_punc(text):
    return text.translate(str.maketrans('','',exclude))

df['review']=df['review'].apply(remove_punc)


In [8]:
from nltk.tokenize import sent_tokenize
from gensim.utils import simple_preprocess

In [9]:
story=[]
for doc in df['review']:
    raw_sent=sent_tokenize(doc)
    for sent in raw_sent:
        story.append(simple_preprocess(sent))
        

### Applying Word2Vec

In [10]:
from gensim.models import Word2Vec
model=Word2Vec(window=10,min_count=2)

In [11]:
model.build_vocab(story)

In [12]:
model.train(story,total_examples=model.corpus_count,epochs=5)

(41021414, 53829365)

In [13]:
len(model.wv.index_to_key)

80914

In [14]:
def document_vector(doc):
    #remove out-of-vocabulary words
    doc=[word for word in doc.split() if word in model.wv.index_to_key]
    return np.mean(model.wv[doc],axis=0)
    

In [15]:
document_vector(df['review'].values[0])

array([ 1.68668285e-01, -5.09473026e-01,  1.85305495e-02,  1.30628002e+00,
        1.41556174e-01,  3.59363943e-01,  3.19560260e-01, -4.11430776e-01,
       -1.12994527e-02,  8.01704884e-01,  2.03676105e-01,  5.30967176e-01,
        3.17641258e-01, -3.63550261e-02, -3.80973935e-01, -2.97440141e-01,
       -2.86719382e-01,  4.24620539e-01, -3.76477152e-01,  2.75009394e-01,
       -9.53930020e-02, -8.73646885e-02, -3.15744758e-01, -1.67071462e-01,
        8.08409274e-01,  3.30759615e-01, -3.16114694e-01,  8.07944000e-01,
        1.17756221e-02, -4.43372160e-01, -2.40895435e-01,  5.34538388e-01,
       -8.44068289e-01, -2.28390340e-02,  6.02697909e-01, -1.51894882e-01,
        4.07571495e-01,  2.44004384e-01,  3.00955418e-02,  4.84590270e-02,
       -3.50497989e-03, -2.38129959e-01,  3.12909074e-02,  1.02124262e+00,
        3.03163379e-01, -1.29104629e-01,  3.61984521e-01,  6.56693950e-02,
        4.18238282e-01,  3.78251910e-01, -8.79416168e-02, -7.35243678e-01,
       -8.71763468e-01, -

In [16]:
from tqdm import tqdm
X=[]
for doc in tqdm(df['review'].values):
    X.append(document_vector(doc))

100%|██████████████████████████████████████████████████████████████████████████| 50000/50000 [2:15:34<00:00,  6.15it/s]


In [17]:
X=np.array(X)

In [18]:
X.shape

(50000, 100)

In [19]:
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()

Y=le.fit_transform(df['sentiment'])

In [20]:
Y

array([1, 1, 1, ..., 0, 0, 0])

### Model Building

In [21]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test= train_test_split(X,Y,test_size=0.2,random_state=1)

In [23]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [24]:
rf=RandomForestClassifier()
rf.fit(x_train,y_train)

In [25]:
y_pred=rf.predict(x_test)
print('Accuracy: ',accuracy_score(y_test,y_pred))

Accuracy:  0.8263


### Taking the Review as Input

In [60]:
review=input()
a=document_vector(review)

 it was a disastrous movie


In [61]:
b=np.expand_dims(a,axis=0)
rf.predict(b)

array([0])

### Creating Pickle File

In [57]:
import pickle

with open('Sentiment_Analysis.pkl', 'wb') as f:
    pickle.dump(rf, f)