In [1]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns

In [7]:
df = pd.read_csv('/IMDB Dataset.csv')

In [8]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [9]:
df['review'] = df['review'].str.lower()

In [10]:
#removing html tags and https links 
import re
CLEANR = re.compile('<.*?>') 

def cleanhtml(raw_html):
  cleantext = re.sub(CLEANR, '', raw_html)
  return cleantext

In [11]:
def remove_URL(text):
    return re.sub(r"http\S+", "", text)

In [12]:
df['review'] = df['review'].apply(cleanhtml)

In [13]:
df['review'] = df['review'].apply(remove_URL)

In [14]:
#removing punctuation 
import string 
exclude = string.punctuation 

In [17]:
def remove_punct(text):
  return text.translate(str.maketrans('','',exclude))

In [18]:
df['review'] = df['review'].apply(remove_punct)

In [19]:
df['review'][0]

'one of the other reviewers has mentioned that after watching just 1 oz episode youll be hooked they are right as this is exactly what happened with methe first thing that struck me about oz was its brutality and unflinching scenes of violence which set in right from the word go trust me this is not a show for the faint hearted or timid this show pulls no punches with regards to drugs sex or violence its is hardcore in the classic use of the wordit is called oz as that is the nickname given to the oswald maximum security state penitentary it focuses mainly on emerald city an experimental section of the prison where all the cells have glass fronts and face inwards so privacy is not high on the agenda em city is home to manyaryans muslims gangstas latinos christians italians irish and moreso scuffles death stares dodgy dealings and shady agreements are never far awayi would say the main appeal of the show is due to the fact that it goes where other shows wouldnt dare forget pretty pictur

In [33]:
#removing stopwords
import nltk 
nltk.download('stopwords')
from nltk.corpus import stopwords
sw = stopwords.words('english')
def remove_stopwords(text):
    new = []
    for t in text.split():
        if t in sw:
            new.append('')
        else:
            new.append(t)
    return " ".join(new)
            

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [32]:
remove_stopwords(df['review'][0])

'one    reviewers  mentioned   watching  1 oz episode youll  hooked   right    exactly  happened  methe first thing  struck   oz   brutality  unflinching scenes  violence  set  right   word go trust      show   faint hearted  timid  show pulls  punches  regards  drugs sex  violence   hardcore   classic use   wordit  called oz     nickname given   oswald maximum security state penitentary  focuses mainly  emerald city  experimental section   prison    cells  glass fronts  face inwards  privacy   high   agenda em city  home  manyaryans muslims gangstas latinos christians italians irish  moreso scuffles death stares dodgy dealings  shady agreements  never far awayi would say  main appeal   show  due   fact   goes   shows wouldnt dare forget pretty pictures painted  mainstream audiences forget charm forget romanceoz doesnt mess around  first episode  ever saw struck    nasty   surreal  couldnt say   ready      watched   developed  taste  oz  got accustomed   high levels  graphic violence  

In [34]:
df['review'] = df['review'].apply(remove_stopwords)

In [35]:
df.head()

Unnamed: 0,review,sentiment
0,one reviewers mentioned watching 1 oz e...,positive
1,wonderful little production filming techniqu...,positive
2,thought wonderful way spend time hot s...,positive
3,basically theres family little boy jake thi...,negative
4,petter matteis love time money visually s...,positive


In [40]:
from nltk.tokenize import word_tokenize
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [41]:
def n_w(text):
  return len(word_tokenize(text))

In [42]:
n_w(df['review'][0])

168

In [45]:
df.head()

Unnamed: 0,review,sentiment
0,one reviewers mentioned watching 1 oz e...,positive
1,wonderful little production filming techniqu...,positive
2,thought wonderful way spend time hot s...,positive
3,basically theres family little boy jake thi...,negative
4,petter matteis love time money visually s...,positive


In [46]:
df['no_of_words'] = df['review'].apply(n_w)

In [47]:
df.describe()

Unnamed: 0,no_of_words
count,50000.0
mean,119.9303
std,90.161935
min,3.0
25%,64.0
50%,89.0
75%,146.0
max,1429.0


In [48]:
sent_length = 1429

In [49]:
from sklearn.preprocessing import LabelEncoder

In [50]:
lb = LabelEncoder()

In [52]:
df['sentiment'] = lb.fit_transform(df['sentiment'])

In [53]:
df.head()

Unnamed: 0,review,sentiment,no_of_words
0,one reviewers mentioned watching 1 oz e...,1,168
1,wonderful little production filming techniqu...,1,84
2,thought wonderful way spend time hot s...,1,86
3,basically theres family little boy jake thi...,0,67
4,petter matteis love time money visually s...,1,125


In [54]:
#converting the text to numbers 

In [55]:
voc_size = 50000

In [56]:
from tensorflow.keras.preprocessing.text import one_hot

In [57]:
one_hot_rep = [one_hot(word,voc_size) for word in df['review']]

In [60]:
len(one_hot_rep)

50000

In [62]:
len(one_hot_rep[0]) #since first review contains 168 words the one_hot_rep[0] also contains 168

168

In [63]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [64]:
encoded_rep = pad_sequences(one_hot_rep,padding='pre',maxlen=sent_length)

In [67]:
len(encoded_rep[0])

1429

In [69]:
from tensorflow.keras.layers import Embedding

In [70]:
from tensorflow.keras.layers import LSTM 
from tensorflow.keras.layers import Dense
from tensorflow.keras.models import Sequential

In [71]:
model = Sequential()

In [73]:
model.add(Embedding(voc_size,300,input_length=sent_length))
model.add(LSTM(100))
model.add(Dense(1,activation='sigmoid'))
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

In [74]:
print(model.summary())

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 1429, 300)         15000000  
                                                                 
 lstm (LSTM)                 (None, 100)               160400    
                                                                 
 dense (Dense)               (None, 1)                 101       
                                                                 
 embedding_1 (Embedding)     (None, 1, 300)            15000000  
                                                                 
 lstm_1 (LSTM)               (None, 100)               160400    
                                                                 
 dense_1 (Dense)             (None, 1)                 101       
                                                                 
Total params: 30,321,002
Trainable params: 30,321,002
No

In [75]:
X = np.array(encoded_rep)

In [76]:
Y = np.array(df['sentiment'])

In [77]:
Y

array([1, 1, 1, ..., 0, 0, 0])

In [78]:
X

array([[    0,     0,     0, ..., 45233, 45744, 42636],
       [    0,     0,     0, ..., 42202, 48377,  5522],
       [    0,     0,     0, ..., 39101,  9454, 29525],
       ...,
       [    0,     0,     0, ..., 26027, 47686, 23847],
       [    0,     0,     0, ..., 17119, 22262, 44242],
       [    0,     0,     0, ..., 34239,  6871, 41167]], dtype=int32)

In [79]:
from sklearn.model_selection import train_test_split

In [80]:
X_train , X_test , Y_train , Y_test = train_test_split(X,Y,test_size=0.3)

In [82]:
model.fit(X_train , Y_train , validation_data = (X_test,Y_test) , epochs=10 , batch_size=64)

Epoch 1/10
  5/547 [..............................] - ETA: 10:46 - loss: 0.6932 - accuracy: 0.4625

KeyboardInterrupt: ignored