In [1]:
import os
from nltk.corpus import stopwords
from nltk import word_tokenize
from string import punctuation
import random

In [2]:
stop_words = stopwords.words('english')

In [3]:
stop_words

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [4]:
import pandas as pd

In [5]:
df = pd.read_csv("IMDB Dataset.csv")

In [6]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [7]:
negative_documents = []
max_len_negative = 0
positive_documents = []
max_len_positive = 0
for ind, row in df.iterrows():
    if row['sentiment']=="negative":
        text = row['review']
        tokens = word_tokenize(text)
        translator = str.maketrans('', '', punctuation)
        tokens = [w.translate(translator) for w in tokens]
        tokens = [w for w in tokens if not w in stop_words]
        if len(tokens) > max_len_negative:
            max_len_negative = len(tokens)
        negative_documents.append(' '.join(tokens))
    else:
        text = row['review']
        tokens = word_tokenize(text)
        translator = str.maketrans('', '', punctuation)
        tokens = [w.translate(translator) for w in tokens]
        tokens = [w for w in tokens if not w in stop_words]
        if len(tokens) > max_len_positive:
            max_len_positive = len(tokens)
        positive_documents.append(' '.join(tokens))

In [8]:
len(negative_documents)

25000

In [9]:
len(positive_documents)

25000

In [10]:
max_len_negative

1287

In [11]:
max_len_positive

1824

In [12]:
max_len = max(max_len_negative, max_len_positive)

In [13]:
random.shuffle(negative_documents)
random.shuffle(positive_documents)

In [14]:
X_train = negative_documents[:20000] + positive_documents[:20000]

In [15]:
len(X_train)

40000

In [16]:
y_train = [0 for _ in range(20000)] + [1 for _ in range(20000)]

In [17]:
len(y_train)

40000

In [18]:
X_test = negative_documents[20000:] + positive_documents[20000:]
len(X_test)

10000

In [19]:
y_test = [0 for _ in range(5000)] + [1 for _ in range(5000)]

In [20]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import plot_model
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Flatten, Embedding, Conv1D, MaxPool1D, Dropout
from tensorflow.keras.layers import concatenate

In [75]:
max_len

1824

In [76]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)

In [77]:
import pickle
with open('tokenizer.h5', 'wb') as f:
    pickle.dump(tokenizer, f)

In [78]:
tokenizer

<keras.preprocessing.text.Tokenizer at 0x1da2a15deb0>

In [79]:
vocab_len = len(tokenizer.word_index) + 1

In [80]:
vocab_len

123623

In [81]:
encoded = tokenizer.texts_to_sequences(X_train)

In [82]:
encoded[0]

[108,
 5,
 1796,
 15,
 21,
 462,
 2815,
 477,
 9736,
 9737,
 675,
 4072,
 19953,
 79,
 394,
 7236,
 3118,
 4,
 2018,
 1625,
 565,
 336,
 101,
 349,
 772,
 109,
 232,
 3933,
 83,
 4072,
 12574,
 4179,
 599,
 1,
 1,
 1580,
 152,
 1641,
 2,
 202,
 2,
 2313,
 24,
 2177,
 8448,
 316,
 124,
 15,
 11,
 99,
 195,
 67]

In [83]:
padded = pad_sequences(encoded, maxlen=max_len, padding='post')

In [84]:
padded.shape

(40000, 1824)

In [85]:
input1 = Input(shape=(max_len,))
embedding1 = Embedding(vocab_len, 100)(input1)
conv1 = Conv1D(filters=32, kernel_size=4, activation='relu')(embedding1)
drop1 = Dropout(0.5)(conv1)
pool1 = MaxPool1D(pool_size=2)(drop1)
flat1 = Flatten()(pool1)
dense1 = Dense(10, activation='relu')(flat1)
output = Dense(1, activation='sigmoid')(dense1)

In [86]:
model = Model(inputs=[input1], outputs=output)

In [87]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [88]:
model.summary()

Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_2 (InputLayer)        [(None, 1824)]            0         
                                                                 
 embedding_1 (Embedding)     (None, 1824, 100)         12362300  
                                                                 
 conv1d_1 (Conv1D)           (None, 1821, 32)          12832     
                                                                 
 dropout_1 (Dropout)         (None, 1821, 32)          0         
                                                                 
 max_pooling1d_1 (MaxPooling  (None, 910, 32)          0         
 1D)                                                             
                                                                 
 flatten_1 (Flatten)         (None, 29120)             0         
                                                           

In [90]:
encoded_test = tokenizer.texts_to_sequences(X_test)
padded_test = pad_sequences(encoded_test, maxlen=max_len, padding='post')

In [91]:
padded_test.shape

(10000, 1824)

In [92]:
import numpy as np

In [140]:
model.fit([padded], np.array(y_train), epochs=50, batch_size=20, validation_data=([padded_test], np.array(y_test)))

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x1da2e951940>

In [141]:
model.save('textcnn.h5')

In [142]:
from tensorflow.keras.models import load_model

In [143]:
model = load_model('textcnn.h5')

In [144]:
with open('tokenizer.h5', 'rb') as f:
    tokenizer = pickle.load(f)

In [145]:
tokenizer.word_index

{'br': 1,
 'i': 2,
 'the': 3,
 'movie': 4,
 'film': 5,
 'nt': 6,
 'one': 7,
 'like': 8,
 'it': 9,
 'this': 10,
 'good': 11,
 'would': 12,
 'even': 13,
 'time': 14,
 'really': 15,
 'story': 16,
 'see': 17,
 'well': 18,
 'much': 19,
 'could': 20,
 'get': 21,
 'bad': 22,
 'people': 23,
 'great': 24,
 'also': 25,
 'first': 26,
 'made': 27,
 'way': 28,
 'make': 29,
 'movies': 30,
 'but': 31,
 'characters': 32,
 'think': 33,
 'and': 34,
 'watch': 35,
 'films': 36,
 'character': 37,
 'many': 38,
 'seen': 39,
 'two': 40,
 'never': 41,
 'acting': 42,
 'love': 43,
 'plot': 44,
 'best': 45,
 'show': 46,
 'know': 47,
 'little': 48,
 'a': 49,
 'life': 50,
 'in': 51,
 'there': 52,
 'ever': 53,
 'better': 54,
 'man': 55,
 'end': 56,
 'if': 57,
 'scene': 58,
 'still': 59,
 'say': 60,
 'he': 61,
 'scenes': 62,
 'something': 63,
 'go': 64,
 'back': 65,
 'thing': 66,
 'watching': 67,
 'real': 68,
 'though': 69,
 'actors': 70,
 'years': 71,
 'funny': 72,
 'actually': 73,
 'another': 74,
 'work': 75,
 'mak

In [146]:
df['review'][3]

"Basically there's a family where a little boy (Jake) thinks there's a zombie in his closet & his parents are fighting all the time.<br /><br />This movie is slower than a soap opera... and suddenly, Jake decides to become Rambo and kill the zombie.<br /><br />OK, first of all when you're going to make a film you must Decide if its a thriller or a drama! As a drama the movie is watchable. Parents are divorcing & arguing like in real life. And then we have Jake with his closet which totally ruins all the film! I expected to see a BOOGEYMAN similar movie, and instead i watched a drama with some meaningless thriller spots.<br /><br />3 out of 10 just for the well playing parents & descent dialogs. As for the shots with Jake: just ignore them."

In [165]:
text = df['review'][3]

In [167]:
tokens = word_tokenize(text)
translator = str.maketrans('', '', punctuation)
tokens = [w.translate(translator) for w in tokens]
tokens = [w for w in tokens if not w in stop_words]
text = ' '.join(tokens)

In [168]:
text

'Basically family little boy  Jake  thinks zombie closet  parents fighting time  br    br   This movie slower soap opera  suddenly  Jake decides become Rambo kill zombie  br    br   OK  first going make film must Decide thriller drama  As drama movie watchable  Parents divorcing  arguing like real life  And Jake closet totally ruins film  I expected see BOOGEYMAN similar movie  instead watched drama meaningless thriller spots  br    br   3 10 well playing parents  descent dialogs  As shots Jake  ignore '

In [169]:
text = tokenizer.texts_to_sequences([text])[0]
text

[582,
 142,
 48,
 328,
 3114,
 1135,
 910,
 4244,
 661,
 893,
 14,
 1,
 1,
 10,
 4,
 8370,
 1868,
 1207,
 998,
 3114,
 988,
 335,
 5949,
 414,
 910,
 1,
 1,
 470,
 26,
 81,
 29,
 5,
 120,
 1040,
 676,
 368,
 110,
 368,
 4,
 1666,
 661,
 25388,
 6807,
 8,
 68,
 50,
 34,
 3114,
 4244,
 361,
 3943,
 5,
 2,
 769,
 17,
 9935,
 639,
 4,
 206,
 202,
 368,
 3587,
 676,
 3347,
 1,
 1,
 348,
 200,
 18,
 306,
 661,
 4631,
 3260,
 110,
 539,
 3114,
 2651]

In [170]:
text = pad_sequences([text], maxlen=max_len, padding='post')
text

array([[582, 142,  48, ...,   0,   0,   0]])

In [171]:
text.shape

(1, 1824)

In [172]:
model.predict(text)



array([[1.3217071e-09]], dtype=float32)

In [173]:
pred = model.predict(text)
if float(pred) > 0.5:
    print('Positive')
else:
    print('Negative')

Negative
