In [0]:
import numpy as np
import pandas as pd
import tensorflow as tf
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense,Embedding,LSTM,Bidirectional,Dropout,SpatialDropout1D
from tensorflow.keras.datasets import imdb
from keras.preprocessing.text import Tokenizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from keras.utils import to_categorical
from time import time
import pickle


In [0]:
max_features = 1000
oov_tok = '<oov>'
maxlen = 100

In [6]:
data = pd.read_csv('imdb_reviews.csv',encoding='latin-1')
print('dataset loaded')

dataset loaded


In [0]:
data = data.drop(['Unnamed: 0','type','file'],axis=1)
data.columns = ["review","sentiment"]

In [0]:
data = data[data.sentiment != 'unsup']
data['sentiment'] = data['sentiment'].map({'pos': 1, 'neg': 0})

In [0]:
tokenizer = Tokenizer(num_words = max_features, oov_token = oov_tok, filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', lower=True)
tokenizer.fit_on_texts(data['review'])
list_tokenized_train = tokenizer.texts_to_sequences(data['review'])
num_words = len(tokenizer.word_index) + 1

In [0]:
train_x = sequence.pad_sequences(list_tokenized_train, maxlen=maxlen)
train_y = data['sentiment']

In [0]:
train_y = to_categorical(train_y)

In [12]:
model = Sequential()
model.add(Embedding(num_words,64))
model.add(SpatialDropout1D(0.4))
model.add(Bidirectional(LSTM(64,return_sequences=True, dropout=0.2)))
model.add(Bidirectional(LSTM(32, dropout=0.2,recurrent_dropout=0.2)))
model.add(Dropout(0.05))
model.add(Dense(2, activation='softmax'))

model.summary()






Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, None, 64)          7915712   
_________________________________________________________________
spatial_dropout1d_1 (Spatial (None, None, 64)          0         
_________________________________________________________________
bidirectional_1 (Bidirection (None, None, 128)         66048     
_________________________________________________________________
bidirectional_2 (Bidirection (None, 64)                41216     
_________________________________________________________________
dropout_1 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 2)                 130       
T

In [13]:
model.compile(loss='categorical_crossentropy', optimizer='adam',metrics=['accuracy'])

model.fit(train_x,train_y,batch_size=5000,epochs=25,verbose=1)



Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where



Epoch 1/25





Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


<keras.callbacks.History at 0x7fa9483c3eb8>

In [14]:
s = 'Excellent movie'

test = tokenizer.texts_to_sequences([s])

test = sequence.pad_sequences(test, maxlen=maxlen)

sentiment = model.predict(test)[0]

if(np.argmax(sentiment) == 0):
    print("negative")
elif (np.argmax(sentiment) == 1):
    print("positive")
sentiment

positive


array([0.10695802, 0.893042  ], dtype=float32)

In [0]:
out = open('model.pkl','wb')
pickle.dump(model,out)
out.close()

In [0]:
out = open('tokenizer.pkl','wb')
pickle.dump(tokenizer,out)
out.close()