In [1]:
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import classification_report



def setup_gpu():
    for gpu in tf.config.list_physical_devices('GPU'):
        tf.config.experimental.set_memory_growth(gpu, True)

#     policy = tf.keras.mixed_precision.Policy('mixed_float16')
    tf.keras.mixed_precision.set_global_policy('mixed_float16')        

setup_gpu()

INFO:tensorflow:Mixed precision compatibility check (mixed_float16): OK
Your GPU will likely run quickly with dtype policy mixed_float16 as it has compute capability of at least 7.0. Your GPU: GeForce RTX 2070 SUPER, compute capability 7.5


In [2]:
sns.set(rc={'figure.figsize':(16,8)})

In [3]:
TOP_WORDS = 80000

In [4]:
(raw_x_train, raw_y_train), (raw_x_test, raw_y_test) = tf.keras.datasets.imdb.load_data(num_words = TOP_WORDS)
word_index = tf.keras.datasets.imdb.get_word_index()
len(word_index)

  x_train, y_train = np.array(xs[:idx]), np.array(labels[:idx])
  x_test, y_test = np.array(xs[idx:]), np.array(labels[idx:])


88584

In [5]:
MAX_WORDS = 256
padded_x_train = tf.keras.preprocessing.sequence.pad_sequences(raw_x_train, maxlen=MAX_WORDS)
padded_x_test = tf.keras.preprocessing.sequence.pad_sequences(raw_x_test, maxlen=MAX_WORDS)


In [6]:
positive_idx = np.where(raw_y_train > 0)[0]
negative_idx = np.where(raw_y_train == 0)[0]

np.random.shuffle(positive_idx)
positive_last = int(len(positive_idx) * 0.5)
positive_idx = positive_idx[:positive_last]

positive_idx = np.tile(positive_idx, 2)

idx = np.concatenate((positive_idx, negative_idx))

x_train = padded_x_train[idx]
y_train = raw_y_train[idx]

# x_train = padded_x_train
# y_train = raw_y_train

x_test = padded_x_test
y_test = raw_y_test

def vectorize(sequences, longest = MAX_WORDS):
    results = np.zeros((len(sequences), dimension))
    for i, sequence in enumerate(sequences):
        results[i, sequence] = 1
    return results

In [7]:
print(f"x_train shape: {x_train.shape}")
print(f"y_train shape: {y_train.shape}")

print(f"x_test shape: {x_test.shape}")
print(f"y_test shape: {y_test.shape}")

print(f"word_index size: {len(word_index)}")

x_train shape: (25000, 256)
y_train shape: (25000,)
x_test shape: (25000, 256)
y_test shape: (25000,)
word_index size: 88584


In [8]:
train_positive = np.where(y_train > 0)[0]
train_negative = np.where(y_train == 0)[0]

In [9]:
print(f"Positive reviews: {train_positive.shape}")
print(f"Negative reviews: {train_negative.shape}")

Positive reviews: (12500,)
Negative reviews: (12500,)


In [10]:
EMBEDDING_SIZE = 16
LSTM_SIZE = 256
HIDDEN_LAYER_SIZE = 128
DROPOUT = 0.2

model = tf.keras.Sequential([
    tf.keras.layers.Embedding(TOP_WORDS, EMBEDDING_SIZE, input_length=MAX_WORDS),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(LSTM_SIZE, dropout=DROPOUT)),
    tf.keras.layers.Dense(HIDDEN_LAYER_SIZE, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 256, 16)           1280000   
_________________________________________________________________
bidirectional (Bidirectional (None, 512)               559104    
_________________________________________________________________
dense (Dense)                (None, 128)               65664     
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 129       
Total params: 1,904,897
Trainable params: 1,904,897
Non-trainable params: 0
_________________________________________________________________


In [11]:
opt = tf.keras.optimizers.Adam()
model.compile(optimizer='adam', loss='binary_crossentropy',  metrics=['accuracy'])

model.fit(x_train, y_train, 
          validation_data=(x_test, y_test), 
          epochs=2, 
          batch_size=128, 
          verbose=2
         )



Epoch 1/2
196/196 - 12s - loss: 0.6170 - accuracy: 0.6775 - val_loss: 0.3796 - val_accuracy: 0.8305
Epoch 2/2
196/196 - 9s - loss: 0.2392 - accuracy: 0.9057 - val_loss: 0.3600 - val_accuracy: 0.8444


<tensorflow.python.keras.callbacks.History at 0x7f07e1956370>

In [12]:
scores = model.evaluate(x_test, y_test, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))

Accuracy: 84.44%


In [13]:
y_hat = model.predict(x_test)
y_hat = y_hat >= 0.5
report = classification_report(y_test, y_hat)

In [14]:
print(report)

              precision    recall  f1-score   support

           0       0.83      0.86      0.85     12500
           1       0.86      0.82      0.84     12500

    accuracy                           0.84     25000
   macro avg       0.85      0.84      0.84     25000
weighted avg       0.85      0.84      0.84     25000

