## Analise de texto IMDB utilizando o TensorFlow para atividade de aprofundamento da trilha 04 de DeepLearning.

Anderson rosa cascalho

In [None]:
import tensorflow as tf
from tensorflow import keras

import numpy as np


In [None]:
imdb = keras.datasets.imdb
(train_data, train_labels), (test_data, test_labels) = imdb.load_data(num_words=10000)

## Tratamento dos dados

In [None]:
print("Training entries: {}, labels: {}".format(len(train_data), len(train_labels)))

Training entries: 25000, labels: 25000


In [None]:
print(train_data[0])

[1, 14, 22, 16, 43, 530, 973, 1622, 1385, 65, 458, 4468, 66, 3941, 4, 173, 36, 256, 5, 25, 100, 43, 838, 112, 50, 670, 2, 9, 35, 480, 284, 5, 150, 4, 172, 112, 167, 2, 336, 385, 39, 4, 172, 4536, 1111, 17, 546, 38, 13, 447, 4, 192, 50, 16, 6, 147, 2025, 19, 14, 22, 4, 1920, 4613, 469, 4, 22, 71, 87, 12, 16, 43, 530, 38, 76, 15, 13, 1247, 4, 22, 17, 515, 17, 12, 16, 626, 18, 2, 5, 62, 386, 12, 8, 316, 8, 106, 5, 4, 2223, 5244, 16, 480, 66, 3785, 33, 4, 130, 12, 16, 38, 619, 5, 25, 124, 51, 36, 135, 48, 25, 1415, 33, 6, 22, 12, 215, 28, 77, 52, 5, 14, 407, 16, 82, 2, 8, 4, 107, 117, 5952, 15, 256, 4, 2, 7, 3766, 5, 723, 36, 71, 43, 530, 476, 26, 400, 317, 46, 7, 4, 2, 1029, 13, 104, 88, 4, 381, 15, 297, 98, 32, 2071, 56, 26, 141, 6, 194, 7486, 18, 4, 226, 22, 21, 134, 476, 26, 480, 5, 144, 30, 5535, 18, 51, 36, 28, 224, 92, 25, 104, 4, 226, 65, 16, 38, 1334, 88, 12, 16, 283, 5, 16, 4472, 113, 103, 32, 15, 16, 5345, 19, 178, 32]


In [None]:
len(train_data[0]), len(train_data[1])

(218, 189)

In [None]:
word_index = imdb.get_word_index()
# The first indices are reserved
word_index = {k:(v+3) for k,v in word_index.items()} 
word_index["<PAD>"] = 0
word_index["<START>"] = 1
word_index["<UNK>"] = 2  # unknown
word_index["<UNUSED>"] = 3

reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])

def decode_review(text):
    return ' '.join([reverse_word_index.get(i, '?') for i in text])

In [None]:
#Aqui estamos deixando os dados com o mesmo tamanho, para não dar diferença no tamanho na entrada aos tensores.
train_data = keras.preprocessing.sequence.pad_sequences(train_data,
                                                        value=word_index["<PAD>"],
                                                        padding='post',
                                                        maxlen=256)

test_data = keras.preprocessing.sequence.pad_sequences(test_data,
                                                       value=word_index["<PAD>"],
                                                       padding='post',
                                                       maxlen=256)

In [None]:
len(train_data[0]), len(train_data[1])

(256, 256)

In [None]:
train_data[0]

array([   1,   14,   22,   16,   43,  530,  973, 1622, 1385,   65,  458,
       4468,   66, 3941,    4,  173,   36,  256,    5,   25,  100,   43,
        838,  112,   50,  670,    2,    9,   35,  480,  284,    5,  150,
          4,  172,  112,  167,    2,  336,  385,   39,    4,  172, 4536,
       1111,   17,  546,   38,   13,  447,    4,  192,   50,   16,    6,
        147, 2025,   19,   14,   22,    4, 1920, 4613,  469,    4,   22,
         71,   87,   12,   16,   43,  530,   38,   76,   15,   13, 1247,
          4,   22,   17,  515,   17,   12,   16,  626,   18,    2,    5,
         62,  386,   12,    8,  316,    8,  106,    5,    4, 2223, 5244,
         16,  480,   66, 3785,   33,    4,  130,   12,   16,   38,  619,
          5,   25,  124,   51,   36,  135,   48,   25, 1415,   33,    6,
         22,   12,  215,   28,   77,   52,    5,   14,  407,   16,   82,
          2,    8,    4,  107,  117, 5952,   15,  256,    4,    2,    7,
       3766,    5,  723,   36,   71,   43,  530,  4

## Modelo

In [None]:
!pip install -q -U keras-tuner

In [None]:
import kerastuner as kt

In [None]:
def model_builder(hp):
 
  model = keras.Sequential()
  model.add(keras.layers.Embedding(10000, 16))
  model.add(keras.layers.GlobalAveragePooling1D())

  # Tune the number of units in the first Dense layer
  # Choose an optimal value between 8-128
  hp_units_1 = hp.Int('hp_units_1', min_value=32, max_value=512, step=32)
  model.add(keras.layers.Dense(units=hp_units_1, activation='relu'))
  
  # Tune the number of hidden layers 2-10 and units per layer
  for i in range(hp.Int('num_layers', 2, 6)):
    model.add(keras.layers.Dense(units=hp.Int('hp_units_' + str(i),
                                            min_value=16,
                                            max_value=256,
                                            step=4),
                               activation='relu'))

  model.add(keras.layers.Dense(16, activation=tf.nn.relu))
  model.add(keras.layers.Dense(1, activation='sigmoid'))

  model.compile(optimizer=tf.optimizers.Adam(), 
                loss='binary_crossentropy', 
                metrics=['accuracy'])
  return model

In [None]:
tuner = kt.Hyperband(model_builder,
                     objective='val_accuracy',
                     max_epochs=50,
                     factor=3,
                     directory='mydir',
                     project_name='teste25_dl_course')

In [None]:
stop_early = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5)

In [None]:
x_val = train_data[:10000]
partial_x_train = train_data[10000:]

y_val = train_labels[:10000]
partial_y_train = train_labels[10000:]

tuner.search(partial_x_train, partial_y_train, 
             batch_size=512,
             epochs=40, 
             validation_data=(x_val, y_val),
             verbose=1,
             callbacks=[stop_early])



Trial 90 Complete [00h 00m 22s]
val_accuracy: 0.8822000026702881

Best val_accuracy So Far: 0.8863999843597412
Total elapsed time: 00h 14m 00s
INFO:tensorflow:Oracle triggered exit


In [None]:
# Get the optimal hyperparameters
best_hps=tuner.get_best_hyperparameters(num_trials=1)[0]

In [None]:
print(f"""
The hyperparameter search is complete.

The optimal learning rate for the optimizer is {best_hps.get('learning_rate')}.
""")


print('Best layers units: \n')
for i in range(2,6):
  print('Layer: ', i-1, ' , ', best_hps.get('hp_units_' + str(i)), ' units')


The hyperparameter search is complete.

The optimal learning rate for the optimizer is 0.0001.

Best layers units: 

Layer:  1  ,  244  units
Layer:  2  ,  224  units
Layer:  3  ,  212  units
Layer:  4  ,  52  units


In [None]:
# Build the model with the optimal hyperparameters and train it on the data for 50 epochs
model = tuner.hypermodel.build(best_hps)

history = model.fit(partial_x_train, partial_y_train, 
                    batch_size=512,
                    epochs=40, 
                    validation_split=0.2,
                    verbose=1)

val_acc_per_epoch = history.history['val_accuracy']
best_epoch = val_acc_per_epoch.index(max(val_acc_per_epoch)) + 1
print('Best epoch: %d' % (best_epoch,))


Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40
Best epoch: 1


In [None]:
hypermodel = tuner.hypermodel.build(best_hps)

# Retrain the model
hypermodel.fit(partial_x_train, partial_y_train, 
               batch_size=64,
               epochs=best_epoch,
               validation_split=0.2)



<keras.callbacks.History at 0x7f963f847310>

In [None]:
eval_result = hypermodel.evaluate(test_data, test_labels)
print("[test loss, test accuracy]:", eval_result)

[test loss, test accuracy]: [0.35073956847190857, 0.8503599762916565]
