# Ejemplo IMDB

## Importar Data

In [1]:
import warnings
warnings.simplefilter('ignore')

In [3]:
from keras.datasets import imdb

# (train_data, train_labels), (test_data, test_labels) = imdb.load_data(num_words=10000)
# si se intenta cargar directamete puede salir un error de pickle, por eso...

In [4]:
# ...se usa numpy

import numpy as np
# guarda np.load
np_load_old = np.load

# modifica los parametros por defecto de np.load
# con allow_pickle=True
np.load = lambda *a,**k: np_load_old(*a, allow_pickle=True, **k)

# se cargan los datos
(train_data, train_labels), (test_data, test_labels) = imdb.load_data(num_words=10000)

# se devuelve np.load a su estado original
np.load = np_load_old

In [6]:
train_data[0][:10]

[1, 14, 22, 16, 43, 530, 973, 1622, 1385, 65]

In [8]:
train_labels[0]

1

In [9]:
# los datos estan restringidos a 10000 palabras
max([max(sequence) for sequence in train_data])

9999

In [10]:
# se mapean las palabras con un numero entero
word_index = imdb.get_word_index()

# se da la vuelta al diccionario
reverse_word_index = dict(
     [(value, key) for (key, value) in word_index.items()])

# se decodifica el review, i-3 porque 
# 0=padding, 1=inicio de la secuencia, 3=desconocido (indices)
decoded_review = ' '.join(
    [reverse_word_index.get(i - 3, '?') for i in train_data[0]])

Downloading data from https://s3.amazonaws.com/text-datasets/imdb_word_index.json


## Preparando Data

In [14]:
# se crea un matriz de todo ceros

def vectorize_sequences(sequences, dimension=10000):
    results = np.zeros((len(sequences), dimension))
    for i, sequence in enumerate(sequences):
          results[i, sequence] = 1.  # indice a uno
    return results

In [15]:
# train-test

x_train = vectorize_sequences(train_data)
x_test = vectorize_sequences(test_data)

In [16]:
x_train[0]

array([0., 1., 1., ..., 0., 0., 0.])

In [17]:
# se vectorizan las etiquetas

y_train = np.asarray(train_labels).astype('float32')
y_test = np.asarray(test_labels).astype('float32')

## Modelo

In [19]:
from keras import models
from keras import layers

model = models.Sequential()
model.add(layers.Dense(16, activation='relu', input_shape=(10000,)))
model.add(layers.Dense(16, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))

# se compila
model.compile(optimizer='rmsprop',
              loss='binary_crossentropy',
              metrics=['accuracy'])



Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


In [20]:
# compilado configurando optimizador
from keras import optimizers

model.compile(optimizer=optimizers.RMSprop(lr=0.001),
              loss='binary_crossentropy',
              metrics=['accuracy'])

In [22]:
# compilado usando metricas y perdidas custom
from keras import losses
from keras import metrics

model.compile(optimizer=optimizers.RMSprop(lr=0.001),
              loss=losses.binary_crossentropy,
              metrics=[metrics.binary_accuracy])

## Validacion

In [23]:
# set de validacion

x_val = x_train[:10000]
partial_x_train = x_train[10000:]

y_val = y_train[:10000]
partial_y_train = y_train[10000:]

## Entrenamiento

In [24]:
# compilacion
model.compile(optimizer='rmsprop',
              loss='binary_crossentropy',
              metrics=['acc'])


# entrenamiento guardado en historial
history = model.fit(partial_x_train,
                    partial_y_train,
                    epochs=20,
                    batch_size=512,
                    validation_data=(x_val, y_val))


Train on 15000 samples, validate on 10000 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [25]:
history_dict = history.history
history_dict.keys()

dict_keys(['val_loss', 'val_acc', 'loss', 'acc'])