# Example with Embeddings - IMDB dataset

#### Loading and pre-processing dataset IMDB

In [1]:
from tensorflow.keras.datasets import imdb
from tensorflow.keras import preprocessing
from tensorflow.keras.models import Sequential 
from tensorflow.keras.layers import Flatten, Dense
from tensorflow.keras.layers import Embedding

# 10k words -> vocabulary
max_features = 10000
# pad_sequences -> transforms a list of sequences into a 2D array to the maxlen length or to the longest sequence of the list
maxlen = 20

(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features)

x_train = preprocessing.sequence.pad_sequences(x_train, maxlen=maxlen)
x_test = preprocessing.sequence.pad_sequences(x_test, maxlen=maxlen)

print(x_train.shape)
print(y_train.shape)

print(x_test.shape)
print(y_test.shape)

print(x_train[0])
print(y_train[0])

(25000, 20)
(25000,)
(25000, 20)
(25000,)
[  65   16   38 1334   88   12   16  283    5   16 4472  113  103   32
   15   16 5345   19  178   32]
1


#### Define and train the model

In [None]:
# The model doesn't take into consideration the word's order, treating them independently

In [2]:
model = Sequential()
# embedding -> number of characteristics that word will be grouped, it is a 8D plot
# the higher the number, the more accurate the model as it words can be more differentiable
model.add(Embedding(10000, 8, input_length=maxlen))
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['acc'])
model.summary()

history = model.fit(x_train, y_train, epochs=10, batch_size=32, validation_split=0.2)

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 20, 8)             80000     
                                                                 
 flatten (Flatten)           (None, 160)               0         
                                                                 
 dense (Dense)               (None, 1)                 161       
                                                                 
Total params: 80,161
Trainable params: 80,161
Non-trainable params: 0
_________________________________________________________________
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


#### Model's performance evaluation

In [3]:
results = model.evaluate(x_test, y_test, verbose = 0)
print(results)

[0.5124567747116089, 0.7588000297546387]
