# Tokens & Embeddings

In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import Flatten
from tensorflow.keras.layers import Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

In [None]:
sentences = [
    'la familia es estupenda',
    'Quiero visitar la Sagrada familia',
    'voy a visitar a mi familia',
    'odio Barcelona',
    'La sagrada familia es fea',
    'no me gustan las visitas'
]
tokenizer = Tokenizer(num_words = 100, oov_token="<OOV>") #Crear tokenizador
#Cuando no existe una palabra en el vocabulario ponemos oov_token=""
tokenizer.fit_on_texts(sentences)#Entrenar tokenizador
word_index = tokenizer.word_index
print('word index ==>', word_index)#Observamos los vectores asociados a las palabras
sequences = tokenizer.texts_to_sequences(sentences)
print('sequences ==>', sequences)

word index ==> {'<OOV>': 1, 'familia': 2, 'la': 3, 'es': 4, 'visitar': 5, 'sagrada': 6, 'a': 7, 'estupenda': 8, 'quiero': 9, 'voy': 10, 'mi': 11, 'odio': 12, 'barcelona': 13, 'fea': 14, 'no': 15, 'me': 16, 'gustan': 17, 'las': 18, 'visitas': 19}
sequences ==> [[3, 2, 4, 8], [9, 5, 3, 6, 2], [10, 7, 5, 7, 11, 2], [12, 13], [3, 6, 2, 4, 14], [15, 16, 17, 18, 19]]


In [None]:
#Padding= Añadir ceros
padded = pad_sequences(sequences, maxlen=7)#Longitud de 7
print("\nPadded Sequences:")
print(padded)


Padded Sequences:
[[ 0  0  0  3  2  4  8]
 [ 0  0  9  5  3  6  2]
 [ 0 10  7  5  7 11  2]
 [ 0  0  0  0  0 12 13]
 [ 0  0  3  6  2  4 14]
 [ 0  0 15 16 17 18 19]]


In [None]:
test_data = [
    'voy a visitar a mis primos',
    'La sagrada familia esta en Barcelona',
    'adoro las visitas',
    'es horrible el turismo masivo'
]
tokenizer.fit_on_texts(test_data)#Entrenar tokenizador
word_index = tokenizer.word_index
print('word index ==>', word_index)
test_seq = tokenizer.texts_to_sequences(test_data)
print("\nTest Sequence = ", test_seq)

test_padded = pad_sequences(test_seq, maxlen=7)
print("\nPadded Test Sequence: ")
print(test_padded)

word index ==> {'<OOV>': 1, 'visitar': 2, 'familia': 3, 'a': 4, 'es': 5, 'adoro': 6, 'horrible': 7, 'la': 8, 'mi': 9, 'sagrada': 10, 'madrid': 11, 'para': 12, 'voy': 13, 'barcelona': 14, 'las': 15, 'visitas': 16, 'mis': 17, 'primos': 18, 'esta': 19, 'en': 20, 'el': 21, 'turismo': 22, 'masivo': 23, 'estupenda': 24, 'quiero': 25, 'odio': 26, 'fea': 27, 'no': 28, 'me': 29, 'gustan': 30}

Test Sequence =  [[13, 4, 2, 4, 17, 18], [8, 10, 3, 19, 20, 14], [6, 15, 16], [5, 7, 21, 22, 23]]

Padded Test Sequence: 
[[ 0 13  4  2  4 17 18]
 [ 0  8 10  3 19 20 14]
 [ 0  0  0  0  6 15 16]
 [ 0  0  5  7 21 22 23]]


In [None]:
sentiments = np.array([1,1,1,0,0,0])#Son como las etiquetas, en entrenamiento tenemos las tres primeras reseñas positivas (=1) y las tres últimas negativas (=0)
training_padded = np.array(padded)
print(sentiments)
print(training_padded)

[1 1 1 0 0 0]
[[ 0  0  7  2  4  6  3]
 [ 0  0 10  5  8 11  2]]


In [None]:
vocab_size = 10000
embedding_dim = 20 #Tamaño del vector del embedding
max_length = 7

In [None]:
model = Sequential()
model.add(Embedding(vocab_size, embedding_dim, input_length=max_length))
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))



In [None]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
print(model.summary())

None


In [None]:
model.fit(training_padded, sentiments, epochs=200, verbose=1)

ValueError: Data cardinality is ambiguous. Make sure all arrays contain the same number of samples.'x' sizes: 2
'y' sizes: 6


In [None]:
test_padded = np.array(test_padded)
test_sentiments = np.array([1,1,1,0])
print(test_padded)
print(test_sentiments)

[[ 0 13  4  2  4 17 18]
 [ 0  8 10  3 19 20 14]
 [ 0  0  0  0  6 15 16]
 [ 0  0  5  7 21 22 23]]
[1 1 1 0]


In [None]:
loss, accuracy = model.evaluate(test_padded, test_sentiments, verbose=0)
print('Accuracy: %f' % (accuracy*100))

Accuracy: 100.000000


In [None]:
sentence = ["adoro visitar a mi familia", 'Madrid es horrible para visitar']
tokenizer.fit_on_texts(sentence)
word_index = tokenizer.word_index
print(word_index)
sequences = tokenizer.texts_to_sequences(sentence)
print(sequences)
padded = pad_sequences(sequences, maxlen=max_length)
print(padded)
print(model.predict(padded))

{'<OOV>': 1, 'visitar': 2, 'familia': 3, 'a': 4, 'es': 5, 'mi': 6, 'adoro': 7, 'horrible': 8, 'la': 9, 'madrid': 10, 'para': 11, 'sagrada': 12, 'voy': 13, 'barcelona': 14, 'las': 15, 'visitas': 16, 'estupenda': 17, 'quiero': 18, 'odio': 19, 'fea': 20, 'no': 21, 'me': 22, 'gustan': 23, 'mis': 24, 'primos': 25, 'esta': 26, 'en': 27, 'el': 28, 'turismo': 29, 'masivo': 30}
[[7, 2, 4, 6, 3], [10, 5, 8, 11, 2]]
[[ 0  0  7  2  4  6  3]
 [ 0  0 10  5  8 11  2]]
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
[[0.6140113 ]
 [0.89589375]]
