In [1]:
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, LSTM, Dropout


In [2]:
import numpy as np
import pandas as pd
import sys

Importamos los datos

In [3]:
train_df = pd.read_csv('train.csv')
#Largo de las frases
train_df['phrase_len'] = train_df.text.str.split().str.len()

Limitamos el dataset a frases de 30 palabras para simplificar

In [4]:
phrases = train_df[train_df['phrase_len'] <= 30].reset_index()
max_phrase_len = phrases['phrase_len'].max()
max_phrase_len

30

Usando un tokenizer, codificamos los textos

In [5]:
tokenizer = Tokenizer(filters= '!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n\'')
tokenizer.fit_on_texts(phrases['text'])
vocab_len = len(tokenizer.word_index) + 1
print(f'We have a {vocab_len}-word vocabulary')

We have a 17693-word vocabulary


Aplicamos padding para que sean de la misma longitud

In [6]:
phrases['sequences'] = tokenizer.texts_to_sequences(phrases['text'])
sequences = phrases['sequences'].values
sequences_padded = pad_sequences(phrases['sequences'], maxlen=max_phrase_len)

In [7]:
sequences_padded.shape

(13337, 30)


--- probamos esto y no funcionó ---
Agregamos mas muestras desplazando las frases hacia la derecha.
El objetivo del modelo va a ser predecir la ultima palabra de cada frase

In [8]:
# %%time
# upsampled_sequences = list()
# #upsampled_sequences.append(sequences_padded)
# for i in range(sequences_padded.shape[0]):
#     upsampled_sequences.append(sequences_padded[i])
#     if i % 100 == 0:
#         print(i, end='\r')
#     sequence = sequences_padded[i]
#     for _ in range(len(sequences[i])-3):
#         sequence = np.append(0,sequence[:-1])
#         upsampled_sequences.append(sequence)
# upsampled_sequences = np.array(upsampled_sequences)

Wall time: 1.85 s


In [8]:
# upsampled_sequences.shape

Importamos la matriz de embeddings

In [9]:
glove_file = './glove.6b/glove.6B.100d.txt'
glove_dim = 100
vocab = tokenizer.word_index.keys()
embeddings_index = {}
f = open(glove_file, encoding='utf8')
for line in f:
    values = line.split()
    word = values[0]
    if word not in vocab:
        continue
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print(f'''
        Se encontraron {len(embeddings_index)} vectores de embedding, sobre un total de {vocab_len} palabras. 
        {vocab_len - len(embeddings_index)} palabras van a ser representadas con todos ceros''')


        Se encontraron 16338 vectores de embedding, sobre un total de 17693 palabras. 
        1355 palabras van a ser representadas con todos ceros


Relacionamos glove con el tokenizer definido anteriormente

In [10]:
embedding_matrix = np.zeros((vocab_len, glove_dim))
for word, i in tokenizer.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

Para simplificar, armamos un modelo con una capa embedding usando glove.
La salida del modelo van a ser las frases con sus vectores de embeddings asociados
Definimos el target como la ultima palabra de cada frase

In [13]:
embedding_model = Sequential()
embedding_layer = Embedding(vocab_len,
                            glove_dim,
                            weights=[embedding_matrix],
                            input_length=max_phrase_len,
                            trainable=False)
embedding_model.add(embedding_layer)
embedding_model.compile('rmsprop', 'mse')

embedded = embedding_model.predict(sequences_padded)
print(f'''
        embedded shape: {embedded.shape}. 
        {embedded.shape[0]} frases, 
        {embedded.shape[1]} palabras,
        {embedded.shape[2]} dimensiones de embedding
        ''')
x = embedded[:,:-1,:]
y = sequences_padded[:,-1]


        embedded shape: (13337, 30, 100). 
        13337 frases, 
        30 palabras,
        100 dimensiones de embedding
        


Definimos un modelo que intente predecir la ultima palabra

In [33]:
generator_input_layer = Input(shape=(x.shape[1:]))
generator_LSTM_layer_1 = LSTM(128, return_sequences=True)(generator_input_layer)
generator_LSTM_layer_2 = LSTM(256)(generator_LSTM_layer_1)
dense_layer_1 = Dense(1024, activation='relu')(generator_LSTM_layer_2)
output_layer = Dense(vocab_len, activation='softmax')(dense_layer_1)

generator_model = Model(generator_input_layer, output_layer, name="generator_model")
optimizer = keras.optimizers.RMSprop(lr=0.01)
generator_model.compile(loss='sparse_categorical_crossentropy', optimizer=optimizer)

In [34]:
generator_model.summary()

Model: "generator_model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_3 (InputLayer)         [(None, 29, 100)]         0         
_________________________________________________________________
lstm_4 (LSTM)                (None, 29, 128)           117248    
_________________________________________________________________
lstm_5 (LSTM)                (None, 256)               394240    
_________________________________________________________________
dense_4 (Dense)              (None, 1024)              263168    
_________________________________________________________________
dense_5 (Dense)              (None, 17693)             18135325  
Total params: 18,909,981
Trainable params: 18,909,981
Non-trainable params: 0
_________________________________________________________________


Entrenamos el modelo

In [42]:
generator_model.fit(x, y,
              batch_size=128,
              epochs=40)

Train on 13337 samples
Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40


<tensorflow.python.keras.callbacks.History at 0x163071211c8>

In [45]:
generator_model.save_weights("generator_weights.h5")

In [46]:
generator_json = generator_model.to_json()
with open("generator_json.json", "w") as json_file:
    json_file.write(generator_json)

In [47]:
import pickle

# saving
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [39]:
#generator_model.load_weights('generator_weights.h5')

In [40]:
x.shape

(13337, 29, 100)

Probamos el modelo generando una frase de 30 palabras

In [44]:
random_sample = np.random.randint(0, x.shape[0] - 1)
generated_sequence = sequences_padded[random_sample]
print('random text: ')
print(f'{tokenizer.sequences_to_texts([generated_sequence,])}')
print(f'generated text:')
for i in range(30):
    generated_embedded = embedding_model.predict(generated_sequence.reshape(1,30))
    predicted_word_sequence = generator_model.predict(generated_embedded[:,:-1,:])
    #next_index = sample(predicted_word_sequence, 1)
    next_index = predicted_word_sequence.argmax()
    generated_sequence = np.append(generated_sequence,next_index)[1:]

    next_word = tokenizer.sequences_to_texts([[next_index,],])[0]
    sys.stdout.write(next_word + ' ')
    sys.stdout.flush()
print()

random text: 
['your acting is very natural as i live']
generated text:
live tell course it much be rowena so air lived angelic it course child appeared day by annoyance me ruin was her needed companions doré alone him true dust day 
