**Procesamiento Natural del Lenguaje**

**Desafio 4**

**Carlos Villalobos**

**Consigna.**


**2 - Preprocesamiento**

Realizar el preprocesamiento necesario para obtener:

    word2idx_inputs, max_input_len
    word2idx_outputs, max_out_len, num_words_output
    encoder_input_sequences, decoder_output_sequences, decoder_targets




**3 - Preparar los embeddings**

Utilizar los embeddings de Glove o FastText para transformar los tokens de entrada en vectores



**4 - Entrenar el modelo**

Entrenar un modelo basado en el esquema encoder-decoder utilizando los datos generados en los puntos anteriores. Utilce como referencias los ejemplos vistos en clase.



**5 - Inferencia**

Experimentar el funcionamiento de su modelo. Recuerde que debe realizar la inferencia de los modelos por separado de encoder y decoder.


In [22]:
import re

import numpy as np
import pandas as pd

import tensorflow as tf
# from keras.preprocessing.text import one_hot # esto causa error
from tensorflow.keras.preprocessing.text import one_hot # uso entonces tensorflow.keras
from tensorflow.keras.utils import pad_sequences
from keras.models import Sequential
from keras.layers import Activation, Dropout, Dense
from keras.layers import Flatten, LSTM, SimpleRNN
from keras.models import Model
from tensorflow.keras.layers import Embedding
from sklearn.model_selection import train_test_split
# from keras.preprocessing.text import Tokenizer # esto también causa error
from tensorflow.keras.preprocessing.text import Tokenizer # uso entonces tensorflow.keras
from keras.layers import Input

In [23]:
# Descargar la carpeta de dataset
import os
import gdown
if os.access('data_volunteers.json', os.F_OK) is False:
    url = 'https://drive.google.com/uc?id=1awUxYwImF84MIT5-jCaYAPe2QwSgS1hN&export=download'
    output = 'data_volunteers.json'
    gdown.download(url, output, quiet=False)
else:
    print("El dataset ya se encuentra descargado")


# dataset_file
import json

text_file = "data_volunteers.json"
with open(text_file) as f:
    data = json.load(f) # la variable data será un diccionario

El dataset ya se encuentra descargado


In [4]:
# Observar los campos disponibles en cada linea del dataset
data[0].keys()


chat_in = []
chat_out = []

input_sentences = []
output_sentences = []
output_sentences_inputs = []
max_len = 30

def clean_text(txt):
    txt = txt.lower()
    txt.replace("\'d", " had")
    txt.replace("\'s", " is")
    txt.replace("\'m", " am")
    txt.replace("don't", "do not")
    txt = re.sub(r'\W+', ' ', txt)

    return txt

for line in data:
    for i in range(len(line['dialog'])-1):
        # vamos separando el texto en "preguntas" (chat_in)
        # y "respuestas" (chat_out)
        chat_in = clean_text(line['dialog'][i]['text'])
        chat_out = clean_text(line['dialog'][i+1]['text'])

        if len(chat_in) >= max_len or len(chat_out) >= max_len:
            continue

        input_sentence, output = chat_in, chat_out

        # output sentence (decoder_output) tiene
        output_sentence = output + ' '
        # output sentence input (decoder_input) tiene
        output_sentence_input = ' ' + output

        input_sentences.append(input_sentence)
        output_sentences.append(output_sentence)
        output_sentences_inputs.append(output_sentence_input)

print("Cantidad de rows utilizadas:", len(input_sentences))


input_sentences[1], output_sentences[1], output_sentences_inputs[1]


Cantidad de rows utilizadas: 6033


('hi how are you ', 'not bad and you  ', ' not bad and you ')

In [5]:
# 2 - Preprocesamiento
tokenizer_inputs = Tokenizer()
tokenizer_inputs.fit_on_texts(input_sentences)
input_sequences = tokenizer_inputs.texts_to_sequences(input_sentences)

word2idx_inputs = tokenizer_inputs.word_index
max_input_len = max(len(seq) for seq in input_sequences)
encoder_input_sequences = pad_sequences(input_sequences, maxlen=max_input_len, padding='post')

tokenizer_outputs = Tokenizer()
tokenizer_outputs.fit_on_texts(output_sentences + output_sentences_inputs)
output_sequences = tokenizer_outputs.texts_to_sequences(output_sentences)
output_sequences_inputs = tokenizer_outputs.texts_to_sequences(output_sentences_inputs)

word2idx_outputs = tokenizer_outputs.word_index
num_words_output = len(word2idx_outputs) + 1
max_out_len = max(len(seq) for seq in output_sequences)

decoder_output_sequences = pad_sequences(output_sequences, maxlen=max_out_len, padding='post')
decoder_targets = decoder_output_sequences.reshape(decoder_output_sequences.shape[0], decoder_output_sequences.shape[1], 1)

In [7]:
!pip install gdown



In [8]:
# 3 - Preparar los embeddings
# Asumimos que estamos usando GloVe embeddings de 100 dimensiones
import os
import gdown

# Check if the file already exists, if not, download it
if not os.path.exists('glove.6B.100d.txt'):
    url = 'https://nlp.stanford.edu/data/glove.6B.zip'
    output = 'glove.6B.zip'
    gdown.download(url, output, quiet=False)

    # Unzip the downloaded file
    !unzip glove.6B.zip

embeddings_index = {}
with open('glove.6B.100d.txt', encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

embedding_dim = 100
embedding_matrix = np.zeros((num_words_output, embedding_dim))
for word, i in word2idx_outputs.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

Downloading...
From: https://nlp.stanford.edu/data/glove.6B.zip
To: /content/glove.6B.zip
100%|██████████| 862M/862M [02:38<00:00, 5.43MB/s]


Archive:  glove.6B.zip
  inflating: glove.6B.50d.txt        
  inflating: glove.6B.100d.txt       
  inflating: glove.6B.200d.txt       
  inflating: glove.6B.300d.txt       


In [9]:
# 4 - Entrenar el modelo
# Definición del modelo
latent_dim = 256

# Encoder
encoder_inputs = Input(shape=(max_input_len,))
encoder_embedding = Embedding(num_words_output, embedding_dim, weights=[embedding_matrix], trainable=False)(encoder_inputs)
encoder = LSTM(latent_dim, return_state=True)
encoder_outputs, state_h, state_c = encoder(encoder_embedding)
encoder_states = [state_h, state_c]

# Decoder
decoder_inputs = Input(shape=(None,))
decoder_embedding = Embedding(num_words_output, embedding_dim, weights=[embedding_matrix], trainable=False)(decoder_inputs)
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=encoder_states)
decoder_dense = Dense(num_words_output, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

In [10]:
# Modelo completo
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

# Compilación y entrenamiento
model.compile(optimizer='rmsprop', loss='sparse_categorical_crossentropy')
model.fit([encoder_input_sequences, decoder_output_sequences[:, :-1]], decoder_targets[:, 1:],
          batch_size=64, epochs=50, validation_split=0.2)

Epoch 1/50
[1m76/76[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 13ms/step - loss: 3.4431 - val_loss: 2.2415
Epoch 2/50
[1m76/76[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 9ms/step - loss: 1.9746 - val_loss: 1.9695
Epoch 3/50
[1m76/76[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - loss: 1.7802 - val_loss: 1.8688
Epoch 4/50
[1m76/76[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - loss: 1.6039 - val_loss: 1.7566
Epoch 5/50
[1m76/76[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - loss: 1.4793 - val_loss: 1.7092
Epoch 6/50
[1m76/76[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - loss: 1.4271 - val_loss: 1.6610
Epoch 7/50
[1m76/76[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - loss: 1.3905 - val_loss: 1.6212
Epoch 8/50
[1m76/76[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 10ms/step - loss: 1.3248 - val_loss: 1.6029
Epoch 9/50
[1m76/76[0m [32m━━━━━━━━━━━━━━━━━━━━[0

<keras.src.callbacks.history.History at 0x78366a3649d0>

In [24]:
# 5 - Inferencia
# Modelo de codificación para inferencia
encoder_model = Model(encoder_inputs, encoder_states)

# Modelo de decodificación para inferencia
decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

# We need to reuse the decoder's embedding layer here
decoder_embedding = Embedding(num_words_output, embedding_dim, weights=[embedding_matrix], trainable=False)
decoder_inputs_inference = Input(shape=(None,))
decoder_embedding_inference = decoder_embedding(decoder_inputs_inference)

decoder_outputs, state_h, state_c = decoder_lstm(decoder_embedding_inference, initial_state=decoder_states_inputs)
decoder_states = [state_h, state_c]
decoder_outputs = decoder_dense(decoder_outputs)
decoder_model = Model([decoder_inputs_inference] + decoder_states_inputs, [decoder_outputs] + decoder_states)

# Función para la inferencia
def decode_sequence(input_seq):
    states_value = encoder_model.predict(input_seq)

    # en lugar de una string vacía '', uso '<start>' or '<sos>'
    # si está presente en el vocabulario
    # debo asegurar que ese token se usa durante el entreamiento, para dar consistencia

    # por ejemplo, si '<start>' es el token inicial:
    target_seq = np.zeros((1, 1))
    if '<start>' in word2idx_outputs:
        target_seq[0, 0] = word2idx_outputs['<start>']
    else:
        # retrocedo al índice de la primera palabra si no se encuentra '<inicio>'.
        # Suponiendo que el índice 1 representa la primera palabra (ajústelo si es necesario).
        target_seq[0, 0] = 1

    stop_condition = False


In [25]:
# Ejemplo de uso
input_sentence = "this is"
input_seq = tokenizer_inputs.texts_to_sequences([input_sentence])
input_seq = pad_sequences(input_seq, maxlen=max_input_len, padding='post')
decoded_sentence = decode_sequence(input_seq)
print('Input:', input_sentence)
print('Output:', decoded_sentence)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 268ms/step
Input: this is
Output: None
