**Procesamiento Natural del Lenguaje**

**Desafio 4**

**Carlos Villalobos**

**Consigna.**

**2 - Preprocesamiento**

Realizar el preprocesamiento necesario para obtener:

word2idx_inputs, max_input_len
word2idx_outputs, max_out_len, num_words_output
encoder_input_sequences, decoder_output_sequences, decoder_targets
**3 - Preparar los embeddings**

Utilizar los embeddings de Glove o FastText para transformar los tokens de entrada en vectores

**4 - Entrenar el modelo**

Entrenar un modelo basado en el esquema encoder-decoder utilizando los datos generados en los puntos anteriores. Utilce como referencias los ejemplos vistos en clase.

**5 - Inferencia**

Experimentar el funcionamiento de su modelo. Recuerde que debe realizar la inferencia de los modelos por separado de encoder y decoder.

In [1]:
# 1. Importar librerías necesarias
import re
import os
import json
import gdown
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import pad_sequences
from keras.models import Sequential, Model
from keras.layers import Activation, Dense, Input
from keras.layers import LSTM
from tensorflow.keras.layers import Embedding



In [2]:
# 2. Descargar y cargar datos
# Descargar el dataset si no existe
if not os.path.exists('data_volunteers.json'):
    url = 'https://drive.google.com/uc?id=1awUxYwImF84MIT5-jCaYAPe2QwSgS1hN&export=download'
    output = 'data_volunteers.json'
    gdown.download(url, output, quiet=False)
else:
    print("El dataset ya se encuentra descargado")

# Cargar el dataset
with open('data_volunteers.json') as f:
    data = json.load(f)

print("Cantidad de diálogos en el dataset:", len(data))



Downloading...
From: https://drive.google.com/uc?id=1awUxYwImF84MIT5-jCaYAPe2QwSgS1hN&export=download
To: /content/data_volunteers.json
100%|██████████| 2.58M/2.58M [00:00<00:00, 155MB/s]

Cantidad de diálogos en el dataset: 1111





In [3]:
# 3. Funciones de procesamiento
def clean_text(txt):
    """Limpia y normaliza el texto"""
    txt = txt.lower()
    txt = txt.replace("\'d", " had")
    txt = txt.replace("\'s", " is")
    txt = txt.replace("\'m", " am")
    txt = txt.replace("don't", "do not")
    txt = re.sub(r'\W+', ' ', txt)
    return txt

def prepare_data(data, max_len=30):
    """Prepara los datos para el entrenamiento"""
    input_sentences = []
    output_sentences = []

    for line in data:
        for i in range(len(line['dialog'])-1):
            chat_in = clean_text(line['dialog'][i]['text'])
            chat_out = clean_text(line['dialog'][i+1]['text'])

            if len(chat_in) >= max_len or len(chat_out) >= max_len:
                continue

            input_sentences.append(chat_in)
            output_sentences.append('<start> ' + chat_out + ' <end>')

    print(f"Total de pares pregunta-respuesta: {len(input_sentences)}")
    return input_sentences, output_sentences

In [4]:
# 4. Preparación de datos
input_sentences, output_sentences = prepare_data(data)

# Mostrar algunos ejemplos
print("\nEjemplos de pares pregunta-respuesta:")
for i in range(3):
    print(f"\nPregunta: {input_sentences[i]}")
    print(f"Respuesta: {output_sentences[i]}")

# 5. Tokenización y secuenciación
def create_tokenizer(input_sentences, output_sentences):
    """Crea y configura el tokenizador"""
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(['<start>', '<end>'] + input_sentences + output_sentences)

    input_sequences = tokenizer.texts_to_sequences(input_sentences)
    output_sequences = tokenizer.texts_to_sequences(output_sentences)

    word2idx = tokenizer.word_index
    num_words = len(word2idx) + 1
    max_input_len = max(len(seq) for seq in input_sequences)
    max_output_len = max(len(seq) for seq in output_sequences)

    encoder_input_sequences = pad_sequences(input_sequences, maxlen=max_input_len, padding='post')
    decoder_output_sequences = pad_sequences(output_sequences, maxlen=max_output_len, padding='post')
    decoder_targets = decoder_output_sequences.reshape(
        decoder_output_sequences.shape[0],
        decoder_output_sequences.shape[1],
        1
    )

    return (tokenizer, word2idx, num_words, max_input_len, max_output_len,
            encoder_input_sequences, decoder_output_sequences, decoder_targets)

# Procesar las secuencias
(tokenizer, word2idx, num_words, max_input_len, max_output_len,
 encoder_input_sequences, decoder_output_sequences, decoder_targets) = create_tokenizer(
    input_sentences, output_sentences
)

print(f"\nTamaño del vocabulario: {num_words}")
print(f"Longitud máxima de entrada: {max_input_len}")
print(f"Longitud máxima de salida: {max_output_len}")

Total de pares pregunta-respuesta: 5985

Ejemplos de pares pregunta-respuesta:

Pregunta: hello 
Respuesta: <start> hi how are you  <end>

Pregunta: hi how are you 
Respuesta: <start> not bad and you  <end>

Pregunta: hi 
Respuesta: <start> hello  <end>

Tamaño del vocabulario: 2146
Longitud máxima de entrada: 9
Longitud máxima de salida: 11


In [5]:
# 6. Cargar embeddings
if not os.path.exists('glove.6B.100d.txt'):
    url = 'https://nlp.stanford.edu/data/glove.6B.zip'
    output = 'glove.6B.zip'
    gdown.download(url, output, quiet=False)
    !unzip glove.6B.zip

def load_embeddings(word2idx, embedding_dim=100):
    """Carga y prepara la matriz de embeddings"""
    embeddings_index = {}
    with open('glove.6B.100d.txt', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs

    embedding_matrix = np.zeros((len(word2idx) + 1, embedding_dim))
    for word, i in word2idx.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

    return embedding_matrix

embedding_matrix = load_embeddings(word2idx)
print(f"\nForma de la matriz de embeddings: {embedding_matrix.shape}")

Downloading...
From: https://nlp.stanford.edu/data/glove.6B.zip
To: /content/glove.6B.zip
100%|██████████| 862M/862M [02:39<00:00, 5.41MB/s]


Archive:  glove.6B.zip
  inflating: glove.6B.50d.txt        
  inflating: glove.6B.100d.txt       
  inflating: glove.6B.200d.txt       
  inflating: glove.6B.300d.txt       

Forma de la matriz de embeddings: (2146, 100)


In [7]:
# 7. Construcción del modelo
def build_model(num_words, embedding_dim, embedding_matrix, max_input_len, max_output_len, latent_dim=256):
    """Construye los modelos de entrenamiento e inferencia"""
    # Encoder
    encoder_inputs = Input(shape=(max_input_len,))
    encoder_embedding = Embedding(
        num_words,
        embedding_dim,
        weights=[embedding_matrix],
        trainable=False
    )(encoder_inputs)

    encoder = LSTM(latent_dim, return_state=True)
    encoder_outputs, state_h, state_c = encoder(encoder_embedding)
    encoder_states = [state_h, state_c]

    # Decoder
    decoder_inputs = Input(shape=(None,))
    decoder_embedding_layer = Embedding(  # Rename to decoder_embedding_layer
        num_words,
        embedding_dim,
        weights=[embedding_matrix],
        trainable=False
    )
    decoder_embedding = decoder_embedding_layer(decoder_inputs) # Apply layer here

    decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
    decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=encoder_states)
    decoder_dense = Dense(num_words, activation='softmax')
    decoder_outputs = decoder_dense(decoder_outputs)

    # Modelo completo
    model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

    # Modelos de inferencia
    encoder_model = Model(encoder_inputs, encoder_states)

    decoder_state_input_h = Input(shape=(latent_dim,))
    decoder_state_input_c = Input(shape=(latent_dim,))
    decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

    decoder_inputs_inference = Input(shape=(None,))
    decoder_embedding_inference = decoder_embedding_layer(decoder_inputs_inference) # Use decoder_embedding_layer

    decoder_outputs, state_h, state_c = decoder_lstm(
        decoder_embedding_inference,
        initial_state=decoder_states_inputs
    )
    decoder_states = [state_h, state_c]
    decoder_outputs = decoder_dense(decoder_outputs)

    decoder_model = Model(
        [decoder_inputs_inference] + decoder_states_inputs,
        [decoder_outputs] + decoder_states
    )

    return model, encoder_model, decoder_model

In [8]:
# Construir modelos
model, encoder_model, decoder_model = build_model(
    num_words,
    100,  # embedding_dim
    embedding_matrix,
    max_input_len,
    max_output_len
)

# Compilar y mostrar resumen
model.compile(optimizer='rmsprop', loss='sparse_categorical_crossentropy')
model.summary()

In [10]:
# 8. Entrenamiento
history = model.fit(
    [encoder_input_sequences, decoder_output_sequences[:, :-1]],
    decoder_targets[:, 1:],
    batch_size=64,
    epochs=50,
    validation_split=0.2
)

Epoch 1/50
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 15ms/step - loss: 3.8060 - val_loss: 2.2295
Epoch 2/50
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step - loss: 2.1121 - val_loss: 2.0271
Epoch 3/50
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step - loss: 1.8621 - val_loss: 2.0173
Epoch 4/50
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step - loss: 1.6829 - val_loss: 1.8448
Epoch 5/50
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step - loss: 1.5721 - val_loss: 1.7580
Epoch 6/50
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step - loss: 1.5452 - val_loss: 1.7365
Epoch 7/50
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 10ms/step - loss: 1.4788 - val_loss: 1.7006
Epoch 8/50
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step - loss: 1.4349 - val_loss: 1.6742
Epoch 9/50
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m

In [15]:
# 9. Inferencia
def decode_sequence(input_seq, encoder_model, decoder_model, tokenizer, max_output_len):
    """Genera una respuesta para una secuencia de entrada"""
    states_value = encoder_model.predict(input_seq)

    target_seq = np.zeros((1, 1))

     # Check if '<start>' exists in tokenizer.word_index
    # If it doesn't exist, use the index of the first word (usually 1) or a default value
    start_token_index = tokenizer.word_index.get('<start>', 1)  # 1 is a common default
    target_seq[0, 0] = start_token_index

    decoded_sentence = []

    while True:
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)

        sampled_token_index = np.argmax(output_tokens[0, -1, :])

        sampled_word = ''
        for word, index in tokenizer.word_index.items():
            if index == sampled_token_index:
                sampled_word = word
                break

        if sampled_word == '<end>' or len(decoded_sentence) > max_output_len:
            break

        decoded_sentence.append(sampled_word)

        target_seq = np.zeros((1, 1))
        target_seq[0, 0] = sampled_token_index
        states_value = [h, c]

    return ' '.join(decoded_sentence)

def chat(input_text, encoder_model, decoder_model, tokenizer, max_input_len, max_output_len):
    """Función para chatear con el modelo"""
    input_text = clean_text(input_text)
    input_seq = tokenizer.texts_to_sequences([input_text])
    input_seq = pad_sequences(input_seq, maxlen=max_input_len, padding='post')

    response = decode_sequence(
        input_seq,
        encoder_model,
        decoder_model,
        tokenizer,
        max_output_len
    )

    return response

In [16]:
# 10. Pruebas del modelo
print("\nPruebas del chatbot:")
test_inputs = [
    "hello how are you",
    "what is your name",
    "tell me a joke"
]

for test_input in test_inputs:
    response = chat(
        test_input,
        encoder_model,
        decoder_model,
        tokenizer,
        max_input_len,
        max_output_len
    )
    print(f"\nHumano: {test_input}")
    print(f"Bot: {response}")


Pruebas del chatbot:
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 164ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m