**Desafio 3**

Carlos Villalobos

**Consigna**

- Seleccionar un corpus de texto sobre el cual entrenar el modelo de lenguaje.
- Realizar el pre-procesamiento adecuado para tokenizar el corpus, estructurar el dataset y separar entre datos de entrenamiento y validación.
- Proponer arquitecturas de redes neuronales basadas en unidades recurrentes para implementar un modelo de lenguaje.
- Con el o los modelos que consideren adecuados, generar nuevas secuencias a partir de secuencias de contexto con las estrategias de greedy search y beam search determístico y estocástico. En este último caso observar el efecto de la temperatura en la generación de secuencias.

**Sugerencias**

- Durante el entrenamiento, guiarse por el descenso de la perplejidad en los datos de validación para finalizar el entrenamiento. Para ello se provee un callback.
- Explorar utilizar SimpleRNN (celda de Elman), LSTM y GRU.
- rmsprop es el optimizador recomendado para la buena convergencia. No obstante se pueden explorar otros.

Second try form Claude.

In [None]:
import random
import io
import pickle
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from tensorflow import keras
from tensorflow.keras import layers
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense, LSTM, Embedding, Dropout, SimpleRNN, GRU
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.text import text_to_word_sequence
from tensorflow.keras.utils import pad_sequences
from scipy.special import softmax

# ... [Keep the existing functions: load_corpus, preprocess_corpus, create_model, PplCallback] ...

# 1. Select and load the corpus
def load_corpus(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read()
    return text

corpus = load_corpus('/kaggle/input/songs-dataset/songs_dataset/nirvana.txt')

# 2. Preprocess the corpus
def preprocess_corpus(corpus):
    # Tokenize the text
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts([corpus])

    # Convert text to sequences
    sequences = tokenizer.texts_to_sequences([corpus])[0]

    # Create input-output pairs
    input_sequences = []
    for i in range(1, len(sequences)):
        n_gram_sequence = sequences[:i+1]
        input_sequences.append(n_gram_sequence)

    # Pad sequences
    max_sequence_length = max([len(seq) for seq in input_sequences])
    input_sequences = pad_sequences(input_sequences, maxlen=max_sequence_length, padding='pre')

    # Create predictors and label
    predictors, label = input_sequences[:,:-1], input_sequences[:,-1]
    label = to_categorical(label, num_classes=len(tokenizer.word_index) + 1)

    return predictors, label, tokenizer, max_sequence_length

predictors, label, tokenizer, max_sequence_length = preprocess_corpus(corpus)

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(predictors, label, test_size=0.2, random_state=42)

# 3. Define model architectures
def create_model(model_type, vocab_size, max_sequence_length):
    model = Sequential()
    model.add(Embedding(vocab_size, 50, input_length=max_sequence_length-1))

    if model_type == 'SimpleRNN':
        model.add(SimpleRNN(100, return_sequences=True))
        model.add(SimpleRNN(100))
    elif model_type == 'LSTM':
        model.add(LSTM(100, return_sequences=True))
        model.add(LSTM(100))
    elif model_type == 'GRU':
        model.add(GRU(100, return_sequences=True))
        model.add(GRU(100))

    model.add(Dense(vocab_size, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
    return model

# Perplexity Callback
class PplCallback(keras.callbacks.Callback):
    def __init__(self, val_data, patience=5):
        self.val_data = val_data
        self.patience = patience
        self.min_perplexity = float('inf')
        self.wait = 0
        self.best_weights = None

    def on_epoch_end(self, epoch, logs=None):
        val_loss = logs.get('val_loss')
        perplexity = np.exp(val_loss)
        print(f'\nValidation Perplexity: {perplexity:.4f}')

        if perplexity < self.min_perplexity:
            self.min_perplexity = perplexity
            self.wait = 0
            self.best_weights = self.model.get_weights()
        else:
            self.wait += 1
            if self.wait >= self.patience:
                self.model.stop_training = True
                print("Restoring model weights from the end of the best epoch.")
                self.model.set_weights(self.best_weights)

# Modify the training loop
models = {}
histories = {}
for model_type in ['SimpleRNN', 'LSTM', 'GRU']:
    print(f"\nTraining {model_type} model:")
    model = create_model(model_type, len(tokenizer.word_index) + 1, max_sequence_length)
    ppl_callback = PplCallback(val_data=(X_val, y_val))

    batch_size = 64
    epochs = 2  # Reduce epochs if still encountering memory issues

    history = model.fit(X_train, y_train, epochs=epochs, validation_data=(X_val, y_val),
                        callbacks=[ppl_callback], batch_size=batch_size, verbose=1)
    models[model_type] = model
    histories[model_type] = history

# Function to generate sequence with timeout
def generate_sequence_with_timeout(model, tokenizer, seed_text, max_length, num_words, temperature=1.0, mode='greedy', timeout=10):
    start_time = time.time()
    while time.time() - start_time < timeout:
        try:
            return generate_sequence(model, tokenizer, seed_text, max_length, num_words, temperature, mode)
        except Exception as e:
            print(f"Error in generation: {e}")
            return f"Error: {str(e)}"
    return "Timeout: Generation took too long"

# Generate sequences using different methods
seed_text = "Come as you are"
for model_type, model in models.items():
    print(f"\n{model_type} Model Generation:")
    result = generate_sequence_with_timeout(model, tokenizer, seed_text, max_sequence_length, 10, 1.0, 'greedy')
    print(result)

# Plot training history
plt.figure(figsize=(12, 8))
for model_type, history in histories.items():
    plt.plot(history.history['val_loss'], label=f'{model_type} Validation Loss')
plt.title('Model Validation Loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend()
plt.savefig('validation_loss.png')
plt.close()

print("Validation loss graph saved as 'validation_loss.png'")

Otra vez

In [1]:
import random
import io
import pickle
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from tensorflow import keras
from tensorflow.keras import layers
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense, LSTM, Embedding, Dropout, SimpleRNN, GRU
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.text import text_to_word_sequence
from tensorflow.keras.utils import pad_sequences
from scipy.special import softmax
import time

# 1. Select and load the corpus
def load_corpus(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read()
    return text

corpus = load_corpus('/kaggle/input/songs-dataset/songs_dataset/nirvana.txt')

# 2. Preprocess the corpus
def preprocess_corpus(corpus):
    # Tokenize the text
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts([corpus])

    # Convert text to sequences
    sequences = tokenizer.texts_to_sequences([corpus])[0]

    # Create input-output pairs
    input_sequences = []
    for i in range(1, len(sequences)):
        n_gram_sequence = sequences[:i+1]
        input_sequences.append(n_gram_sequence)

    # Pad sequences
    max_sequence_length = max([len(seq) for seq in input_sequences])
    input_sequences = pad_sequences(input_sequences, maxlen=max_sequence_length, padding='pre')

    # Create predictors and label
    predictors, label = input_sequences[:,:-1], input_sequences[:,-1]
    label = to_categorical(label, num_classes=len(tokenizer.word_index) + 1)

    return predictors, label, tokenizer, max_sequence_length

predictors, label, tokenizer, max_sequence_length = preprocess_corpus(corpus)

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(predictors, label, test_size=0.2, random_state=42)

# 3. Define model architectures
def create_model(model_type, vocab_size, max_sequence_length):
    model = Sequential()
    model.add(Embedding(vocab_size, 50, input_length=max_sequence_length-1))

    if model_type == 'SimpleRNN':
        model.add(SimpleRNN(100, return_sequences=True))
        model.add(SimpleRNN(100))
    elif model_type == 'LSTM':
        model.add(LSTM(100, return_sequences=True))
        model.add(LSTM(100))
    elif model_type == 'GRU':
        model.add(GRU(100, return_sequences=True))
        model.add(GRU(100))

    model.add(Dense(vocab_size, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
    return model

# Perplexity Callback
class PplCallback(keras.callbacks.Callback):
    def __init__(self, val_data, patience=5):
        self.val_data = val_data
        self.patience = patience
        self.min_perplexity = float('inf')
        self.wait = 0
        self.best_weights = None

    def on_epoch_end(self, epoch, logs=None):
        val_loss = logs.get('val_loss')
        perplexity = np.exp(val_loss)
        print(f'\nValidation Perplexity: {perplexity:.4f}')

        if perplexity < self.min_perplexity:
            self.min_perplexity = perplexity
            self.wait = 0
            self.best_weights = self.model.get_weights()
        else:
            self.wait += 1
            if self.wait >= self.patience:
                self.model.stop_training = True
                print("Restoring model weights from the end of the best epoch.")
                self.model.set_weights(self.best_weights)

# Generate sequence function
def generate_sequence(model, tokenizer, seed_text, max_length, num_words, temperature=1.0, mode='greedy'):
    for _ in range(num_words):
        encoded = tokenizer.texts_to_sequences([seed_text])[0]
        encoded = pad_sequences([encoded], maxlen=max_length-1, padding='pre')

        if mode == 'greedy':
            predicted = model.predict(encoded, verbose=0)
            predicted = np.argmax(predicted, axis=-1)
        elif mode == 'stochastic':
            predicted = model.predict(encoded, verbose=0)
            predicted = sample_with_temperature(predicted[0], temperature)

        output_word = ""
        for word, index in tokenizer.word_index.items():
            if index == predicted:
                output_word = word
                break
        seed_text += " " + output_word
    return seed_text

def sample_with_temperature(preds, temperature):
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

# Train models
models = {}
histories = {}
for model_type in ['SimpleRNN', 'LSTM', 'GRU']:
    print(f"\nTraining {model_type} model:")
    model = create_model(model_type, len(tokenizer.word_index) + 1, max_sequence_length)
    ppl_callback = PplCallback(val_data=(X_val, y_val))

    batch_size = 64
    epochs = 2  # Reduced epochs for faster execution

    history = model.fit(X_train, y_train, epochs=epochs, validation_data=(X_val, y_val),
                        callbacks=[ppl_callback], batch_size=batch_size, verbose=1)
    models[model_type] = model
    histories[model_type] = history

# Function to generate sequence with timeout
def generate_sequence_with_timeout(model, tokenizer, seed_text, max_length, num_words, temperature=1.0, mode='greedy', timeout=10):
    start_time = time.time()
    while time.time() - start_time < timeout:
        try:
            return generate_sequence(model, tokenizer, seed_text, max_length, num_words, temperature, mode)
        except Exception as e:
            print(f"Error in generation: {e}")
            return f"Error: {str(e)}"
    return "Timeout: Generation took too long"

# Generate sequences using different methods
seed_text = "Come as you are"
for model_type, model in models.items():
    print(f"\n{model_type} Model Generation:")
    result = generate_sequence_with_timeout(model, tokenizer, seed_text, max_sequence_length, 10, 1.0, 'greedy')
    print(result)

# Plot training history
plt.figure(figsize=(12, 8))
for model_type, history in histories.items():
    plt.plot(history.history['val_loss'], label=f'{model_type} Validation Loss')
plt.title('Model Validation Loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend()
plt.savefig('validation_loss.png')
plt.close()

print("Validation loss graph saved as 'validation_loss.png'")


Training SimpleRNN model:




Epoch 1/2


I0000 00:00:1727390307.937278    1118 service.cc:145] XLA service 0x7fb588012d60 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1727390307.937344    1118 service.cc:153]   StreamExecutor device (0): Tesla T4, Compute Capability 7.5
I0000 00:00:1727390307.937351    1118 service.cc:153]   StreamExecutor device (1): Tesla T4, Compute Capability 7.5
I0000 00:00:1727390309.999538    1118 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m155/155[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4s/step - accuracy: 0.0197 - loss: 6.7911
Validation Perplexity: 419.2848
[1m155/155[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m611s[0m 4s/step - accuracy: 0.0197 - loss: 6.7882 - val_accuracy: 0.0381 - val_loss: 6.0386
Epoch 2/2
[1m155/155[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2s/step - accuracy: 0.0409 - loss: 5.9266
Validation Perplexity: 413.0592
[1m155/155[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m373s[0m 2s/step - accuracy: 0.0409 - loss: 5.9267 - val_accuracy: 0.0296 - val_loss: 6.0236

Training LSTM model:
Epoch 1/2
[1m155/155[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 867ms/step - accuracy: 0.0319 - loss: 6.5342
Validation Perplexity: 422.7978
[1m155/155[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m153s[0m 964ms/step - accuracy: 0.0319 - loss: 6.5319 - val_accuracy: 0.0381 - val_loss: 6.0469
Epoch 2/2
[1m155/155[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 