<h1 align="center">
  <b>Universidad Autónoma de Chihuahua</b>
</h1>
<h2 align="center">
  <b>Facultad de Ingeniaría</b>
</h2>
<br>
<p align = "center">
  <a href="https://sega.uach.mx/">
     <img src="https://drive.google.com/uc?id=1n8NdPSF4WAZRxVomm74jf0zLU9ibdxqT">
  </a>
</p>

<h1 align="center">
  <b>Natural Language Processing with RNNs</b>
</h1>
<br>

<h2 align="center">
  <b>Data Science</b>
</h2>

<h3 align="center">
  <b>Proyecto Tercer Parcial</b>
</h3>
<br>
<p align="center">
<b>Link GitHub</b><br>
<href> </href>
</p>
<p align="center">
  <b>Alumnos: </b><br>
  Juan Luis Del Valle Sotelo - 338912 
  <br>
  Valeria Sofía Nevárez Juárez - 338811
</p>
<p align="center">
  <b>Profesor:</b>
  Jesús Roberto López Santillán
</p>
<br>



<p align="left">
  A miércoles 24 de mayo de 2023
</p>

In [None]:
import sys
assert sys.version_info >= (3, 5)

IS_COLAB = "google.colab" in sys.modules
IS_KAGGLE = "kaggle_secrets" in sys.modules

if IS_COLAB:
    %pip install -q -U tensorflow-addons
    %pip install -q -U transformers

import sklearn
assert sklearn.__version__ >= "0.20"

import tensorflow as tf
from tensorflow import keras
assert tf.__version__ >= "2.0"

if not tf.config.list_physical_devices('GPU'):
    print("No GPU was detected. LSTMs and CNNs can be very slow without a GPU.")
    if IS_COLAB:
        print("Go to Runtime > Change runtime and select a GPU hardware accelerator.")
    if IS_KAGGLE:
        print("Go to Settings > Accelerator and select GPU.")

import numpy as np
import os

np.random.seed(42)
tf.random.set_seed(42)

%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

PROJECT_ROOT_DIR = "."
CHAPTER_ID = "nlp"
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID)
os.makedirs(IMAGES_PATH, exist_ok=True)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

In [None]:
np.random.seed(42)
tf.random.set_seed(42)

n_steps = 5
dataset = tf.data.Dataset.from_tensor_slices(tf.range(15))
dataset = dataset.window(n_steps, shift=2, drop_remainder=True)
dataset = dataset.flat_map(lambda window: window.batch(n_steps))
dataset = dataset.shuffle(10).map(lambda window: (window[:-1], window[1:]))
dataset = dataset.batch(3).prefetch(1)
for index, (X_batch, Y_batch) in enumerate(dataset):
    print("_" * 20, "Batch", index, "\nX_batch")
    print(X_batch.numpy())
    print("=" * 5, "\nY_batch")
    print(Y_batch.numpy())

____________________ Batch 0 
X_batch
[[6 7 8 9]
 [2 3 4 5]
 [4 5 6 7]]
===== 
Y_batch
[[ 7  8  9 10]
 [ 3  4  5  6]
 [ 5  6  7  8]]
____________________ Batch 1 
X_batch
[[ 0  1  2  3]
 [ 8  9 10 11]
 [10 11 12 13]]
===== 
Y_batch
[[ 1  2  3  4]
 [ 9 10 11 12]
 [11 12 13 14]]


## Loading the Data and Preparing the Dataset

In [None]:
file_path = "south_park.txt" 

In [None]:
with open(file_path, 'r') as file:
    file_contents = file.read()

In [None]:
unwanted_chars = "àáâäèéëíîñóôöúüабйнтщў؟آابتثخدرزشعفلمنهوچکی™"

translation_table = str.maketrans("", "", unwanted_chars)
south_park_text = file_contents.translate(translation_table)


In [None]:
print(south_park_text[:200])

Stan 
You guys, you guys! Chef is going away. 

Kyle
Going away? For how long?

Stan
Forever.

Chef
I'm sorry boys.

Stan
Chef said he's been bored, so he joining a group called the Super Adventure Cl


In [None]:
"".join(sorted(set(south_park_text.lower())))

"\t\n !#$%&'()*+,-./0123456789:;=?@_abcdefghijklmnopqrstuvwxyz¡¿’"

In [None]:
tokenizer = keras.preprocessing.text.Tokenizer(char_level=True)
tokenizer.fit_on_texts(south_park_text)

In [None]:
tokenizer.texts_to_sequences(["First"])

[[25, 7, 9, 8, 3]]

In [None]:
tokenizer.sequences_to_texts([[25, 7, 9, 8, 3]])

['f i r s t']

In [None]:
max_id = len(tokenizer.word_index) 
dataset_size = tokenizer.document_count

In [None]:
[encoded] = np.array(tokenizer.texts_to_sequences([south_park_text])) - 1
train_size = dataset_size * 90 // 100
dataset = tf.data.Dataset.from_tensor_slices(encoded[:train_size])

In [None]:
n_steps = 100
window_length = n_steps + 1 
dataset = dataset.window(window_length, shift=1, drop_remainder=True)

In [None]:
dataset = dataset.flat_map(lambda window: window.batch(window_length))

In [None]:
np.random.seed(42)
tf.random.set_seed(42)

In [None]:
batch_size = 32
dataset = dataset.shuffle(10000).batch(batch_size)
dataset = dataset.map(lambda windows: (windows[:, :-1], windows[:, 1:]))

In [None]:
dataset = dataset.map(
    lambda X_batch, Y_batch: (tf.one_hot(X_batch, depth=max_id), Y_batch))

In [None]:
dataset = dataset.prefetch(1)

In [None]:
for X_batch, Y_batch in dataset.take(1):
    print(X_batch.shape, Y_batch.shape)

(32, 100, 62) (32, 100)


## Creating and Training the Model

In [None]:
model = keras.models.Sequential([
    keras.layers.GRU(128, return_sequences=True, input_shape=[None, max_id],
                     dropout=0.2),
    keras.layers.GRU(128, return_sequences=True,
                     dropout=0.2),
    keras.layers.TimeDistributed(keras.layers.Dense(max_id,
                                                    activation="softmax"))
])
model.compile(loss="sparse_categorical_crossentropy", optimizer="adam")
history = model.fit(dataset, epochs=15)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


## Using the Model to Generate Text

In [None]:
def preprocess(texts):
    X = np.array(tokenizer.texts_to_sequences(texts)) - 1
    return tf.one_hot(X, max_id)

In [None]:
X_new = preprocess(["How are yo"])
Y_pred = np.argmax(model(X_new), axis=-1)
tokenizer.sequences_to_texts(Y_pred + 1)[0][-1] 

'u'

In [None]:
tf.random.set_seed(42)

tf.random.categorical([[np.log(0.5), np.log(0.4), np.log(0.1)]], num_samples=40).numpy()

array([[0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0,
        2, 0, 0, 1, 1, 1, 0, 0, 1, 2, 0, 0, 1, 1, 0, 0, 0, 0]])

In [None]:
def next_char(text, temperature=1):
    X_new = preprocess([text])
    y_proba = model(X_new)[0, -1:, :]
    rescaled_logits = tf.math.log(y_proba) / temperature
    char_id = tf.random.categorical(rescaled_logits, num_samples=1) + 1
    return tokenizer.sequences_to_texts(char_id.numpy())[0]

In [None]:
tf.random.set_seed(42)

next_char("How are yo", temperature=1)

'u'

In [None]:
def complete_text(text, n_chars=1000, temperature=1):
    for _ in range(n_chars):
        text += next_char(text, temperature)
    return text

In [None]:
tf.random.set_seed(42)

print(complete_text("t"))

t i should help and together help!  kick you mysterion. branding them about what i think that you gotuen anymore, complete you guys?  tupperwar, and soon, is dead weth unshou sesurer tacklite because etoses sack.  or, in a stopping crilnd of the coon. i actually it feel lekens! the wrong is, he're actually even destroy factices! but the coon and friends how cthulhu is just started people is wrong?!

coon
look, what should i do? 

singer
right, fucking guy!

toolshed
aww! where happened?! 

mosquito
yeah, no. 

liane
yes, because it's my undically grautan' shop even and coon, 'cause i'll totally be done into another dimension. and an the gleas rainbow.

coon
what do you mean as this curse? be should we do?

memt reporter
coon one reamonion. without him.

toolshed
all when we're gonna die-

woman k
toolshed!

toolshed
gotard! dude. you soon, mysterion! let's go home! 

red goth
i'll took dembsibey, per-that evil called owe my name!

loane
so the dark lord age!

coon
no! baby ap
lot right

## Stateful RNN

In [None]:
tf.random.set_seed(42)

In [None]:
dataset = tf.data.Dataset.from_tensor_slices(encoded[:train_size])
dataset = dataset.window(window_length, shift=n_steps, drop_remainder=True)
dataset = dataset.flat_map(lambda window: window.batch(window_length))
dataset = dataset.batch(1)
dataset = dataset.map(lambda windows: (windows[:, :-1], windows[:, 1:]))
dataset = dataset.map(
    lambda X_batch, Y_batch: (tf.one_hot(X_batch, depth=max_id), Y_batch))
dataset = dataset.prefetch(1)

In [None]:
batch_size = 32
encoded_parts = np.array_split(encoded[:train_size], batch_size)
datasets = []
for encoded_part in encoded_parts:
    dataset = tf.data.Dataset.from_tensor_slices(encoded_part)
    dataset = dataset.window(window_length, shift=n_steps, drop_remainder=True)
    dataset = dataset.flat_map(lambda window: window.batch(window_length))
    datasets.append(dataset)
dataset = tf.data.Dataset.zip(tuple(datasets)).map(lambda *windows: tf.stack(windows))
dataset = dataset.map(lambda windows: (windows[:, :-1], windows[:, 1:]))
dataset = dataset.map(
    lambda X_batch, Y_batch: (tf.one_hot(X_batch, depth=max_id), Y_batch))
dataset = dataset.prefetch(1)

In [None]:
model = keras.models.Sequential([
    keras.layers.GRU(128, return_sequences=True, stateful=True,
                     dropout=0.2,
                     batch_input_shape=[batch_size, None, max_id]),
    keras.layers.GRU(128, return_sequences=True, stateful=True,
                     dropout=0.2),
    keras.layers.TimeDistributed(keras.layers.Dense(max_id,
                                                    activation="softmax"))
])

In [None]:
class ResetStatesCallback(keras.callbacks.Callback):
    def on_epoch_begin(self, epoch, logs):
        self.model.reset_states()

In [None]:
model.compile(loss="sparse_categorical_crossentropy", optimizer="adam")
history = model.fit(dataset, epochs=50,
                    callbacks=[ResetStatesCallback()])

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [None]:
stateless_model = keras.models.Sequential([
    keras.layers.GRU(128, return_sequences=True, input_shape=[None, max_id]),
    keras.layers.GRU(128, return_sequences=True),
    keras.layers.TimeDistributed(keras.layers.Dense(max_id,
                                                    activation="softmax"))
])

In [None]:
stateless_model.build(tf.TensorShape([None, None, max_id]))

In [None]:
stateless_model.set_weights(model.get_weights())
model = stateless_model

In [None]:
tf.random.set_seed(42)

print(complete_text("cartman"))

cartman
the fuck here are to the walk, then you thanks to fig brand?

mr. mackey
get arout warchist you, buke, but, i'm just going to be of the night, and you grist soes to the las somebody aren't real?

cartman
isn't a little new of said's knowed to keep shut the dad, i'm gonna did it to really addew here, we had to deptine and inshur. i've way-

stan
what is that?!

butters
just to a speak is bexise your hark. i'm not sorry, they peet the thried is the country, and i moaas.

welly
how chen?! 

boy
mush, i'm gonna foll brywhide cartumate d-!

croid and now, we shopped that put of 'he jimmy's popvy party guy in showidnel big... 

louse 2
it's going on, really gonna kyle video the fact is cure. it's trying.

cartman
i have to think of the hoado?

liane
oh, let's in.

kyle
hww you don't keep me where!  you're doing!

butters
hello, don't you sool kyle! i'm neliing that?

cartman
dube it wouldn't you dead. eyee are writtle guy. i stand some of the makes and now lage!

cartman
mimsy, p
lat

# Conclusión




En resumen, las redes neuronales recurrentes (RNN) pueden mostrar y completar palabras, pero carecen de sentido gramatical. En este caso, se habría podido mejorar el rendimiento corriendo el modelo durante más épocas, pero se enfrentaron problemas de desconexión en Google Colab. A pesar de sus limitaciones, las RNN pueden ser útiles en contextos creativos o como punto de partida para generar texto más avanzado. Mejorar y refinar estos modelos en el futuro puede llevar a resultados más coherentes y comprensibles.