# ¿Cómo funciona un modelo NLP?

In [1]:
from tensorflow.keras.preprocessing.text import Tokenizer




Las redes neuronales necesitan entrenar con números por lo que deberemos convertir el texto en números. Esto se hace con la tokenización

In [2]:
sentences = {
    'I love my dog',
    'I, love my cat'
}

tokenizer = Tokenizer(num_words=100)
tokenizer.fit_on_texts(sentences)

Se ordenan por frecuencia. "I", "love" y "my" están 2 veces por lo que aparecerán antes que "dog" o "cat" que solo están 1 vez

In [3]:
tokenizer.word_index

{'i': 1, 'love': 2, 'my': 3, 'cat': 4, 'dog': 5}

In [11]:
sentences = {
    'I love my dog',
    'I, love my cat',
    'You love my dog!'
}
tokenizer = Tokenizer(num_words=100, filters='')
tokenizer.fit_on_texts(sentences)
tokenizer.word_index

{'love': 1, 'my': 2, 'i,': 3, 'cat': 4, 'i': 5, 'dog': 6, 'you': 7, 'dog!': 8}

In [14]:
sentences = [
    'I love my dog',
    'I, love my cat',
    'You love my dog!',
    'Do you think my dog is amazing?'
]
# OOV --> Out of vocabulary. Significa que cuando una palabra no está dentro del vocabulario se le asigna un 1
tokenizer = Tokenizer(num_words=100, oov_token="<OOV>")
tokenizer.fit_on_texts(sentences)
sequences = tokenizer.texts_to_sequences(sentences)
tokenizer.word_index

{'<OOV>': 1,
 'my': 2,
 'love': 3,
 'dog': 4,
 'i': 5,
 'you': 6,
 'cat': 7,
 'do': 8,
 'think': 9,
 'is': 10,
 'amazing': 11}

Ya hemos tokenizado las frases

In [15]:
sequences

[[5, 3, 2, 4], [5, 3, 2, 7], [6, 3, 2, 4], [8, 6, 9, 2, 4, 10, 11]]

In [16]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [17]:
pad_sequences(sequences, maxlen=5)
# Se nivelan el tamaño de las frases a 5 palabras
# A las frases con menos de 5 palabras se le añaden 0's a la izquierda
# y las que frases que tienen menos de 5 palabras se les quita los datos de la izquierda para almacenar solo el final de la frase

array([[ 0,  5,  3,  2,  4],
       [ 0,  5,  3,  2,  7],
       [ 0,  6,  3,  2,  4],
       [ 9,  2,  4, 10, 11]])

Ya que anteriormente hemos usado oov_token='\<OOV>', las palabras que no están en el diccionario tienen un valor de 1

In [18]:
test_data = [
    'i really love my dog',
    'my dog loves my manatee'
]

tokenizer.texts_to_sequences(test_data)

[[5, 1, 3, 2, 4], [2, 4, 1, 2, 1]]

## Vamos a descargar un json para averiguar si son saracásticos los titulares de un noticiero

In [22]:
# %wget https://storage.googleapis.com/tensorflow-1-public/course3/sarcasm.json

UsageError: Line magic function `%curl` not found.


In [26]:
import json
with open("./sarcasm.json", 'r') as f:
  datastore = json.load(f)

In [30]:
datastore[20000]

{'article_link': 'https://www.theonion.com/pediatricians-announce-2011-newborns-are-ugliest-babies-1819572977',
 'headline': 'pediatricians announce 2011 newborns are ugliest babies in 30 years',
 'is_sarcastic': 1}

In [31]:
sentences, labels, urls = list(), list(), list()

for item in datastore:
  sentences.append(item['headline']),
  labels.append(item['is_sarcastic']),
  urls.append(item['article_link'])

In [32]:
len(sentences)

26709

In [33]:
tokenizer = Tokenizer(oov_token="<OOV>")
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index

In [34]:
len(word_index)

29657

In [36]:
sequences = tokenizer.texts_to_sequences(sentences)
# Al no especificar longitud pad_sequences iguala todas las frases a la longitud de la frase más larga
padded = pad_sequences(sequences, padding='post') # Con el padding añadimos los 0's al final de la frase

En sentences tenemos los titulares en formato texto

En padded tenemos los titulares tokenizados

In [37]:
sentences[2]

"mom starting to fear son's web series closest thing she will have to grandchild"

In [42]:
padded[2]

array([  145,   838,     2,   907,  1749,  2093,   582,  4719,   221,
         143,    39,    46,     2, 10736,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0])

## Pruebas con otro dataset

In [44]:
import tensorflow_datasets as tfds

imdb, info = tfds.load("imdb_reviews", with_info=True, as_supervised=True)

  from .autonotebook import tqdm as notebook_tqdm


[1mDownloading and preparing dataset Unknown size (download: Unknown size, generated: Unknown size, total: Unknown size) to C:\Users\Adri\tensorflow_datasets\imdb_reviews\plain_text\1.0.0...[0m


Dl Size...: 100%|██████████| 80/80 [04:19<00:00,  3.24s/ MiB]url]
Dl Completed...: 100%|██████████| 1/1 [04:19<00:00, 259.52s/ url]
                                                                        

[1mDataset imdb_reviews downloaded and prepared to C:\Users\Adri\tensorflow_datasets\imdb_reviews\plain_text\1.0.0. Subsequent calls will reuse this data.[0m


In [45]:
info

tfds.core.DatasetInfo(
    name='imdb_reviews',
    full_name='imdb_reviews/plain_text/1.0.0',
    description="""
    Large Movie Review Dataset. This is a dataset for binary sentiment
    classification containing substantially more data than previous benchmark
    datasets. We provide a set of 25,000 highly polar movie reviews for training,
    and 25,000 for testing. There is additional unlabeled data for use as well.
    """,
    config_description="""
    Plain text
    """,
    homepage='http://ai.stanford.edu/~amaas/data/sentiment/',
    data_dir='C:\\Users\\Adri\\tensorflow_datasets\\imdb_reviews\\plain_text\\1.0.0',
    file_format=tfrecord,
    download_size=80.23 MiB,
    dataset_size=129.83 MiB,
    features=FeaturesDict({
        'label': ClassLabel(shape=(), dtype=int64, num_classes=2),
        'text': Text(shape=(), dtype=string),
    }),
    supervised_keys=('text', 'label'),
    disable_shuffling=False,
    splits={
        'test': <SplitInfo num_examples=25000, num_s

Una vez importadas las reseñas, las convertimos en texto ya que inicialmente están en formato de tensor.

Luego dividimos estas reseñas en entrenamiento y test.

In [67]:
import numpy as np
train_data, test_data = imdb['train'], imdb['test']

training_sentences, training_labels = list(), list()
testing_sentences, testing_labels = list(), list()

for sentence, label in train_data:
  training_sentences.append(sentence.numpy().decode('utf8'))
  training_labels.append(label.numpy())

for sentence, label in test_data:
  testing_sentences.append(sentence.numpy().decode('utf8'))
  testing_labels.append(label.numpy())

In [68]:
training_labels_final = np.array(training_labels)
testing_labels_final = np.array(testing_labels)

In [69]:
training_sentences[0], training_labels[0]

("This was an absolutely terrible movie. Don't be lured in by Christopher Walken or Michael Ironside. Both are great actors, but this must simply be their worst role in history. Even their great acting could not redeem this movie's ridiculous storyline. This movie is an early nineties US propaganda piece. The most pathetic scenes were those when the Columbian rebels were making their cases for revolutions. Maria Conchita Alonso appeared phony, and her pseudo-love affair with Walken was nothing but a pathetic emotional plug in a movie that was devoid of any real meaning. I am disappointed that there are movies like this, ruining actor's like Christopher Walken's good name. I could barely sit through it.",
 0)

In [70]:
testing_sentences[0], testing_labels[0]

("There are films that make careers. For George Romero, it was NIGHT OF THE LIVING DEAD; for Kevin Smith, CLERKS; for Robert Rodriguez, EL MARIACHI. Add to that list Onur Tukel's absolutely amazing DING-A-LING-LESS. Flawless film-making, and as assured and as professional as any of the aforementioned movies. I haven't laughed this hard since I saw THE FULL MONTY. (And, even then, I don't think I laughed quite this hard... So to speak.) Tukel's talent is considerable: DING-A-LING-LESS is so chock full of double entendres that one would have to sit down with a copy of this script and do a line-by-line examination of it to fully appreciate the, uh, breadth and width of it. Every shot is beautifully composed (a clear sign of a sure-handed director), and the performances all around are solid (there's none of the over-the-top scenery chewing one might've expected from a film like this). DING-A-LING-LESS is a film whose time has come.",
 1)

Aplicamos Word Embedding

El word embedding es una técnica de deeplearning que permite a las computadoras entender las palabras de manera más matemática y computacional.

Esto permite realizar operaciones matemáticas con palabras, como encontrar la similitud entre palabras, la analogía (por ejemplo, "hombre" está a "mujer" como "rey" está a "reina"), entre otras operaciones.

In [71]:
# Parámetros para la tokenización y el relleno
vocab_size = 10000  # Tamaño del vocabulario: número máximo de palabras únicas
max_length = 120    # Longitud máxima de las secuencias después del relleno/truncado
embedding_dim = 16  # Dimensión del espacio de incrustación para cada palabra
trunc_type = 'post' # Tipo de truncado: 'post' trunca al final, 'pre' al principio
oov_tok = '<OOV>'   # Token para palabras fuera del vocabulario durante la tokenización

# Inicializar el tokenizador con el tamaño de vocabulario definido y el token para palabras fuera del vocabulario
tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)

# Ajustar el tokenizador en las oraciones de entrenamiento para desarrollar el índice de palabras
tokenizer.fit_on_texts(training_sentences)

# Obtener el índice de palabras que mapea palabras a su representación entera
word_index = tokenizer.word_index

# Convertir las oraciones de entrenamiento en secuencias de enteros
sequences = tokenizer.texts_to_sequences(training_sentences)

# Rellenar las secuencias para asegurar una longitud uniforme
padded = pad_sequences(sequences, maxlen=max_length, truncating=trunc_type)

# Convertir las oraciones de prueba en secuencias usando el mismo tokenizador
testing_sequences = tokenizer.texts_to_sequences(testing_sentences)

# Rellenar las secuencias de prueba
testing_padded = pad_sequences(testing_sequences, maxlen=max_length, truncating=trunc_type)


In [72]:
import tensorflow as tf
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(12, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [73]:
model.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 120, 16)           160000    
                                                                 
 flatten_2 (Flatten)         (None, 1920)              0         
                                                                 
 dense_4 (Dense)             (None, 12)                23052     
                                                                 
 dense_5 (Dense)             (None, 1)                 13        
                                                                 
Total params: 183065 (715.10 KB)
Trainable params: 183065 (715.10 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [74]:
num_epochs = 10
model.fit(padded, training_labels_final, epochs=num_epochs, validation_data=(testing_padded, testing_labels_final))

Epoch 1/10


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x222b88e6b90>

In [75]:
embedding_layer = model.layers[0]
embedding_weights = embedding_layer.get_weights()[0]

In [76]:
reverse_word_index = tokenizer.index_word

In [77]:
import io
out_v = io.open('vecs.tsv', 'w', encoding='utf-8')
out_m = io.open('meta.tsv', 'w', encoding='utf-8')

for word_num in range(1, vocab_size):
  word_name = reverse_word_index[word_num]
  word_embedding = embedding_weights[word_num]
  out_m.write(word_name + "\n")
  out_v.write('\t'.join([str(x) for x in word_embedding]) + "\n")

In [78]:
# try:
#   from google.colab import files
# except ImportError:
#   pass

# else:
#   files.download('vecs.tsv')
#   files.download('meta.tsv')

[Página para visualizar los datos](https://projector.tensorflow.org/)