In [None]:
import gensim.models
from gensim.test.utils import datapath
from gensim import utils
from gensim.models.callbacks import CallbackAny2Vec

# 🆘 Clases auxiliares

In [None]:
class EpochLogger(CallbackAny2Vec):
    '''Callback to log information about training'''

    def __init__(self):
        self.epoch = 0

    def on_epoch_end(self, model):
        loss = model.get_latest_training_loss()
        if self.epoch == 0:
            print('Loss after epoch {}: {}'.format(self.epoch, loss))
        else:
            print('Loss after epoch {}: {}'.format(self.epoch, loss- self.loss_previous_step))
        self.epoch += 1
        self.loss_previous_step = loss


#Tienes que proporcionar a esta clase una lista de rutas con txts.
class MyCorpus:
    """An iterator that yields sentences (lists of str)."""

    def __init__(self, paths):
        self.paths = paths

    def __iter__(self):
        len_lines = 0
        for path in self.paths:
            with open(path, 'r') as file:
                for line in file:
                    len_lines += 1
                    yield utils.simple_preprocess(line)

# 2️⃣ Word2Vec

In [None]:
corpus_paths = ["../Corpus/Txt/TheFinalEmpire.txt", "../Corpus/Txt/TheHeroOfAges.txt"]
sentences = MyCorpus(corpus_paths)
epoch_logger = EpochLogger()

model = gensim.models.Word2Vec(sentences=sentences,epochs=300, vector_size=100, window=5, min_count=3,  compute_loss=True, callbacks=[epoch_logger], negative=5)

Loss after epoch 0: 932805.0
Loss after epoch 1: 828965.625
Loss after epoch 2: 776026.875
Loss after epoch 3: 736761.75
Loss after epoch 4: 710022.25
Loss after epoch 5: 627609.5
Loss after epoch 6: 581019.5
Loss after epoch 7: 579391.5
Loss after epoch 8: 562754.5
Loss after epoch 9: 549741.0
Loss after epoch 10: 554954.5
Loss after epoch 11: 540146.0
Loss after epoch 12: 527308.0
Loss after epoch 13: 465570.0
Loss after epoch 14: 463860.0
Loss after epoch 15: 468987.0
Loss after epoch 16: 457845.0
Loss after epoch 17: 463745.0
Loss after epoch 18: 455207.0
Loss after epoch 19: 452476.0
Loss after epoch 20: 451015.0
Loss after epoch 21: 455041.0
Loss after epoch 22: 447654.0
Loss after epoch 23: 453106.0
Loss after epoch 24: 444332.0
Loss after epoch 25: 442712.0
Loss after epoch 26: 442487.0
Loss after epoch 27: 449150.0
Loss after epoch 28: 430486.0
Loss after epoch 29: 440178.0
Loss after epoch 30: 437396.0
Loss after epoch 31: 381919.0
Loss after epoch 32: 359884.0
Loss after epo

In [None]:
model.save("../Embeddings/Word2Vec/Mistborn_model_300epoch")

# 📋 Tensorboard

In [None]:
import tensorflow as tf
import os
from tensorboard.plugins import projector
import gensim

#Cargamos el modelo que hemos guardado anteriormente. TODO: cambiar la ruta.
model = gensim.models.Word2Vec.load("../Embeddings/Word2Vec/Mistborn_model_300epoch")

#Preparamos el modelo para ser visualizado guardando el nombre de las 10,000 palabras más repetidas. TODO: crear carpeta logs en tu drive.
LOG_DIR = '../Embeddings/Word2Vec/logs/'
metadata = os.path.join(LOG_DIR, 'metadata.tsv')
vocab_list = model.wv.index_to_key[:10000]  # Visualize top 10,000 words

#Guardamos todo el vocabulario.
with open(metadata, 'w') as metadata_file:
    for word in vocab_list:
        metadata_file.write(f"{word}\n")


#Guardamos todos los pesos de las 10,000 palabras.
weights = tf.Variable(model.wv.vectors[:10000], dtype=tf.float32, name='word2vec')
checkpoint = tf.train.Checkpoint(embedding=weights)
checkpoint.save(os.path.join(LOG_DIR, "embedding.ckpt"))

#Configuramos el proyector.
config = projector.ProjectorConfig()
embedding = config.embeddings.add()
embedding.tensor_name = "embedding/.ATTRIBUTES/VARIABLE_VALUE"
embedding.metadata_path = 'metadata.tsv'

with open(os.path.join(LOG_DIR, 'projector_config.pbtxt'), 'w') as f:
    f.write(str(config))