In [None]:
!pip install datasets
!python -m spacy download es_core_news_sm

## Entrenamos NN con capa de Embeddings

In [None]:
from datasets import load_dataset

my_dataset = load_dataset("amazon_reviews_multi", "es", split='train')
my_dataset = my_dataset.filter(lambda example: example['stars'] in [1, 5])
my_dataset = [[example['review_body'], 1 if example['stars'] == 5 else 0] for example in my_dataset]

In [None]:
from tensorflow.keras.preprocessing import text
from tensorflow.keras.preprocessing.sequence import pad_sequences

texts = [example[0] for example in my_dataset]

tokenizer = text.Tokenizer()
tokenizer.fit_on_texts(texts)

X = tokenizer.texts_to_sequences(texts)
X = pad_sequences(X, maxlen=20, padding='post', truncating='post')

In [None]:
import numpy as np

labels = [example[1] for example in my_dataset]
y = np.array(labels)

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation, Dropout, Flatten

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Flatten, Dense

model = Sequential()
model.add(Embedding(len(tokenizer.word_counts) + 1, 10, input_length=20))
model.add(Flatten())
model.add(Dropout(0.7))
model.add(Dense(1, activation="sigmoid"))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.summary()

In [None]:
model.fit(X, y, batch_size=16, epochs=2, validation_split=0.25, verbose=2)

## Extraemos capa de embeddings 

In [None]:
embeddings = model.layers[0].get_weights()[0]
word_embeddings = {}

for index, vector in enumerate(embeddings):
  if not index: # 0 is padding
    continue
  word = tokenizer.index_word[index]
  word_embeddings[word] = vector

print(word_embeddings['roto'])

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

print(cosine_similarity([word_embeddings['precioso']], [word_embeddings['bonito']]))
print(cosine_similarity([word_embeddings['tarde']], [word_embeddings['defectuoso']]))

In [None]:
print(cosine_similarity([word_embeddings['precioso']], [word_embeddings['defectuoso']]))
print(cosine_similarity([word_embeddings['tarde']], [word_embeddings['bonito']]))

## Guardamos embeddings en ficheros

In [None]:
embeddings_fh = open('my_embeddings.tsv', 'w+')
vocabulary_fh = open('my_vocabulary.tsv', 'w+')

for word, vector in word_embeddings.items():
  embeddings_fh.write('\t'.join([str(v) for v in vector]) + '\n')
  vocabulary_fh.write(word + '\n')

embeddings_fh.close()
vocabulary_fh.close()