In [None]:
!pip install datasets
!python -m spacy download es_core_news_sm

## Descargamos un dataset




In [None]:
from datasets import load_dataset

my_dataset = load_dataset("amazon_reviews_multi", "es", split='train')
my_dataset = my_dataset.filter(lambda example: example['stars'] in [1, 5])
my_dataset = [[example['review_body'], 1 if example['stars'] == 5 else 0] for example in my_dataset]

In [None]:
print(len(my_dataset))
print(my_dataset[0])

In [None]:
import random
random.shuffle(my_dataset)
my_dataset_1k = my_dataset[:1000]
print(len(my_dataset_1k))

## Tokenizamos el texto y lo lematizamos

In [None]:
import spacy

nlp = spacy.load('es_core_news_sm')

texts = [example[0] for example in my_dataset_1k]

tokenized_texts = []

counter = 0
for text in texts:
  counter += 1
  print('\rtokenized texts: {}'.format(str(counter)), end='')
  doc = nlp(text)
  tokenized_text = []
  for token in doc:
    if token.is_stop or token.is_punct:
      continue
    tokenized_text.append(token.lemma_)
  tokenized_texts.append(tokenized_text)

In [None]:
print(len(tokenized_texts))
print(texts[0])
print(tokenized_texts[0])

## Representamos el texto como un Bag of Words (BoW)

In [4]:
from tensorflow.keras.preprocessing import text

tokenizer = text.Tokenizer(num_words=10000)
tokenizer.fit_on_texts(tokenized_texts)
X = tokenizer.texts_to_matrix(tokenized_texts, mode="tfidf")

In [None]:
print(X.shape)
print(X.tolist()[0])

## Entrenamos una NN con BoW

In [5]:
import numpy as np

labels = [example[1] for example in my_dataset_1k]
y = np.array(labels)

In [None]:
print(y.shape)
print(y.tolist())

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation, Dropout

model = Sequential()
model.add(Dense(250, input_shape=(10000,)))
model.add(Activation("relu"))
model.add(Dropout(0.2))
model.add(Dense(1))
model.add(Activation("sigmoid"))

model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])
model.summary()

In [None]:
model.fit(X, y, batch_size=16, epochs=10, validation_split=0.25, verbose=2)

## Predecimos con el modelo entrenado

In [None]:
texts = ['Este producto no es muy bueno', 'Este producto es muy malo']
tokenized_texts = [[token.lemma_ for token in nlp(text)] for text in texts]
X = tokenizer.texts_to_matrix(tokenized_texts, mode="tfidf")

print(X.shape)
print(X.tolist()[0])

In [None]:
scores = model.predict(X)
predictions = [1 if score > 0.5 else 0 for score in scores]
print(scores)
print(predictions)

## Representamos el texto como una secuencia de Ã­ndices

In [None]:
X = tokenizer.texts_to_sequences(tokenized_texts)
print(X[0])
print(len(X[0]))
print(len(X[100]))

In [None]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

X = pad_sequences(X, maxlen=50, padding='post', truncating='post')
print(X[0])
print(X.shape)
