# Importamos modelo para el español

In [None]:
!python -m spacy download es_core_news_sm

# Tokenizamos y lematizamos texto

In [None]:
import spacy

my_texts = ['Esta es mi primera oración', 'Vamos a partir estas cadenas de texto en tokens', 'Y vamos a lematizar esos tokens']
nlp = spacy.load('es_core_news_sm')

for text in my_texts:
  doc = nlp(text)
  for token in doc:
    if not token.is_stop:
      print(token.lemma_, token.pos_)

# Importamos dataset

In [None]:
!pip install datasets

In [None]:
from datasets import load_dataset

dataset = load_dataset("amazon_reviews_multi", "es", split="test")
dataset = dataset.filter(lambda example: example['stars'] in [1, 5])
dataset = dataset.shuffle()
texts = dataset['review_body']
stars = dataset['stars']
print(len(stars))

# Tokenizamos y lematizamos texto

In [None]:
tokenized_texts = []
counter = 0
for text in texts:
  counter += 1
  print('\rtokenized texts: {}'.format(str(counter)), end='')
  tokenized_text = nlp(text)
  tokenized_text = [token.lemma_ for token in tokenized_text]
  tokenized_texts.append(tokenized_text)

In [None]:
print(texts[0])
print(tokenized_texts[0])

# Creamos Bag of Words (BoW)

In [16]:
import numpy as np
from tensorflow.keras.preprocessing import text

tokenizer = text.Tokenizer(num_words=10000)
tokenizer.fit_on_texts(tokenized_texts)
X = tokenizer.texts_to_matrix(tokenized_texts, mode="tfidf")
Y = np.array(list(map(lambda x: 1 if x == 5 else 0, stars)))

In [None]:
print(X[0])
print(Y[0])

# Entrenamos una NN con BoW

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation, Dropout

model = Sequential()
model.add(Dense(250, input_shape=(10000,)))
model.add(Activation("relu"))
model.add(Dropout(0.2))
model.add(Dense(1))
model.add(Activation("sigmoid"))

model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])
model.summary()

In [None]:
model.fit(X, Y, batch_size=16, epochs=10, validation_split=0.25, verbose=2)

In [None]:
texts = ['Este producto es muy bueno', 'Este producto es muy malo']
tokenized_texts = [[token.lemma_ for token in nlp(text)] for text in texts]
X = tokenizer.texts_to_matrix(tokenized_texts, mode="tfidf")

print(model.predict(X))