In [11]:
from tensorflow.keras.datasets import imdb
from tensorflow.keras.preprocessing import sequence

# Cargar datos con las 50,000 palabras más frecuentes
(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=50000)


In [12]:
X_train = sequence.pad_sequences(X_train, maxlen=150)
X_test = sequence.pad_sequences(X_test, maxlen=150)


In [13]:
import tensorflow as tf

# Obtener el índice de palabras del dataset de IMDB
word_index = tf.keras.datasets.imdb.get_word_index()

# Invertir el índice para que puedas decodificar las secuencias numéricas
inverted_word_index = dict(
    (i + 3, word) for (word, i) in word_index.items()
)
inverted_word_index[1] = "[START]"
inverted_word_index[2] = "[OOV]"



In [14]:
import numpy as np

# Definir listas básicas de palabras positivas y negativas
positive_words = set(["good", "great", "excellent", "amazing", "awesome", "fantastic"])
negative_words = set(["bad", "terrible", "horrible", "worst", "awful"])

def extract_features(sequences, word_index):
    features = []
    for sequence in sequences:
        decoded_review = [inverted_word_index.get(i, "") for i in sequence]
        length = len(sequence)  # Longitud de la crítica
        positive_count = sum(1 for word in decoded_review if word in positive_words)
        negative_count = sum(1 for word in decoded_review if word in negative_words)
        total_words = len(decoded_review)
        if total_words > 0:
            pos_neg_ratio = (positive_count - negative_count) / total_words
        else:
            pos_neg_ratio = 0
        features.append([length, pos_neg_ratio])
    return np.array(features)

# Extraer características para el set de entrenamiento y prueba
train_features = extract_features(X_train, word_index)
test_features = extract_features(X_test, word_index)


In [15]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
train_features_scaled = scaler.fit_transform(train_features)
test_features_scaled = scaler.transform(test_features)


In [20]:
from tensorflow.keras.layers import Input, Concatenate, LSTM, Dense, Embedding, Dropout
from tensorflow.keras.models import Model

# Entrada de secuencias de longitud 150
input_seq = Input(shape=(150,))
x = Embedding(50000, 128)(input_seq)
x = LSTM(128, return_sequences=True)(x)
x = Dropout(0.3)(x)
x = LSTM(64, return_sequences=False)(x)
x = Dropout(0.3)(x)

# Entrada de características adicionales (2 características)
input_features = Input(shape=(2,))

# Concatenar ambas entradas
concat = Concatenate()([x, input_features])

# Capas densas adicionales
x = Dense(64, activation='relu')(concat)
x = Dropout(0.5)(x)
x = Dense(32, activation='relu')(x)
output = Dense(1, activation='sigmoid')(x)

# Definir el modelo
model = Model([input_seq, input_features], output)

# Compilar el modelo
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])


In [21]:
# Entrenar el modelo con la nueva longitud de secuencia y características
model.fit([X_train, train_features_scaled], y_train, 
          batch_size=32, 
          epochs=15, 
          validation_data=([X_test, test_features_scaled], y_test), 
          verbose=2)


Epoch 1/15
782/782 - 202s - 259ms/step - accuracy: 0.7449 - loss: 0.5151 - val_accuracy: 0.8049 - val_loss: 0.4288
Epoch 2/15
782/782 - 204s - 261ms/step - accuracy: 0.8656 - loss: 0.3261 - val_accuracy: 0.7731 - val_loss: 0.4906
Epoch 3/15
782/782 - 195s - 249ms/step - accuracy: 0.9154 - loss: 0.2283 - val_accuracy: 0.8565 - val_loss: 0.3430
Epoch 4/15
782/782 - 288s - 369ms/step - accuracy: 0.9578 - loss: 0.1271 - val_accuracy: 0.8475 - val_loss: 0.4049
Epoch 5/15
782/782 - 1109s - 1s/step - accuracy: 0.9762 - loss: 0.0743 - val_accuracy: 0.8354 - val_loss: 0.5464
Epoch 6/15
782/782 - 189s - 241ms/step - accuracy: 0.9832 - loss: 0.0543 - val_accuracy: 0.8440 - val_loss: 0.6007
Epoch 7/15
782/782 - 1366s - 2s/step - accuracy: 0.9919 - loss: 0.0273 - val_accuracy: 0.8447 - val_loss: 0.6779
Epoch 8/15
782/782 - 189s - 242ms/step - accuracy: 0.9927 - loss: 0.0240 - val_accuracy: 0.8400 - val_loss: 0.6954
Epoch 9/15
782/782 - 693s - 886ms/step - accuracy: 0.9950 - loss: 0.0156 - val_accur

<keras.src.callbacks.history.History at 0x314d243d0>

In [22]:
# Evaluar el modelo
loss, accuracy = model.evaluate([X_test, test_features_scaled], y_test, batch_size=32, verbose=2)
print('Pérdida de la prueba:', loss)
print('Exactitud de la prueba:', accuracy)


782/782 - 29s - 37ms/step - accuracy: 0.8395 - loss: 0.8289
Pérdida de la prueba: 0.8288770914077759
Exactitud de la prueba: 0.8394799828529358
