# Cargar datos

In [72]:
import pandas as pd
import numpy as np
import tensorflow as tf

# Load the data
train_data = pd.read_json("train_financial.json")
test_data = pd.read_json("FINANCIAL_TEST_LIMPIO.json")

convertir datos a tf.data.Dataset

In [73]:
def df_to_dataset(dataframe, shuffle=True, batch_size=32):
    dataframe = dataframe.copy()
    labels = dataframe.pop('label')
    ds = tf.data.Dataset.from_tensor_slices((dataframe['text'].values, labels.values))
    if shuffle:
        ds = ds.shuffle(buffer_size=len(dataframe))
    ds = ds.batch(batch_size)
    return ds

batch_size = 32
train_dataset = df_to_dataset(train_data, batch_size=batch_size)
test_dataset = df_to_dataset(test_data, shuffle=False, batch_size=batch_size)


crear y compilar el modelo Bilstm

In [74]:
# Usar la capa TextVectorization para normalizar, dividir y mapear cadenas a enteros.
encoder = tf.keras.layers.TextVectorization(max_tokens=10000)
encoder.adapt(train_dataset.map(lambda text, label: text))

# Crear el modelo
model = tf.keras.Sequential([
    encoder,
    tf.keras.layers.Embedding(input_dim=len(encoder.get_vocabulary()), output_dim=64, mask_zero=True),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, return_sequences=True)),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(3)  # Cambia a 3 para las tres clases
])

# Compilar el modelo
model.compile(
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    optimizer=tf.keras.optimizers.Adam(),
    metrics=['accuracy']
)

# Resumen del modelo
model.summary()


entrenar el modelo

In [75]:
history = model.fit(
    train_dataset,
    epochs=10,
    validation_data=test_dataset
)


Epoch 1/10
[1m106/106[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 40ms/step - accuracy: 0.5941 - loss: 0.9422 - val_accuracy: 0.7109 - val_loss: 0.6982
Epoch 2/10
[1m106/106[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 38ms/step - accuracy: 0.7941 - loss: 0.4911 - val_accuracy: 0.7453 - val_loss: 0.6722
Epoch 3/10
[1m106/106[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 38ms/step - accuracy: 0.9291 - loss: 0.2218 - val_accuracy: 0.7642 - val_loss: 0.8352
Epoch 4/10
[1m106/106[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 39ms/step - accuracy: 0.9597 - loss: 0.1226 - val_accuracy: 0.7536 - val_loss: 0.9319
Epoch 5/10
[1m106/106[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 40ms/step - accuracy: 0.9737 - loss: 0.0783 - val_accuracy: 0.7476 - val_loss: 1.2313
Epoch 6/10
[1m106/106[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 39ms/step - accuracy: 0.9783 - loss: 0.0663 - val_accuracy: 0.7536 - val_loss: 1.1579
Epoch 7/10
[1m106/10

evaluar el modelo

In [76]:
loss, accuracy = model.evaluate(test_dataset)
print(f"Test Loss: {loss}")
print(f"Test Accuracy: {accuracy}")


[1m 1/27[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m0s[0m 12ms/step - accuracy: 0.6562 - loss: 2.3323

[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 0.7365 - loss: 1.6241
Test Loss: 1.4337142705917358
Test Accuracy: 0.7594786882400513


predicción y cálculo de métricas

In [78]:
from sklearn.metrics import classification_report

# Convertir el conjunto de prueba a NumPy arrays
test_texts = test_data['text'].values
test_labels = test_data['label'].values

# Hacer predicciones
logits = model.predict(test_texts)
predictions = tf.nn.softmax(logits).numpy()  # Aplicar softmax para obtener probabilidades

# Convertir probabilidades a etiquetas
predicted_labels = np.argmax(predictions, axis=1)

# Calcular las métricas
report = classification_report(test_labels, predicted_labels, target_names=['Negative', 'Neutral', 'Positive'])
print(report)


[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 36ms/step
              precision    recall  f1-score   support

    Negative       0.64      0.61      0.62        97
     Neutral       0.82      0.84      0.83       498
    Positive       0.68      0.65      0.66       249

    accuracy                           0.76       844
   macro avg       0.71      0.70      0.71       844
weighted avg       0.76      0.76      0.76       844

