In [70]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.utils import class_weight
from sklearn.metrics import classification_report

import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout, Concatenate
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.regularizers import l2



In [74]:
train_df = pd.read_csv(r"C:\Users\Day\Documents\LBBYs_CH2-ana\LBBYs_CH2-ana\data\processed\train_limpieza_v1.csv")
test_df = pd.read_csv(r"C:\Users\Day\Documents\LBBYs_CH2-ana\LBBYs_CH2-ana\data\processed\test_limpieza_v1.csv")


In [76]:
MAX_VOCAB = 20000
MAX_LEN = 100
EMBEDDING_DIM = 100

tokenizer = Tokenizer(num_words=MAX_VOCAB)
tokenizer.fit_on_texts(train_df['statement'])

X_train_text = tokenizer.texts_to_sequences(train_df['statement'])
X_train_text = pad_sequences(X_train_text, maxlen=MAX_LEN)

X_test_text = tokenizer.texts_to_sequences(test_df['statement'])
X_test_text = pad_sequences(X_test_text, maxlen=MAX_LEN)


In [78]:
cat_columns = ['subject', 'speaker', 'party_affiliation_category_map']
encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)

X_train_cats = encoder.fit_transform(train_df[cat_columns])
X_test_cats = encoder.transform(test_df[cat_columns])


In [80]:
X_train_text, X_val_text, X_train_cats, X_val_cats, y_train, y_val = train_test_split(
    X_train_text, X_train_cats, train_df['label'], test_size=0.2, random_state=42)


In [82]:
class_weights_array = class_weight.compute_class_weight(class_weight='balanced', classes=np.unique(y_train), y=y_train)
class_weights_dict = dict(enumerate(class_weights_array))


In [84]:
def build_improved_cnn_model_with_features():
    input_text = Input(shape=(MAX_LEN,), name='text_input')
    x = Embedding(input_dim=MAX_VOCAB, output_dim=EMBEDDING_DIM, embeddings_regularizer=l2(1e-6))(input_text)

    convs = []
    for size in [3, 4, 5]:
        c = Conv1D(256, size, activation='relu')(x)
        c = GlobalMaxPooling1D()(c)
        convs.append(c)
    x_text = Concatenate()(convs)
    x_text = Dropout(0.5)(x_text)

    input_cats = Input(shape=(X_train_cats.shape[1],), name='cats_input')
    x_cats = Dense(64, activation='relu', kernel_regularizer=l2(1e-4))(input_cats)
    x_cats = Dropout(0.3)(x_cats)

    x = Concatenate()([x_text, x_cats])
    x = Dense(128, activation='relu', kernel_regularizer=l2(1e-4))(x)
    x = Dropout(0.5)(x)
    output = Dense(1, activation='sigmoid')(x)

    model = Model(inputs=[input_text, input_cats], outputs=output)
    model.compile(optimizer=Adam(learning_rate=1e-4), loss='binary_crossentropy', metrics=['accuracy'])
    return model


In [86]:
model = build_improved_cnn_model_with_features()

early_stopping = EarlyStopping(monitor='val_loss', patience=4, restore_best_weights=True)

history = model.fit([X_train_text, X_train_cats],
                    y_train,
                    validation_data=([X_val_text, X_val_cats], y_val),
                    epochs=30,
                    batch_size=32,
                    class_weight=class_weights_dict,
                    callbacks=[early_stopping])


Epoch 1/30
[1m224/224[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 92ms/step - accuracy: 0.4654 - loss: 0.7288 - val_accuracy: 0.6067 - val_loss: 0.7157
Epoch 2/30
[1m224/224[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 93ms/step - accuracy: 0.5162 - loss: 0.7147 - val_accuracy: 0.5028 - val_loss: 0.7168
Epoch 3/30
[1m224/224[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 89ms/step - accuracy: 0.5756 - loss: 0.7059 - val_accuracy: 0.4453 - val_loss: 0.7260
Epoch 4/30
[1m224/224[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 89ms/step - accuracy: 0.5975 - loss: 0.6915 - val_accuracy: 0.6352 - val_loss: 0.6812
Epoch 5/30
[1m224/224[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 88ms/step - accuracy: 0.6752 - loss: 0.6613 - val_accuracy: 0.6128 - val_loss: 0.6754
Epoch 6/30
[1m224/224[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 86ms/step - accuracy: 0.7435 - loss: 0.5979 - val_accuracy: 0.6089 - val_loss: 0.6726
Epoch 7/30
[1m2

In [88]:
y_val_probs = model.predict([X_val_text, X_val_cats])
y_val_pred = (y_val_probs > 0.5).astype(int)

print("Reporte de métricas en VALIDACIÓN:")
print(classification_report(y_val, y_val_pred, digits=2))


[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 20ms/step
Reporte de métricas en VALIDACIÓN:
              precision    recall  f1-score   support

           0       0.44      0.54      0.49       611
           1       0.73      0.64      0.68      1179

    accuracy                           0.61      1790
   macro avg       0.59      0.59      0.59      1790
weighted avg       0.63      0.61      0.62      1790



In [92]:
y_test_probs = model.predict([X_test_text, X_test_cats])
y_test_pred = (y_test_probs > 0.5).astype(int)

predictions = pd.DataFrame({
    'id': test_df['id'],
    'label': y_test_pred.flatten()
})

predictions.to_csv("CNN_05.csv", index=False)
print("Predicciones guardadas en 'predicciones_finales_mejoradas.csv'")


[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 17ms/step
Predicciones guardadas en 'predicciones_finales_mejoradas.csv'
