In [128]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.utils import class_weight
from sklearn.metrics import classification_report

import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout, Concatenate
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.regularizers import l2



In [129]:
train_df = pd.read_csv(r"C:\Users\Day\Documents\LBBYs_CH2-ana\LBBYs_CH2-ana\data\processed\train_preprocess_v1.csv")
test_df = pd.read_csv(r"C:\Users\Day\Documents\LBBYs_CH2-ana\LBBYs_CH2-ana\data\processed\test_preprocess_v1.csv")


In [130]:
MAX_VOCAB = 20000
MAX_LEN = 100
EMBEDDING_DIM = 100

tokenizer = Tokenizer(num_words=MAX_VOCAB)
tokenizer.fit_on_texts(train_df['statement'])

X_train_text = tokenizer.texts_to_sequences(train_df['statement'])
X_train_text = pad_sequences(X_train_text, maxlen=MAX_LEN)

X_test_text = tokenizer.texts_to_sequences(test_df['statement'])
X_test_text = pad_sequences(X_test_text, maxlen=MAX_LEN)


In [131]:
cat_columns = [
    'subject', 'speaker', 'speaker_job', 'state_info',
    'party_affiliation', 'party_affiliation_uni', 'party_affiliation_category_map',
    'processed_subject', 'speaker_type'
]


encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)

X_train_cats = encoder.fit_transform(train_df[cat_columns])
X_test_cats = encoder.transform(test_df[cat_columns])


In [132]:
print(train_df.columns)


Index(['id', 'label', 'statement', 'subject', 'speaker', 'speaker_job',
       'state_info', 'party_affiliation', 'party_affiliation_uni',
       'party_affiliation_category_map', 'statement_tokens', 'num_tokens',
       'num_sentences', 'pos_info', 'pos_freq', 'lemma_freq', 'tag_freq',
       'entities', 'stopwords', 'statement_tokens_without_stopwords',
       'num_tokens_without_stopwords', 'pos_info_without_stopwords',
       'pos_freq_without_stopwords', 'lemma_freq_without_stopwords',
       'tag_freq_without_stopwords', 'processed_subject', 'speaker_entities',
       'speaker_type', 'speaker_job_tokens', 'state_info_tokens',
       'party_affiliation_tokens'],
      dtype='object')


In [133]:
X_train_text, X_val_text, X_train_cats, X_val_cats, y_train, y_val = train_test_split(
    X_train_text, X_train_cats, train_df['label'], test_size=0.2, random_state=42)


In [134]:
class_weights_array = class_weight.compute_class_weight(class_weight='balanced', classes=np.unique(y_train), y=y_train)
class_weights_dict = dict(enumerate(class_weights_array))


In [135]:
def build_improved_cnn_model_with_features():
    input_text = Input(shape=(MAX_LEN,), name='text_input')
    x = Embedding(input_dim=MAX_VOCAB, output_dim=EMBEDDING_DIM, embeddings_regularizer=l2(1e-6))(input_text)

    convs = []
    for size in [3, 4, 5]:
        c = Conv1D(256, size, activation='relu')(x)
        c = GlobalMaxPooling1D()(c)
        convs.append(c)
    x_text = Concatenate()(convs)
    x_text = Dropout(0.5)(x_text)

    input_cats = Input(shape=(X_train_cats.shape[1],), name='cats_input')
    x_cats = Dense(64, activation='relu', kernel_regularizer=l2(1e-4))(input_cats)
    x_cats = Dropout(0.3)(x_cats)

    x = Concatenate()([x_text, x_cats])
    x = Dense(128, activation='relu', kernel_regularizer=l2(1e-4))(x)
    x = Dropout(0.5)(x)
    output = Dense(1, activation='sigmoid')(x)

    model = Model(inputs=[input_text, input_cats], outputs=output)
    model.compile(optimizer=Adam(learning_rate=1e-4), loss='binary_crossentropy', metrics=['accuracy'])
    return model


In [136]:
model = build_improved_cnn_model_with_features()

early_stopping = EarlyStopping(monitor='val_loss', patience=4, restore_best_weights=True)

history = model.fit([X_train_text, X_train_cats],
                    y_train,
                    validation_data=([X_val_text, X_val_cats], y_val),
                    epochs=30,
                    batch_size=32,
                    class_weight=class_weights_dict,
                    callbacks=[early_stopping])


Epoch 1/30
[1m224/224[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 89ms/step - accuracy: 0.5164 - loss: 0.7239 - val_accuracy: 0.6642 - val_loss: 0.7053
Epoch 2/30
[1m224/224[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 82ms/step - accuracy: 0.5824 - loss: 0.7097 - val_accuracy: 0.5441 - val_loss: 0.7121
Epoch 3/30
[1m224/224[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 83ms/step - accuracy: 0.6028 - loss: 0.6907 - val_accuracy: 0.6106 - val_loss: 0.6815
Epoch 4/30
[1m224/224[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 84ms/step - accuracy: 0.6674 - loss: 0.6531 - val_accuracy: 0.5816 - val_loss: 0.6923
Epoch 5/30
[1m224/224[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 82ms/step - accuracy: 0.7046 - loss: 0.6156 - val_accuracy: 0.6078 - val_loss: 0.6764
Epoch 6/30
[1m224/224[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 82ms/step - accuracy: 0.7730 - loss: 0.5524 - val_accuracy: 0.6101 - val_loss: 0.6783
Epoch 7/30
[1m2

In [137]:
y_val_probs = model.predict([X_val_text, X_val_cats])
y_val_pred = (y_val_probs > 0.5).astype(int)

print("Reporte de métricas en VALIDACIÓN:")
print(classification_report(y_val, y_val_pred, digits=2))


[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 27ms/step
Reporte de métricas en VALIDACIÓN:
              precision    recall  f1-score   support

           0       0.45      0.61      0.52       611
           1       0.75      0.61      0.67      1179

    accuracy                           0.61      1790
   macro avg       0.60      0.61      0.59      1790
weighted avg       0.65      0.61      0.62      1790



In [151]:
y_test_probs = model.predict([X_test_text, X_test_cats])
y_test_pred = (y_test_probs > 0.5).astype(int)

predictions = pd.DataFrame({
    'id': test_df['id'],
    'label': y_test_pred.flatten()
})

predictions.to_csv("CNN_16.csv", index=False)
print("Predicciones guardadas en 'predicciones_finales_mejoradas.csv'")


[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 20ms/step
Predicciones guardadas en 'predicciones_finales_mejoradas.csv'
