# Modelo GRU para detección de noticias falsas

### Imports

In [1]:
import nltk
import pandas as pd
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import (Embedding, SpatialDropout1D, Conv1D, BatchNormalization, MaxPooling1D, Bidirectional, GRU, Dropout, Dense)
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import train_test_split
from sklearn.utils import class_weight
from sklearn.metrics import classification_report, recall_score, f1_score
from textblob import Word

nltk.download('wordnet')
nltk.download('omw-1.4')


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\inesg\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\inesg\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

### Cargar Datos

In [None]:

train_df = pd.read_csv('../../../../data/processed/train_simp_preprocess_v2.csv')
test_df = pd.read_csv('../../../../data/processed/test_simp_preprocess_v2.csv')

### Preparar textos y etiquetas 

In [3]:
texts = train_df['statement'].astype(str).values
labels = train_df['label'].values

### Parámetros de tokenización

In [4]:
max_words = 10000
max_len = 100
embedding_dim = 64

max_len_2 = 150  # aumentado para capturar más contexto
embedding_dim_2 = 100  # tamaño embedding GloVe

max_len_3 = 200

### Tokenización

In [5]:
tokenizer = Tokenizer(num_words=max_words, oov_token='<OOV>')
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
padded_sequences = pad_sequences(sequences, maxlen=max_len, padding='post')

In [6]:
padded_sequences_2 = pad_sequences(sequences, maxlen=max_len_2, padding='post')

In [7]:
padded_sequences_3 = pad_sequences(sequences, maxlen=max_len_3, padding='post')

### Dividir en entrenamiento y validación

In [8]:
X_train, X_val, y_train, y_val = train_test_split(padded_sequences, labels, test_size=0.2, random_state=42)


In [9]:
class_weights_arr = class_weight.compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
class_weights = dict(enumerate(class_weights_arr))

### Generar entregable

In [10]:
def generate_submission(model, tokenizer, test_df, max_len, threshold, filename):
    """
    Genera un archivo CSV de submission con columnas 'id' y 'label'.

    Parámetros:
    - model: modelo entrenado para hacer predict
    - tokenizer: instancia de Tokenizer ajustada al texto
    - test_df: DataFrame con al menos las columnas 'id' y 'statement'
    - max_len: longitud máxima para pad_sequences
    - threshold: umbral para convertir probabilidades en etiquetas (0/1)
    - filename: ruta de salida del CSV
    """
    # Preprocesar textos de test
    test_texts = test_df['statement'].astype(str).values
    seqs = tokenizer.texts_to_sequences(test_texts)
    X_test = pad_sequences(seqs, maxlen=max_len, padding='post')
    submission_path = 'C:/Users/inesg/dev/LBBYs_CH2/notebooks/3_summision/'

    # Predecir probabilidades y convertir a etiquetas
    probs = model.predict(X_test).flatten()
    preds = (probs >= threshold).astype(int)

    # Construir DataFrame de submission
    submission_df = pd.DataFrame({
        'id': test_df['id'],
        'label': preds
    })
    submission_df.to_csv(f"{submission_path}{filename}", index=False)
    print(f"Archivo de submission generado: {submission_path}{filename}")

## Modelo GRU Simple

In [11]:
model = Sequential([
    Embedding(input_dim=max_words, output_dim=embedding_dim, input_length=max_len),
    Bidirectional(GRU(64)),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.summary()

early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

history = model.fit(
    X_train, y_train,
    epochs=15,
    batch_size=32,
    validation_data=(X_val, y_val),
    class_weight=class_weights,
    callbacks=[early_stopping]
)

val_probs = model.predict(X_val).flatten()

thresholds = np.arange(0.0, 1.0, 0.01)
best_threshold = 0.5
best_score = 0

for t in thresholds:
    val_preds_t = (val_probs >= t).astype(int)
    recall_0 = recall_score(y_val, val_preds_t, pos_label=0)
    recall_1 = recall_score(y_val, val_preds_t, pos_label=1)
    avg_recall = (recall_0 + recall_1) / 2
    if avg_recall > best_score:
        best_score = avg_recall
        best_threshold = t

print(f"Best threshold: {best_threshold:.2f} with average recall: {best_score:.3f}")

val_preds = (val_probs >= best_threshold).astype(int)
print(classification_report(y_val, val_preds))




Epoch 1/15
[1m224/224[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 26ms/step - accuracy: 0.4622 - loss: 0.6969 - val_accuracy: 0.5352 - val_loss: 0.7024
Epoch 2/15
[1m224/224[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 29ms/step - accuracy: 0.6943 - loss: 0.5928 - val_accuracy: 0.5642 - val_loss: 0.7223
Epoch 3/15
[1m224/224[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 28ms/step - accuracy: 0.7878 - loss: 0.4485 - val_accuracy: 0.5603 - val_loss: 0.8943
Epoch 4/15
[1m224/224[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 29ms/step - accuracy: 0.8589 - loss: 0.3178 - val_accuracy: 0.5816 - val_loss: 0.9702
[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 12ms/step
Best threshold: 0.47 with average recall: 0.583
              precision    recall  f1-score   support

           0       0.42      0.61      0.49       611
           1       0.73      0.56      0.63      1179

    accuracy                           0.57      1790
   macro

## Doble Capa GRU Bidireccional

In [12]:
model = Sequential([
    Embedding(input_dim=max_words, output_dim=embedding_dim, input_length=max_len),
    Bidirectional(GRU(64, return_sequences=True)),
    Bidirectional(GRU(32)),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.summary()

early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

history = model.fit(
    X_train, y_train,
    epochs=15,
    batch_size=32,
    validation_data=(X_val, y_val),
    class_weight=class_weights,
    callbacks=[early_stopping]
)

val_probs = model.predict(X_val).flatten()

thresholds = np.arange(0.0, 1.0, 0.01)
best_threshold = 0.5
best_score = 0

for t in thresholds:
    val_preds_t = (val_probs >= t).astype(int)
    recall_0 = recall_score(y_val, val_preds_t, pos_label=0)
    recall_1 = recall_score(y_val, val_preds_t, pos_label=1)
    avg_recall = (recall_0 + recall_1) / 2
    if avg_recall > best_score:
        best_score = avg_recall
        best_threshold = t

print(f"Best threshold: {best_threshold:.2f} with average recall: {best_score:.3f}")

val_preds = (val_probs >= best_threshold).astype(int)
print(classification_report(y_val, val_preds))




Epoch 1/15
[1m224/224[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 52ms/step - accuracy: 0.5171 - loss: 0.6917 - val_accuracy: 0.5508 - val_loss: 0.7002
Epoch 2/15
[1m224/224[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 50ms/step - accuracy: 0.6867 - loss: 0.5940 - val_accuracy: 0.6145 - val_loss: 0.6621
Epoch 3/15
[1m224/224[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 50ms/step - accuracy: 0.8178 - loss: 0.4276 - val_accuracy: 0.6112 - val_loss: 0.7649
Epoch 4/15
[1m224/224[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 51ms/step - accuracy: 0.8879 - loss: 0.2824 - val_accuracy: 0.5749 - val_loss: 1.0059
Epoch 5/15
[1m224/224[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 51ms/step - accuracy: 0.9368 - loss: 0.1855 - val_accuracy: 0.5749 - val_loss: 1.2629
[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 19ms/step
Best threshold: 0.52 with average recall: 0.592
              precision    recall  f1-score   support

   

## Embeddings GloVe y GRU Bidireccional Doble

In [23]:
X_train, X_val, y_train, y_val = train_test_split(padded_sequences_2, labels, test_size=0.2, random_state=42)

class_weights_arr = class_weight.compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
class_weights = dict(enumerate(class_weights_arr))

embedding_index = {}
with open('glove.6B.100d.txt', encoding='utf8') as f:
    for line in f:
        values = line.strip().split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embedding_index[word] = coefs

embedding_matrix = np.zeros((max_words, embedding_dim_2))
for word, i in tokenizer.word_index.items():
    if i >= max_words:
        continue
    embedding_vector = embedding_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

model = Sequential([
    Embedding(input_dim=max_words, output_dim=embedding_dim_2, input_length=max_len_2,
              weights=[embedding_matrix], trainable=False),
    Bidirectional(GRU(128, return_sequences=True)),
    Bidirectional(GRU(64)),
    Dropout(0.5),
    Dense(32, activation='relu'),
    Dropout(0.3),
    Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.summary()

early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=3, min_lr=1e-6)

history = model.fit(
    X_train, y_train,
    epochs=30,
    batch_size=32,
    validation_data=(X_val, y_val),
    class_weight=class_weights,
    callbacks=[early_stopping, reduce_lr]
)

val_probs = model.predict(X_val).flatten()

thresholds = np.arange(0.0, 1.0, 0.01)
best_threshold = 0.5
best_score = 0

for t in thresholds:
    val_preds_t = (val_probs >= t).astype(int)
    recall_0 = recall_score(y_val, val_preds_t, pos_label=0)
    recall_1 = recall_score(y_val, val_preds_t, pos_label=1)
    avg_recall = (recall_0 + recall_1) / 2
    if avg_recall > best_score:
        best_score = avg_recall
        best_threshold = t

print(f"Best threshold: {best_threshold:.2f} with average recall: {best_score:.3f}")

val_preds = (val_probs >= best_threshold).astype(int)
print(classification_report(y_val, val_preds))




Epoch 1/30
[1m224/224[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 111ms/step - accuracy: 0.5356 - loss: 0.6987 - val_accuracy: 0.6229 - val_loss: 0.6721 - learning_rate: 0.0010
Epoch 2/30
[1m224/224[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 112ms/step - accuracy: 0.5943 - loss: 0.6755 - val_accuracy: 0.5972 - val_loss: 0.6795 - learning_rate: 0.0010
Epoch 3/30
[1m224/224[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 113ms/step - accuracy: 0.5930 - loss: 0.6688 - val_accuracy: 0.5933 - val_loss: 0.6627 - learning_rate: 0.0010
Epoch 4/30
[1m224/224[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 114ms/step - accuracy: 0.6038 - loss: 0.6609 - val_accuracy: 0.5687 - val_loss: 0.6816 - learning_rate: 0.0010
Epoch 5/30
[1m224/224[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 113ms/step - accuracy: 0.5970 - loss: 0.6530 - val_accuracy: 0.5480 - val_loss: 0.6805 - learning_rate: 0.0010
Epoch 6/30
[1m224/224[0m [32m━━━━━━━━━━━━━━━━━━━━[0

## Normalización de Tipos y GRU Bidireccional

In [14]:
X_train, X_val, y_train, y_val = train_test_split(padded_sequences_2, labels, test_size=0.2, random_state=42)

class_weights_arr = class_weight.compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
class_weights = dict(enumerate(class_weights_arr))

model = Sequential([
    Embedding(input_dim=max_words, output_dim=embedding_dim_2, input_length=max_len),
    Bidirectional(GRU(64)),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.summary()

early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=2, min_lr=1e-6)

history = model.fit(
    X_train, y_train,
    epochs=15,
    batch_size=32,
    validation_data=(X_val, y_val),
    class_weight=class_weights,
    callbacks=[early_stopping, reduce_lr]
)

val_probs = model.predict(X_val).flatten()

thresholds = np.arange(0.0, 1.0, 0.01)
best_threshold = 0.5
best_score = 0

for t in thresholds:
    val_preds_t = (val_probs >= t).astype(int)
    recall_0 = recall_score(y_val, val_preds_t, pos_label=0)
    recall_1 = recall_score(y_val, val_preds_t, pos_label=1)
    avg_recall = (recall_0 + recall_1) / 2
    if avg_recall > best_score:
        best_score = avg_recall
        best_threshold = t

print(f"Best threshold: {best_threshold:.2f} with average recall: {best_score:.3f}")

val_preds = (val_probs >= best_threshold).astype(int)
print(classification_report(y_val, val_preds))



Epoch 1/15
[1m224/224[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 48ms/step - accuracy: 0.5762 - loss: 0.6888 - val_accuracy: 0.5570 - val_loss: 0.6925 - learning_rate: 0.0010
Epoch 2/15
[1m224/224[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 46ms/step - accuracy: 0.6808 - loss: 0.6012 - val_accuracy: 0.6173 - val_loss: 0.6759 - learning_rate: 0.0010
Epoch 3/15
[1m224/224[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 46ms/step - accuracy: 0.8025 - loss: 0.4347 - val_accuracy: 0.5799 - val_loss: 0.7955 - learning_rate: 0.0010
Epoch 4/15
[1m224/224[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 46ms/step - accuracy: 0.8689 - loss: 0.3077 - val_accuracy: 0.5916 - val_loss: 0.8849 - learning_rate: 0.0010
Epoch 5/15
[1m224/224[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 46ms/step - accuracy: 0.9290 - loss: 0.1979 - val_accuracy: 0.5732 - val_loss: 1.3589 - learning_rate: 5.0000e-04
[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1

## CNN-GRU Híbrido con Conv1D y MaxPooling

In [15]:
X_train = np.array(X_train).astype(np.int32)
y_train = np.array(y_train).astype(np.int32)
X_val = np.array(X_val).astype(np.int32)
y_val = np.array(y_val).astype(np.int32)

class_weights_arr = class_weight.compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
class_weights = dict(enumerate(class_weights_arr))

model = Sequential([
    Embedding(input_dim=max_words, output_dim=embedding_dim, input_length=max_len),
    Conv1D(filters=64, kernel_size=5, activation='relu'),
    MaxPooling1D(pool_size=2),
    Bidirectional(GRU(64)),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.summary()

early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=2, min_lr=1e-6)

history = model.fit(
    X_train, y_train,
    epochs=15,
    batch_size=32,
    validation_data=(X_val, y_val),
    class_weight=class_weights,
    callbacks=[early_stopping, reduce_lr]
)

val_probs = model.predict(X_val).flatten()

thresholds = np.arange(0.0, 1.0, 0.01)
best_threshold = 0.5
best_score = 0

for t in thresholds:
    val_preds_t = (val_probs >= t).astype(int)
    recall_0 = recall_score(y_val, val_preds_t, pos_label=0)
    recall_1 = recall_score(y_val, val_preds_t, pos_label=1)
    avg_recall = (recall_0 + recall_1) / 2
    if avg_recall > best_score:
        best_score = avg_recall
        best_threshold = t

print(f"Best threshold: {best_threshold:.2f} with average recall: {best_score:.3f}")

val_preds = (val_probs >= best_threshold).astype(int)
print(classification_report(y_val, val_preds))




Epoch 1/15
[1m224/224[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 29ms/step - accuracy: 0.4844 - loss: 0.6941 - val_accuracy: 0.6425 - val_loss: 0.6340 - learning_rate: 0.0010
Epoch 2/15
[1m224/224[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 29ms/step - accuracy: 0.7017 - loss: 0.5906 - val_accuracy: 0.6073 - val_loss: 0.6726 - learning_rate: 0.0010
Epoch 3/15
[1m224/224[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 28ms/step - accuracy: 0.8646 - loss: 0.3411 - val_accuracy: 0.6011 - val_loss: 0.8491 - learning_rate: 0.0010
Epoch 4/15
[1m224/224[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 29ms/step - accuracy: 0.9647 - loss: 0.1142 - val_accuracy: 0.6089 - val_loss: 1.2110 - learning_rate: 5.0000e-04
[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 10ms/step
Best threshold: 0.63 with average recall: 0.594
              precision    recall  f1-score   support

           0       0.41      0.73      0.53       611
           1     

## CNN-GRU Avanzado con SpatialDropout1D y BatchNormalization

In [16]:
X_train, X_val, y_train, y_val = train_test_split(padded_sequences_2, labels, test_size=0.2, random_state=42)

X_train = np.array(X_train).astype(np.int32)
y_train = np.array(y_train).astype(np.int32)
X_val = np.array(X_val).astype(np.int32)
y_val = np.array(y_val).astype(np.int32)

# Calcular class weights y aumentar peso clase 1
class_weights_arr = class_weight.compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
class_weights = dict(enumerate(class_weights_arr))
class_weights[1] *= 1.5  # dar más peso a clase 1

print(f"Class weights usados: {class_weights}")

# Modelo
model = Sequential([
    Embedding(input_dim=max_words, output_dim=embedding_dim, input_length=max_len),
    SpatialDropout1D(0.3),
    Conv1D(filters=64, kernel_size=5, activation='relu'),
    BatchNormalization(),
    MaxPooling1D(pool_size=2),
    Bidirectional(GRU(64)),
    Dropout(0.5),
    Dense(32, activation='relu'),
    Dropout(0.3),
    Dense(1, activation='sigmoid')
])

optimizer = Adam(learning_rate=1e-4)
model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])
model.summary()

early_stopping = EarlyStopping(monitor='val_loss', patience=7, restore_best_weights=True)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.3, patience=2, min_lr=1e-6)

history = model.fit(
    X_train, y_train,
    epochs=30,
    batch_size=32,
    validation_data=(X_val, y_val),
    class_weight=class_weights,
    callbacks=[early_stopping, reduce_lr]
)

val_probs = model.predict(X_val).flatten()
thresholds = np.arange(0.0, 1.0, 0.01)
best_threshold = 0.5
best_f1 = 0

for t in thresholds:
    val_preds_t = (val_probs >= t).astype(int)
    f1 = f1_score(y_val, val_preds_t, average='macro')
    if f1 > best_f1:
        best_f1 = f1
        best_threshold = t

print(f"Best threshold optimizing macro F1: {best_threshold:.2f} with F1: {best_f1:.3f}")

val_preds = (val_probs >= best_threshold).astype(int)
print(classification_report(y_val, val_preds))


Class weights usados: {0: np.float64(1.4072327044025157), 1: np.float64(1.1633448873483536)}




Epoch 1/30
[1m224/224[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 30ms/step - accuracy: 0.5373 - loss: 0.9053 - val_accuracy: 0.6587 - val_loss: 0.6816 - learning_rate: 1.0000e-04
Epoch 2/30
[1m224/224[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 31ms/step - accuracy: 0.5748 - loss: 0.8789 - val_accuracy: 0.6587 - val_loss: 0.6728 - learning_rate: 1.0000e-04
Epoch 3/30
[1m224/224[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 30ms/step - accuracy: 0.6097 - loss: 0.8461 - val_accuracy: 0.6587 - val_loss: 0.6576 - learning_rate: 1.0000e-04
Epoch 4/30
[1m224/224[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 31ms/step - accuracy: 0.6279 - loss: 0.8367 - val_accuracy: 0.6553 - val_loss: 0.6476 - learning_rate: 1.0000e-04
Epoch 5/30
[1m224/224[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 30ms/step - accuracy: 0.6352 - loss: 0.8268 - val_accuracy: 0.6503 - val_loss: 0.6410 - learning_rate: 1.0000e-04
Epoch 6/30
[1m224/224[0m [32m━━━━━━━━━━━━━

## CNN-GRU con Aumento de Peso de Clase

In [17]:
X_train, X_val, y_train, y_val = train_test_split(padded_sequences_3, labels, test_size=0.2, random_state=42)

X_train = np.array(X_train).astype(np.int32)
y_train = np.array(y_train).astype(np.int32)
X_val = np.array(X_val).astype(np.int32)
y_val = np.array(y_val).astype(np.int32)

class_weights_arr = class_weight.compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
class_weights = dict(enumerate(class_weights_arr))

# Cargar GloVe
embedding_index = {}
with open('glove.6B.100d.txt', encoding='utf8') as f:
    for line in f:
        values = line.strip().split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embedding_index[word] = coefs

embedding_matrix = np.zeros((max_words, embedding_dim_2))
for word, i in tokenizer.word_index.items():
    if i >= max_words:
        continue
    vector = embedding_index.get(word)
    if vector is not None:
        embedding_matrix[i] = vector

# Modelo mejorado
model = Sequential([
    Embedding(input_dim=max_words, output_dim=embedding_dim_2, input_length=max_len,
              weights=[embedding_matrix], trainable=True),
    SpatialDropout1D(0.3),
    Bidirectional(GRU(256, return_sequences=True)),
    Bidirectional(GRU(128)),
    Dropout(0.5),
    Dense(64, activation='relu'),
    Dropout(0.3),
    Dense(1, activation='sigmoid')
])

optimizer = Adam(learning_rate=1e-4)
model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])
model.summary()

early_stopping = EarlyStopping(monitor='val_loss', patience=7, restore_best_weights=True)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.3, patience=3, min_lr=1e-6)

history = model.fit(
    X_train, y_train,
    epochs=40,
    batch_size=32,
    validation_data=(X_val, y_val),
    class_weight=class_weights,
    callbacks=[early_stopping, reduce_lr]
)

val_probs = model.predict(X_val).flatten()

thresholds = np.arange(0.0, 1.0, 0.01)
best_threshold = 0.5
best_f1 = 0

for t in thresholds:
    val_preds_t = (val_probs >= t).astype(int)
    f1 = f1_score(y_val, val_preds_t, average='macro')
    if f1 > best_f1:
        best_f1 = f1
        best_threshold = t

print(f"Best threshold for macro F1: {best_threshold:.2f} with F1: {best_f1:.3f}")

val_preds = (val_probs >= best_threshold).astype(int)
print(classification_report(y_val, val_preds))




Epoch 1/40
[1m224/224[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m100s[0m 429ms/step - accuracy: 0.4746 - loss: 0.7017 - val_accuracy: 0.6128 - val_loss: 0.6717 - learning_rate: 1.0000e-04
Epoch 2/40
[1m224/224[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m95s[0m 423ms/step - accuracy: 0.5804 - loss: 0.6806 - val_accuracy: 0.5374 - val_loss: 0.6922 - learning_rate: 1.0000e-04
Epoch 3/40
[1m224/224[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m93s[0m 416ms/step - accuracy: 0.5573 - loss: 0.6829 - val_accuracy: 0.5631 - val_loss: 0.6792 - learning_rate: 1.0000e-04
Epoch 4/40
[1m224/224[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m92s[0m 413ms/step - accuracy: 0.5803 - loss: 0.6778 - val_accuracy: 0.6251 - val_loss: 0.6643 - learning_rate: 1.0000e-04
Epoch 5/40
[1m224/224[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m93s[0m 413ms/step - accuracy: 0.5970 - loss: 0.6760 - val_accuracy: 0.6034 - val_loss: 0.6693 - learning_rate: 1.0000e-04
Epoch 6/40
[1m224/224[0m [32m━━

## GRU Profundo con GloVe Fine-Tuning y SpatialDropout1D

In [18]:
X_train, X_val, y_train, y_val = train_test_split(padded_sequences_2, labels, test_size=0.2, random_state=42)

class_weights_arr = class_weight.compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
class_weights = dict(enumerate(class_weights_arr))

embedding_index = {}
with open('glove.6B.100d.txt', encoding='utf8') as f:
    for line in f:
        values = line.strip().split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embedding_index[word] = coefs

embedding_matrix = np.zeros((max_words, embedding_dim_2))
for word, i in tokenizer.word_index.items():
    if i >= max_words:
        continue
    vector = embedding_index.get(word)
    if vector is not None:
        embedding_matrix[i] = vector

# Modelo mejorado
model = Sequential([
    Embedding(input_dim=max_words, output_dim=embedding_dim_2, input_length=max_len,
              weights=[embedding_matrix], trainable=True),
    SpatialDropout1D(0.3),
    Bidirectional(GRU(128)),  # Una capa Bidirectional GRU
    Dropout(0.3),  # Reducido el dropout
    Dense(32, activation='relu'),
    Dense(1, activation='sigmoid')
])

optimizer = Adam(learning_rate=1e-3)  # Learning rate más alto
model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])
model.summary()

# Callbacks
early_stopping = EarlyStopping(monitor='val_loss', patience=7, restore_best_weights=True)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=3, min_lr=1e-6)

history = model.fit(
    X_train, y_train,
    epochs=40,
    batch_size=64,  # Incrementamos tamaño de batch
    validation_data=(X_val, y_val),
    class_weight=class_weights,
    callbacks=[early_stopping, reduce_lr]
)

# Evaluar y buscar mejor umbral
val_probs = model.predict(X_val).flatten()

thresholds = np.arange(0.0, 1.0, 0.01)
best_threshold = 0.5
best_f1 = 0

for t in thresholds:
    val_preds_t = (val_probs >= t).astype(int)
    f1 = f1_score(y_val, val_preds_t, average='macro')
    if f1 > best_f1:
        best_f1 = f1
        best_threshold = t

print(f"Best threshold for macro F1: {best_threshold:.2f} with F1: {best_f1:.3f}")

val_preds = (val_probs >= best_threshold).astype(int)
print(classification_report(y_val, val_preds))




Epoch 1/40
[1m112/112[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 113ms/step - accuracy: 0.5221 - loss: 0.6925 - val_accuracy: 0.5318 - val_loss: 0.6978 - learning_rate: 0.0010
Epoch 2/40
[1m112/112[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 114ms/step - accuracy: 0.5879 - loss: 0.6738 - val_accuracy: 0.5408 - val_loss: 0.6979 - learning_rate: 0.0010
Epoch 3/40
[1m112/112[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 111ms/step - accuracy: 0.5971 - loss: 0.6606 - val_accuracy: 0.5536 - val_loss: 0.7158 - learning_rate: 0.0010
Epoch 4/40
[1m112/112[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 114ms/step - accuracy: 0.6366 - loss: 0.6278 - val_accuracy: 0.6184 - val_loss: 0.6474 - learning_rate: 0.0010
Epoch 5/40
[1m112/112[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 112ms/step - accuracy: 0.6830 - loss: 0.5969 - val_accuracy: 0.6000 - val_loss: 0.6787 - learning_rate: 0.0010
Epoch 6/40
[1m112/112[0m [32m━━━━━━━━━━━━━━━━━━━━[0

## GRU con Capacidad Incrementada y EarlyStopping Extendido

In [19]:
# Cargar GloVe
embedding_index = {}
with open('glove.6B.100d.txt', encoding='utf8') as f:
    for line in f:
        values = line.strip().split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embedding_index[word] = coefs

embedding_matrix = np.zeros((max_words, embedding_dim_2))
for word, i in tokenizer.word_index.items():
    if i >= max_words:
        continue
    vector = embedding_index.get(word)
    if vector is not None:
        embedding_matrix[i] = vector

# Modelo simplificado
model = Sequential([
    Embedding(input_dim=max_words, output_dim=embedding_dim_2, input_length=max_len,
              weights=[embedding_matrix], trainable=True),
    SpatialDropout1D(0.3),
    Bidirectional(GRU(128)),  # Una capa GRU bidireccional más simple
    Dropout(0.3),  # Regularización
    Dense(64, activation='relu'),
    Dense(1, activation='sigmoid')
])

optimizer = Adam(learning_rate=1e-3)  # Learning rate más alto
model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])
model.summary()

# Callbacks
early_stopping = EarlyStopping(monitor='val_loss', patience=7, restore_best_weights=True)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=3, min_lr=1e-6)

# Entrenamiento
history = model.fit(
    X_train, y_train,
    epochs=40,
    batch_size=64,  # Tamaño de batch mayor
    validation_data=(X_val, y_val),
    class_weight=class_weights,
    callbacks=[early_stopping, reduce_lr]
)

# Evaluar y buscar mejor umbral para clase 0
val_probs = model.predict(X_val).flatten()

thresholds = np.arange(0.0, 1.0, 0.01)
best_threshold = 0.5
best_f1 = 0

for t in thresholds:
    val_preds_t = (val_probs >= t).astype(int)
    f1 = f1_score(y_val, val_preds_t, average='macro')
    if f1 > best_f1:
        best_f1 = f1
        best_threshold = t

print(f"Best threshold for macro F1: {best_threshold:.2f} with F1: {best_f1:.3f}")
# Predicción con el umbral optimizado
val_preds = (val_probs >= best_threshold).astype(int)
print(classification_report(y_val, val_preds))





Epoch 1/40
[1m112/112[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 121ms/step - accuracy: 0.5458 - loss: 0.6891 - val_accuracy: 0.4933 - val_loss: 0.7032 - learning_rate: 0.0010
Epoch 2/40
[1m112/112[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 120ms/step - accuracy: 0.5715 - loss: 0.6766 - val_accuracy: 0.4922 - val_loss: 0.7139 - learning_rate: 0.0010
Epoch 3/40
[1m112/112[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 117ms/step - accuracy: 0.5935 - loss: 0.6600 - val_accuracy: 0.5827 - val_loss: 0.6744 - learning_rate: 0.0010
Epoch 4/40
[1m112/112[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 117ms/step - accuracy: 0.6378 - loss: 0.6342 - val_accuracy: 0.5894 - val_loss: 0.6593 - learning_rate: 0.0010
Epoch 5/40
[1m112/112[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 118ms/step - accuracy: 0.6612 - loss: 0.6057 - val_accuracy: 0.5508 - val_loss: 0.7279 - learning_rate: 0.0010
Epoch 6/40
[1m112/112[0m [32m━━━━━━━━━━━━━━━━━━━━[0

## GRU con Ajustes de Learning Rate y Batch Size

In [20]:
X_train, X_val, y_train, y_val = train_test_split(padded_sequences_3, labels, test_size=0.2, random_state=42)

# Calcular los pesos de clase
class_weights_arr = class_weight.compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
class_weights = dict(enumerate(class_weights_arr))

# Cargar GloVe
embedding_index = {}
with open('glove.6B.100d.txt', encoding='utf8') as f:
    for line in f:
        values = line.strip().split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embedding_index[word] = coefs

embedding_matrix = np.zeros((max_words, embedding_dim_2))
for word, i in tokenizer.word_index.items():
    if i >= max_words:
        continue
    vector = embedding_index.get(word)
    if vector is not None:
        embedding_matrix[i] = vector

# Modelo mejorado con optimización
model = Sequential([
    Embedding(input_dim=max_words, output_dim=embedding_dim_2, input_length=max_len,
              weights=[embedding_matrix], trainable=True),
    SpatialDropout1D(0.3),
    Conv1D(filters=64, kernel_size=5, activation='relu'),
    MaxPooling1D(pool_size=2),
    BatchNormalization(),
    Bidirectional(GRU(128)),  # Una capa GRU bidireccional más simple
    Dropout(0.5),  # Regularización
    Dense(64, activation='relu'),
    Dropout(0.3),
    Dense(1, activation='sigmoid')
])

optimizer = Adam(learning_rate=1e-3)  # Learning rate más alto
model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])
model.summary()

# Callbacks
early_stopping = EarlyStopping(monitor='val_loss', patience=7, restore_best_weights=True)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=3, min_lr=1e-6)

# Entrenamiento
history = model.fit(
    X_train, y_train,
    epochs=40,
    batch_size=64,  # Tamaño de batch mayor
    validation_data=(X_val, y_val),
    class_weight=class_weights,
    callbacks=[early_stopping, reduce_lr]
)

# Evaluar y buscar mejor umbral para clase 0
val_probs = model.predict(X_val).flatten()

thresholds = np.arange(0.0, 1.0, 0.01)
best_threshold = 0.5
best_f1 = 0

for t in thresholds:
    val_preds_t = (val_probs >= t).astype(int)
    f1 = f1_score(y_val, val_preds_t, average='macro')
    if f1 > best_f1:
        best_f1 = f1
        best_threshold = t

print(f"Best threshold for macro F1: {best_threshold:.2f} with F1: {best_f1:.3f}")

# Predicción con el umbral optimizado
val_preds = (val_probs >= best_threshold).astype(int)
print(classification_report(y_val, val_preds))




Epoch 1/40
[1m112/112[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 87ms/step - accuracy: 0.4954 - loss: 0.7242 - val_accuracy: 0.3609 - val_loss: 0.7065 - learning_rate: 0.0010
Epoch 2/40
[1m112/112[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 90ms/step - accuracy: 0.5285 - loss: 0.6896 - val_accuracy: 0.5061 - val_loss: 0.6986 - learning_rate: 0.0010
Epoch 3/40
[1m112/112[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 90ms/step - accuracy: 0.5626 - loss: 0.6855 - val_accuracy: 0.4240 - val_loss: 0.7314 - learning_rate: 0.0010
Epoch 4/40
[1m112/112[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 90ms/step - accuracy: 0.5822 - loss: 0.6690 - val_accuracy: 0.5760 - val_loss: 0.6769 - learning_rate: 0.0010
Epoch 5/40
[1m112/112[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 90ms/step - accuracy: 0.5932 - loss: 0.6629 - val_accuracy: 0.5201 - val_loss: 0.6990 - learning_rate: 0.0010
Epoch 6/40
[1m112/112[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37

## GRU con Contexto Ampliado con Combined_Text

In [21]:
# Cargar GloVe
embedding_index = {}
with open('glove.6B.100d.txt', encoding='utf8') as f:
    for line in f:
        values = line.strip().split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embedding_index[word] = coefs

embedding_matrix = np.zeros((max_words, embedding_dim_2))
for word, i in tokenizer.word_index.items():
    if i >= max_words:
        continue
    vector = embedding_index.get(word)
    if vector is not None:
        embedding_matrix[i] = vector

# Modelo mejorado con más capacidad
model = Sequential([
    Embedding(input_dim=max_words, output_dim=embedding_dim_2, input_length=max_len,
              weights=[embedding_matrix], trainable=True),
    SpatialDropout1D(0.4),
    Bidirectional(GRU(256, return_sequences=True)),  # Más unidades en la capa GRU
    Bidirectional(GRU(128)),
    Dropout(0.5),  # Más dropout para evitar sobreajuste
    Dense(64, activation='relu'),
    Dropout(0.3),
    Dense(1, activation='sigmoid')
])

optimizer = Adam(learning_rate=1e-3)  # Learning rate más alto
model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])
model.summary()

# Callbacks
early_stopping = EarlyStopping(monitor='val_loss', patience=7, restore_best_weights=True)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=3, min_lr=1e-6)

# Entrenamiento
history = model.fit(
    X_train, y_train,
    epochs=40,
    batch_size=32,  # Tamaño de batch más pequeño
    validation_data=(X_val, y_val),
    class_weight=class_weights,
    callbacks=[early_stopping, reduce_lr]
)

# Evaluar y buscar mejor umbral para clase 0
val_probs = model.predict(X_val).flatten()

thresholds = np.arange(0.0, 1.0, 0.01)
best_threshold = 0.5
best_f1 = 0

for t in thresholds:
    val_preds_t = (val_probs >= t).astype(int)
    f1 = f1_score(y_val, val_preds_t, average='macro')
    if f1 > best_f1:
        best_f1 = f1
        best_threshold = t

print(f"Best threshold for macro F1: {best_threshold:.2f} with F1: {best_f1:.3f}")

# Predicción con el umbral optimizado
val_preds = (val_probs >= best_threshold).astype(int)
print(classification_report(y_val, val_preds))




Epoch 1/40
[1m224/224[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m109s[0m 470ms/step - accuracy: 0.5203 - loss: 0.7159 - val_accuracy: 0.6145 - val_loss: 0.6643 - learning_rate: 0.0010
Epoch 2/40
[1m224/224[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m100s[0m 448ms/step - accuracy: 0.5694 - loss: 0.6859 - val_accuracy: 0.5648 - val_loss: 0.6866 - learning_rate: 0.0010
Epoch 3/40
[1m224/224[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m101s[0m 451ms/step - accuracy: 0.5934 - loss: 0.6751 - val_accuracy: 0.5670 - val_loss: 0.6959 - learning_rate: 0.0010
Epoch 4/40
[1m224/224[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m99s[0m 443ms/step - accuracy: 0.6176 - loss: 0.6567 - val_accuracy: 0.5916 - val_loss: 0.6589 - learning_rate: 0.0010
Epoch 5/40
[1m224/224[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m99s[0m 443ms/step - accuracy: 0.6740 - loss: 0.6119 - val_accuracy: 0.6022 - val_loss: 0.6495 - learning_rate: 0.0010
Epoch 6/40
[1m224/224[0m [32m━━━━━━━━━━━━━━━━━━━━

## Conclusiones

1. **Evolución de la métrica en validación**  
   - En entrenamientos cortos (15 épocas, LR inicial 1 × 10⁻³), la **accuracy** de validación se estabiliza alrededor de 0.56–0.58, mientras que la **val_loss** comienza a aumentar a partir de la época 3, señal de cierto sobreajuste.  
   - Extender a 30 épocas con reducciones sucesivas de LR apenas mejora la accuracy, pero sí degrada la loss de validación tras la época 7.  
   - En 40 épocas, con un schedule más agresivo (hasta 2.5 × 10⁻⁴), se alcanza la **mejor accuracy** (~0.61) y un **macro-F1** ≈ 0.59 tras ajustar el umbral de decisión al valor óptimo (0.49).  

2. **Selección de umbral y trade-off precisión/recall**  
   - Ajustar el threshold por recall medio mejora la detección de la clase minoritaria (“fake”):  
     - Con threshold ≈ 0.47 → recall clase 0 ≈ 0.61, clase 1 ≈ 0.56 (macro-F1 ≈ 0.56).  
     - Con threshold ≈ 0.63 → recall clase 0 sube (0.73) a costa de la clase 1 (0.46).  
   - Con el umbral óptimo para maximizar macro-F1 (0.49), se logra un buen equilibrio:  
     - **Clase 0**: precision 0.44, recall 0.53 → F1 ≈ 0.48  
     - **Clase 1**: precision 0.73, recall 0.66 → F1 ≈ 0.69  

3. **Sobreajuste y regularización**  
   - El gap entre training accuracy (hasta ~0.85) y validation accuracy (> 0.60) tras pocas épocas indica que la red memoriza secuencias de entrenamiento.  
   - Se recomienda incorporar regularización más agresiva (dropout adicional, L2, early stopping) o data augmentation de texto (reemplazo de sinónimos, back-translation).

4. **Posibles vías de mejora**  
   - **Embeddings pre-entrenados** (GloVe, FastText o BERT) para enriquecer la semántica frente a embeddings aprendidos desde cero.  
   - **Modelos híbridos o basados en transformadores** (por ejemplo, encoder BERT + capa GRU) que suelen impulsar el rendimiento en clasificación de texto.  
   - **Búsqueda de hiperparámetros** (unidades GRU, tamaño de batch, schedules de LR) mediante grid search o Bayesian Optimization para optimizar la curva de validación.