In [1]:
pip install optuna

Collecting optuna
  Downloading optuna-4.1.0-py3-none-any.whl.metadata (16 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.14.0-py3-none-any.whl.metadata (7.4 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading Mako-1.3.6-py3-none-any.whl.metadata (2.9 kB)
Downloading optuna-4.1.0-py3-none-any.whl (364 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m364.4/364.4 kB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading alembic-1.14.0-py3-none-any.whl (233 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m233.5/233.5 kB[0m [31m10.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.9.0-py3-none-any.whl (11 kB)
Downloading Mako-1.3.6-py3-none-any.whl (78 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.6/78.6 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: Ma

In [3]:
pip install spacy



In [4]:
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m111.8 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [7]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import precision_recall_fscore_support, roc_auc_score
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional, GlobalMaxPooling1D, Conv1D
from tensorflow.keras.callbacks import EarlyStopping
import tensorflow as tf
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk
from sklearn.utils.class_weight import compute_class_weight
import optuna
import logging
import spacy
import gensim.downloader as api

# Configurar logging
logging.basicConfig(level=logging.INFO)

# Descargar recursos NLTK
nltk.download('punkt')  # Cambiado de punkt_tab a punkt
nltk.download('stopwords')
nltk.download('wordnet')

# Configuración de parámetros base
MAX_WORDS = 15000
MAX_LEN = 128
EMBEDDING_DIM = 300  # Cambiado a 300 para coincidir con GloVe

def preprocess_text(text_series):
    """
    Preprocesamiento mejorado con manejo especial de palabras ofensivas.
    """
    stop_words = set(stopwords.words('english')) - {'no', 'not', 'hate', 'against', 'racist', 'abuse', 'toxic'}
    lemmatizer = WordNetLemmatizer()

    def clean_text(text):
        if not isinstance(text, str):
            text = str(text)
        # Convertir a minúsculas
        text = text.lower()

        # Preservar ciertas palabras compuestas ofensivas
        text = text.replace('son of a bitch', 'sonofabitch')
        text = text.replace('f u c k', 'fuck')
        text = text.replace('b i t c h', 'bitch')

        # Eliminar URLs
        text = re.sub(r'http\S+|www.\S+', '', text)

        # Preservar algunos caracteres especiales que pueden indicar toxicidad
        text = re.sub(r'[^a-zA-Z\s!?*#@$]', '', text)

        # Eliminar espacios extras
        text = re.sub(r'\s+', ' ', text).strip()

        # Lematización y eliminación de stopwords
        words = text.split()
        words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]

        return ' '.join(words)

    return text_series.apply(clean_text)

def prepare_data(df):
    """
    Prepara los datos para el entrenamiento.
    """
    # Preprocesar texto
    print("Iniciando preprocesamiento de texto...")
    processed_texts = preprocess_text(df['Text'])

    # Tokenización y creación de secuencias
    print("Tokenizando textos...")
    tokenizer = Tokenizer(num_words=MAX_WORDS, oov_token='<OOV>')
    tokenizer.fit_on_texts(processed_texts)
    sequences = tokenizer.texts_to_sequences(processed_texts)
    X = pad_sequences(sequences, maxlen=MAX_LEN, padding='post', truncating='post')

    # Preparar etiquetas
    target_columns = ['IsToxic', 'IsAbusive', 'IsProvocative', 'IsObscene', 'IsHatespeech', 'IsRacist']
    df['IsOffensive'] = df[target_columns].any(axis=1)
    y = df['IsOffensive'].astype(int)

    # Cargar los embeddings de GloVe
    print("Cargando embeddings de GloVe...")
    glove_model = api.load("glove-wiki-gigaword-300")

    # Crear matriz de embeddings
    embedding_matrix = np.zeros((MAX_WORDS + 1, EMBEDDING_DIM))
    for word, i in tokenizer.word_index.items():
        if i < MAX_WORDS:  # Solo procesamos hasta MAX_WORDS
            try:
                embedding_matrix[i] = glove_model[word]
            except KeyError:
                continue  # Si la palabra no está en GloVe, dejamos el vector en ceros

    return X, y, tokenizer, embedding_matrix

def create_model_tuned(vocab_size, num_labels, params, embedding_matrix):
    """
    Versión mejorada del modelo con parámetros optimizados y embeddings
    """
    model = Sequential([
        Embedding(vocab_size, EMBEDDING_DIM, weights=[embedding_matrix], trainable=False),
        Conv1D(params.get('conv_filters', 128), 5, activation='relu'),
        Bidirectional(LSTM(params.get('lstm_units_1', 64), return_sequences=True)),
        Bidirectional(LSTM(params.get('lstm_units_2', 32), return_sequences=True)),
        GlobalMaxPooling1D(),
        Dense(params.get('dense_units_1', 128), activation='relu'),
        Dropout(params.get('dropout_1', 0.5)),
        Dense(params.get('dense_units_2', 64), activation='relu'),
        Dropout(params.get('dropout_2', 0.3)),
        Dense(num_labels, activation='sigmoid')
    ])

    optimizer = tf.keras.optimizers.Adam(learning_rate=params.get('learning_rate', 0.001))

    model.compile(
        optimizer=optimizer,
        loss='binary_crossentropy',
        metrics=['accuracy', tf.keras.metrics.AUC(name='auc'),
                tf.keras.metrics.Precision(name='precision'),
                tf.keras.metrics.Recall(name='recall')]
    )
    return model

def objective(trial, X, y, tokenizer, embedding_matrix):
    """
    Función objetivo para Optuna
    """
    params = {
        'conv_filters': trial.suggest_int('conv_filters', 64, 256),
        'lstm_units_1': trial.suggest_int('lstm_units_1', 32, 128),
        'lstm_units_2': trial.suggest_int('lstm_units_2', 16, 64),
        'dense_units_1': trial.suggest_int('dense_units_1', 64, 256),
        'dense_units_2': trial.suggest_int('dense_units_2', 32, 128),
        'dropout_1': trial.suggest_float('dropout_1', 0.2, 0.6),
        'dropout_2': trial.suggest_float('dropout_2', 0.1, 0.4),
        'learning_rate': trial.suggest_float('learning_rate', 1e-4, 1e-2, log=True),
        'batch_size': trial.suggest_categorical('batch_size', [16, 32, 64])
    }

    # Validación cruzada para evaluación más robusta
    kf = KFold(n_splits=3, shuffle=True)
    scores = []

    for train_idx, val_idx in kf.split(X):
        X_train, X_val = X[train_idx], X[val_idx]
        y_train, y_val = y[train_idx], y[val_idx]

        model = create_model_tuned(MAX_WORDS + 1, 1, params, embedding_matrix)

        early_stopping = EarlyStopping(
            monitor='val_loss',
            patience=3,
            restore_best_weights=True
        )

        history = model.fit(
            X_train, y_train,
            epochs=10,
            batch_size=params['batch_size'],
            validation_data=(X_val, y_val),
            callbacks=[early_stopping],
            verbose=0
        )

        scores.append(history.history['val_accuracy'][-1])

    return np.mean(scores)

def train_with_optimization(df, n_trials=20):
    """
    Pipeline de entrenamiento con optimización de hiperparámetros
    """
    logging.info("Preparando datos...")
    X, y, tokenizer, embedding_matrix = prepare_data(df)

    logging.info("Iniciando optimización de hiperparámetros...")
    study = optuna.create_study(direction='maximize')

    try:
        study.optimize(lambda trial: objective(trial, X, y, tokenizer, embedding_matrix),
                      n_trials=n_trials)

        best_params = study.best_params
        logging.info("Mejores hiperparámetros encontrados:")
        for param, value in best_params.items():
            logging.info(f"{param}: {value}")

        logging.info("Entrenando modelo final...")
        final_model = create_model_tuned(MAX_WORDS + 1, 1, best_params, embedding_matrix)
        history = final_model.fit(
            X, y,
            epochs=15,
            batch_size=best_params.get('batch_size', 32),
            validation_split=0.1,
            callbacks=[EarlyStopping(monitor='val_loss', patience=4, restore_best_weights=True)]
        )

        return final_model, history

    except Exception as e:
        logging.error(f"Error durante la optimización: {str(e)}")
        default_params = {
            'conv_filters': 128,
            'lstm_units_1': 64,
            'lstm_units_2': 32,
            'dense_units_1': 128,
            'dense_units_2': 64,
            'dropout_1': 0.5,
            'dropout_2': 0.3,
            'learning_rate': 0.001,
            'batch_size': 32
        }
        logging.info("Usando parámetros por defecto debido al error...")
        final_model = create_model_tuned(MAX_WORDS + 1, 1, default_params, embedding_matrix)
        history = final_model.fit(
            X, y,
            epochs=15,
            batch_size=default_params['batch_size'],
            validation_split=0.1,
            callbacks=[EarlyStopping(monitor='val_loss', patience=4, restore_best_weights=True)]
        )

        return final_model, history

if __name__ == "__main__":
    # Asegúrate de que el archivo CSV existe y tiene las columnas correctas
    df = pd.read_csv('youtoxic_english_1000.csv')

    try:
        logging.info("Iniciando entrenamiento con optimización...")
        final_model, history = train_with_optimization(df, n_trials=20)

        # Guardar el modelo
        final_model.save('final_model.h5')
        logging.info("Entrenamiento completado y modelo guardado.")

    except Exception as e:
        logging.error(f"Error durante el entrenamiento: {str(e)}")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Iniciando preprocesamiento de texto...
Tokenizando textos...
Cargando embeddings de GloVe...


[I 2024-11-13 20:55:19,321] A new study created in memory with name: no-name-b84abcef-f8c0-494e-a5ff-d1ade0d5aee3
[I 2024-11-13 20:55:55,795] Trial 0 finished with value: 0.6759993433952332 and parameters: {'conv_filters': 64, 'lstm_units_1': 128, 'lstm_units_2': 51, 'dense_units_1': 163, 'dense_units_2': 68, 'dropout_1': 0.3734897454159435, 'dropout_2': 0.15378940697279803, 'learning_rate': 0.0047729130195821425, 'batch_size': 64}. Best is trial 0 with value: 0.6759993433952332.
[I 2024-11-13 20:56:39,157] Trial 1 finished with value: 0.6920093695322672 and parameters: {'conv_filters': 102, 'lstm_units_1': 78, 'lstm_units_2': 53, 'dense_units_1': 137, 'dense_units_2': 48, 'dropout_1': 0.5364619958829588, 'dropout_2': 0.13608967132728858, 'learning_rate': 0.00037042433861917003, 'batch_size': 16}. Best is trial 1 with value: 0.6920093695322672.
[I 2024-11-13 20:57:18,583] Trial 2 finished with value: 0.7009734312693278 and parameters: {'conv_filters': 165, 'lstm_units_1': 53, 'lstm_uni

Epoch 1/15
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 129ms/step - accuracy: 0.5048 - auc: 0.5227 - loss: 0.6919 - precision: 0.3978 - recall: 0.2383 - val_accuracy: 0.7100 - val_auc: 0.7786 - val_loss: 0.5773 - val_precision: 0.6735 - val_recall: 0.7174
Epoch 2/15
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 40ms/step - accuracy: 0.7195 - auc: 0.7438 - loss: 0.5866 - precision: 0.7523 - recall: 0.5656 - val_accuracy: 0.6900 - val_auc: 0.7566 - val_loss: 0.6071 - val_precision: 0.6471 - val_recall: 0.7174
Epoch 3/15
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step - accuracy: 0.7730 - auc: 0.8546 - loss: 0.4812 - precision: 0.8191 - recall: 0.6586 - val_accuracy: 0.7400 - val_auc: 0.7955 - val_loss: 0.6068 - val_precision: 0.8125 - val_recall: 0.5652
Epoch 4/15
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 30ms/step - accuracy: 0.8254 - auc: 0.9210 - loss: 0.3818 - precision: 0.8992 - recall: 0.7225 - va

