In [None]:
import numpy as np
import pandas as pd

from dotenv import load_dotenv
import os

import unicodedata
import nltk
import spacy
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.metrics import confusion_matrix, roc_auc_score
from sklearn.metrics import precision_recall_curve, auc, precision_score, recall_score, f1_score, fbeta_score, roc_curve, average_precision_score
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import ConfusionMatrixDisplay
import logging
import json
import re
import string 
import joblib
import warnings
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe
import mlflow
import mlflow.sklearn
from sklearn.pipeline import Pipeline
from mlflow.models.signature import infer_signature
import time
from tqdm import tqdm  # para barra de progreso

warnings.filterwarnings("ignore")

In [2]:
# Configuración del logging
logging.basicConfig(
    filename="errores_entrenamiento.log",
    level=logging.DEBUG,
    format="%(asctime)s - %(levelname)s - %(message)s"
)


In [3]:
# Load environment variables
try:
    load_dotenv()
    ruta_cst_twcs = os.getenv("customer_support_twitter_twcs")
    logging.info("Environment variables loaded successfully.")
except Exception as e:
    logging.error(f"Error loading environment variables: {e}")
    raise e

In [4]:
# load data
try:
    data_cst_twcs = pd.read_csv(ruta_cst_twcs)
    print(data_cst_twcs.shape)
    logging.info("Data loaded successfully.")
except FileNotFoundError as e:
    logging.error(f"File not found: {ruta_cst_twcs}")
    raise e

(2811774, 7)


In [5]:
# transform the 'inbound' column to int
try:
    data_cst_twcs['inbound'] = data_cst_twcs['inbound'].astype('int')
    logging.info("Data transformed successfully.")
except Exception as e:
    logging.error(f"Error transforming data: {e}")
    raise e

In [6]:
# load stopwords
try:
    nltk.download('punkt')
    #nltk.download('wordnet')
    nltk.download('stopwords')
    english_stopwords = stopwords.words('english')
except Exception as e:
    logging.error(f"Error loading stopwords: {e}")
    raise e

[nltk_data] Downloading package punkt to /home/alejo/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/alejo/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [7]:
# split the data into train, validation and test sets
# stratified split to maintain the same proportion of classes in each set
try:
    X_train, X_test, y_train, y_test = train_test_split(data_cst_twcs['text'], data_cst_twcs['inbound'], test_size=0.3, stratify=data_cst_twcs['inbound'], random_state=42)
    X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.3, stratify=y_train, random_state=42)
    print('X_train: ', X_train.shape)
    print('X_valid: ', X_valid.shape)
    print('X_test: ', X_test.shape)
    print('y_train: ', y_train.shape)
    print('y_valid: ', y_valid.shape)
    print('y_test: ', y_test.shape)
    logging.info("Data split into train, validation and test sets successfully.")
except Exception as e:
    logging.error(f"Error splitting data: {e}")
    raise e

X_train:  (1377768,)
X_valid:  (590473,)
X_test:  (843533,)
y_train:  (1377768,)
y_valid:  (590473,)
y_test:  (843533,)


In [8]:
# Set the experiment name
try:
    mlflow.create_experiment("experimento_catboost")
    print("Experimento creado")
    logging.info("Experiment created successfully.")
except:
    mlflow.set_experiment("experimento_catboost")
    print("Experimento ya existe")
    logging.info("Experiment already exists, set to existing experiment.")

Experimento ya existe


In [9]:
# Carga el modelo más reciente
try:
    model_name = "modelo_catboost_prueba"
    model_version = "latest"  # Indica que quieres la última versión disponible

    loaded_model = mlflow.sklearn.load_model(f"models:/{model_name}/{model_version}")
    logging.info(f"Model {model_name} version {model_version} loaded successfully.")
except Exception as e:
    logging.error(f"Error loading model: {e}")
    raise e

In [10]:
#predict
try:
    pred = loaded_model.predict(X_test)
    print(pred)
    logging.info("Prediction made successfully.")
except Exception as e:
    logging.error(f"Error making prediction: {e}")
    raise e

[1 0 1 ... 0 1 1]


In [None]:
def realizar_prediccion_batch(modelo, X, tamaño_lote=1000):
    """
    Realiza predicciones en lotes (batches) para evitar problemas de memoria
    """
    try:
        inicio = time.time()
        n_muestras = X.shape[0]
        
        # Inicializar array para almacenar resultados
        if hasattr(modelo, 'predict_proba'):
            # Para modelos que pueden dar probabilidades
            primera_pred = modelo.predict_proba(X[:1])
            predicciones = np.zeros((n_muestras, primera_pred.shape[1]))
            metodo_pred = 'predict_proba'
        else:
            # Para modelos que solo dan la clase
            predicciones = np.zeros(n_muestras)
            metodo_pred = 'predict'
        
        # Procesar por lotes
        for i in tqdm(range(0, n_muestras, tamaño_lote)):
            fin_lote = min(i + tamaño_lote, n_muestras)
            lote = X[i:fin_lote]
            
            if metodo_pred == 'predict_proba':
                predicciones[i:fin_lote] = modelo.predict_proba(lote)
            else:
                predicciones[i:fin_lote] = modelo.predict(lote)
        
        fin = time.time()
        logging.info(f"Predicciones completadas en {fin - inicio:.2f} segundos")
        
        return predicciones
    
    except Exception as e:
        logging.error(f"Error durante la predicción: {str(e)}")
        raise

In [None]:
def guardar_resultados(predicciones, datos_original, ruta_salida, incluir_datos_originales=True):
    """
    Guarda los resultados de la predicción en un archivo
    """
    try:
        # Crear DataFrame con las predicciones
        if predicciones.ndim > 1:
            # Para predicciones de probabilidad con múltiples clases
            df_pred = pd.DataFrame(
                predicciones, 
                columns=[f'prob_clase_{i}' for i in range(predicciones.shape[1])]
            )
            # Agregar la clase con mayor probabilidad
            df_pred['prediccion'] = np.argmax(predicciones, axis=1)
        else:
            # Para predicciones de clase única
            df_pred = pd.DataFrame({'prediccion': predicciones})
        
        # Combinar con datos originales si se solicita
        if incluir_datos_originales:
            resultado = pd.concat([datos_original.reset_index(drop=True), df_pred], axis=1)
        else:
            resultado = df_pred
        
        # Guardar resultados
        if ruta_salida.endswith('.csv'):
            resultado.to_csv(ruta_salida, index=False)
        elif ruta_salida.endswith('.xlsx'):
            resultado.to_excel(ruta_salida, index=False)
        elif ruta_salida.endswith('.parquet'):
            resultado.to_parquet(ruta_salida, index=False)
        else:
            resultado.to_csv(ruta_salida, index=False)
        
        logging.info(f"Resultados guardados en {ruta_salida}")
        return resultado
    
    except Exception as e:
        logging.error(f"Error al guardar resultados: {str(e)}")
        raise