## Imports

In [1]:
import re
import os
import glob
import random
import pandas as pd
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.decomposition import KernelPCA, TruncatedSVD
from sklearn.model_selection import StratifiedKFold
from sklearn.pipeline import Pipeline
from catboost import CatBoostClassifier
from genetic_algorithm import GeneticAlgorithmCV
from concurrent.futures import ThreadPoolExecutor, as_completed
from skl2onnx import convert_sklearn, update_registered_converter
from skl2onnx.common.shape_calculator import calculate_linear_classifier_output_shapes
from skl2onnx.common.data_types import FloatTensorType, Int64TensorType
from skl2onnx._parse import _apply_zipmap, _get_sklearn_operator_name
from onnx.helper import get_attribute_value
from catboost.utils import convert_to_onnx_object
import warnings
warnings.filterwarnings('ignore')
import torch
torch.cuda.empty_cache()

## Functions

In [2]:
# Procesamiento de archivos (modificado para devolver 4 componentes)
def process_files(file_paths):
    processed_data = []
    for file_path in file_paths:
        model_type = extract_action(file_path)  # 'long' o 'short'
        df = pd.read_csv(file_path)
        # (1) Generar datos base
        X_train, y_train = create_training_dataset(df, model_type)
        # (2) Aplicar meta-labeling (devuelve 2 conjuntos)
        (X_main, y_main), (X_meta, y_meta) = meta_label_data_multi_bootstrap_oob(
            X_train, y_train,
            models_number=100,
            bad_samples_fraction=0.8
        )
        processed_data.append({
            'direction': model_type,
            'main': (X_main, y_main),
            'meta': (X_meta, y_meta)
        })
    return processed_data

# Helper function
def extract_action(filepath):
    match = re.search(r'(long|short)', filepath)
    return match.group(0) if match else None

# Crear datasets de entrenamiento
def create_training_dataset(df, action_type):
    print(f"=== Procesando dataset de {'compras' if action_type == 'long' else 'ventas'} ===")
    df = df.drop_duplicates()
    print(f"Total de operaciones después de eliminar duplicados: {len(df)}")
    # Filtrar las operaciones con profit != 0
    df_trade = df[df['profit'] != 0].copy()
    print(f"Operaciones con profit != 0: {len(df_trade)}")
    # Añadir la columna 'target' basada en el profit
    df_trade['target'] = df_trade['profit'].apply(lambda x: 1 if x > 0 else 0)
    num_ganadoras = df_trade['target'].sum()
    num_perdedoras = len(df_trade) - num_ganadoras
    print(f"Operaciones ganadoras: {int(num_ganadoras)}")
    print(f"Operaciones perdedoras: {int(num_perdedoras)}")
    # Eliminar posibles missings
    if df_trade.isna().values.any():
        num_missings = df_trade.isna().sum().sum()
        print(f"Valores faltantes encontrados: {num_missings}")
        df_trade = df_trade.dropna()
        print(f"Total de operaciones después de eliminar missings: {len(df_trade)}")
    df_training = df_trade.copy(deep=True)
    # Seleccionar las columnas necesarias (todas menos las dos últimas para el conjunto principal)
    feature_columns = df_training.columns[:-2]
    df_training = df_training[feature_columns.tolist() + ['target']]
    # Preparación de los datos de entrenamiento
    X_train = df_training.drop(columns='target').values.astype('float')
    y_train = df_training['target'].values.astype('int')
    print(f"Dataset final preparado: {X_train.shape[0]} operaciones, {X_train.shape[1]} características")
    return X_train, y_train

def sample_random_hparams():
    """
    Retorna un dict con hiperparámetros aleatorios de XGBoost 
    dentro de rangos razonables. Ajusta a tu gusto.
    """
    return {
        'iterations':       random.randint(50, 500),
        'max_depth':        random.randint(3, 10),
        'learning_rate':    random.uniform(0.1, 0.5),
        'l2_leaf_reg':      random.uniform(0.0, 1.0),
        'min_data_in_leaf': random.randint(1, 10)
    }

def meta_label_data_multi_bootstrap_oob(
    X, y, 
    models_number=5, 
    bad_samples_fraction=0.8
):
    """
    Versión 'avanzada' de meta-labeling con bootstrapping y uso de OOB.
    - Cada iteración:
      (a) Sample (frac=0.5, replace=True) => train_sample
      (b) OOB = filas que quedaron fuera => val_sample
      (c) Entrena en train_sample
      (d) Predice solo en val_sample
      (e) Acumula las filas de val_sample mal clasificadas
    - Al final, elimina filas que superan el umbral (mean*bad_samples_fraction)
    """
    df = pd.DataFrame(X)
    df['target'] = y
    # Índices de malas muestras
    BAD_0 = pd.Index([])
    BAD_1 = pd.Index([])
    for i in range(models_number):
        # (1) Generar muestra bootstrap (train_sample)
        train_sample = df.sample(frac=0.8, replace=True, random_state=None)
        # (2) Conjunto OOB = val_sample
        val_sample = df.loc[~df.index.isin(train_sample.index)]

        # (3) Hiperparámetros (aleatorios o fijos)
        hparams = sample_random_hparams()
        model = CatBoostClassifier(
            task_type="CPU",
            eval_metric='Accuracy',
            verbose=False,
            **hparams
        )

        # (4) Entrenar en la parte bootstrap
        model.fit(
            train_sample.drop(columns='target'),
            train_sample['target']
        )

        # (5) Predecir en OOB
        if len(val_sample) == 0:
            # Si en algún caso random la muestra bootstrap coge todo, esto evitará un error
            continue

        pred_proba = model.predict_proba(val_sample.drop(columns='target'))[:, 1]
        pred_labels = (pred_proba >= 0.5).astype(int)

        # (6) Identificar malas muestras solo en val_sample
        val_sample = val_sample.copy()
        val_sample['pred'] = pred_labels
        
        val_sample_0 = val_sample[val_sample['target'] == 0]
        val_sample_1 = val_sample[val_sample['target'] == 1]
        
        diff_0 = val_sample_0.index[val_sample_0['target'] != val_sample_0['pred']]
        diff_1 = val_sample_1.index[val_sample_1['target'] != val_sample_1['pred']]

        # (7) Acumular esos índices
        BAD_0 = BAD_0.append(diff_0)
        BAD_1 = BAD_1.append(diff_1)
    # (8) Contar la frecuencia de error de cada índice
    to_mark_0 = BAD_0.value_counts()
    to_mark_1 = BAD_1.value_counts()
    # (9) Definir umbral
    threshold_0 = to_mark_0.mean() * bad_samples_fraction if len(to_mark_0) else 0
    threshold_1 = to_mark_1.mean() * bad_samples_fraction if len(to_mark_1) else 0
    marked_0 = to_mark_0[to_mark_0 > threshold_0].index if len(to_mark_0) else []
    marked_1 = to_mark_1[to_mark_1 > threshold_1].index if len(to_mark_1) else []
    # (10) Filtrar las filas marcadas del dataset completo
    all_bad = pd.Index(marked_0).union(marked_1)
    good_mask = ~df.index.isin(all_bad)
    # Crear columna meta_labels
    df['meta_labels'] = 1
    df.loc[all_bad, 'meta_labels'] = 0
    # Devolver X_main (filtrado) y X_meta (todo el dataset)
    X_main = df.loc[good_mask].drop(columns=['target', 'meta_labels'])
    y_main = df.loc[good_mask, 'target']
    X_meta = df.drop(columns=['target', 'meta_labels']) 
    y_meta = df['meta_labels']
    
    return (X_main, y_main), (X_meta, y_meta)

# Train model functions
def train_classifier(X_train, y_train, model_type):
    n_features = X_train.shape[1]
    # Definir algoritmo de validación cruzada
    cv = StratifiedKFold(n_splits=3)
    # Definir Pipeline
    pipeline = Pipeline([
            ('scaler', 'passthrough'),
            ('reducer', 'passthrough'),
            ('catboostclassifier', CatBoostClassifier(
                task_type="CPU",
                eval_metric='Accuracy',
                verbose=False
        ))
        ])
    # Definir mapa de estimadores
    estimator_map = {
        'scaler': {
            'standard': StandardScaler(),
            'robust': RobustScaler(),
            'none': 'passthrough'
        },
        'reducer': {
            'kernel_pca_rbf': KernelPCA(kernel='rbf'),
            'kernel_pca_linear': KernelPCA(kernel='linear'),
            'truncated_svd': TruncatedSVD(),
            'none': 'passthrough'
        }
    }
    # Definir espacio de hiperparámetros para compras
    param_grid = {
        'scaler': {'type': 'categorical', 'values': ['standard', 'robust']},
        'reducer': {'type': 'categorical', 'values': [
            'kernel_pca_rbf', 'kernel_pca_linear', 'truncated_svd']},
        'reducer__n_components': {'type': 'int', 'low': 2, 'high': n_features-1},
        'catboostclassifier__iterations': {'type': 'int', 'low': 50, 'high': 500},
        'catboostclassifier__max_depth': {'type': 'int', 'low': 3, 'high': 10},
        'catboostclassifier__learning_rate': {'type': 'float', 'low': 0.1, 'high': 0.5},
        'catboostclassifier__l2_leaf_reg': {'type': 'float', 'low': 0.0, 'high': 1.0},
        'catboostclassifier__min_data_in_leaf': {'type': 'int', 'low': 1, 'high': 10}
    }
    try:
        # Entrenar el modelo utilizando el algoritmo genético
        ga_search = GeneticAlgorithmCV(
            model_type=model_type,
            pipeline=pipeline,
            param_grid=param_grid,
            estimator_map=estimator_map,
            cv=cv,
            pop_size=25,
            generations=5,
            early_stopping_rounds=3,
            crossover_initial=0.9,
            crossover_end=0.1,
            mutation_initial=0.1,
            mutation_end=0.9,
            elitism=True,
            elite_size=5,
            tournament_size=3,
            n_random=5,
            n_jobs=1,
            verbose=True,
        )
        ga_search.fit(X_train, y_train)
    except Exception as e:
        print(f"Error en traing model {model_type}: {e}")
        raise
    # Obtener los mejores parámetros y el mejor estimador
    print("####################################################################")
    print(f"Mejor puntuación de validación para {model_type}: {ga_search.best_score_}")
    print(f"Mejores parámetros encontrados para {model_type}: {ga_search.best_params_full_}")
    print("####################################################################")
    # Retornar mejor estimador
    return ga_search.best_estimator_

# ONNX para Pipeline con Catboost
def skl2onnx_parser_castboost_classifier(scope, model, inputs, custom_parsers=None):
    options = scope.get_options(model, dict(zipmap=True))
    no_zipmap = isinstance(options["zipmap"], bool) and not options["zipmap"]
    
    alias = _get_sklearn_operator_name(type(model))
    this_operator = scope.declare_local_operator(alias, model)
    this_operator.inputs = inputs
    
    label_variable = scope.declare_local_variable("label", Int64TensorType())
    probability_tensor_variable = scope.declare_local_variable("probabilities", FloatTensorType())
    
    this_operator.outputs.append(label_variable)
    this_operator.outputs.append(probability_tensor_variable)
    
    return _apply_zipmap(options["zipmap"], scope, model, inputs[0].type, this_operator.outputs)

def skl2onnx_convert_catboost(scope, operator, container):
    onx = convert_to_onnx_object(operator.raw_operator)
    node = onx.graph.node[0]
    
    container.add_node(
        node.op_type,
        [operator.inputs[0].full_name],
        [operator.outputs[0].full_name, operator.outputs[1].full_name],
        op_domain=node.domain,
        **{att.name: get_attribute_value(att) for att in node.attribute}
    )

def save_onnx_model(mql5_files_folder, model, X, model_type):
    try:
        # Define el tipo de entrada
        initial_type = [('input', FloatTensorType([None, X.shape[1]]))]
        
        # Convierte el pipeline completo
        model_onnx = convert_sklearn(
            model,
            initial_types=initial_type,
            target_opset={"": 12, "ai.onnx.ml": 2},
            options={id(model.steps[-1][1]): {'zipmap': True}}
        )
        
        # Guarda el modelo
        with open(os.path.join(mql5_files_folder, f"model_{model_type}.onnx"), "wb") as f:
            f.write(model_onnx.SerializeToString())
            
        print(f"Modelo {model_type} ONNX exportado correctamente")
        
    except Exception as e:
        print(f"Error en exportar el modelo {model_type}: {e}")
        raise

def train_models_parallel(data_list, mql5_files_folder):
    # Antes de cualquier entrenamiento o conversión:
    update_registered_converter(
        CatBoostClassifier,
        "CatBoostClassifier",
        calculate_linear_classifier_output_shapes,
        skl2onnx_convert_catboost,
        parser=skl2onnx_parser_castboost_classifier,
        options={"nocl": [True, False], "zipmap": [True, False, "columns"]}
    )
    # Diccionario para rastrear metadatos de cada futuro
    future_metadata = {}
    futures = []
    
    with ThreadPoolExecutor(max_workers=4) as executor:
        # Paso 1: Enviar todas las tareas de entrenamiento
        for data in data_list:
            direction = data['direction']
            
            # Modelo Principal (main)
            X_main, y_main = data['main']
            future_main = executor.submit(
                train_classifier, 
                X_main, 
                y_main, 
                f"{direction}_main"  # Ej: "long_main"
            )
            future_metadata[future_main] = {
                'type': 'main',
                'direction': direction,
                'X_train': X_main
            }
            futures.append(future_main)
            
            # Meta-Modelo (validación causal)
            X_meta, y_meta = data['meta']
            future_meta = executor.submit(
                train_classifier,
                X_meta,
                y_meta.astype(int), 
                f"{direction}_meta"  # Ej: "long_meta"
            )
            future_metadata[future_meta] = {
                'type': 'meta',
                'direction': direction,
                'X_train': X_meta
            }
            futures.append(future_meta)
        
        # Paso 2: Procesar resultados conforme se completan
        for future in as_completed(futures):
            metadata = future_metadata[future]
            try:
                model = future.result()
                model_type = f"{metadata['direction']}_{metadata['type']}"  # Ej: "long_main"
                X_train = metadata['X_train']
                # Guardar modelo ONNX
                save_onnx_model(mql5_files_folder, model, X_train, model_type)
                
            except Exception as e:
                direction_type = f"{metadata['direction']}_{metadata['type']}"
                print(f"Error crítico en {direction_type}: {str(e)}")
                raise

## Train

In [3]:
def main():
    # Rutas
    common_file_folder = r"/mnt/c/Users/Administrador/AppData/Roaming/MetaQuotes/Terminal/Common/Files/"
    mql5_files_folder = r'/mnt/c/Users/Administrador/AppData/Roaming/MetaQuotes/Terminal/6C3C6A11D1C3791DD4DBF45421BF8028/MQL5/Files/'
    # Definir patrones de archivos para compras y ventas
    long_file_pattern = os.path.join(common_file_folder, 'training_dataset_long_*.csv')
    short_file_pattern = os.path.join(common_file_folder, 'training_dataset_short_*.csv')
    generic_file_pattern = os.path.join(common_file_folder, 'training_dataset_*.csv')
    # Encontrar archivos
    df_long_file_paths = glob.glob(long_file_pattern)
    df_short_file_paths = glob.glob(short_file_pattern)
    df_generic_file_paths = glob.glob(generic_file_pattern)
    # Inicializar una lista para almacenar los datos
    # Procesar todos los datasets
    full_data = []
    if df_long_file_paths:
        full_data.extend(process_files(df_long_file_paths))
    if df_short_file_paths:
        full_data.extend(process_files(df_short_file_paths))
    if not df_long_file_paths and not df_short_file_paths and df_generic_file_paths:
        full_data.extend(process_files(df_generic_file_paths))

    # Entrenamiento paralelo optimizado
    if len(full_data) > 0:
        train_models_parallel(full_data, mql5_files_folder)
    else:
        print("No se encontraron datasets válidos para entrenar")
if __name__ == "__main__":
    main()

=== Procesando dataset de ventas ===
Total de operaciones después de eliminar duplicados: 8119
Operaciones con profit != 0: 8119
Operaciones ganadoras: 3542
Operaciones perdedoras: 4577
Dataset final preparado: 8119 operaciones, 10 características


Generaciones short_main:   0%|          | 0/5 [00:00<?, ?gen/s]

Generaciones short_meta:   0%|          | 0/5 [00:00<?, ?gen/s]

[1, short_main] Fitness: 0.6388422035480859 | Best Fitness: 0.6388422035480859
[1, short_main] Fitness Improvement: 0.0000 | Diversity: 1.1693
[1, short_main] Normalized Fitness Improvement: 0.0000 | Normalized Diversity: 0.0000
[1, short_main] Crossover Rate: 0.9000 | Mutation Rate: 0.1000
[1, short_meta] Fitness: 0.6500791383975901 | Best Fitness: 0.6500791383975901
[1, short_meta] Fitness Improvement: 0.0000 | Diversity: 1.1664
[1, short_meta] Normalized Fitness Improvement: 0.0000 | Normalized Diversity: 0.0000
[1, short_meta] Crossover Rate: 0.9000 | Mutation Rate: 0.1000
[2, short_main] Fitness: 0.6388422035480859 | Best Fitness: 0.6388422035480859
[2, short_main] Fitness Improvement: 0.0000 | Diversity: 1.1622
[2, short_main] Normalized Fitness Improvement: 0.0000 | Normalized Diversity: 0.0000
[2, short_main] Crossover Rate: 0.5800 | Mutation Rate: 0.1552
[2, short_meta] Fitness: 0.6500791383975901 | Best Fitness: 0.6500791383975901
[2, short_meta] Fitness Improvement: 0.0000 |

##########
