## Imports

In [None]:
import re
import os
import glob
import random
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.decomposition import KernelPCA, TruncatedSVD
from sklearn.model_selection import StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.ensemble import VotingClassifier
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from genetic_algorithm import GeneticAlgorithmCV
from concurrent.futures import ThreadPoolExecutor, as_completed
from onnxmltools.convert.xgboost.operator_converters.XGBoost import convert_xgboost
from onnxmltools.convert import convert_xgboost as convert_xgboost_booster
from onnxmltools.convert.lightgbm.operator_converters.LightGbm import convert_lightgbm
from skl2onnx import convert_sklearn, update_registered_converter
from skl2onnx.common.shape_calculator import calculate_linear_classifier_output_shapes
from skl2onnx.common.data_types import FloatTensorType, Int64TensorType
from skl2onnx._parse import _apply_zipmap, _get_sklearn_operator_name
from onnx.helper import get_attribute_value
from catboost.utils import convert_to_onnx_object
import warnings
warnings.filterwarnings('ignore')
import torch
torch.cuda.empty_cache()

## Functions

In [None]:
# Helper function
def extract_action(filepath):
    match = re.search(r'(long|short)', filepath)
    return match.group(0) if match else None

# Crear datasets de entrenamiento
def create_training_dataset(df, action_type):
    print(f"=== Procesando dataset de {'compras' if action_type == 'long' else 'ventas'} ===")
    df = df.drop_duplicates()
    print(f"Total de operaciones después de eliminar duplicados: {len(df)}")
    # Filtrar las operaciones con profit != 0
    df_trade = df[df['profit'] != 0].copy()
    print(f"Operaciones con profit != 0: {len(df_trade)}")
    # Añadir la columna 'target' basada en el profit
    df_trade['target'] = df_trade['profit'].apply(lambda x: 1 if x > 0 else 0)
    num_ganadoras = df_trade['target'].sum()
    num_perdedoras = len(df_trade) - num_ganadoras
    print(f"Operaciones ganadoras: {int(num_ganadoras)}")
    print(f"Operaciones perdedoras: {int(num_perdedoras)}")
    # Eliminar posibles missings
    if df_trade.isna().values.any():
        num_missings = df_trade.isna().sum().sum()
        print(f"Valores faltantes encontrados: {num_missings}")
        df_trade = df_trade.dropna()
        print(f"Total de operaciones después de eliminar missings: {len(df_trade)}")
    df_training = df_trade.copy(deep=True)
    # Seleccionar las columnas necesarias (todas menos las dos últimas para el conjunto principal)
    feature_columns = df_training.columns[:-2]
    df_training = df_training[feature_columns.tolist() + ['target']]
    # Preparación de los datos de entrenamiento
    X_train = df_training.drop(columns='target').values.astype('float')
    y_train = df_training['target'].values.astype('int')
    print(f"Dataset final preparado: {X_train.shape[0]} operaciones, {X_train.shape[1]} características")
    return X_train, y_train

def meta_label_data_multi_bootstrap_oob(
    X, y, 
    models_number=25, 
    # En vez de un único valor, podemos recibir una lista de valores a probar:
    fractions_to_try=None  
):
    """
    Realiza OOB bootstrap + detección de malas muestras.
    Además, optimiza automáticamente 'bad_samples_fraction' entre varios valores
    usando la métrica OOB.
    """
    if fractions_to_try is None:
        # Por defecto probamos varios valores
        fractions_to_try = [0.5, 0.6, 0.7, 0.8]

    def sample_random_hparams_catboost():
        # Devuelve parámetros aleatorios para CatBoostClassifier
        return {
            'iterations': random.randint(100, 500),
            'max_depth': random.randint(3, 10),
            'learning_rate': random.uniform(0.1, 0.5),
            'l2_leaf_reg': random.uniform(0.0, 1.0),
            'min_data_in_leaf': random.randint(1, 10)
        }

    def sample_random_hparams_xgb():
        # Devuelve parámetros aleatorios para XGBClassifier
        return {
            'n_estimators': random.randint(50, 500),
            'max_depth': random.randint(3, 10),
            'eta': random.uniform(0.1, 0.5),
            'gamma': random.uniform(0.0, 0.5),
            'subsample': random.uniform(0.5, 1.0),
            'colsample_bytree': random.uniform(0.5, 1.0)
        }

    def sample_random_hparams_lgbm():
        # Devuelve parámetros aleatorios para LGBMClassifier
        return {
            'n_estimators': random.randint(50, 500),
            'max_depth': random.randint(3, 10),
            'learning_rate': random.uniform(0.1, 0.5),
            'min_child_samples': random.randint(3, 10)
        }
    
    # Podemos ajustar esta función para no depender directamente de 'bad_samples_fraction'
    # y usarla solo para el cálculo de percentiles:
    def safe_threshold(series, fraction):
        non_zero = series[series > 0]
        if len(non_zero) >= 5:
            return np.percentile(non_zero, 75) * fraction
        elif len(non_zero) > 0:
            return np.median(non_zero) * fraction
        else:
            return 0

    df = pd.DataFrame(X)
    df['target'] = y
    df_no_target = df.drop(columns='target')
    scaler = RobustScaler()
    df_scaled_np = scaler.fit_transform(df_no_target)  
    df_scaled = pd.DataFrame(df_scaled_np, index=df.index, columns=df_no_target.columns)
    for col in df_no_target.columns:
        df[col] = df_scaled[col]
    # Contadores
    oob_counts = pd.Series(0, index=df.index)
    error_counts_0 = pd.Series(0, index=df.index)
    error_counts_1 = pd.Series(0, index=df.index)

    # ===============================
    # (1) Paso: Generar error_counts con "models_number" iteraciones
    # ===============================
    for i in range(models_number):
        # Fracción de bootstrap para este ensemble
        frac_bootstrap = random.uniform(0.4, 0.6)
        train_sample = df.sample(frac=frac_bootstrap, replace=True, random_state=None)
        
        val_sample = df.loc[~df.index.isin(train_sample.index)]

        # Hiperparámetros de cada modelo base
        hparams_cat = sample_random_hparams_catboost()
        hparams_xgb = sample_random_hparams_xgb()
        hparams_lgbm = sample_random_hparams_lgbm()

        base_models = [
            ('catboost', CatBoostClassifier(task_type="CPU", verbose=False, **hparams_cat)),
            ('xgboost', XGBClassifier(verbosity=0, **hparams_xgb)),
            ('lightgbm', LGBMClassifier(verbosity=-1 ,**hparams_lgbm))
        ]

        model = Pipeline([
            ('ensemble', VotingClassifier(
                estimators=base_models,
                voting='soft',
                flatten_transform=False,
                n_jobs=1
            ))
        ])

        model.fit(train_sample.drop(columns='target'), train_sample['target'])
        
        if len(val_sample) == 0:
            continue
        
        pred_proba = model.named_steps['ensemble'].predict_proba(val_sample.drop(columns='target'))[:, 1]
        
        # Ajuste dinámico del threshold en val_sample para maximizar la accuracy local
        if len(val_sample['target'].unique()) > 1:
            best_acc = 0
            best_thr = 0.5
            for thr_candidate in np.linspace(0, 1, 21):
                pred_temp = (pred_proba >= thr_candidate).astype(int)
                acc_temp = (val_sample['target'].values == pred_temp).mean()
                if acc_temp > best_acc:
                    best_acc = acc_temp
                    best_thr = thr_candidate
            pred_labels = (pred_proba >= best_thr).astype(int)
        else:
            pred_labels = (pred_proba >= 0.5).astype(int)
        
        val_sample = val_sample.copy()
        val_sample['pred'] = pred_labels
        
        # Marcar errores
        val_sample_0 = val_sample[val_sample['target'] == 0]
        val_sample_1 = val_sample[val_sample['target'] == 1]
        diff_0 = val_sample_0.index[val_sample_0['target'] != val_sample_0['pred']]
        diff_1 = val_sample_1.index[val_sample_1['target'] != val_sample_1['pred']]
        
        oob_counts.loc[val_sample.index] += 1
        error_counts_0.loc[diff_0] += 1
        error_counts_1.loc[diff_1] += 1

    # ===============================
    # (2) Paso: Calculamos el ratio de error para cada muestra
    # ===============================
    to_mark_0 = (error_counts_0 / oob_counts.replace(0, 1)).fillna(0)
    to_mark_1 = (error_counts_1 / oob_counts.replace(0, 1)).fillna(0)

    # ===============================
    # (3) Paso: Probar varios 'bad_samples_fraction' y quedarnos con el mejor
    # ===============================

    best_fraction = None
    best_score = np.inf  # Usamos "score = media de error OOB" -> cuanto menor, mejor

    for frac in fractions_to_try:
        # Definir umbrales para esta fracción
        threshold_0 = safe_threshold(to_mark_0, frac)
        threshold_1 = safe_threshold(to_mark_1, frac)

        # Marcar “malas” muestras
        marked_0 = to_mark_0[to_mark_0 > threshold_0].index if len(to_mark_0) else []
        marked_1 = to_mark_1[to_mark_1 > threshold_1].index if len(to_mark_1) else []
        all_bad = pd.Index(marked_0).union(marked_1)

        # Muestras "buenas"
        good_mask = ~df.index.isin(all_bad)

        # (a) Calculamos la "media de error OOB" solo en las muestras buenas
        #     (Así sabemos qué tan limpias quedan)
        error_ratios_good = []
        for idx in df[good_mask].index:
            if df.loc[idx, 'target'] == 0:
                error_ratios_good.append(to_mark_0[idx])
            else:
                error_ratios_good.append(to_mark_1[idx])
        
        mean_error_good = np.mean(error_ratios_good) if len(error_ratios_good) else 1.0

        # Ver si mejora
        # Mientras más bajo, mejor (menos error OOB en las muestras buenas)
        if mean_error_good < best_score:
            best_score = mean_error_good
            best_fraction = frac

    # ===============================
    # (4) Paso: Con la mejor fracción, hacemos el filtrado definitivo
    # ===============================
    threshold_0 = safe_threshold(to_mark_0, best_fraction)
    threshold_1 = safe_threshold(to_mark_1, best_fraction)
    
    marked_0 = to_mark_0[to_mark_0 > threshold_0].index if len(to_mark_0) else []
    marked_1 = to_mark_1[to_mark_1 > threshold_1].index if len(to_mark_1) else []
    all_bad = pd.Index(marked_0).union(marked_1)

    df['meta_labels'] = 1
    df.loc[all_bad, 'meta_labels'] = 0
    
    X_main = df.loc[df['meta_labels'] == 1].drop(columns=['target', 'meta_labels'])
    y_main = df.loc[df['meta_labels'] == 1, 'target']
    
    X_meta = df.drop(columns=['target', 'meta_labels'])
    y_meta = df['meta_labels']
    
    print(f"Malas muestras filtradas: {len(df) - len(X_main)} (con fraction={best_fraction})")
    return (X_main, y_main), (X_meta, y_meta)

# Procesamiento de archivos (modificado para devolver 4 componentes)
def process_files(file_paths):
    processed_data = []
    for file_path in file_paths:
        model_type = extract_action(file_path)  # 'long' o 'short'
        df = pd.read_csv(file_path)
        # (1) Generar datos base
        X_train, y_train = create_training_dataset(df, model_type)
        # (2) Aplicar meta-labeling (devuelve 2 conjuntos)
        (X_main, y_main), (X_meta, y_meta) = meta_label_data_multi_bootstrap_oob(
            X_train, y_train,
            models_number=100,
            fractions_to_try=[0.8]
        )
        processed_data.append({
            'direction': model_type,
            'main': (X_main, y_main),
            'meta': (X_meta, y_meta)
        })
    return processed_data

# Train model functions
def train_classifier(X_train, y_train, model_type):
    # Definir algoritmo de validación cruzada
    cv = StratifiedKFold(n_splits=3)
    # Definir Pipeline
    base_models = [
        ('catboost', CatBoostClassifier(task_type="CPU", verbose=False)),
        ('xgboost', XGBClassifier(verbosity=0)),
        ('lightgbm', LGBMClassifier(verbosity=-1))
    ]
    pipeline = Pipeline([
        ('ensemble', VotingClassifier(
            estimators=base_models,
            voting='soft',
            flatten_transform=False,
            n_jobs=1
        ))
    ])
    # Definir mapa de estimadores
    # estimator_map = {
    #     'scaler': {
    #         'standard': RobustScaler(),
    #         'robust': RobustScaler(),
    #         'none': 'passthrough'
    #     },
    #     'reducer': {
    #         'kernel_pca_rbf': KernelPCA(kernel='rbf'),
    #         'kernel_pca_linear': KernelPCA(kernel='linear'),
    #         'truncated_svd': TruncatedSVD(),
    #         'none': 'passthrough'
    #     }
    # }
    # Definir espacio de hiperparámetros
    param_grid = {
        'ensemble__catboost__iterations': {'type': 'int', 'low': 100, 'high': 500},
        'ensemble__catboost__max_depth': {'type': 'int', 'low': 3, 'high': 10},
        'ensemble__catboost__learning_rate': {'type': 'float', 'low': 0.1, 'high': 0.5},
        'ensemble__catboost__l2_leaf_reg': {'type': 'float', 'low': 0.0, 'high': 1.0},
        'ensemble__catboost__min_data_in_leaf': {'type': 'int', 'low': 3, 'high': 10},
        'ensemble__xgboost__n_estimators': {'type': 'int', 'low': 50, 'high': 500},
        'ensemble__xgboost__max_depth': {'type': 'int', 'low': 3, 'high': 10},
        'ensemble__xgboost__eta': {'type': 'float', 'low': 0.1, 'high': 0.5},
        'ensemble__xgboost__gamma': {'type': 'float', 'low': 0.0, 'high': 0.5},
        'ensemble__xgboost__subsample': {'type': 'float', 'low': 0.5, 'high': 1.0},
        'ensemble__xgboost__colsample_bytree': {'type': 'float', 'low': 0.5, 'high': 1.0},
        'ensemble__lightgbm__n_estimators': {'type': 'int', 'low': 50, 'high': 500},
        'ensemble__lightgbm__max_depth': {'type': 'int', 'low': 3, 'high': 10},
        'ensemble__lightgbm__learning_rate': {'type': 'float', 'low': 0.1, 'high': 0.5},
        'ensemble__lightgbm__min_child_samples': {'type': 'int', 'low': 3, 'high': 10}
    }
    try:
        # Entrenar el modelo utilizando el algoritmo genético
        ga_search = GeneticAlgorithmCV(
            model_type=model_type,
            pipeline=pipeline,
            param_grid=param_grid,
            #estimator_map=estimator_map,
            cv=cv,
            pop_size=25,
            generations=5,
            early_stopping_rounds=3,
            crossover_initial=0.9,
            crossover_end=0.1,
            mutation_initial=0.1,
            mutation_end=0.9,
            elitism=True,
            elite_size=5,
            tournament_size=3,
            n_random=5,
            n_jobs=1,
            verbose=True,
        )
        ga_search.fit(X_train, y_train)
    except Exception as e:
        print(f"Error en traing model {model_type}: {e}")
        raise
    # Obtener los mejores parámetros y el mejor estimador
    print("####################################################################")
    print(f"Mejor puntuación de validación para {model_type}: {ga_search.best_score_}")
    print(f"Mejores parámetros encontrados para {model_type}: {ga_search.best_params_full_}")
    print("####################################################################")
    # Retornar mejor estimador
    return ga_search.best_estimator_

# ONNX para Pipeline con Catboost
def skl2onnx_parser_castboost_classifier(scope, model, inputs, custom_parsers=None):
    options = scope.get_options(model, dict(zipmap=True))
    no_zipmap = isinstance(options["zipmap"], bool) and not options["zipmap"]
    
    alias = _get_sklearn_operator_name(type(model))
    this_operator = scope.declare_local_operator(alias, model)
    this_operator.inputs = inputs
    
    label_variable = scope.declare_local_variable("label", Int64TensorType())
    probability_tensor_variable = scope.declare_local_variable("probabilities", FloatTensorType())
    
    this_operator.outputs.append(label_variable)
    this_operator.outputs.append(probability_tensor_variable)
    
    return _apply_zipmap(options["zipmap"], scope, model, inputs[0].type, this_operator.outputs)

def skl2onnx_convert_catboost(scope, operator, container):
    onx = convert_to_onnx_object(operator.raw_operator)
    node = onx.graph.node[0]
    
    container.add_node(
        node.op_type,
        [operator.inputs[0].full_name],
        [operator.outputs[0].full_name, operator.outputs[1].full_name],
        op_domain=node.domain,
        **{att.name: get_attribute_value(att) for att in node.attribute}
    )

def save_onnx_model(mql5_files_folder, model, X, model_type):
    update_registered_converter(
        CatBoostClassifier,
        "CatBoostClassifier",
        calculate_linear_classifier_output_shapes,
        skl2onnx_convert_catboost,
        parser=skl2onnx_parser_castboost_classifier,
        options={"nocl": [True, False], "zipmap": [True, False]}
    )
    update_registered_converter(
        XGBClassifier,
        'XGBClassifier',
        calculate_linear_classifier_output_shapes,
        convert_xgboost,
        options={'nocl': [True, False], 'zipmap': [True, False]}
    )
    update_registered_converter(
        LGBMClassifier,
        'LGBMClassifier',
        calculate_linear_classifier_output_shapes,
        convert_lightgbm,
        options={'nocl': [True, False], 'zipmap': [True, False]}
    )
    try:
        # Define el tipo de entrada
        initial_type = [('input', FloatTensorType([None, X.shape[1]]))]
        
        # Convierte el pipeline completo
        model_onnx = convert_sklearn(
            model,
            initial_types=initial_type,
            target_opset={"": 18, "ai.onnx.ml": 2},
            options={id(model.named_steps['ensemble']): {'zipmap': True}}
        )
        
        # Guarda el modelo
        with open(os.path.join(mql5_files_folder, f"model_{model_type}.onnx"), "wb") as f:
            f.write(model_onnx.SerializeToString())
            
        print(f"Modelo {model_type} ONNX exportado correctamente")
        
    except Exception as e:
        print(f"Error en exportar el modelo {model_type}: {e}")
        raise

def train_models_parallel(data_list, mql5_files_folder):
    # Diccionario para rastrear metadatos de cada futuro
    future_metadata = {}
    futures = []
    
    with ThreadPoolExecutor(max_workers=4) as executor:
        # Paso 1: Enviar todas las tareas de entrenamiento
        for data in data_list:
            direction = data['direction']
            
            # Modelo Principal (main)
            X_main, y_main = data['main']
            scaler_main = RobustScaler()
            X_main_scaled = scaler_main.fit_transform(X_main)
            future_main = executor.submit(
                train_classifier, 
                X_main_scaled, 
                y_main, 
                f"{direction}_main"
            )
            future_metadata[future_main] = {
                'type': 'main',
                'direction': direction,
                'X_train': X_main,  
                'scaler': scaler_main
            }
            futures.append(future_main)
            
            # Meta-Modelo (validación causal)
            X_meta, y_meta = data['meta']
            scaler_meta = RobustScaler()
            X_meta_scaled = scaler_meta.fit_transform(X_meta)
            future_meta = executor.submit(
                train_classifier,
                X_meta_scaled,
                y_meta.astype(int), 
                f"{direction}_meta" 
            )
            future_metadata[future_meta] = {
                'type': 'meta',
                'direction': direction,
                'X_train': X_meta,  
                'scaler': scaler_meta
            }
            futures.append(future_meta)
        
        # Paso 2: Procesar resultados conforme se completan
        for future in as_completed(futures):
            metadata = future_metadata[future]
            try:
                best_estimator = future.result()
                model_type = f"{metadata['direction']}_{metadata['type']}"
                X_train = metadata['X_train']
                scaler = metadata['scaler']
                # Reconstruir pipeline
                final_pipeline = Pipeline([
                    ("scaler", scaler),
                    ("ensemble", best_estimator.named_steps['ensemble'])
                ])
                # Guardar modelo ONNX
                save_onnx_model(mql5_files_folder, final_pipeline, X_train, model_type)
                
            except Exception as e:
                direction_type = f"{metadata['direction']}_{metadata['type']}"
                print(f"Error crítico en {direction_type}: {str(e)}")
                raise

## Train

In [None]:
def main():
    # Rutas
    common_file_folder = r"/mnt/c/Users/Administrador/AppData/Roaming/MetaQuotes/Terminal/Common/Files/"
    mql5_files_folder = r'/mnt/c/Users/Administrador/AppData/Roaming/MetaQuotes/Terminal/6C3C6A11D1C3791DD4DBF45421BF8028/MQL5/Files/'
    # Definir patrones de archivos para compras y ventas
    long_file_pattern = os.path.join(common_file_folder, 'training_dataset_long_*.csv')
    short_file_pattern = os.path.join(common_file_folder, 'training_dataset_short_*.csv')
    generic_file_pattern = os.path.join(common_file_folder, 'training_dataset_*.csv')
    # Encontrar archivos
    df_long_file_paths = glob.glob(long_file_pattern)
    df_short_file_paths = glob.glob(short_file_pattern)
    df_generic_file_paths = glob.glob(generic_file_pattern)
    # Procesar todos los datasets
    full_data = []
    if df_long_file_paths:
        full_data.extend(process_files(df_long_file_paths))
    if df_short_file_paths:
        full_data.extend(process_files(df_short_file_paths))
    if not df_long_file_paths and not df_short_file_paths and df_generic_file_paths:
        full_data.extend(process_files(df_generic_file_paths))
    # Entrenamiento paralelo optimizado
    if len(full_data) > 0:
        train_models_parallel(full_data, mql5_files_folder)
    else:
        print("No se encontraron datasets válidos para entrenar")
if __name__ == "__main__":
    main()

##########
