## Imports

In [None]:
import re
import os
import glob
import pandas as pd
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.decomposition import KernelPCA, TruncatedSVD
from sklearn.model_selection import StratifiedKFold
from sklearn.pipeline import Pipeline
import xgboost as xgb
from genetic_algorithm import GeneticAlgorithmCV
from concurrent.futures import ThreadPoolExecutor, wait
from skl2onnx.common.data_types import FloatTensorType
from skl2onnx import convert_sklearn, update_registered_converter
from skl2onnx.common.shape_calculator import calculate_linear_classifier_output_shapes
from onnxmltools.convert.xgboost.operator_converters.XGBoost import convert_xgboost
import warnings
warnings.filterwarnings('ignore')

## Functions

In [None]:
# Helper function
def process_files(file_paths):
    data_list = []
    for file_path in file_paths:
        model_type = extract_action(file_path)
        df = pd.read_csv(file_path)
        X_train, y_train = create_training_dataset(df, model_type)
        data_list.append((X_train, y_train, model_type))
    return data_list

# Helper function
def extract_action(filepath):
    match = re.search(r'(buy|sell)', filepath)
    return match.group(0) if match else None

# Crear datasets de entrenamiento
def create_training_dataset(df, action_type):
    print(f"=== Procesando dataset de {'compras' if action_type == 'buy' else 'ventas'} ===")
    df = df.drop_duplicates()
    print(f"Total de operaciones después de eliminar duplicados: {len(df)}")
    # Filtrar las operaciones con profit != 0
    df_trade = df[df['profit'] != 0].copy()
    print(f"Operaciones con profit != 0: {len(df_trade)}")
    # Añadir la columna 'target' basada en el profit
    df_trade['target'] = df_trade['profit'].apply(lambda x: 1 if x > 0 else 0)
    num_ganadoras = df_trade['target'].sum()
    num_perdedoras = len(df_trade) - num_ganadoras
    print(f"Operaciones ganadoras: {int(num_ganadoras)}")
    print(f"Operaciones perdedoras: {int(num_perdedoras)}")
    # Eliminar posibles missings
    if df_trade.isna().values.any():
        num_missings = df_trade.isna().sum().sum()
        print(f"Valores faltantes encontrados: {num_missings}")
        df_trade = df_trade.dropna()
        print(f"Total de operaciones después de eliminar missings: {len(df_trade)}")
    df_training = df_trade.copy(deep=True)
    # Seleccionar las columnas necesarias (todas menos las dos últimas para el conjunto principal)
    feature_columns = df_training.columns[:-2]
    df_training = df_training[feature_columns.tolist() + ['target']]
    # Preparación de los datos de entrenamiento
    X_train = df_training.drop(columns='target').values.astype('float32')
    y_train = df_training['target'].values.astype('float32')
    print(f"Dataset final preparado: {X_train.shape[0]} operaciones, {X_train.shape[1]} características")
    return X_train, y_train

# Train model functions
def train_classifier(X_train, y_train, model_type):
    n_features = X_train.shape[1]
    # Definir algoritmo de validación cruzada
    cv = StratifiedKFold(n_splits=3)
    # Definir Pipeline
    pipeline = Pipeline([
            ('scaler', 'passthrough'),
            ('reducer', 'passthrough'),
            ('xgbclassifier', xgb.XGBClassifier(
                device = "cuda",
                verbosity=0)
            )
        ])
    # Definir mapa de estimadores
    estimator_map = {
        'scaler': {
            'standard': StandardScaler(),
            'robust': RobustScaler(),
            'none': 'passthrough'
        },
        'reducer': {
            'kernel_pca_rbf': KernelPCA(kernel='rbf'),
            'kernel_pca_linear': KernelPCA(kernel='linear'),
            'truncated_svd': TruncatedSVD(),
            'none': 'passthrough'
        }
    }
    # Definir espacio de hiperparámetros para compras
    param_grid = {
        'scaler': {'type': 'categorical', 'values': ['standard', 'robust', 'none']},
        'reducer': {'type': 'categorical', 'values': [
            'kernel_pca_rbf', 'kernel_pca_linear', 'truncated_svd', 'none']},
        'reducer__n_components': {'type': 'int', 'low': 2, 'high': n_features-1},
        'xgbclassifier__n_estimators': {'type': 'int', 'low': 50, 'high': 500},
        'xgbclassifier__max_depth': {'type': 'int', 'low': 3, 'high': 10},
        'xgbclassifier__eta':{'type': 'float', 'low': 0.1, 'high': 0.5},
        'xgbclassifier__gamma':{'type': 'float', 'low': 0.0, 'high': 0.5},
        'xgbclassifier__subsample': {'type': 'float', 'low': 0.5, 'high': 1.0},
        'xgbclassifier__colsample_bytree': {'type': 'float', 'low': 0.5, 'high': 1.0},
        'xgbclassifier__colsample_bylevel': {'type': 'float', 'low': 0.5, 'high': 1.0},
        'xgbclassifier__colsample_bynode': {'type': 'float', 'low': 0.5, 'high': 1.0},
        'xgbclassifier__alpha': {'type': 'float', 'low': 0.0, 'high': 1.0},
        'xgbclassifier__lambda': {'type': 'float', 'low': 0.0, 'high': 1.0},
        'xgbclassifier__min_child_weight': {'type': 'int', 'low': 1, 'high': 10},
        'xgbclassifier__scale_pos_weight':  {'type': 'int', 'low': 1, 'high': 10}
    }
    try:
        # Entrenar el modelo utilizando el algoritmo genético
        ga_search = GeneticAlgorithmCV(
            model_type=model_type,
            pipeline=pipeline,
            param_grid=param_grid,
            estimator_map=estimator_map,
            cv=cv,
            pop_size=45,
            generations=15,
            early_stopping_rounds=3,
            crossover_initial=0.9,
            crossover_end=0.1,
            mutation_initial=0.1,
            mutation_end=0.9,
            elitism=True,
            elite_size=5,
            tournament_size=3,
            n_random=5,
            n_jobs=2,
            verbose=True,
        )
        ga_search.fit(X_train, y_train)
    except Exception as e:
        print(f"Error en traing model {model_type}: {e}")
        raise
    # Obtener los mejores parámetros y el mejor estimador
    print("####################################################################")
    print(f"Mejor puntuación de validación para {model_type}: {ga_search.best_score_}")
    print(f"Mejores parámetros encontrados para {model_type}: {ga_search.best_params_full_}")
    print("####################################################################")
    # Retornar mejor estimador
    return ga_search.best_estimator_

# Guardar modelos en formato ONNX
def save_onnx_model(mql5_files_folder, model, X, model_type):
    try:
        update_registered_converter(
            xgb.XGBClassifier,
            "XGBClassifier",
            calculate_linear_classifier_output_shapes,
            convert_xgboost,
            options={'nocl': [True, False], 'zipmap': [True, False, 'columns']}
        )
        model_onnx = convert_sklearn(
            model,
            f"pipeline_{model_type}_xgboost",
            [('input', FloatTensorType([None, X.shape[1]]))],
            target_opset={'': 12, 'ai.onnx.ml': 2}
        )
        with open(os.path.join(mql5_files_folder, f"model_{model_type}.onnx"), 'wb') as f:
            f.write(model_onnx.SerializeToString())
    except Exception as e:
        print(f"Error en exportar el modelo {model_type}: {e}")
        raise
    print(f"Modelo {model_type} ONNX exportado correctamente")

## Train

In [None]:
# Rutas
common_file_folder = r"/mnt/c/Users/Administrador/AppData/Roaming/MetaQuotes/Terminal/Common/Files/"
mql5_files_folder = r'/mnt/c/Users/Administrador/AppData/Roaming/MetaQuotes/Terminal/6C3C6A11D1C3791DD4DBF45421BF8028/MQL5/Files/'
# Definir patrones de archivos para compras y ventas
buy_file_pattern = os.path.join(common_file_folder, 'training_dataset_buy_*.csv')
sell_file_pattern = os.path.join(common_file_folder, 'training_dataset_sell_*.csv')
generic_file_pattern = os.path.join(common_file_folder, 'training_dataset_*.csv')
# Encontrar archivos
df_buy_file_paths = glob.glob(buy_file_pattern)
df_sell_file_paths = glob.glob(sell_file_pattern)
df_generic_file_paths = glob.glob(generic_file_pattern)
# Inicializar una lista para almacenar los datos
data_list = []
# Procesar archivos de compras
if df_buy_file_paths:
    data_list.extend(process_files(df_buy_file_paths))
# Procesar archivos de ventas
if df_sell_file_paths:
    data_list.extend(process_files(df_sell_file_paths))
# Si no se encontraron archivos específicos de compras o ventas, buscar archivos genéricos
if not df_buy_file_paths and not df_sell_file_paths:
    if df_generic_file_paths:
        data_list.extend(process_files(df_generic_file_paths))
# Verificar cuántos conjuntos de datos tenemos y proceder en consecuencia
if len(data_list) == 0:
    print("No se encontraron archivos de datos para entrenar.")
elif len(data_list) == 1:
    # Solo hay un conjunto de datos
    X_train, y_train, model_type = data_list[0]
    print(f"Entrenando el modelo clasificador para {model_type}...")
    model = train_classifier(X_train, y_train, model_type)
    print("¡Modelo entrenado!")
    # Guardar el modelo entrenado en formato ONNX
    save_onnx_model(mql5_files_folder, model, X_train, model_type)
else:
    # Hay múltiples conjuntos de datos, entrenar en paralelo
    with ThreadPoolExecutor(max_workers=len(data_list)) as executor:
        print("Entrenando modelos en paralelo...")
        futures = []
        for X_train, y_train, model_type in data_list:
            future = executor.submit(train_classifier, X_train, y_train, model_type)
            futures.append((future, X_train, model_type))
        # Esperar a que todas las tareas terminen
        wait([f[0] for f in futures])
        # Obtener resultados y guardar modelos
        for future, X_train, model_type in futures:
            model = future.result()
            print(f"¡Modelo para {model_type} entrenado!")
            # Guardar el modelo entrenado en formato ONNX
            save_onnx_model(mql5_files_folder, model, X_train, model_type)