## Importar librerías

In [None]:
!pip -q uninstall -y tensorflow-cloud tensorflow tensorflow-transform tensorflow-serving-api witwidget apache-beam google-cloud-aiplatform google-cloud-automl kfp
!pip -q install skl2onnx onnxmltools
!conda update -y cuml

In [None]:
import os
import glob
import cupy as cp
import cudf
from cuml.metrics import accuracy_score
from cuml.model_selection import StratifiedKFold
from cuml.preprocessing import RobustScaler
from cuml.decomposition import PCA
import xgboost as xgb
from cuml.pipeline import Pipeline
from concurrent.futures import ThreadPoolExecutor, wait
from skl2onnx.common.data_types import FloatTensorType
from skl2onnx import convert_sklearn, update_registered_converter
from skl2onnx.common.shape_calculator import calculate_linear_classifier_output_shapes
from onnxmltools.convert.xgboost.operator_converters.XGBoost import convert_xgboost
import warnings
warnings.filterwarnings('ignore')

## Cargar y preparar datos

In [None]:
def create_training_dataset(df, trade_type):
    df = df.drop_duplicates()
    # Filtrar las operaciones del tipo especificado y con profit != 0
    df_trade = df[(df['type'] == trade_type) & (df['profit'] != 0)].copy()
    # Separar en ganadoras y perdedoras
    df_winning = df_trade[df_trade['profit'] > 0]
    df_losing = df_trade[df_trade['profit'] < 0]
    n_winning = len(df_winning)
    n_losing = len(df_losing)
    print(f"Tipo de operación: {'Buy' if trade_type == 1 else 'Sell'}")
    print(f"Total Ganadoras: {n_winning}")
    print(f"Total Perdedoras: {n_losing}")
    # Verificar que hay suficientes datos
    if n_winning == 0 or n_losing == 0:
        print(f"No hay suficientes datos para {'compras' if trade_type == 1 else 'ventas'} para entrenar el modelo.")
        return False
    # Equilibrar las clases
    if n_winning <= n_losing:
        n_samples_per_class = n_winning
        # Seleccionar todas las ganadoras
        selected_winning = df_winning.copy()
        # Ordenar las perdedoras por pérdida de mayor a menor (menor profit a mayor)
        df_losing_sorted = df_losing.sort_values(by='profit', ascending=True)
        # Seleccionar las perdedoras con mayor pérdida
        selected_losing = df_losing_sorted.head(n_samples_per_class)
    else:
        n_samples_per_class = n_losing
        # Seleccionar todas las perdedoras
        selected_losing = df_losing.copy()
        # Ordenar las ganadoras por profit de mayor a menor
        df_winning_sorted = df_winning.sort_values(by='profit', ascending=False)
        # Seleccionar las ganadoras con mayor profit
        selected_winning = df_winning_sorted.head(n_samples_per_class)
    print(f"Se seleccionarán {n_samples_per_class} muestras por clase.")
    # Combinar las muestras seleccionadas
    df_training = cudf.concat([selected_winning, selected_losing], ignore_index=True)
    # Añadir la columna 'Target' basada en el profit
    df_training['target'] = df_training['profit'].apply(lambda x: 1 if x > 0 else 0)
    # Seleccionar las columnas necesarias (todas menos las dos últimas para el conjunto principal,
    # y todas las columnas de los subconjuntos excepto la última)
    # Suponiendo que las dos últimas columnas en el conjunto principal son 'type' y 'profit'
    feature_columns = df.columns[:-2]
    df_training = df_training[feature_columns.tolist() + ['target']]
    # Mezclar los datos
    df_training = df_training.sample(frac=1).reset_index(drop=True)
    # Eliminar posibles missings
    if(df_training.isna().values.any()):
        df_training=df_training.dropna()
    # retunr df
    return df_training

In [None]:
# Cargar, limpiar y preparar datasets
def load_dataset(df):
    # Preparación de los datos de compra
    df_buy = create_training_dataset(df, trade_type=1)
    X_buy_train = df_buy.drop(columns='target')
    y_buy_train = df_buy['target']
    # Preparación de los datos de venta
    df_sell = create_training_dataset(df, trade_type=-1)
    X_sell_train = df_sell.drop(columns='target')
    y_sell_train = df_sell['target']
    return X_buy_train.to_cupy(), y_buy_train.to_cupy(), X_sell_train.to_cupy(), y_sell_train.to_cupy()

In [None]:
# Carga
file_folder = r"/kaggle/input/training-datasets/"
file_pattern = os.path.join(file_folder, 'training_dataset_*.csv')
df_file_path = glob.glob(file_pattern)
df = cudf.read_csv(df_file_path[0])
# Split
X_buy_train, y_buy_train, X_sell_train, y_sell_train = load_dataset(df)
# num features
n_features = X_buy_train.shape[1]

## Algoritmo genético para encontrar los mejores hiperparámetros

In [None]:
class GeneticAlgorithmCV:
    def __init__(
        self,
        estimator,
        param_grid,
        cv=None,
        scoring=None,
        pop_size=20,
        generations=10,
        early_stopping_rounds=1,
        crossover_initial=0.1,
        crossover_end=0.9,
        mutation_initial=0.9,
        mutation_end=0.1,
        elitism=True,
        elite_size=3,
        tournament_size=5,
        n_random=5,
        verbose=False
    ):
        self.estimator = estimator
        self.param_grid = param_grid
        self.cv = cv
        self.scoring = scoring
        self.pop_size = pop_size
        self.generations = generations
        self.early_stopping_rounds = early_stopping_rounds
        self.crossover_initial = crossover_initial
        self.crossover_end = crossover_end
        self.mutation_initial = mutation_initial
        self.mutation_end = mutation_end
        self.elitism = elitism
        self.elite_size = elite_size
        self.tournament_size = tournament_size
        self.n_random = n_random
        self.verbose = verbose
        self.best_params_ = None
        self.best_score_ = None
        self.best_estimator_ = None

    def decode_chromosome(self, chromosome):
        param_values = {}
        for i, key in enumerate(self.param_grid.keys()):
            gene = chromosome[i]
            param_info = self.param_grid[key]
            low = param_info['low']
            high = param_info['high']
            if param_info['type'] == 'int':
                value = int(cp.round(gene * (high - low) + low))
            elif param_info['type'] == 'float':
                value = gene * (high - low) + low
            param_values[key] = value
        return param_values

    def initialize_population(self):
        chromosome_length = len(self.param_grid)
        population = cp.random.uniform(low=0.0, high=1.0, size=(self.pop_size, chromosome_length))
        return population

    def evaluate_population(self, population, X_train, y_train):
        fitnesses = []
        for chromosome in population:
            params = self.decode_chromosome(chromosome)
            scores = []
            for train_idx, val_idx in self.cv.split(X_train, y_train):
                X_tr, X_val = X_train[train_idx], X_train[val_idx]
                y_tr, y_val = y_train[train_idx], y_train[val_idx]
                
                # Clonar el estimador para evitar efectos colaterales
                model = clone(self.estimator)
                model.set_params(**params)
                model.fit(X_tr, y_tr)
                y_pred = model.predict(X_val)
                
                # Calcular la puntuación utilizando el scorer
                score = self.scoring(y_val, y_pred)
                
                scores.append(score)
            fitness = cp.mean(cp.array(scores))
            fitnesses.append(fitness)
        return cp.array(fitnesses)

    def select_parents(self, population, fitnesses):
        selected = []
        for _ in range(len(population)):
            indices = cp.random.randint(0, len(population), size=self.tournament_size)
            best_idx = indices[cp.argmax(fitnesses[indices])]
            selected.append(population[best_idx])
        return cp.vstack(selected)

    def crossover(self, parents, crossover_rate):
        offspring = []
        for i in range(0, len(parents), 2):
            parent1 = parents[i].copy()
            parent2 = parents[(i+1) % len(parents)].copy()
            if cp.random.rand() < crossover_rate:
                point = cp.random.randint(1, len(parent1))
                child1 = cp.concatenate((parent1[:point], parent2[point:]))
                child2 = cp.concatenate((parent2[:point], parent1[point:]))
                offspring.append(child1)
                offspring.append(child2)
            else:
                offspring.append(parent1)
                offspring.append(parent2)
        return cp.vstack(offspring)

    def mutate(self, offspring, mutation_rate, mutation_scale=0.1):
        for chromosome in offspring:
            if cp.random.rand() < mutation_rate:
                gene_idx = cp.random.randint(0, len(chromosome))
                mutation = cp.random.normal(0, mutation_scale)
                chromosome[gene_idx] += mutation
                chromosome[gene_idx] = cp.clip(chromosome[gene_idx], 0.0, 1.0)
        return offspring

    def generate_random_individuals(self, n_random):
        chromosome_length = len(self.param_grid)
        random_chromosomes = cp.empty((n_random, chromosome_length), dtype=cp.float32)
        for i, key in enumerate(self.param_grid.keys()):
            grid = self.param_grid[key]
            low = grid['low']
            high = grid['high']
            if grid['type'] == 'int':
                sampled = cp.random.randint(low, high + 1, size=n_random)
                normalized = (sampled - low) / (high - low)
                random_chromosomes[:, i] = normalized.astype(cp.float32)
            elif grid['type'] == 'float':
                sampled = cp.random.uniform(low, high, size=n_random)
                normalized = (sampled - low) / (high - low)
                random_chromosomes[:, i] = normalized.astype(cp.float32)
            else:
                raise ValueError(f"Tipo de parámetro no soportado: {grid['type']}")
        return random_chromosomes

    def fit(self, X_train, y_train):
        if self.cv is None:
            self.cv = StratifiedKFold(n_splits=5, shuffle=True)
        chromosome_length = len(self.param_grid)
        population = self.initialize_population()
        best_overall_fitness = -cp.inf
        best_overall_chromosome = None
        no_improvement_generations = 0

        for generation in range(self.generations):
            if self.verbose:
                print(f"Generación [{generation+1}]")
            crossover_rate = self.crossover_initial * ((self.crossover_end / self.crossover_initial) ** (generation / self.generations))
            mutation_rate = self.mutation_initial * ((self.mutation_end / self.mutation_initial) ** (generation / self.generations))
            if self.verbose:
                print(f"Crossover Rate: {crossover_rate:.4f}, Mutation Rate: {mutation_rate:.4f}")
            fitnesses = self.evaluate_population(population, X_train, y_train)
            current_best_fitness = cp.max(fitnesses)
            if self.verbose:
                print(f"Mejor fitness en generación [{generation+1}]: {current_best_fitness}")
            if current_best_fitness > best_overall_fitness:
                best_overall_fitness = current_best_fitness
                best_idx = cp.argmax(fitnesses)
                best_overall_chromosome = population[best_idx]
                no_improvement_generations = 0
            else:
                no_improvement_generations += 1
            if no_improvement_generations >= self.early_stopping_rounds:
                if self.verbose:
                    print(f"No hubo mejora en el fitness por {self.early_stopping_rounds} generaciones consecutivas. Deteniendo el algoritmo.")
                    print(f"El mejor fitness: {best_overall_fitness}")
                break
            if self.elitism:
                sorted_indices = cp.argsort(fitnesses)[::-1]
                elites = population[sorted_indices[:self.elite_size]]
            else:
                elites = None
            # Seleccionar padres
            parents = self.select_parents(population, fitnesses)
            # Generar descendencia mediante cruza
            offspring = self.crossover(parents, crossover_rate=crossover_rate)
            # Aplicar mutaciones a la descendencia
            offspring = self.mutate(offspring, mutation_rate=mutation_rate)
            # Inyección de individuos aleatorios
            random_individuals = self.generate_random_individuals(self.n_random)
            offspring = cp.vstack((offspring, random_individuals))
            # Mantener el tamaño de la población
            if self.elitism and elites is not None:
                population = cp.vstack((elites, offspring))
            else:
                population = offspring
            # Si la población excede el tamaño, seleccionar los mejores
            if len(population) > self.pop_size:
                fitnesses = self.evaluate_population(population, X_train, y_train)
                sorted_indices = cp.argsort(fitnesses)[::-1]
                population = population[sorted_indices[:self.pop_size]]
                    
        self.best_params_ = self.decode_chromosome(best_overall_chromosome)
        self.best_score_ = best_overall_fitness.get()  # Convertir a float de Python
        # Entrenar el mejor estimador
        #self.best_estimator_ = clone(self.estimator)
        #self.best_estimator_.set_params(**self.best_params_)
        #self.best_estimator_.fit(X_train, y_train)
        return self

## Entrenar los modelos

In [None]:
def train_model_buy(X_train, y_train, param_grid):
    try:
        # Definir el pipeline con placeholders
        estimator = Pipeline([
            ('scaler', RobustScaler()),
            ('dim_reducer', PCA()),
            ('xgb', xgb.XGBClassifier(eval_metric='mlogloss', tree_method='gpu_hist', predictor='gpu_predictor', verbosity=0))
        ])
        # Crear una instancia de GeneticAlgorithmCV
        ga_search = GeneticAlgorithmCV(
            estimator=estimator,
            param_grid=param_grid,
            cv=StratifiedKFold(n_splits=5, shuffle=True),
            scoring=accuracy_score,
            pop_size=50,
            generations=15,
            early_stopping_rounds=1,
            crossover_initial=0.1,
            crossover_end=0.9,
            mutation_initial=0.9,
            mutation_end=0.1,
            elitism=True,
            elite_size=3,
            tournament_size=5,
            n_random=5,
            verbose=True
        )
        # Entrenar el modelo utilizando el algoritmo genético
        ga_search.fit(X_train, y_train)
    except Exception as e:
        print(f"Error en train_model_buy: {e}")
        raise
    # Obtener los mejores parámetros y el mejor estimador
    best_params = ga_search.best_params_
    print("Mejores parámetros encontrados para compras:", best_params)
    print("Mejor puntuación de validación para compras:", ga_search.best_score_)
    # Retornar mejores parámetros
    return best_params

In [None]:
def train_model_sell(X_train, y_train, param_grid):
    try:
        # Definir el pipeline con placeholders
        estimator = Pipeline([
            ('scaler', RobustScaler()),
            ('dim_reducer', PCA()),
            ('xgb', xgb.XGBClassifier(eval_metric='mlogloss', tree_method='gpu_hist', predictor='gpu_predictor', verbosity=0))
        ])
        # Crear una instancia de GeneticAlgorithmCV
        ga_search = GeneticAlgorithmCV(
            estimator=estimator,
            param_grid=param_grid,
            cv=StratifiedKFold(n_splits=5, shuffle=True),
            scoring=accuracy_score,
            pop_size=50,
            generations=15,
            early_stopping_rounds=1,
            crossover_initial=0.1,
            crossover_end=0.9,
            mutation_initial=0.9,
            mutation_end=0.1,
            elitism=True,
            elite_size=3,
            tournament_size=5,
            n_random=5,
            verbose=True
        )
        # Entrenar el modelo utilizando el algoritmo genético
        ga_search.fit(X_train, y_train)
    except Exception as e:
        print(f"Error en train_model_buy: {e}")
        raise
    # Obtener los mejores parámetros y el mejor estimador
    best_params = ga_search.best_params_
    print("Mejores parámetros encontrados para compras:", best_params)
    print("Mejor puntuación de validación para compras:", ga_search.best_score_)
    # Retornar mejores parámetros
    return best_params

In [None]:
# Definir param_grid con rangos
param_grid = {
    'dim_reducer__n_components': {'type': 'int', 'low': 5, 'high': n_features},
    'xgb__n_estimators': {'type': 'int', 'low': 50, 'high': 500},
    'xgb__max_depth': {'type': 'int', 'low': 3, 'high': 10},
    'xgb__learning_rate': {'type': 'float', 'low': 0.01, 'high': 0.3},
    'xgb__subsample': {'type': 'float', 'low': 0.6, 'high': 1.0},
    'xgb__colsample_bytree': {'type': 'float', 'low': 0.6, 'high': 1.0},
    'xgb__gamma': {'type': 'float', 'low': 0.0, 'high': 0.5},
    'xgb__min_child_weight': {'type': 'int', 'low': 1, 'high': 10},
    'xgb__reg_alpha': {'type': 'float', 'low': 0.0, 'high': 1.0},
    'xgb__reg_lambda': {'type': 'float', 'low': 0.0, 'high': 1.0}
}

In [None]:
# Entrenar modelos simultáneamente
with ThreadPoolExecutor(max_workers=2) as executor:
    # enviar tareas de entrenamiento
    future_buy = executor.submit(train_model_buy, X_buy_train, y_buy_train, param_grid)
    future_sell = executor.submit(train_model_sell, X_sell_train, y_sell_train, param_grid)
    # esperar a que todas las tareas terminen
    futures = [future_buy, future_sell]
    print("Esperando que las tareas finalicen...")
    wait(futures)
    print("¡Todas las tareas han terminado!")
    # Obtener resultados una vez que ambas tareas han terminado
    model_buy_best_params = future_buy.result()
    model_sell_best_params = future_sell.result()

## Exportar modelos a formato ONNX

In [None]:
def save_onnx_models(mql5_files_folder):
    try:
        update_registered_converter(
            xgb.XGBClassifier,
            "XGBClassifier",
            calculate_linear_classifier_output_shapes,
            convert_xgboost,
            options={'nocl': [True, False], 'zipmap': [True, False, 'columns']}
        )
        model_buy_onnx = convert_sklearn(
            model_buy,
            'pipeline_buy_xgboost',
            [('input', FloatTensorType([None, X_buy_train.shape[1]]))],
            target_opset={'': 12, 'ai.onnx.ml': 2}
        )
        model_sell_onnx = convert_sklearn(
            model_sell,
            'pipeline_sell_xgboost',
            [('input', FloatTensorType([None, X_buy_train.shape[1]]))],
            target_opset={'': 12, 'ai.onnx.ml': 2}
        )
        with open(os.path.join(mql5_files_folder, "model_buy.onnx"), 'wb') as f:
            f.write(model_buy_onnx.SerializeToString())
        with open(os.path.join(mql5_files_folder, "model_sell.onnx"), 'wb') as f:
            f.write(model_sell_onnx.SerializeToString())
    except Exception as e:
        print(f"Error en exportar los modelos: {e}")
        raise
    print("Modelos ONNX exportados correctamente")

In [None]:
# Exportar modelos
save_onnx_models(r'/kaggle/working/')