In [1]:
# train.py

import os
import torch
import numpy as np
import warnings
warnings.filterwarnings(action='ignore')

from data_loader import load_data_1m
from feature_calculations import (
    resample_data, calculate_MA_data, calculate_ema_bollinger_bands, calculate_rsi,
    calculate_macd, calculate_stochastic_oscillator, calculate_adx, calculate_atr,
    calculate_obv, calculate_williams_r, base_feature_fn, cyclic_encode_fn, log_transform
)
from strategies import BB_fitness_fn, BB_MACD_fitness_fn
from dataset import make_dataset, replace_nan_with_zero
from train_functions import inference, fitness_fn, generation_valid, generation_test

from Prescriptor import Prescriptor
from Evolution.crossover import UniformCrossover, WeightedSumCrossover, DifferentialEvolutionOperator
from Evolution.mutation import MultiplyNormalMutation, MultiplyUniformMutation, AddNormalMutation, AddUniformMutation, ChainMutation, FlipSignMutation
from Evolution.mutation import RandomValueMutation
from Evolution.selection import RouletteSelection, TournamentSelection, ParetoLexsortSelection
from Evolution import Evolution

In [2]:
# # Load Data
# data_1m = load_data_1m('/root/daily/bit/data/1min_bitusdt_test.pkl')
# data_1m = data_1m.iloc[:200000]

# # Resample data to 1D
# data_1d = resample_data(data_1m, '1D')
# data_1d['Close time'] = data_1d.index
# data_1d = data_1d.reset_index(drop=True)

# # Apply Feature Calculations
# # For 1D Data
# data_1d, ma_cols_1d, ma_cols_rel_1d = calculate_MA_data(data_1d, 60, 'EMA', '_1d')
# data_1d, bb_cols_1d, bb_cols_rel_1d = calculate_ema_bollinger_bands(data_1d, 60, extra_str='_1d')
# data_1d, rsi_cols_1d = calculate_rsi(data_1d, window=20, extra_str='_1d')
# data_1d, macd_cols_1d = calculate_macd(data_1d, 20, 120, 60, extra_str='_1d')
# data_1d, stoch_cols_1d = calculate_stochastic_oscillator(data_1d, 60, 20, extra_str='_1d')
# data_1d, adx_cols_1d = calculate_adx(data_1d, 60, extra_str='_1d')
# data_1d, atr_cols_1d = calculate_atr(data_1d, 60, extra_str='_1d')
# data_1d, obv_cols_1d = calculate_obv(data_1d, extra_str='_1d')
# data_1d, will_cols_1d = calculate_williams_r(data_1d, 60, extra_str='_1d')
# data_1d, base_feature_1d = base_feature_fn(data_1d, extra_str='_1d')
# data_1d, cyclice_encoding_1d = cyclic_encode_fn(data_1d, 'Close time', 'day_of_year')

# # For 1M Data
# data_1m, ma_cols, ma_cols_rel = calculate_MA_data(data_1m, 240, 'EMA')
# data_1m, bb_cols, bb_cols_rel = calculate_ema_bollinger_bands(data_1m, 240)
# data_1m, rsi_cols = calculate_rsi(data_1m, window=60)
# data_1m, macd_cols = calculate_macd(data_1m, 60, 600, 240)
# data_1m, stoch_cols = calculate_stochastic_oscillator(data_1m, 240, 60)
# data_1m, adx_cols = calculate_adx(data_1m, 240)
# data_1m, atr_cols = calculate_atr(data_1m, 240)
# data_1m, obv_cols = calculate_obv(data_1m)
# data_1m, will_cols = calculate_williams_r(data_1m, 240)
# data_1m, base_feature = base_feature_fn(data_1m)
# data_1m, cyclice_encoding = cyclic_encode_fn(data_1m, 'Open time')

# data_1m, short_ma_cols, short_ma_cols_rel = calculate_MA_data(data_1m, 60, 'EMA')
# data_1m, long_ma_cols, long_ma_cols_rel = calculate_MA_data(data_1m, 180, 'EMA')

# # Prepare Feature Columns
# drop_column = [
#     'Open time', 'Close time', 'Quote asset volume', 'Ignore',
#     'Number of trades', 'Taker buy base asset volume', 'Taker buy quote asset volume'
# ]
# feature_column = (
#     ma_cols_rel + bb_cols_rel + rsi_cols + macd_cols + stoch_cols +
#     adx_cols + will_cols + base_feature + cyclice_encoding  # Excluding obv and atr
# )
# feature_column_1d = (
#     ma_cols_rel_1d + bb_cols_rel_1d + rsi_cols_1d + macd_cols_1d + stoch_cols_1d +
#     adx_cols_1d + will_cols_1d + base_feature_1d + cyclice_encoding_1d
# )


# # Apply Log Transform
# for feature in feature_column:
#     data_1m[feature] = log_transform(data_1m[feature])

# for feature in feature_column_1d:
#     data_1d[feature] = log_transform(data_1d[feature])

# data_1d['%D_20__1d'] = 0
# data_1d['ADX_60__1d'] = 0


In [3]:
import random
import numpy as np
import pandas as pd
from deap import base, creator, tools, algorithms
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
import warnings
import multiprocessing  # Added for multiprocessing support

# 경고 무시
warnings.filterwarnings("ignore")

# 1. 데이터 준비
def load_adult_data():
    # UCI Adult 데이터셋을 로드합니다.
    column_names = [
        "age", "workclass", "fnlwgt", "education", "education-num",
        "marital-status", "occupation", "relationship", "race", "sex",
        "capital-gain", "capital-loss", "hours-per-week", "native-country", "income"
    ]
    train_data = pd.read_csv(
        "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data",
        names=column_names,
        sep=",\s",
        na_values="?",
        engine="python"
    )
    test_data = pd.read_csv(
        "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test",
        names=column_names,
        sep=",\s",
        na_values="?",
        skiprows=1,
        engine="python"
    )
    data = pd.concat([train_data, test_data], ignore_index=True)
    data.dropna(inplace=True)  # 결측치 제거
    return data

data = load_adult_data()

# 특성 전처리
def preprocess_data(df):
    df = df.copy()
    # 범주형 변수 인코딩
    label_encoders = {}
    for column in df.select_dtypes(include=["object"]).columns:
        if column != "income":
            le = LabelEncoder()
            df[column] = le.fit_transform(df[column])
            label_encoders[column] = le
    # 타겟 변수 인코딩
    df["income"] = df["income"].apply(lambda x: 1 if ">50K" in x else 0)
    return df, label_encoders

data, label_encoders = preprocess_data(data)

# 특성과 타겟 분리
X = data.drop("income", axis=1).values
y = data["income"].values

# 데이터 분할 (훈련/검증)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# 트리의 최대 깊이와 노드 수 정의
MAX_DEPTH = 3  # 트리의 최대 깊이
NUM_NODES = 2**MAX_DEPTH - 1  # 완전 이진 트리의 노드 수

NUM_FEATURES = X_train.shape[1]

# 2. 유전 알고리즘 설정

# DEAP 설정
creator.create("FitnessMax", base.Fitness, weights=(1.0,))
creator.create("Individual", list, fitness=creator.FitnessMax)

toolbox = base.Toolbox()

# 트리의 각 노드를 (feature, threshold, left_child, right_child, is_leaf, class_label)로 인코딩
def init_individual():
    individual = []
    for _ in range(NUM_NODES):
        node = {}
        # 내부 노드일 확률과 리프 노드일 확률을 설정 (여기서는 최대 깊이를 고정)
        # 따라서 모든 노드는 내부 노드가 아니며, 최대 깊이에 도달하면 리프 노드로 설정
        node['is_leaf'] = False
        node['feature'] = random.randint(0, NUM_FEATURES - 1)
        node['threshold'] = random.uniform(np.min(X_train[:, node['feature']]),
                                           np.max(X_train[:, node['feature']]))
        node['class_label'] = None  # 내부 노드는 클래스 레이블이 없음
        individual.append(node)
    # 마지막 레벨의 노드는 리프 노드로 설정하고 클래스 레이블을 할당
    for i in range(2**(MAX_DEPTH-1) -1, NUM_NODES):
        individual[i]['is_leaf'] = True
        individual[i]['class_label'] = random.randint(0,1)
    return creator.Individual(individual)

toolbox.register("individual", init_individual)
toolbox.register("population", tools.initRepeat, list, toolbox.individual)

# 3. 적합도 함수 정의

def predict(individual, X):
    """
    개체의 트리 구조를 사용하여 예측을 수행합니다.
    """
    def traverse(node_idx, sample):
        node = individual[node_idx]
        if node['is_leaf']:
            return node['class_label']
        feature = node['feature']
        threshold = node['threshold']
        if sample[feature] <= threshold:
            return traverse(2*node_idx + 1, sample)
        else:
            return traverse(2*node_idx + 2, sample)
    
    predictions = []
    for sample in X:
        pred = traverse(0, sample)
        predictions.append(pred)
    return np.array(predictions)

def eval_individual(individual):
    try:
        preds = predict(individual, X_train)
        accuracy = accuracy_score(y_train, preds)
        return (accuracy,)
    except:
        return (0.0,)

toolbox.register("evaluate", eval_individual)

# 4. 유전 연산자 정의

def mutate_individual(individual, indpb=0.1):
    for node in individual:
        if random.random() < indpb:
            if not node['is_leaf']:
                # feature와 threshold 변경
                node['feature'] = random.randint(0, NUM_FEATURES - 1)
                node['threshold'] = random.uniform(np.min(X_train[:, node['feature']]),
                                                   np.max(X_train[:, node['feature']]))
            else:
                # 리프 노드의 클래스 레이블 변경
                node['class_label'] = random.randint(0,1)
    return (individual,)

def crossover_individual(ind1, ind2):
    # 단일 교차점 교환
    cxpoint = random.randint(1, NUM_NODES - 1)
    ind1[cxpoint:], ind2[cxpoint:] = ind2[cxpoint:], ind1[cxpoint:]
    return ind1, ind2

toolbox.register("mate", crossover_individual)
toolbox.register("mutate", mutate_individual, indpb=0.2)
toolbox.register("select", tools.selTournament, tournsize=3)

# 5. 유전 알고리즘 실행

def main():
    random.seed(42)

    # Set up multiprocessing pool
    pool = multiprocessing.Pool()
    toolbox.register("map", pool.map)

    pop = toolbox.population(n=1000)  # 인구 크기
    hof = tools.HallOfFame(3)        # 최고 성능 개체 저장

    stats = tools.Statistics(lambda ind: ind.fitness.values)
    stats.register("avg", np.mean)
    stats.register("std", np.std)
    stats.register("min", np.min)
    stats.register("max", np.max)

    # 유전 알고리즘 실행
    pop, log = algorithms.eaSimple(pop, toolbox, cxpb=0.7, mutpb=0.2,
                                   ngen=40, stats=stats, halloffame=hof, verbose=True)

    pool.close()  # Close the pool to free up resources
    pool.join()

    # 최고 성능 개체 출력
    best_ind = hof[0]
    print("\n최적의 개체:")
    for idx, node in enumerate(best_ind):
        print(f"Node {idx}: ", end="")
        if node['is_leaf']:
            print(f"Leaf - Class {node['class_label']}")
        else:
            print(f"Feature {node['feature']}, Threshold {node['threshold']:.4f}")

    # 최적 개체로 모델 학습 및 평가
    y_train_pred = predict(best_ind, X_train)
    train_accuracy = accuracy_score(y_train, y_train_pred)
    print(f"\n훈련 정확도: {train_accuracy:.4f}")

    y_val_pred = predict(best_ind, X_val)
    val_accuracy = accuracy_score(y_val, y_val_pred)
    print(f"검증 정확도: {val_accuracy:.4f}")
    return best_ind

if __name__ == "__main__":
    best_ind = main()

gen	nevals	avg     	std     	min     	max     
0  	1000  	0.499252	0.205475	0.206098	0.792852
1  	733   	0.626232	0.17421 	0.227382	0.792852
2  	749   	0.688017	0.142853	0.226304	0.794814
3  	762   	0.710813	0.121954	0.231418	0.794814
4  	766   	0.71267 	0.124404	0.212649	0.794814
5  	750   	0.714827	0.13049 	0.207148	0.794814
6  	788   	0.709127	0.143   	0.207148	0.794814
7  	753   	0.708937	0.152495	0.206153	0.794814
8  	753   	0.725424	0.141952	0.206153	0.79487 
9  	771   	0.731427	0.14613 	0.205269	0.79487 
10 	759   	0.754427	0.122725	0.207148	0.79487 
11 	793   	0.76491 	0.11009 	0.212483	0.79487 
12 	761   	0.777221	0.0828684	0.2464  	0.79498 
13 	750   	0.772105	0.0992694	0.246565	0.79498 
14 	758   	0.775327	0.0889921	0.247395	0.79498 
15 	750   	0.773538	0.0950278	0.207065	0.79498 
16 	745   	0.782535	0.0692106	0.207065	0.79498 
17 	769   	0.773823	0.0921864	0.207065	0.79498 
18 	781   	0.779012	0.0802623	0.207065	0.79498 
19 	806   	0.774102	0.0939536	0.246234	0.79498 
20 	7

In [6]:
best_ind

[{'is_leaf': False,
  'feature': 0,
  'threshold': 18.691268298300894,
  'class_label': None},
 {'is_leaf': False,
  'feature': 6,
  'threshold': 1.889247251226342,
  'class_label': None},
 {'is_leaf': False,
  'feature': 10,
  'threshold': 7069.788496818701,
  'class_label': None},
 {'is_leaf': True,
  'feature': 2,
  'threshold': 91603.19528872528,
  'class_label': 0},
 {'is_leaf': True,
  'feature': 5,
  'threshold': 2.517084030937275,
  'class_label': 0},
 {'is_leaf': True,
  'feature': 5,
  'threshold': 0.5998278575662777,
  'class_label': 0},
 {'is_leaf': True,
  'feature': 11,
  'threshold': 4195.892689346308,
  'class_label': 1}]