In [1]:
from typing import TypeVar, Tuple
from alphagen.data.expression import *

from alphagen_qlib.stock_data import StockData
from alphagen_generic.features import *

import numpy as np
from lightgbm import Booster
from alphagen.utils.pytorch_utils import normalize_by_day
from alphagen.data.calculator import AlphaCalculator
from alphagen.utils.correlation import batch_pearsonr

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold, train_test_split
from sklearn.utils import column_or_1d
import lightgbm as lgb
import sklearn.datasets
import sklearn.metrics
import optuna

_T = TypeVar("_T")

In [2]:
def make_ensemble_alpha(exprs: List[Expression], model: Booster) -> Tensor:
    n = len(exprs)
    return torch.from_numpy(predict(model, exprs)).to(data.device)

def predict(model: Booster, exprs: List[Expression]) -> np.ndarray:
    X = torch.stack([_calc_alpha(expr) for expr in exprs], dim=-1).cpu().numpy()
    X = X.reshape(-1, X.shape[-1])
    val = model.predict(X)
    return unstack(val)

def unstack(value: np.ndarray) -> np.ndarray:
    return value.reshape(data.n_days, data.n_stocks)

def _calc_alpha(expr: Expression) -> Tensor:
    return normalize_by_day(expr.evaluate(data))

def _calc_ICs(value1: Tensor, value2: Tensor) -> Tensor:
    return batch_pearsonr(value1, value2)

def _calc_IC(value1: Tensor, value2: Tensor) -> float:
    return batch_pearsonr(value1, value2).mean().item()

def _calc_IR(value1: Tensor, value2: Tensor) -> float:
    ICs = _calc_ICs(value1, value2)
    IC_mean = ICs.mean().item()
    IC_std = ICs.std().item()
    epsilon = 1e-10  # 防止除以零的小值
    IR = IC_mean / (IC_std - epsilon)
    return IR

def test_ensemble(exprs: List[Expression], model: Booster, calculator: AlphaCalculator) -> Tuple[float, float]:
    return calc_pool_all_ret(exprs, calculator.target_value, model)

def calc_pool_all_ret(exprs: List[Expression], target: Tensor, model: Booster) -> Tuple[float, float]:
    with torch.no_grad():
        ensemble_value = make_ensemble_alpha(exprs, model)
        return _calc_IC(ensemble_value, target), _calc_IR(ensemble_value, target)

In [None]:
from alphagen_qlib.utils import load_alpha_pool_by_path, load_dt_model_by_path
from alphagen_qlib.calculator import QLibStockDataCalculator

POOL_PATH = 'model/51200_steps_pool.json'
DT_PATH = 'model/51200_steps_dt.txt'

data = StockData(instrument='csi500',
                 start_time='2024-06-01',
                 end_time='2024-07-01',
                 max_future_days=1,
                 )

close = Feature(FeatureType.CLOSE)
target = Ref(close, -1) / close - 1

[46160:MainThread](2024-08-08 18:53:39,316) INFO - qlib.Initialization - [config.py:416] - default_conf: client.
[46160:MainThread](2024-08-08 18:53:39,868) INFO - qlib.Initialization - [__init__.py:74] - qlib successfully initialized based on client settings.
[46160:MainThread](2024-08-08 18:53:39,869) INFO - qlib.Initialization - [__init__.py:76] - data_path={'__DEFAULT_FREQ': WindowsPath('C:/Users/liush/.qlib/qlib_data/cn_data_rolling')}


In [None]:
calculator = QLibStockDataCalculator(data=data, target=target)
exprs, _ = load_alpha_pool_by_path(POOL_PATH)
booster = load_dt_model_by_path(DT_PATH)

ensemble_alpha = make_ensemble_alpha(exprs, booster)
df = data.make_dataframe(ensemble_alpha)

print(test_ensemble(exprs, booster, calculator))

In [None]:
# def get_data(exprs: Expression, target_value: Tensor) -> Tuple[Tensor, Tensor]:
#     X = torch.stack([_calc_alpha(expr) for expr in exprs], dim=-1).cpu().numpy()
#     X = X.reshape(-1, X.shape[-1])
#     y = column_or_1d(target_value.cpu().numpy().reshape(-1, 1))
#     return X, y
# 
# def objective(trial):
#     data, target = get_data(exprs, calculator.target_value)
#     train_x, valid_x, train_y, valid_y = train_test_split(data, target, test_size=0.25)
#     dtrain = lgb.Dataset(train_x, label=train_y)
# 
#     param = {
#         "objective": "regression",
#         "metric": "huber",
#         "verbosity": -1,
#         "boosting_type": "dart",
#         #"drop_rate":0.45,
#         #"max_drop":30,
#         "skip_drop":0.65,
#         #"max_bin":65,
#         #"bagging_fraction": 0.9,
#         #"bagging_freq": 4,
#         #"feature_fraction": 0.45,
#         #"min_split_gain": 0.67,
#         "min_child_samples": trial.suggest_int("min_child_samples", 100, 5000),
#     }
# 
#     gbm = lgb.train(param, dtrain)
#     preds = gbm.predict(valid_x)
#     pred_labels = np.rint(preds)
#     accuracy = sklearn.metrics.mean_absolute_error(valid_y, pred_labels)
#     return accuracy
# 
# def study():
#     study = optuna.create_study(direction="minimize")
#     study.optimize(objective, n_trials=25)
# 
#     print("Number of finished trials: {}".format(len(study.trials)))
# 
#     print("Best trial:")
#     trial = study.best_trial
# 
#     print("  Value: {}".format(trial.value))
# 
#     print("  Params: ")
#     for key, value in trial.params.items():
#         print("    {}: {}".format(key, value))
#         
# study()

In [None]:
def train_lgbm(exprs: List[Expression], pretrained: Booster, target_value: Tensor) -> Booster:
    n_splits = 5
    X = torch.stack([_calc_alpha(expr) for expr in exprs], dim=-1).cpu().numpy()
    X = X.reshape(-1, X.shape[-1])
    y = column_or_1d(target_value.cpu().numpy().reshape(-1, 1))

    threshold = 3

    X = np.where(X > threshold, threshold, X)
    X = np.where(X < -threshold, -threshold, X)

    print("\n\n")

    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)

    best_model = None
    best_score = float('inf')

    for train_index, valid_index in kf.split(X):
        X_train, X_valid = X[train_index], X[valid_index]
        y_train, y_valid = y[train_index], y[valid_index]
        
        train_data = lgb.Dataset(X_train, label=y_train)
        valid_data = lgb.Dataset(X_valid, label=y_valid, reference=train_data)


        params = {
            'objective': 'regression',  # 根据你的实际任务调整
            'num_leaves': 31,
            'max_depth': 6,
            #'learning_rate': 0.12,
            'metric': 'l1, l2',
            "boosting": 'dart',
            "lambda_l1": 9,
            "lambda_l2": 2,
            "skip_drop":0.65,
            #"max_bin":65,
            "bagging_fraction": 0.9,
            "bagging_freq": 5,
            "feature_fraction": 0.8879,
            #"min_gain_to_split": 0.67,
            #"extra_trees": True,
            #"min_data_in_leaf": 1000,
            "verbosity": 1,
        }
        # 继续训练模型
        model = lgb.train(
            params,
            train_data,
            num_boost_round=100,  
            valid_sets=[valid_data],
            init_model=None 
        )

        score = model.best_score['valid_0']['l1']
        # 计算训练误差
        y_train_pred = model.predict(X_train)
        train_rmse = mean_squared_error(y_train, y_train_pred, squared=False)

        # 计算测试误差
        y_test_pred = model.predict(X_valid)
        test_rmse = mean_squared_error(y_valid, y_test_pred, squared=False)

        print('\n')
        print('\n')
        print(f'Train RMSE: {train_rmse}')
        print(f'Test RMSE: {test_rmse}')
        print('\n')
        print('\n')

        best_model = model

        if score < best_score:
            best_score = score
            best_model = model

    return best_model


In [None]:
booster = train_lgbm(exprs, booster, calculator.target_value)
print(test_ensemble(exprs, booster, calculator))

In [None]:
booster.save_model('model/boostered_7.txt')

In [None]:
booster.feature_importance()