In [1]:
from typing import Optional, TypeVar, Callable, Optional, Tuple
from alphagen.data.expression import *

from alphagen_qlib.stock_data import StockData
from alphagen_generic.features import *

import numpy as np
from lightgbm import Booster
from alphagen.utils.pytorch_utils import normalize_by_day
from alphagen.data.calculator import AlphaCalculator
from alphagen.utils.correlation import batch_pearsonr, batch_spearmanr

_T = TypeVar("_T")

In [2]:
def make_ensemble_alpha(exprs: List[Expression], model: Booster) -> Tensor:
    n = len(exprs)
    return torch.from_numpy(predict(model, exprs)).to(data.device)

def predict(model: Booster, exprs: List[Expression]) -> np.ndarray:
    X = torch.stack([_calc_alpha(expr) for expr in exprs], dim=-1).cpu().numpy()
    X = X.reshape(-1, X.shape[-1])
    val = model.predict(X)
    return unstack(val)

def unstack(value: np.ndarray) -> np.ndarray:
    return value.reshape(data.n_days, data.n_stocks)

def _calc_alpha(expr: Expression) -> Tensor:
    return normalize_by_day(expr.evaluate(data))

def _calc_ICs(value1: Tensor, value2: Tensor) -> Tensor:
    return batch_pearsonr(value1, value2)

def _calc_IC(value1: Tensor, value2: Tensor) -> float:
    return batch_pearsonr(value1, value2).mean().item()

def _calc_IR(value1: Tensor, value2: Tensor) -> float:
    ICs = _calc_ICs(value1, value2)
    IC_mean = ICs.mean().item()
    IC_std = ICs.std().item()
    epsilon = 1e-10  # 防止除以零的小值
    IR = IC_mean / (IC_std - epsilon)
    return IR

def test_ensemble(exprs: List[Expression], model: Booster, calculator: AlphaCalculator) -> Tuple[float, float]:
    return calc_pool_all_ret(exprs, calculator.target_value, model)

def calc_pool_all_ret(exprs: List[Expression], target: Tensor, model: Booster) -> Tuple[float, float]:
    with torch.no_grad():
        ensemble_value = make_ensemble_alpha(exprs, model)
        return _calc_IC(ensemble_value, target), _calc_IR(ensemble_value, target)

In [3]:
from alphagen_qlib.utils import load_alpha_pool_by_path, load_dt_model_by_path
from alphagen_qlib.calculator import QLibStockDataCalculator

POOL_PATH = 'model/18432_steps_pool.json'
DT_PATH = 'model/18432_steps_dt.txt'

data = StockData(instrument='csi500',
                 start_time='2016-06-01',
                 end_time='2022-06-01')

close = Feature(FeatureType.CLOSE)
target = Ref(close, -20) / close - 1

calculator = QLibStockDataCalculator(data=data, target=target)
exprs, _ = load_alpha_pool_by_path(POOL_PATH)
booster = load_dt_model_by_path(DT_PATH)

ensemble_alpha = make_ensemble_alpha(exprs, booster)
df = data.make_dataframe(ensemble_alpha)

print(test_ensemble(exprs, booster, calculator))

[36736:MainThread](2024-07-31 18:01:04,699) INFO - qlib.Initialization - [config.py:416] - default_conf: client.
[36736:MainThread](2024-07-31 18:01:05,255) INFO - qlib.Initialization - [__init__.py:74] - qlib successfully initialized based on client settings.
[36736:MainThread](2024-07-31 18:01:05,257) INFO - qlib.Initialization - [__init__.py:76] - data_path={'__DEFAULT_FREQ': WindowsPath('C:/Users/liush/.qlib/qlib_data/cn_data_rolling')}


(0.04110304208418048, 0.3091372140997289)


In [12]:
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
from sklearn.utils import column_or_1d
import lightgbm as lgb

def train_lgbm(exprs: List[Expression], pretrained: Booster, target_value: Tensor) -> Booster:
    n_splits = 2
    X = torch.stack([_calc_alpha(expr) for expr in exprs], dim=-1).cpu().numpy()
    X = X.reshape(-1, X.shape[-1])
    y = column_or_1d(target_value.cpu().numpy().reshape(-1, 1))

    threshold = 3

    X = np.where(X > threshold, threshold, X)
    X = np.where(X < -threshold, -threshold, X)

    print("\n\n")

    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)

    best_model = None
    best_score = float('inf')

    for train_index, valid_index in kf.split(X):
        X_train, X_valid = X[train_index], X[valid_index]
        y_train, y_valid = y[train_index], y[valid_index]
        
        train_data = lgb.Dataset(X_train, label=y_train)
        valid_data = lgb.Dataset(X_valid, label=y_valid, reference=train_data)


        params = {
            'objective': 'regression',  # 根据你的实际任务调整
            'learning_rate': 0.01,
            'num_leaves': 31,
            'metric': 'l1'  # 根据你的实际任务调整
        }
            
        # 继续训练模型
        model = lgb.train(
            params,
            train_data,
            num_boost_round=100,  # 继续训练的迭代次数
            valid_sets=[valid_data],
            init_model=pretrained  # 传入之前训练好的模型
        )

        score = model.best_score['valid_0']['l1']
        # 计算训练误差
        y_train_pred = model.predict(X_train)
        train_rmse = mean_squared_error(y_train, y_train_pred, squared=False)

        # 计算测试误差
        y_test_pred = model.predict(X_valid)
        test_rmse = mean_squared_error(y_valid, y_test_pred, squared=False)

        print('\n')
        print('\n')
        print(f'Train RMSE: {train_rmse}')
        print(f'Test RMSE: {test_rmse}')
        print('\n')
        print('\n')

        best_model = model

        if score < best_score:
            best_score = score
            best_model = model

    return best_model

In [13]:
booster = train_lgbm(exprs, booster, calculator.target_value)
print(test_ensemble(exprs, booster, calculator))




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.264147 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 51510
[LightGBM] [Info] Number of data points in the train set: 558705, number of used features: 202




Train RMSE: 0.770786456530388
Test RMSE: 0.7728961864060868




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.285636 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 51510
[LightGBM] [Info] Number of data points in the train set: 558705, number of used features: 202




Train RMSE: 0.7703777354020531
Test RMSE: 0.7747928378267063




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.238492 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 51510
[LightGBM] [Info] Number of data points in the train set: 558706, number of use

In [14]:
booster.save_model('model/boostered.txt')

<lightgbm.basic.Booster at 0x1d9f00c6af0>

In [15]:
booster.feature_importance()

array([ 90,  53,  21,  44,  14, 106, 188,  23,  74,  25,  12,  16,  17,
        69,   3,  12,  18,  98,  15,  67,   2,  15,  15,  14,   9,  13,
        25,  29,  83,  67,  13,  19,   4,  26,  10,  36,  24,  23,  13,
        17,  51,  10,   6,  25,   8,  11,   7,  43,  34,   6,   6,  57,
         8,  38,  17,   8,   9,  12,  30,  14,   3,  20,  19,  13,   3,
        16,  50,   7,   6,  60,   1,  39,  26,  19, 184,  25,  12,   9,
        89,  90,  27,  17,   6,  17,  13,   4,  27,  26,  57,   8,  38,
        13,  56,  27,  11,  95,   9,  22,  27,   9,  37, 116,  10,  10,
        30,  42,   9,   6,  18,  76,  19,  16,  51,  37,  28, 102,  12,
        51,  12,   6,   7,  77,  75,   9,  17,  17,  41,  47,   4,  12,
        38,   9,  14,  24,  18,  25,  94,  29,  90,  64,  16, 132,  26,
         9,  11,  15,  24,  41,  88,  12,  19,  39,  63,  10,  75,  26,
        10,  77,  10,   4,  18,   5,  36,  33, 101,   9,  20,   2,  35,
         6,   1,   6,   9,   5,  59,  18,  11,  15,  38,   6,   