In [3]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.ensemble import RandomForestClassifier
import lightgbm as lgb
import optuna
from itertools import combinations
import shutup

shutup.please()

In [4]:
data = pd.read_csv('D:\PycharmProjects\QuantInterview\Dataset.csv', dtype={'feature1': np.int8, 'feature2': np.int8, 'feature3': np.int8, 'index_return': str})
data['index_return'] = data['index_return'].str.rstrip('%').astype('float') / 100.0
data

Unnamed: 0,day,feature1,feature2,feature3,index_return
0,1,9,59,72,
1,2,5,17,64,0.0010
2,3,20,38,69,0.0002
3,4,14,15,66,-0.0029
4,5,14,18,61,-0.0115
...,...,...,...,...,...
700,701,3,14,20,-0.0140
701,702,18,21,46,0.0092
702,703,2,2,42,-0.0010
703,704,2,5,41,-0.0049


In [5]:
def reduce_mem_usage(df, verbose=0):
    """
    Iterate through all numeric columns of a dataframe and modify the data type
    to reduce memory usage.
    """

    start_mem = df.memory_usage().sum() / 1024**2

    for col in df.columns:
        col_type = df[col].dtype

        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == "int":
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float32)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float32)

    if verbose:
        print(f"Memory usage of dataframe is {start_mem:.2f} MB")
        end_mem = df.memory_usage().sum() / 1024 ** 2
        print(f"Memory usage after optimization is: {end_mem:.2f} MB")
        decrease = 100 * (start_mem - end_mem) / start_mem
        print(f"Decreased by {decrease:.2f}%")

    return df

# 根据已有特征构建出新的特征

由于根据现有feature无法得知具体是什么feature，因此尽可能多地根据给到地3个feature构建新的feature，并用机器学习模型预测

In [6]:
# 先构建各个feature的和与差，积与熵
ori_features = ['feature1', 'feature2', 'feature3']
for c in combinations(ori_features, 2):
    data[f"{c[0]}_{c[1]}_sum"] = data[c[0]] + data[c[1]]  # 和
    data[f"{c[0]}_{c[1]}_difference"] = data[c[0]] - data[c[1]]  # 差
    data[f"{c[0]}_{c[1]}_product"] = data[c[0]] * data[c[1]]  # 积
    data[f"{c[0]}_{c[1]}_quotient"] = data[c[0]] / data[c[1]]  # 商
data

Unnamed: 0,day,feature1,feature2,feature3,index_return,feature1_feature2_sum,feature1_feature2_difference,feature1_feature2_product,feature1_feature2_quotient,feature1_feature3_sum,feature1_feature3_difference,feature1_feature3_product,feature1_feature3_quotient,feature2_feature3_sum,feature2_feature3_difference,feature2_feature3_product,feature2_feature3_quotient
0,1,9,59,72,,68,-50,19,0.152542,81,-63,-120,0.125000,-125,-13,-104,0.819444
1,2,5,17,64,0.0010,22,-12,85,0.294118,69,-59,64,0.078125,81,-47,64,0.265625
2,3,20,38,69,0.0002,58,-18,-8,0.526316,89,-49,100,0.289855,107,-31,62,0.550725
3,4,14,15,66,-0.0029,29,-1,-46,0.933333,80,-52,-100,0.212121,81,-51,-34,0.227273
4,5,14,18,61,-0.0115,32,-4,-4,0.777778,75,-47,86,0.229508,79,-43,74,0.295082
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
700,701,3,14,20,-0.0140,17,-11,42,0.214286,23,-17,60,0.150000,34,-6,24,0.700000
701,702,18,21,46,0.0092,39,-3,122,0.857143,64,-28,60,0.391304,67,-25,-58,0.456522
702,703,2,2,42,-0.0010,4,0,4,1.000000,44,-40,84,0.047619,44,-40,84,0.047619
703,704,2,5,41,-0.0049,7,-3,10,0.400000,43,-39,82,0.048780,46,-36,-51,0.121951


In [7]:
# 下面计算移动平均指标
feature_cols = list(data.columns)
feature_cols.remove('day')
feature_cols.remove('index_return')

periods = [5, 10, 20]  # 短线操作中常用的均线参数
for ft in feature_cols:
    for period in periods:
        data[f'{ft}_{period}_mean'] = data[ft].rolling(period).mean()  # 每个feature的移动平均
        data[f'{ft}_{period}_bias'] = data[ft] / data[f'{ft}_{period}_mean'] - 1  # 每个feature的乖离度
        try:
            data[f'{ft}_{period}_vol'] = np.sqrt(data[ft].rolling(period).var())  # 每个feature的波动率
        except TypeError:
            print(data[ft].rolling(period))
            break
    for c in combinations(periods, 2):
        data[f'{ft}_{c[0]}_{c[1]}'] = data[ft].rolling(c[0]).mean() / data[ft].rolling(c[1]).mean()  # 每个feature的移动平均之比
data

Unnamed: 0,day,feature1,feature2,feature3,index_return,feature1_feature2_sum,feature1_feature2_difference,feature1_feature2_product,feature1_feature2_quotient,feature1_feature3_sum,...,feature2_feature3_quotient_5_vol,feature2_feature3_quotient_10_mean,feature2_feature3_quotient_10_bias,feature2_feature3_quotient_10_vol,feature2_feature3_quotient_20_mean,feature2_feature3_quotient_20_bias,feature2_feature3_quotient_20_vol,feature2_feature3_quotient_5_10,feature2_feature3_quotient_5_20,feature2_feature3_quotient_10_20
0,1,9,59,72,,68,-50,19,0.152542,81,...,,,,,,,,,,
1,2,5,17,64,0.0010,22,-12,85,0.294118,69,...,,,,,,,,,,
2,3,20,38,69,0.0002,58,-18,-8,0.526316,89,...,,,,,,,,,,
3,4,14,15,66,-0.0029,29,-1,-46,0.933333,80,...,,,,,,,,,,
4,5,14,18,61,-0.0115,32,-4,-4,0.777778,75,...,0.251271,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
700,701,3,14,20,-0.0140,17,-11,42,0.214286,23,...,0.144241,0.516044,0.356474,0.132196,0.525114,0.333044,0.165271,1.148872,1.129028,0.982727
701,702,18,21,46,0.0092,39,-3,122,0.857143,64,...,0.147655,0.510716,-0.106114,0.133543,0.535120,-0.146879,0.153807,1.154709,1.102049,0.954395
702,703,2,2,42,-0.0010,4,0,4,1.000000,44,...,0.287898,0.473478,-0.899427,0.198008,0.513890,-0.907336,0.188367,1.063893,0.980229,0.921361
703,704,2,5,41,-0.0049,7,-3,10,0.400000,43,...,0.333390,0.449958,-0.728972,0.225431,0.503320,-0.757707,0.204288,0.940882,0.841130,0.893980


In [8]:
data = reduce_mem_usage(data, verbose=1)
data

Memory usage of dataframe is 1.00 MB
Memory usage after optimization is: 0.51 MB
Decreased by 49.39%


Unnamed: 0,day,feature1,feature2,feature3,index_return,feature1_feature2_sum,feature1_feature2_difference,feature1_feature2_product,feature1_feature2_quotient,feature1_feature3_sum,...,feature2_feature3_quotient_5_vol,feature2_feature3_quotient_10_mean,feature2_feature3_quotient_10_bias,feature2_feature3_quotient_10_vol,feature2_feature3_quotient_20_mean,feature2_feature3_quotient_20_bias,feature2_feature3_quotient_20_vol,feature2_feature3_quotient_5_10,feature2_feature3_quotient_5_20,feature2_feature3_quotient_10_20
0,1,9,59,72,,68,-50,19,0.152542,81,...,,,,,,,,,,
1,2,5,17,64,0.0010,22,-12,85,0.294118,69,...,,,,,,,,,,
2,3,20,38,69,0.0002,58,-18,-8,0.526316,89,...,,,,,,,,,,
3,4,14,15,66,-0.0029,29,-1,-46,0.933333,80,...,,,,,,,,,,
4,5,14,18,61,-0.0115,32,-4,-4,0.777778,75,...,0.251271,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
700,701,3,14,20,-0.0140,17,-11,42,0.214286,23,...,0.144241,0.516044,0.356474,0.132196,0.525114,0.333044,0.165271,1.148872,1.129027,0.982727
701,702,18,21,46,0.0092,39,-3,122,0.857143,64,...,0.147655,0.510716,-0.106114,0.133543,0.535120,-0.146879,0.153807,1.154709,1.102049,0.954395
702,703,2,2,42,-0.0010,4,0,4,1.000000,44,...,0.287898,0.473478,-0.899427,0.198008,0.513889,-0.907336,0.188367,1.063893,0.980229,0.921361
703,704,2,5,41,-0.0049,7,-3,10,0.400000,43,...,0.333390,0.449958,-0.728972,0.225431,0.503320,-0.757707,0.204288,0.940882,0.841130,0.893980


## 缩尾（4%）并标准化

In [9]:
from sklearn.preprocessing import StandardScaler

feature_cols = list(data.columns)
feature_cols.remove('day')
feature_cols.remove('index_return')

scaler = StandardScaler()
data[feature_cols + ['index_return']] = data[feature_cols + ['index_return']].clip(lower=data[feature_cols + ['index_return']].quantile(0.04), upper=data[feature_cols + ['index_return']].quantile(0.96), axis=1)
data[feature_cols + ['index_return']] = pd.DataFrame(scaler.fit_transform(data[feature_cols + ['index_return']]), columns=[feature_cols + ['index_return']])
data

Unnamed: 0,day,feature1,feature2,feature3,index_return,feature1_feature2_sum,feature1_feature2_difference,feature1_feature2_product,feature1_feature2_quotient,feature1_feature3_sum,...,feature2_feature3_quotient_5_vol,feature2_feature3_quotient_10_mean,feature2_feature3_quotient_10_bias,feature2_feature3_quotient_10_vol,feature2_feature3_quotient_20_mean,feature2_feature3_quotient_20_bias,feature2_feature3_quotient_20_vol,feature2_feature3_quotient_5_10,feature2_feature3_quotient_5_20,feature2_feature3_quotient_10_20
0,1,-0.466642,1.317203,1.357251,,0.942208,-1.920112,0.181151,-0.650135,0.527213,...,,,,,,,,,,
1,2,-0.615909,-0.452524,0.865544,0.117669,0.008084,-0.358175,1.200372,-0.525788,0.040429,...,,,,,,,,,,
2,3,-0.056160,0.432339,1.172861,0.038288,0.739137,-0.748659,-0.235803,-0.321847,0.851735,...,,,,,,,,,,
3,4,-0.280059,-0.536797,0.988470,-0.269313,0.150233,0.357713,-0.822627,0.035639,0.486647,...,,,,,,,,,,
4,5,-0.280059,-0.410388,0.681154,-1.122658,0.211154,0.162471,-0.174032,-0.100986,0.283821,...,-0.053922,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
700,701,-0.690542,-0.578933,-1.838845,-1.370723,-0.093452,-0.293094,0.536334,-0.595905,-1.622747,...,-0.339185,-0.239254,0.494242,-0.476520,-0.237888,0.289437,-0.397931,0.432678,0.141660,-0.059595
701,702,-0.130793,-0.283979,-0.240797,0.931324,0.353303,0.227552,1.617326,-0.031279,-0.162397,...,-0.330087,-0.245885,-0.236152,-0.473627,-0.225353,-0.264600,-0.409356,0.450037,0.097192,-0.132994
702,703,-0.727858,-1.084569,-0.486650,-0.080783,-0.357443,0.422794,-0.050490,0.094193,-0.973702,...,0.043699,-0.292229,-1.455313,-0.335132,-0.251949,-1.142495,-0.374912,0.179975,-0.103598,-0.218575
703,704,-0.727858,-0.958160,-0.548114,-0.467765,-0.296522,0.227552,0.042166,-0.432791,-1.014268,...,0.164947,-0.321500,-1.219603,-0.276219,-0.265190,-0.969758,-0.359044,-0.185826,-0.332869,-0.289509


# 使用LightGBM进行表格数据预测并使用Shapley解释特征重要性

In [10]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

X = data[feature_cols]
y = data['index_return']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
train_data = lgb.Dataset(X_train, label=y_train)

# 使用LightGBM，用optuna调参
def objective(trial):
    ...

    # 2. Suggest values of the hyperparameters using a trial object.
    param = {
        'objective': 'regression',
        'verbosity': -1,
        'boosting_type': 'gbdt',
        'lambda_l1': trial.suggest_float('lambda_l1', 1e-8, 10.0, log=True),
        'lambda_l2': trial.suggest_float('lambda_l2', 1e-8, 10.0, log=True),
        'num_leaves': trial.suggest_int('num_leaves', 2, 256),
        'feature_fraction': trial.suggest_float('feature_fraction', 0.4, 1.0),
        'bagging_fraction': trial.suggest_float('bagging_fraction', 0.4, 1.0),
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 7),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
        'feature_pre_filter': False
    }

    gbm = lgb.train(param, train_data)
    preds = gbm.predict(X_test)
    loss = mean_squared_error(y_test, preds)
    return loss


study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=100)
print("Number of finished trials: {}".format(len(study.trials)))
print("Best trial:")
trial = study.best_trial
print("  Value: {}".format(trial.value))
print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

[I 2023-12-14 01:10:20,250] A new study created in memory with name: no-name-878913a4-7c81-4c45-ac1f-378a2d7c84ef
[I 2023-12-14 01:10:20,362] Trial 0 finished with value: 0.7198320394813236 and parameters: {'lambda_l1': 7.867547422534093e-08, 'lambda_l2': 0.0002959490414679992, 'num_leaves': 84, 'feature_fraction': 0.574679673890646, 'bagging_fraction': 0.9341937446121666, 'bagging_freq': 4, 'min_child_samples': 31}. Best is trial 0 with value: 0.7198320394813236.
[I 2023-12-14 01:10:20,431] Trial 1 finished with value: 0.8236554583449395 and parameters: {'lambda_l1': 0.008771999550090979, 'lambda_l2': 1.0249355784384713, 'num_leaves': 44, 'feature_fraction': 0.8811455071979257, 'bagging_fraction': 0.522369982880337, 'bagging_freq': 5, 'min_child_samples': 34}. Best is trial 0 with value: 0.7198320394813236.
[I 2023-12-14 01:10:20,466] Trial 2 finished with value: 0.6772608322737856 and parameters: {'lambda_l1': 6.9837113297283056, 'lambda_l2': 6.701226626295953e-06, 'num_leaves': 85, 

Number of finished trials: 100
Best trial:
  Value: 0.647437194658588
  Params: 
    lambda_l1: 8.159068587025143
    lambda_l2: 0.002023672178083324
    num_leaves: 6
    feature_fraction: 0.5535793492575833
    bagging_fraction: 0.4367578543726366
    bagging_freq: 6
    min_child_samples: 17


In [12]:
from sklearn.metrics import r2_score
import shap

bst_params = {
        'objective': 'regression',
        'verbosity': -1,
        'boosting_type': 'gbdt',
        'lambda_l1': 0.6311636859918583,
        'lambda_l2': 2.9691410198069135e-08,
        'num_leaves': 2,
        'feature_fraction': 0.8717773875737789,
        'bagging_fraction': 0.9678736282682716,
        'bagging_freq': 6,
        'min_child_samples': 38,
        'feature_pre_filter': False
    }
gbm = lgb.train(bst_params, train_data)
gbm_explainer = shap.TreeExplainer(gbm)
preds = gbm.predict(X_test)
r2 = r2_score(y_test, preds)
gbm_shap = gbm_explainer.shap_values(X)
print(r2)

0.17369407202672293


In [19]:
gbm_shap = pd.DataFrame(gbm_shap)
gbm_shap.columns = X.columns
gbm_shap

Unnamed: 0,feature1,feature2,feature3,feature1_feature2_sum,feature1_feature2_difference,feature1_feature2_product,feature1_feature2_quotient,feature1_feature3_sum,feature1_feature3_difference,feature1_feature3_product,...,feature2_feature3_quotient_5_vol,feature2_feature3_quotient_10_mean,feature2_feature3_quotient_10_bias,feature2_feature3_quotient_10_vol,feature2_feature3_quotient_20_mean,feature2_feature3_quotient_20_bias,feature2_feature3_quotient_20_vol,feature2_feature3_quotient_5_10,feature2_feature3_quotient_5_20,feature2_feature3_quotient_10_20
0,0.0,0.0,0.065891,0.0,0.0,0.0,0.0,0.0,0.0,-0.001996,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,-0.010586,0.0,0.0,0.0,0.0,0.0,0.0,-0.001996,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.065891,0.0,0.0,0.0,0.0,0.0,0.0,-0.001996,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,-0.010586,0.0,0.0,0.0,0.0,0.0,0.0,-0.001996,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,-0.010586,0.0,0.0,0.0,0.0,0.0,0.0,-0.001996,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
700,0.0,0.0,-0.010586,0.0,0.0,0.0,0.0,0.0,0.0,-0.001996,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
701,0.0,0.0,-0.010586,0.0,0.0,0.0,0.0,0.0,0.0,-0.001996,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
702,0.0,0.0,-0.010586,0.0,0.0,0.0,0.0,0.0,0.0,-0.001996,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
703,0.0,0.0,-0.010586,0.0,0.0,0.0,0.0,0.0,0.0,-0.001996,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [20]:
feature_importance = gbm_shap.mean(axis=0)
feature_importance

feature1                              0.000000
feature2                              0.000000
feature3                             -0.000064
feature1_feature2_sum                 0.000000
feature1_feature2_difference          0.000000
                                        ...   
feature2_feature3_quotient_20_bias    0.000000
feature2_feature3_quotient_20_vol     0.000000
feature2_feature3_quotient_5_10       0.000000
feature2_feature3_quotient_5_20       0.000000
feature2_feature3_quotient_10_20      0.000000
Length: 195, dtype: float64

In [26]:
feature_importance[feature_importance != 0].sort_values()

feature2_5_bias                       -0.004964
feature1_feature2_quotient_10_vol     -0.000704
feature2_feature3_product_5_mean      -0.000383
feature2_feature3_sum_5_bias          -0.000303
feature1_feature3_product_5_20        -0.000243
feature2_feature3_difference_5_bias   -0.000235
feature3_5_20                         -0.000154
feature1_feature2_sum_5_20            -0.000121
feature1_feature3_product             -0.000092
feature3_5_10                         -0.000066
feature3                              -0.000064
feature2_feature3_product_10_bias     -0.000040
feature1_feature2_product_20_bias     -0.000033
feature1_feature2_product_5_bias      -0.000021
feature1_feature2_sum_5_bias          -0.000004
feature1_feature3_product_10_bias      0.000014
feature1_feature3_sum_5_bias           0.000026
feature1_feature2_product_5_20         0.000032
feature2_5_vol                         0.000038
feature2_feature3_product_5_10         0.000066
feature2_feature3_difference_5_20      0