In [1]:
import lightgbm as lgb
import SharedArray as sa
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# x_raw = sa.attach("X_905P_22")
x_name = "X_968P_22"
# x_raw = np.load(f"/home/public2/share/Features/{x_name}.npy")
x_raw = sa.attach(x_name)
x_app = np.load("/home/ray/workspace/trading-opt/src/thres8.npy")

In [3]:
raw_shape = x_raw.shape
app_shape = x_app.shape
n_raw_feat = raw_shape[-1]
n_new_feat = app_shape[-1]
print(f"raw_shape: {raw_shape}")
print(f"app_shape: {app_shape}")
print(f"n_raw_feat: {n_raw_feat}")
print(f"n_new_feat: {n_new_feat}")

raw_shape: (971, 16, 6000, 968)
app_shape: (11664, 6000, 175)
n_raw_feat: 968
n_new_feat: 175


In [4]:
x_raw = x_raw.reshape(-1, 16, 6000, n_raw_feat)
x_app = x_app.reshape(-1, 16, 6000, n_new_feat)

In [5]:
num_days = x_app.shape[0]
x_raw = x_raw[:num_days, ...]

In [6]:
x = np.concatenate((x_raw, x_app), axis=-1)

In [7]:
# tmp = sa.create("X968_with_thres8", x.shape, dtype=np.float32)
# tmp[...] = x

In [8]:
x = x.reshape(-1, x.shape[-1])

In [9]:
x_new_shape = x.shape
print(f"x_new_shape: {x_new_shape}")

x_new_shape: (69984000, 1143)


In [10]:
# y1d = sa.attach('yd').reshape(-1,1)
y1d = np.load("/home/public2/share/Ret/RET1D2022.npy").reshape(-1, 1)
y1d = y1d[: x.shape[0]]

In [11]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(
    x, y1d, test_size=0.2, random_state=42
)
x_train, x_val, y_train, y_val = train_test_split(
    x_train, y_train, test_size=0.2, random_state=42
)

In [12]:
params = {
    "objective": "regression",
    "boosting_type": "gbdt",
    "metric": "rmse",
    "num_leaves": 31,
    "max_depth": -1,
    "min_data_in_leaf": 20,
    "lambda_l1": 0,
    "lambda_l2": 1,
    "learning_rate": 0.1,
    "n_estimators": 100,
    "early_stopping_rounds": 10,
    "bagging_fraction": 0.8,
    "feature_fraction": 0.8,
    "verbosity": -1,
    "num_threads": 60,
}

raw_gbm = lgb.train(
    params,
    lgb.Dataset(x_train, y_train),
    valid_sets=[lgb.Dataset(x_val, y_val)],
    early_stopping_rounds=100,
)



[1]	valid_0's rmse: 0.0181747
Training until validation scores don't improve for 10 rounds
[2]	valid_0's rmse: 0.0181661
[3]	valid_0's rmse: 0.0181591
[4]	valid_0's rmse: 0.0181532
[5]	valid_0's rmse: 0.0181474
[6]	valid_0's rmse: 0.0181424
[7]	valid_0's rmse: 0.0181379
[8]	valid_0's rmse: 0.018134
[9]	valid_0's rmse: 0.0181302
[10]	valid_0's rmse: 0.0181272
[11]	valid_0's rmse: 0.0181237
[12]	valid_0's rmse: 0.0181205
[13]	valid_0's rmse: 0.018118
[14]	valid_0's rmse: 0.0181149
[15]	valid_0's rmse: 0.0181126
[16]	valid_0's rmse: 0.0181106
[17]	valid_0's rmse: 0.0181084
[18]	valid_0's rmse: 0.0181065
[19]	valid_0's rmse: 0.0181043
[20]	valid_0's rmse: 0.0181022
[21]	valid_0's rmse: 0.0181007
[22]	valid_0's rmse: 0.0180991
[23]	valid_0's rmse: 0.0180975
[24]	valid_0's rmse: 0.0180955
[25]	valid_0's rmse: 0.0180944
[26]	valid_0's rmse: 0.0180927
[27]	valid_0's rmse: 0.0180911
[28]	valid_0's rmse: 0.01809
[29]	valid_0's rmse: 0.0180888
[30]	valid_0's rmse: 0.0180873
[31]	valid_0's rmse: 0

In [13]:
y_pred = raw_gbm.predict(x_test).reshape(-1, 1)
nan_mask = np.isnan(y_test)
y_pred, y_test = y_pred[~nan_mask], y_test[~nan_mask]
ic = np.corrcoef(y_pred, y_test)[0, 1]

In [14]:
print("ic", ic)

ic 0.1515701084973196


In [15]:
feature_importance = raw_gbm.feature_importance()
sorted_idx = np.argsort(feature_importance)[::-1][:n_raw_feat]
from_first = sum(sorted_idx < n_raw_feat)
from_second = n_raw_feat - from_first
print(f"Number of features from the first set: {from_first}")
print(f"Number of features from the second set: {from_second}")

Number of features from the first set: 806
Number of features from the second set: 162


In [16]:
feature_names = list(range(x_raw.shape[-1]))
feature_names += ["new" + str(i) for i in list(range(x_app.shape[-1]))]
selected_features = [feature_names[i] for i in sorted_idx]
with open("selected_features.txt", "w") as f:
    for feature in selected_features:
        f.write(f"{feature}\n")

In [20]:
# sa.delete('X_905P_22_slender')
x_slender = x[:, sorted_idx]
try:
    tmp = sa.create(f"{x_name}_s", shape=x_slender.shape, dtype=np.float32)
except:
    tmp = sa.attach(f"{x_name}_s")
tmp[:] = x_slender

In [18]:
tmp.shape

(69984000, 994)

In [None]:
df = pd.DataFrame(x_slender, columns=selected_features)

In [None]:
df.filter(regex="new")

Unnamed: 0,new58,new43,new98,new85,new89,new133,new21,new170,new27,new46,...,new49,new48,new47,new45,new41,new40,new39,new38,new36,new34
0,-0.014200,-0.000606,0.008816,0.030771,0.238857,0.000000,0.016339,-0.002888,-0.020941,-2.009041e-03,...,0.000000,0.028612,-0.006253,-0.001242,-0.010156,-0.018599,0.000000,0.000000,-0.105486,0.014903
1,-0.001290,0.002522,0.029180,0.002525,0.263869,0.000000,-0.000068,0.024026,-0.007409,-2.063192e-03,...,0.000000,0.037989,-0.006337,-0.008568,-0.004264,0.019492,0.000000,0.000000,0.075008,0.012003
2,0.001319,-0.002248,0.042531,-0.010423,0.017877,0.000000,0.004034,0.002500,-0.017577,-1.166928e-03,...,0.000000,0.037750,-0.004947,-0.000652,0.012644,-0.017890,0.000000,0.000000,-0.106578,0.002509
3,-0.007484,-0.005031,0.007624,0.015401,0.250299,0.000000,0.002245,0.023575,-0.003406,-1.798611e-03,...,0.000000,0.033266,-0.005927,0.013306,0.009857,0.020203,0.000000,0.000000,0.062251,0.007801
4,-0.026710,-0.011265,0.004868,-0.004803,0.249901,0.000000,-0.041427,0.016519,0.002608,-1.403016e-03,...,0.000000,0.037820,-0.005313,-0.008761,-0.044794,-0.017701,0.000000,0.000000,-0.008598,0.005069
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
69983995,0.000903,0.002789,0.023102,0.013770,-0.084291,0.004876,0.010352,0.000325,-0.010204,-1.168908e-10,...,0.008405,0.040385,0.013181,0.005293,-0.009468,0.022431,-0.009982,-0.007902,0.005293,0.011841
69983996,0.000903,0.002789,0.023102,0.013770,-0.084291,0.004876,0.010352,0.000325,-0.010204,-1.168908e-10,...,0.008405,0.040385,0.013181,0.005293,-0.009468,0.022431,-0.009982,-0.007902,0.005293,0.011841
69983997,0.000903,0.002789,0.023102,0.013770,-0.084291,0.004876,0.010352,0.000325,-0.010204,-1.168908e-10,...,0.008405,0.040385,0.013181,0.005293,-0.009468,0.022431,-0.009982,-0.007902,0.005293,0.011841
69983998,0.000903,0.002789,0.023102,0.013770,-0.084291,0.004876,0.010352,0.000325,-0.010204,-1.168908e-10,...,0.008405,0.040385,0.013181,0.005293,-0.009468,0.022431,-0.009982,-0.007902,0.005293,0.011841
