In [11]:
import tushare as ts
import pandas as pd
import numpy as np
import os
import time
import lightgbm as lgb
import pickle
from datetime import date, timedelta
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error as mse
from sklearn import metrics
import akshare as ak
import warnings
from pypfopt.expected_returns import mean_historical_return
from pypfopt.risk_models import CovarianceShrinkage
from pypfopt.efficient_frontier import EfficientFrontier
def log_diff(df):
    df.replace(0, 0.01, inplace=True)
    diffdf = np.log(df).loc[1:,:] - np.log(df.shift(1)).loc[1:,:]
    return diffdf

def minicut(df, train_period=5, pred_period=1):
    '''
    train_period: 用于训练的数据长度，默认前5个交易日
    pred_period: 预测几日后的结果，默认第二天
    '''
    n_features = df.shape[1]
    roll = pd.concat([df.shift(i) for i in range(pred_period,pred_period+train_period-1)], axis=1)
    roll = pd.concat([df, roll], axis=1).dropna()
    return roll.iloc[-1:,:]

In [12]:
hs300 = pd.read_csv('../data/index/hs300.csv')
model = pickle.load(open('model/{}.pickle'.format('hs300'),'rb')).booster_
minimodel = pickle.load(open('model/{}.pickle'.format('minihs300'),'rb')).booster_
p = {}
for code in tqdm(hs300['成分券代码']):
    if len(str(code)) < 6:
        code = '0'*(6-len(str(code))) + str(code)
    else:
        code = str(code)
    single_df = ak.stock_zh_a_hist(symbol=code, period="daily", start_date="20230701", end_date='20301231', adjust="hfq")
    X = minicut(log_diff(single_df.drop(columns=['日期','涨跌幅','涨跌额','振幅'])), train_period=5, pred_period=1)
    p[model.predict(X)[0]+minimodel.predict(X)[0]] = code
[(p[key],(np.exp(key)-1)*100) for key in sorted(p, reverse=True)[:10]]


100%|██████████| 300/300 [04:12<00:00,  1.19it/s]


[('603486', 2.459647238933149),
 ('300751', 0.5981188609459176),
 ('300957', 0.5545719660316362),
 ('688036', 0.44325114018366474),
 ('300979', 0.375458679615015),
 ('002271', 0.36151002154261036),
 ('600809', 0.33992022076900774),
 ('002920', 0.31543498956105864),
 ('002230', 0.3107539264995074),
 ('601166', 0.2852927609463096)]

In [13]:
top_10 = [p[key] for key in sorted(p, reverse=True)[:10]]
portfolio = pd.DataFrame()
for code in tqdm(top_10):
    single_df = ak.stock_zh_a_hist(symbol=code, period="daily", start_date="20230701", end_date='20301231', adjust="hfq")[['收盘']].iloc[-10:,:]
    single_df[code] = single_df['收盘']
    single_df.drop(columns=['收盘'], inplace=True)
    portfolio = pd.concat([portfolio,single_df],axis=1)

mu = np.exp(sorted(p, reverse=True)[:10])**(25.2) - 1
S = CovarianceShrinkage(portfolio).ledoit_wolf()

ef = EfficientFrontier(mu, S)
weights = ef.max_sharpe()

cleaned_weights = ef.clean_weights()
print(dict(cleaned_weights))

ef.portfolio_performance(verbose=True)

100%|██████████| 10/10 [00:09<00:00,  1.10it/s]

{'603486': 0.6346, '300751': 0.0, '300957': 0.01965, '688036': 0.01188, '300979': 0.0, '002271': 0.0, '600809': 0.11431, '002920': 0.07358, '002230': 0.08026, '601166': 0.06563}
Expected annual return: 56.8%
Annual volatility: 34.2%
Sharpe Ratio: 1.60





(0.5681034452922598, 0.3424487269926508, 1.6005416346722834)

In [14]:
dict(cleaned_weights)

{'603486': 0.6346,
 '300751': 0.0,
 '300957': 0.01965,
 '688036': 0.01188,
 '300979': 0.0,
 '002271': 0.0,
 '600809': 0.11431,
 '002920': 0.07358,
 '002230': 0.08026,
 '601166': 0.06563}

In [15]:
zz1000 = pd.read_csv('../data/index/zz1000.csv')
model = pickle.load(open('model/{}.pickle'.format('zz1000'),'rb')).booster_
minimodel = pickle.load(open('model/{}.pickle'.format('minizz1000'),'rb')).booster_
p = {}
for code in tqdm(zz1000['成分券代码']):
    try:
        if len(str(code)) < 6:
            code = '0'*(6-len(str(code))) + str(code)
        else:
            code = str(code)
        single_df = ak.stock_zh_a_hist(symbol=code, period="daily", start_date="20230101", end_date='20301231', adjust="hfq")
        X = minicut(log_diff(single_df.drop(columns=['日期','涨跌幅','涨跌额','振幅'])), train_period=5, pred_period=1)
        p[model.predict(X)[0]+minimodel.predict(X)[0]] = code
    except:
        continue
[(p[key],(np.exp(key)-1)*100) for key in sorted(p, reverse=False)[:10]]

 24%|██▍       | 244/1000 [03:26<10:50,  1.16it/s]

In [None]:
# single_df = ak.stock_zh_a_hist(symbol='002943', period="daily", start_date="20230101", end_date='20301231', adjust="hfq")
# X = minicut(log_diff(single_df.drop(columns=['日期','涨跌幅','涨跌额','振幅'])), train_period=5, pred_period=1)
# model.predict(X)[0]+minimodel.predict(X)[0]

In [None]:
[(p[key],(np.exp(key)-1)*100) for key in sorted(p, reverse=True)[:10]]

[('000737', 4.36519783301208),
 ('000982', 3.070667107703362),
 ('600740', 2.1625703131062046),
 ('300738', 2.0777417604566306),
 ('002023', 2.071304583614597),
 ('002085', 1.9919182860365758),
 ('000758', 1.766003508992009),
 ('300527', 1.722822737643015),
 ('002378', 1.699421522899991),
 ('300666', 1.6018612208760885)]

In [None]:
top_10 = [p[key] for key in sorted(p, reverse=True)[:10]]
portfolio = pd.DataFrame()
for code in tqdm(top_10):
    single_df = ak.stock_zh_a_hist(symbol=code, period="daily", start_date="20230701", end_date='20301231', adjust="hfq")[['收盘']].iloc[-10:,:]
    single_df[code] = single_df['收盘']
    single_df.drop(columns=['收盘'], inplace=True)
    portfolio = pd.concat([portfolio,single_df],axis=1)

mu = np.exp(sorted(p, reverse=True)[:10])**(25.2) - 1
S = CovarianceShrinkage(portfolio).ledoit_wolf()

ef = EfficientFrontier(mu, S)
weights = ef.max_sharpe()

cleaned_weights = ef.clean_weights()
print(dict(cleaned_weights))

ef.portfolio_performance(verbose=True)

100%|██████████| 10/10 [00:08<00:00,  1.21it/s]

{'000737': 0.30179, '000982': 0.27649, '600740': 0.18742, '300738': 0.0, '002023': 0.0, '002085': 0.0, '000758': 0.0, '300527': 0.19806, '002378': 0.0, '300666': 0.03624}
Expected annual return: 115.8%
Annual volatility: 56.8%
Sharpe Ratio: 2.00





(1.1582822735332248, 0.5682952043731014, 2.0029770879183966)

In [None]:
dict(cleaned_weights)

{'000737': 0.30179,
 '000982': 0.27649,
 '600740': 0.18742,
 '300738': 0.0,
 '002023': 0.0,
 '002085': 0.0,
 '000758': 0.0,
 '300527': 0.19806,
 '002378': 0.0,
 '300666': 0.03624}

# Fine Tune

In [None]:
X = pd.read_csv('../data/X.csv', index_col=0)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
model = lgb.LGBMRegressor(learning_rate=0.01, metric='l1', n_estimators=10000, num_leaves=2048, min_data_in_leaf=128, bagging_fraction=0.8, feature_fraction=0.05, lambda_l1=1, lambda_l2=1, verbose=0 ,num_iterations=10000, early_stopping_round=100)
model.fit(X_train,y_train,eval_metric='l2',eval_set=[(X_test,y_test)],early_stopping_rounds=100)
model.best_score_

FileNotFoundError: [Errno 2] No such file or directory: '../data/X.csv'

In [None]:
def LGB_bayesian(
    num_leaves, # int
    min_data_in_leaf, # int
    learning_rate,
    min_sum_hessian_in_leaf, # int
    feature_fraction,
    lambda_l1,
    lambda_l2,
    min_gain_to_split,
    max_depth):
    # LightGBM expects next three parameters need to be integer. So we make them integer

    num_leaves = int(num_leaves)
    min_data_in_leaf = int(min_data_in_leaf)
    max_depth = int(max_depth)
    assert type(num_leaves) == int
    assert type(min_data_in_leaf) == int
    assert type(max_depth) == int

    param = {
        'num_leaves': num_leaves,
        'max_bin': 256,
        'min_data_in_leaf': min_data_in_leaf,
        'learning_rate': learning_rate,
        'min_sum_hessian_in_leaf': min_sum_hessian_in_leaf,
        'bagging_fraction': 1.0,
        'bagging_freq': 5,
        'feature_fraction': feature_fraction,
        'lambda_l1': lambda_l1,
        'lambda_l2': lambda_l2,
        'min_gain_to_split': min_gain_to_split,
        'max_depth': max_depth,
        'save_binary': True,
        'objective': 'binary',
        'boosting_type': 'gbdt',
        'verbose': -1,
        'metric': 'mse',
        'boost_from_average': False,
    }

    xg_train = lgb.Dataset(X_train, label=y_train)
    xg_valid = lgb.Dataset(X_test, label=y_test)
    num_round = 10000
    clf = lgb.train(param, xg_train, num_round, valid_sets = [xg_valid], callbacks=[lgb.callback.early_stopping(100),lgb.callback.log_evaluation(0)])
    predictions = clf.predict(X_train, num_iteration=clf.best_iteration)
    score = metrics.mean_absolute_error(y_train, predictions)
    return -score

In [None]:
bounds_LGB = {
    'num_leaves': (16, 4096),
    'min_data_in_leaf': (10, 200),
    'learning_rate': (0.01, 0.5),
    'min_sum_hessian_in_leaf': (0.00001, 0.01),
    'feature_fraction': (0.05, 0.8),
    'lambda_l1': (0, 1.0),
    'lambda_l2': (0, 1.0),
    'min_gain_to_split': (0, 1.0),
    'max_depth':(2,64),
}



In [None]:
from bayes_opt import BayesianOptimization
LGB_BO = BayesianOptimization(LGB_bayesian, bounds_LGB, random_state=13)

In [None]:
init_points = 5

n_iter = 10000

print('-' * 130)

with warnings.catch_warnings():
    warnings.filterwarnings('ignore')
    warnings.warn("deprecated", DeprecationWarning)

LGB_BO.set_gp_params(alpha=1e-4, n_restarts_optimizer=2)
LGB_BO.maximize(init_points=init_points, n_iter=n_iter)

----------------------------------------------------------------------------------------------------------------------------------
|   iter    |  target   | featur... | lambda_l1 | lambda_l2 | learni... | max_depth | min_da... | min_ga... | min_su... | num_le... |
-------------------------------------------------------------------------------------------------------------------------------------


NameError: name 'X_train' is not defined

In [None]:
temp = pickle.load(open('temp.pickle', 'rb'))
print(temp)

{'feature_fraction': 0.05, 'lambda_l1': 1.0, 'lambda_l2': 1.0, 'learning_rate': 0.5, 'max_depth': 2.0, 'min_data_in_leaf': 114.30214038193319, 'min_gain_to_split': 0.0, 'min_sum_hessian_in_leaf': 1e-05, 'num_leaves': 3139.445350499245}


In [None]:
filename = 'model/{}.pickle'.format('zz1000')
pickle.dump(model, open(filename,'wb'))

In [None]:
zz500 = pd.read_csv('../data/index/zz500.csv')
model = pickle.load(open('model/{}.pickle'.format('zz500'),'rb'))
minimodel = pickle.load(open('model/{}.pickle'.format('minizz500'),'rb'))
p = {}
for code in tqdm(zz500['成分券代码']):
    if len(str(code)) < 6:
        code = '0'*(6-len(str(code))) + str(code)
    else:
        code = str(code)
    single_df = ak.stock_zh_a_hist(symbol=code, period="daily", start_date="20230101", end_date='20301231', adjust="hfq")
    X = minicut(log_diff(single_df.drop(columns=['日期','涨跌幅','涨跌额','振幅'])), train_period=5, pred_period=1)
    p[model.predict(X)[0]+minimodel.predict(X)[0]] = code
[(p[key],(np.exp(key)-1)*100) for key in sorted(p, reverse=True)[:10]]

100%|██████████| 500/500 [03:16<00:00,  2.55it/s]


[('601828', 8.658341958138749),
 ('002195', 3.6648852435535284),
 ('300118', 1.780139030878214),
 ('002568', 1.5823300096940107),
 ('600521', 1.0904057877058282),
 ('002373', 0.8838936969239208),
 ('300604', 0.8351706887218313),
 ('002056', 0.6752962225225589),
 ('002518', 0.6568423236717136),
 ('003035', 0.6277091520410449)]

In [None]:
zz800 = pd.read_csv('../data/index/zz800.csv')
model = pickle.load(open('model/{}.pickle'.format('zz800'),'rb'))
p = {}
for code in tqdm(zz800['成分券代码']):
    if len(str(code)) < 6:
        code = '0'*(6-len(str(code))) + str(code)
    else:
        code = str(code)
    single_df = ak.stock_zh_a_hist(symbol=code, period="daily", start_date="20230101", end_date='20301231', adjust="hfq")
    X = minicut(log_diff(single_df.drop(columns=['日期','涨跌幅','涨跌额','振幅'])), train_period=5, pred_period=1)
    p[model.predict(X)[0]] = code
[(p[key],(np.exp(key)-1)*100) for key in sorted(p, reverse=True)[:10]]

100%|██████████| 800/800 [04:40<00:00,  2.85it/s]


[('601828', 6.712831244922413),
 ('002195', 3.8209760118292024),
 ('300118', 2.495748701315481),
 ('002568', 1.881895118872312),
 ('600803', 1.2339374346034937),
 ('600521', 1.1193173372157306),
 ('603833', 1.084197087967076),
 ('000001', 1.061428940422715),
 ('600153', 0.9041229187523925),
 ('000938', 0.8471935243878459)]

In [None]:
def log_diff(df):
    df.replace(0, 0.01, inplace=True)
    diffdf = np.log(df).loc[1:,:] - np.log(df.shift(1)).loc[1:,:]
    return diffdf

def cut(df, train_period=5, pred_period=1):
    '''
    train_period: 用于训练的数据长度，默认前5个交易日
    pred_period: 预测几日后的结果，默认第二天
    '''
    n_features = df.shape[1]
    roll = pd.concat([df.shift(i) for i in range(pred_period,pred_period+train_period)], axis=1)
    roll = pd.concat([df, roll], axis=1).dropna()
    train = roll.iloc[:, n_features:]
    pred = roll['收盘'].iloc[:,0]
    return train, pred

index_name = 'zz1000'
index = pd.read_csv('../data/index/{}.csv'.format(index_name))
X_train = pd.DataFrame()
y_train = pd.DataFrame()
for code in tqdm(index['成分券代码']):
    if len(str(code)) < 6:
        code = '0'*(6-len(str(code))) + str(code)
    single_df = ak.stock_zh_a_hist(symbol=str(code), period="daily", start_date="20230101", end_date='20301231', adjust="hfq")
    X, y = cut(log_diff(single_df.drop(columns=['日期','涨跌幅','涨跌额','振幅'])), train_period=5, pred_period=1)
    X_train = pd.concat([X_train, X])
    y_train = pd.concat([y_train, y])

X_train.to_csv('../data/X_minitrain_{}.csv'.format(index_name),encoding='utf-8-sig',index=False)        #保存到本地
y_train.to_csv('../data/y_minitrain_{}.csv'.format(index_name),encoding='utf-8-sig',index=False)        #保存到本地

model = pickle.load(open('model/{}.pickle'.format(index_name),'rb')).booster_
error = y_train.iloc[:,0].to_numpy() - model.predict(X_train)

X_train, X_test, y_train, y_test = train_test_split(X_train, error, test_size=0.2)
minimodel = lgb.LGBMRegressor(learning_rate=0.01, metric='l1', n_estimators=10000, num_leaves=256, min_data_in_leaf=128, bagging_fraction=0.8, feature_fraction=0.05, lambda_l1=1, lambda_l2=1, verbose=0 ,num_iterations=10000, early_stopping_round=100)

minimodel.fit(X_train.to_numpy(),y_train,eval_metric='l2',eval_set=[(X_test.to_numpy(),y_test)])

filename = 'model/{}.pickle'.format('mini'+index_name)
pickle.dump(minimodel, open(filename,'wb'))
p = {}
for code in tqdm(index['成分券代码']):
    if len(str(code)) < 6:
        code = '0'*(6-len(str(code))) + str(code)
    else:
        code = str(code)
    single_df = ak.stock_zh_a_hist(symbol=code, period="daily", start_date="20230701", end_date='20301231', adjust="hfq")
    X = minicut(log_diff(single_df.drop(columns=['日期','涨跌幅','涨跌额','振幅'])), train_period=5, pred_period=1)
    p[model.predict(X)[0]+minimodel.predict(X)[0]] = code
print([(p[key],(np.exp(key)-1)*100) for key in sorted(p, reverse=True)[:10]])
print([(p[key],(np.exp(key)-1)*100) for key in sorted(p, reverse=False)[:10]])

100%|██████████| 1000/1000 [13:23<00:00,  1.24it/s]




  0%|          | 0/1000 [00:00<?, ?it/s]

In [None]:
[(p[key],(np.exp(key)-1)*100) for key in sorted(p, reverse=True)[:10]]


[('600732', 2.26655343104758),
 ('300769', 1.3859937893807794),
 ('300763', 1.272225811148453),
 ('688223', 0.6330727800115676),
 ('601857', 0.6033910311123325),
 ('600050', 0.5193635860434531),
 ('605117', 0.4964181721238914),
 ('002142', 0.48073201981562175),
 ('600061', 0.44783249701136896),
 ('601186', 0.36974304086756504)]

In [None]:
[(p[key],(np.exp(key)-1)*100) for key in sorted(p, reverse=False)[:10]]

[('603363', -5.099436806331914),
 ('000818', -3.643702344626565),
 ('601068', -3.365114333429786),
 ('002467', -2.817589924408226),
 ('300459', -2.508346498000402),
 ('000025', -2.4563694239053913),
 ('002217', -2.4436759584447576),
 ('002212', -2.3082375093583463),
 ('603896', -2.0951838979606507),
 ('000881', -1.9571184961328925)]

In [None]:
zz1000 = pd.read_csv('../data/index/zz1000.csv')
model = pickle.load(open('model/{}.pickle'.format('zz1000'),'rb'))
minimodel = pickle.load(open('model/{}.pickle'.format('minizz1000'),'rb'))
p = {}
for code in tqdm(zz1000['成分券代码']):
    if len(str(code)) < 6:
        code = '0'*(6-len(str(code))) + str(code)
    else:
        code = str(code)
    single_df = ak.stock_zh_a_hist(symbol=code, period="daily", start_date="20230701", end_date='20301231', adjust="hfq")
    X = minicut(log_diff(single_df.drop(columns=['日期','涨跌幅','涨跌额','振幅'])), train_period=5, pred_period=1)
    p[model.predict(X)[0]+minimodel.predict(X)[0]] = code
[(p[key],(np.exp(key)-1)*100) for key in sorted(p, reverse=True)[:10]]

100%|██████████| 1000/1000 [04:15<00:00,  3.92it/s]


[('300476', 2.5873593149894125),
 ('605376', 1.5606206901105146),
 ('300502', 1.4886075846494817),
 ('300634', 1.407877051623596),
 ('002310', 1.3058865563513322),
 ('603888', 1.277941192485943),
 ('300394', 1.2359950024037003),
 ('002400', 1.2092929811944986),
 ('002261', 1.1525919937411366),
 ('002036', 1.0862606925834184)]

In [None]:
[(p[key],(np.exp(key)-1)*100) for key in sorted(p, reverse=False)[:10]]

[('002777', -3.59983041172397),
 ('300579', -1.739553532231386),
 ('003012', -1.4069864091808215),
 ('003021', -1.2689713325634289),
 ('603005', -1.1668052549846042),
 ('301308', -1.154989936596873),
 ('300457', -1.03215318015073),
 ('300602', -0.9368449304953952),
 ('600335', -0.9312848801049989),
 ('000811', -0.8944601516572059)]