In [1]:
%matplotlib inline
import datetime as dt
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from alphamind.api import *
from PyFin.api import *
import xgboost as xgb
from xgboost.sklearn import XGBRegressor
from sklearn.model_selection import GridSearchCV

plt.style.use('fivethirtyeight')
engine = SqlEngine('postgres+psycopg2://postgres:we083826@localhost/alpha')

In [2]:
u_name = 'zz500'
benchmark = 905
universe = Universe(u_name, [u_name])
factor_coverage = engine.fetch_factor_coverage()

In [3]:
flitered_coverage = factor_coverage[((factor_coverage.source == 'uqer'))
                                    & (factor_coverage.universe == u_name) 
                                    & (factor_coverage.trade_date >= '2012-01-01')]
coverage_report = flitered_coverage.groupby(['factor'])['coverage'].mean()

In [4]:
flitered_coverage = factor_coverage[((factor_coverage.source == 'uqer'))
                                    & (factor_coverage.universe == u_name) 
                                    & (factor_coverage.trade_date >= '2012-01-01')]
coverage_report = flitered_coverage.groupby(['factor'])['coverage'].mean()

In [5]:
alpha_factors = coverage_report[coverage_report >= 0.98].index.tolist()

alpha_factors = {
    f: CSRank(DIFF(f) / (ABS(f) + 1e-4)) for f in alpha_factors
}

In [6]:
alpha_factors.__len__()

211

In [7]:
frequency = '2w'
batch = 8
start_date = '2012-01-01'
end_date = '2017-10-25'
method = 'risk_neutral'
portfolio_risk_neutralize = ['SIZE']
neutralize_risk = industry_styles + portfolio_risk_neutralize
industry_lower = 1.
industry_upper = 1.

In [8]:
data_package = fetch_data_package(engine,
                                  alpha_factors=alpha_factors,
                                  start_date=start_date,
                                  end_date=end_date,
                                  frequency=frequency,
                                  universe=universe,
                                  benchmark=benchmark,
                                  batch=batch,
                                  neutralized_risk=neutralize_risk,
                                  pre_process=[winsorize_normal],
                                  post_process=[winsorize_normal],
                                  warm_start=batch)

2017-10-27 18:07:43,652 - ALPHA_MIND - INFO - Starting data package fetching ...
2017-10-27 18:09:02,266 - ALPHA_MIND - INFO - Loading data is finished
2017-10-27 18:09:12,239 - ALPHA_MIND - INFO - Data processing is finished


In [9]:
train_x = data_package['train']['x']
train_y = data_package['train']['y']

predict_x = data_package['predict']['x']
predict_y = data_package['predict']['y']

features = data_package['x_names']

In [10]:
def plot_model_importance(model, features):
    features = np.array(features)
    n_features = len(features)
    features_importance = model.feature_importances_
    order = features_importance.argsort().argsort()
    features = features[order >= n_features - 10]
    features_importance = features_importance[order >= n_features - 10]
    n_features = len(features)
    plt.figure(figsize=(12, 6))
    plt.barh(range(n_features), features_importance, align='center')
    plt.yticks(np.arange(n_features), features)
    plt.xlabel('Feature importance')
    plt.ylabel('Feature')

## 0. Train Score on a specific date
------------------------------------

In [19]:
from multiprocessing import set_start_method

ref_date = list(train_x.keys())[-2]
sample_train_x = train_x[ref_date]
sample_train_y = train_y[ref_date].flatten()

sample_test_x = predict_x[ref_date]
sample_test_y = predict_y[ref_date].flatten()

In [None]:
param = {'objective': 'reg:linear',
         'tree_method': 'gpu_hist'
         }

param_grid = {
 'max_depth': [3, 10],
 'min_child_weight': [1, 7]
}

gsearch = GridSearchCV(estimator = XGBRegressor(n_estimators=50),
                       param_grid=param_grid,
                       cv=3,
                       n_jobs=8)

model = gsearch.fit(sample_train_x, sample_train_y)

In [31]:
gsearch.score(sample_test_x, sample_test_y)

-0.24730901084637047

## 1. Train and test accuracy trend （XGBRegressor）
----------

In [32]:
dates = sorted(train_x.keys())
accuray_table = pd.DataFrame(columns=['train', 'test'])

portfolio_industry_neutralize = True

settlement = data_package['settlement']
industry_dummies = pd.get_dummies(settlement['industry'].values)
risk_styles = settlement[portfolio_risk_neutralize].values
total_risks = settlement[neutralize_risk].values
final_res = np.zeros(len(dates))
method = 'risk_neutral'

In [33]:
for i, ref_date in enumerate(dates):
    sample_train_x = train_x[ref_date]
    sample_train_y = train_y[ref_date].flatten()

    gsearch.fit(sample_train_x, sample_train_y)
    accuray_table.loc[ref_date, 'train'] =  gsearch.score(sample_train_x, sample_train_y)
    alpha_logger.info('trade_date: {0} training finished'.format(ref_date))
    
    ##---------------------------------##
    
    sample_test_x = predict_x[ref_date]
    sample_test_y = predict_y[ref_date].flatten()
    
    cons = Constraints()
    index = settlement.trade_date == ref_date
    benchmark_w = settlement[index]['weight'].values
    realized_r = settlement[index]['dx'].values
    industry_names = settlement[index]['industry'].values
    is_tradable = settlement[index]['isOpen'].values

    cons.add_exposure(['total'], np.ones((len(is_tradable), 1)))
    cons.set_constraints('total', benchmark_w.sum(), benchmark_w.sum())

    if portfolio_industry_neutralize:
        ind_exp = industry_dummies[index]

        risk_tags = ind_exp.columns
        cons.add_exposure(risk_tags, ind_exp.values)
        benchmark_exp = benchmark_w @ ind_exp.values

        for k, name in enumerate(risk_tags):
            cons.set_constraints(name, benchmark_exp[k]*industry_lower, benchmark_exp[k]*industry_upper)

    if portfolio_risk_neutralize:
        risk_exp = risk_styles[index]

        risk_tags = np.array(portfolio_risk_neutralize)
        cons.add_exposure(risk_tags, risk_exp)

        benchmark_exp = benchmark_w @ risk_exp
        for k, name in enumerate(risk_tags):
            cons.set_constraints(name, benchmark_exp[k], benchmark_exp[k])

    risk_table = total_risks[index]
    
    y = gsearch.predict(sample_test_x)
    accuray_table.loc[ref_date, 'test'] = gsearch.score(sample_test_x, sample_test_y)

    is_tradable[:] = True
    weights, analysis = er_portfolio_analysis(y.astype(float),
                                              industry_names,
                                              realized_r,
                                              constraints=cons,
                                              detail_analysis=True,
                                              benchmark=benchmark_w,
                                              is_tradable=is_tradable,
                                              method=method)
    
    final_res[i] = analysis['er']['total'] / benchmark_w.sum()
    alpha_logger.info('trade_date: {0} predicting finished'.format(ref_date))
    

2017-10-27 19:02:12,689 - ALPHA_MIND - INFO - trade_date: 2012-05-02 00:00:00 training finished
2017-10-27 19:02:12,843 - ALPHA_MIND - INFO - trade_date: 2012-05-02 00:00:00 predicting finished
2017-10-27 19:06:49,629 - ALPHA_MIND - INFO - trade_date: 2012-05-16 00:00:00 training finished
2017-10-27 19:06:49,793 - ALPHA_MIND - INFO - trade_date: 2012-05-16 00:00:00 predicting finished
2017-10-27 19:11:09,707 - ALPHA_MIND - INFO - trade_date: 2012-05-30 00:00:00 training finished
2017-10-27 19:11:09,852 - ALPHA_MIND - INFO - trade_date: 2012-05-30 00:00:00 predicting finished
2017-10-27 19:15:35,107 - ALPHA_MIND - INFO - trade_date: 2012-06-13 00:00:00 training finished
2017-10-27 19:15:35,247 - ALPHA_MIND - INFO - trade_date: 2012-06-13 00:00:00 predicting finished
2017-10-27 19:20:18,175 - ALPHA_MIND - INFO - trade_date: 2012-06-27 00:00:00 training finished
2017-10-27 19:20:18,322 - ALPHA_MIND - INFO - trade_date: 2012-06-27 00:00:00 predicting finished
2017-10-27 19:24:50,547 - ALPH

KeyboardInterrupt: 

In [None]:
last_date = advanceDateByCalendar('china.sse', dates[-1], frequency)

df = pd.Series(final_res, index=dates[1:] + [last_date])
df.sort_index(inplace=True)
df['2012-01-01':].cumsum().plot(figsize=(12, 6))
plt.title('Prod factors model {1} ({0})'.format(method, model.__class__.__name__))

In [None]:
accuray_table.aggregate([np.mean, np.median, np.std])