In [1]:
%matplotlib inline
import datetime as dt
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from alphamind.api import *
from PyFin.api import *
import xgboost as xgb
from xgboost.sklearn import XGBRegressor
from sklearn.model_selection import GridSearchCV

plt.style.use('fivethirtyeight')
engine = SqlEngine('postgres+psycopg2://postgres:A12345678!@10.63.6.220/alpha')

In [2]:
u_name = 'zz500'
benchmark = 905
universe = Universe(u_name, [u_name])
factor_coverage = engine.fetch_factor_coverage()

In [3]:
flitered_coverage = factor_coverage[((factor_coverage.source == 'uqer'))
                                    & (factor_coverage.universe == u_name) 
                                    & (factor_coverage.trade_date >= '2012-01-01')]
coverage_report = flitered_coverage.groupby(['factor'])['coverage'].mean()

In [4]:
risk_factors = engine.fetch_risk_meta()
risk_factors = risk_factors[risk_factors.type == 'style'].factor.tolist()

In [5]:
alpha_factors = coverage_report[coverage_report >= 0.99].index.tolist()

alpha_factors = {
    f: DIFF(f) / (ABS(f) + 1e-4) / (DIFF('closePrice') / (LAST('closePrice') + 1e-4))
        for f in alpha_factors if f != 'SIZE'
}

In [6]:
alpha_factors.__len__()

202

In [7]:
frequency = '2w'
batch = 8
start_date = '2012-01-01'
end_date = '2017-10-25'
method = 'risk_neutral'
portfolio_risk_neutralize = ['SIZE']
neutralize_risk = industry_styles + portfolio_risk_neutralize
industry_lower = 1.
industry_upper = 1.

In [8]:
data_package = fetch_data_package(engine,
                                  alpha_factors=alpha_factors,
                                  start_date=start_date,
                                  end_date=end_date,
                                  frequency=frequency,
                                  universe=universe,
                                  benchmark=benchmark,
                                  batch=batch,
                                  neutralized_risk=neutralize_risk,
                                  pre_process=[winsorize_normal],
                                  post_process=[winsorize_normal],
                                  warm_start=batch)

2017-10-30 15:59:06,085 - ALPHA_MIND - INFO - Starting data package fetching ...
  dropna=False)
  dropna=False)
2017-10-30 16:00:44,156 - ALPHA_MIND - INFO - Loading data is finished
2017-10-30 16:01:02,368 - ALPHA_MIND - INFO - Data processing is finished


In [9]:
train_x = data_package['train']['x']
train_y = data_package['train']['y']

predict_x = data_package['predict']['x']
predict_y = data_package['predict']['y']

features = data_package['x_names']

In [10]:
def plot_model_importance(model, features):
    features = np.array(features)
    n_features = len(features)
    features_importance = model.feature_importances_
    order = features_importance.argsort().argsort()
    features = features[order >= n_features - 10]
    features_importance = features_importance[order >= n_features - 10]
    n_features = len(features)
    plt.figure(figsize=(12, 6))
    plt.barh(range(n_features), features_importance, align='center')
    plt.yticks(np.arange(n_features), features)
    plt.xlabel('Feature importance')
    plt.ylabel('Feature')

## 0. Train Score on a specific date
------------------------------------

In [11]:
ref_date = list(train_x.keys())[-1]
sample_train_x = train_x[ref_date]
sample_train_y = train_y[ref_date].flatten()

sample_test_x = predict_x[ref_date]
sample_test_y = predict_y[ref_date].flatten()

In [12]:
%%time

param_grid = {
 'max_depth': [3, 5, 10],
 'min_child_weight': [1, 5, 10]
}



inner_model = XGBRegressor(max_depth=5,
                     min_child_weight=20,
                     n_estimators=1000,
                     subsample=0.5,
                     colsample_bytree=0.05,
                     reg_alpha=1.,
                     reg_lambda=1.)

model = GridSearchCV(estimator = inner_model,
                     param_grid=param_grid,
                     cv=5,
                     n_jobs=-1)

model.fit(sample_train_x, sample_train_y)

Wall time: 1min 22s


In [13]:
model.score(sample_train_x, sample_train_y)

0.55783631412419776

In [14]:
model.score(sample_test_x, sample_test_y)

-0.44341398453487768

In [15]:
model.best_estimator_.feature_importances_

array([ 0.0043753 ,  0.00372711,  0.0043753 ,  0.00453735,  0.00340301,
        0.00405121,  0.00599579,  0.00275482,  0.0046994 ,  0.00550964,
        0.00648193,  0.00518555,  0.0046994 ,  0.00615784,  0.00696808,
        0.00550964,  0.00988495,  0.00567169,  0.00307892,  0.0043753 ,
        0.00534759,  0.00356506,  0.00534759,  0.00518555,  0.0050235 ,
        0.00388916,  0.00486145,  0.00324097,  0.00307892,  0.00405121,
        0.00421326,  0.00794037,  0.00761627,  0.00324097,  0.00615784,
        0.00518555,  0.00405121,  0.00388916,  0.00291687,  0.00388916,
        0.00307892,  0.00631988,  0.00388916,  0.0050235 ,  0.00453735,
        0.00275482,  0.00518555,  0.00226868,  0.00340301,  0.00421326,
        0.00324097,  0.00388916,  0.00453735,  0.00696808,  0.0046994 ,
        0.00372711,  0.00421326,  0.00761627,  0.0050235 ,  0.00453735,
        0.00777832,  0.00599579,  0.00550964,  0.00372711,  0.00664398,
        0.00421326,  0.00534759,  0.00534759,  0.00567169,  0.00

## 1. Train and test accuracy trend （XGBRegressor）
----------

In [None]:
dates = sorted(train_x.keys())
accuray_table = pd.DataFrame(columns=['train', 'test'])

portfolio_industry_neutralize = True

settlement = data_package['settlement']
industry_dummies = pd.get_dummies(settlement['industry'].values)
risk_styles = settlement[portfolio_risk_neutralize].values
total_risks = settlement[neutralize_risk].values
final_res = np.zeros(len(dates))
method = 'risk_neutral'

In [None]:
for i, ref_date in enumerate(dates):
    sample_train_x = train_x[ref_date]
    sample_train_y = train_y[ref_date].flatten()

    model.fit(sample_train_x, sample_train_y)
    accuray_table.loc[ref_date, 'train'] =  model.score(sample_train_x, sample_train_y)
    alpha_logger.info('trade_date: {0} training finished'.format(ref_date))
    
    ##---------------------------------##
    
    sample_test_x = predict_x[ref_date]
    sample_test_y = predict_y[ref_date].flatten()
    
    cons = Constraints()
    index = settlement.trade_date == ref_date
    benchmark_w = settlement[index]['weight'].values
    realized_r = settlement[index]['dx'].values
    industry_names = settlement[index]['industry'].values
    is_tradable = settlement[index]['isOpen'].values

    cons.add_exposure(['total'], np.ones((len(is_tradable), 1)))
    cons.set_constraints('total', benchmark_w.sum(), benchmark_w.sum())

    if portfolio_industry_neutralize:
        ind_exp = industry_dummies[index]

        risk_tags = ind_exp.columns
        cons.add_exposure(risk_tags, ind_exp.values)
        benchmark_exp = benchmark_w @ ind_exp.values

        for k, name in enumerate(risk_tags):
            cons.set_constraints(name, benchmark_exp[k]*industry_lower, benchmark_exp[k]*industry_upper)

    if portfolio_risk_neutralize:
        risk_exp = risk_styles[index]

        risk_tags = np.array(portfolio_risk_neutralize)
        cons.add_exposure(risk_tags, risk_exp)

        benchmark_exp = benchmark_w @ risk_exp
        for k, name in enumerate(risk_tags):
            cons.set_constraints(name, benchmark_exp[k], benchmark_exp[k])

    risk_table = total_risks[index]
    
    y = model.predict(sample_test_x)
    accuray_table.loc[ref_date, 'test'] = model.score(sample_test_x, sample_test_y)

    is_tradable[:] = True
    weights, analysis = er_portfolio_analysis(y.astype(float),
                                              industry_names,
                                              realized_r,
                                              constraints=cons,
                                              detail_analysis=True,
                                              benchmark=benchmark_w,
                                              is_tradable=is_tradable,
                                              method=method)
    
    final_res[i] = analysis['er']['total'] / benchmark_w.sum()
    alpha_logger.info('trade_date: {0} predicting finished'.format(ref_date))
    

2017-10-30 16:03:43,756 - ALPHA_MIND - INFO - trade_date: 2012-01-30 00:00:00 training finished
2017-10-30 16:03:44,031 - ALPHA_MIND - INFO - trade_date: 2012-01-30 00:00:00 predicting finished
2017-10-30 16:05:05,734 - ALPHA_MIND - INFO - trade_date: 2012-02-08 00:00:00 training finished
2017-10-30 16:05:05,990 - ALPHA_MIND - INFO - trade_date: 2012-02-08 00:00:00 predicting finished
2017-10-30 16:06:15,716 - ALPHA_MIND - INFO - trade_date: 2012-02-22 00:00:00 training finished
2017-10-30 16:06:16,232 - ALPHA_MIND - INFO - trade_date: 2012-02-22 00:00:00 predicting finished
2017-10-30 16:07:26,617 - ALPHA_MIND - INFO - trade_date: 2012-03-07 00:00:00 training finished
2017-10-30 16:07:26,842 - ALPHA_MIND - INFO - trade_date: 2012-03-07 00:00:00 predicting finished
2017-10-30 16:08:34,769 - ALPHA_MIND - INFO - trade_date: 2012-03-21 00:00:00 training finished
2017-10-30 16:08:35,300 - ALPHA_MIND - INFO - trade_date: 2012-03-21 00:00:00 predicting finished
2017-10-30 16:09:42,546 - ALPH

2017-10-30 16:51:59,771 - ALPHA_MIND - INFO - trade_date: 2013-09-04 00:00:00 predicting finished
2017-10-30 16:53:06,232 - ALPHA_MIND - INFO - trade_date: 2013-09-18 00:00:00 training finished
2017-10-30 16:53:06,504 - ALPHA_MIND - INFO - trade_date: 2013-09-18 00:00:00 predicting finished
2017-10-30 16:54:17,100 - ALPHA_MIND - INFO - trade_date: 2013-10-08 00:00:00 training finished
2017-10-30 16:54:17,349 - ALPHA_MIND - INFO - trade_date: 2013-10-08 00:00:00 predicting finished
2017-10-30 16:55:22,653 - ALPHA_MIND - INFO - trade_date: 2013-10-16 00:00:00 training finished
2017-10-30 16:55:22,904 - ALPHA_MIND - INFO - trade_date: 2013-10-16 00:00:00 predicting finished
2017-10-30 16:56:31,367 - ALPHA_MIND - INFO - trade_date: 2013-10-30 00:00:00 training finished
2017-10-30 16:56:31,650 - ALPHA_MIND - INFO - trade_date: 2013-10-30 00:00:00 predicting finished
2017-10-30 16:57:39,492 - ALPHA_MIND - INFO - trade_date: 2013-11-13 00:00:00 training finished
2017-10-30 16:57:39,751 - ALPH

2017-10-30 17:41:02,621 - ALPHA_MIND - INFO - trade_date: 2015-04-29 00:00:00 training finished
2017-10-30 17:41:02,945 - ALPHA_MIND - INFO - trade_date: 2015-04-29 00:00:00 predicting finished
2017-10-30 17:42:10,267 - ALPHA_MIND - INFO - trade_date: 2015-05-13 00:00:00 training finished
2017-10-30 17:42:10,804 - ALPHA_MIND - INFO - trade_date: 2015-05-13 00:00:00 predicting finished
2017-10-30 17:43:20,729 - ALPHA_MIND - INFO - trade_date: 2015-05-27 00:00:00 training finished
2017-10-30 17:43:20,960 - ALPHA_MIND - INFO - trade_date: 2015-05-27 00:00:00 predicting finished
2017-10-30 17:44:32,601 - ALPHA_MIND - INFO - trade_date: 2015-06-10 00:00:00 training finished
2017-10-30 17:44:32,920 - ALPHA_MIND - INFO - trade_date: 2015-06-10 00:00:00 predicting finished
2017-10-30 17:45:42,142 - ALPHA_MIND - INFO - trade_date: 2015-06-24 00:00:00 training finished
2017-10-30 17:45:42,498 - ALPHA_MIND - INFO - trade_date: 2015-06-24 00:00:00 predicting finished
2017-10-30 17:46:51,160 - ALPH

In [None]:
last_date = advanceDateByCalendar('china.sse', dates[-1], frequency)

df = pd.Series(final_res, index=dates[1:] + [last_date])
df.sort_index(inplace=True)
df['2012-01-01':].cumsum().plot(figsize=(12, 6))
plt.title('Prod factors model {1} ({0})'.format(method, model.__class__.__name__))

In [None]:
accuray_table.aggregate([np.mean, np.median, np.std])