In [22]:
%matplotlib inline
import datetime as dt
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from alphamind.api import *
from PyFin.api import *
import xgboost as xgb
from sklearn.model_selection import train_test_split

plt.style.use('fivethirtyeight')
engine = SqlEngine('postgres+psycopg2://postgres:A12345678!@10.63.6.220/alpha')

In [2]:
u_name = 'zz500'
benchmark = 905
universe = Universe(u_name, [u_name])
factor_coverage = engine.fetch_factor_coverage()

In [3]:
flitered_coverage = factor_coverage[((factor_coverage.source == 'uqer'))
                                    & (factor_coverage.universe == u_name) 
                                    & (factor_coverage.trade_date >= '2012-01-01')]
coverage_report = flitered_coverage.groupby(['factor'])['coverage'].mean()

In [4]:
alpha_factors = coverage_report[coverage_report >= 0.99].index.tolist()

alpha_factors = {
    f: DIFF(f) / (ABS(f) + 1e-4) / (DIFF('closePrice') / (LAST('closePrice') + 1e-4))
        for f in alpha_factors if f != 'SIZE'
}

In [5]:
alpha_factors.__len__()

202

In [6]:
frequency = '2w'
batch = 8
start_date = '2012-01-01'
end_date = '2017-10-25'
method = 'risk_neutral'
portfolio_risk_neutralize = ['SIZE']
neutralize_risk = industry_styles + portfolio_risk_neutralize
industry_lower = 1.
industry_upper = 1.

In [7]:
data_package = fetch_data_package(engine,
                                  alpha_factors=alpha_factors,
                                  start_date=start_date,
                                  end_date=end_date,
                                  frequency=frequency,
                                  universe=universe,
                                  benchmark=benchmark,
                                  batch=batch,
                                  neutralized_risk=neutralize_risk,
                                  pre_process=[winsorize_normal],
                                  post_process=[winsorize_normal],
                                  warm_start=batch)

2017-10-30 17:03:41,507 - ALPHA_MIND - INFO - Starting data package fetching ...
  dropna=False)
  dropna=False)
2017-10-30 17:06:05,198 - ALPHA_MIND - INFO - Loading data is finished
2017-10-30 17:06:59,024 - ALPHA_MIND - INFO - Data processing is finished


In [8]:
train_x = data_package['train']['x']
train_y = data_package['train']['y']

predict_x = data_package['predict']['x']
predict_y = data_package['predict']['y']

features = data_package['x_names']

In [9]:
def plot_model_importance(model, features):
    features = np.array(features)
    n_features = len(features)
    features_importance = model.feature_importances_
    order = features_importance.argsort().argsort()
    features = features[order >= n_features - 10]
    features_importance = features_importance[order >= n_features - 10]
    n_features = len(features)
    plt.figure(figsize=(12, 6))
    plt.barh(range(n_features), features_importance, align='center')
    plt.yticks(np.arange(n_features), features)
    plt.xlabel('Feature importance')
    plt.ylabel('Feature')

## 0. Train Score on a specific date
------------------------------------

In [10]:
ref_date = list(train_x.keys())[-1]
sample_train_x = train_x[ref_date]
sample_train_y = train_y[ref_date].flatten()

sample_test_x = predict_x[ref_date]
sample_test_y = predict_y[ref_date].flatten()

In [31]:
%%time

param = {'objective': 'reg:linear',
         'tree_method': 'hist'}

X_train, X_val, y_train, y_val = train_test_split(sample_train_x, sample_train_y, test_size=0.25, random_state=42)

dtrain = xgb.DMatrix(X_train, label=y_train)
dval = xgb.DMatrix(X_val, label=y_val)
num_round = 500
model = xgb.train(param, dtrain, num_round, evals=[(dval, 'val')], early_stopping_rounds=10)

[0]	val-rmse:0.358247
Will train until val-rmse hasn't improved in 20 rounds.
[1]	val-rmse:0.255458
[2]	val-rmse:0.184941
[3]	val-rmse:0.137607
[4]	val-rmse:0.106497
[5]	val-rmse:0.086864
[6]	val-rmse:0.07489
[7]	val-rmse:0.068031
[8]	val-rmse:0.064358
[9]	val-rmse:0.062044
[10]	val-rmse:0.0612
[11]	val-rmse:0.060778
[12]	val-rmse:0.060721
[13]	val-rmse:0.060811
[14]	val-rmse:0.060485
[15]	val-rmse:0.060291
[16]	val-rmse:0.060126
[17]	val-rmse:0.060461
[18]	val-rmse:0.060616
[19]	val-rmse:0.060645
[20]	val-rmse:0.060813
[21]	val-rmse:0.06065
[22]	val-rmse:0.06056
[23]	val-rmse:0.060573
[24]	val-rmse:0.060879
[25]	val-rmse:0.06104
[26]	val-rmse:0.061136
[27]	val-rmse:0.0612
[28]	val-rmse:0.061146
[29]	val-rmse:0.061263
[30]	val-rmse:0.061236
[31]	val-rmse:0.061272
[32]	val-rmse:0.061228
[33]	val-rmse:0.061271
[34]	val-rmse:0.061277
[35]	val-rmse:0.061287
[36]	val-rmse:0.061349
Stopping. Best iteration:
[16]	val-rmse:0.060126

Wall time: 50.6 s


In [32]:
dpredict = xgb.DMatrix(sample_test_x, label=sample_test_y)
model.predict(dpredict)

array([-0.01240957, -0.01542193,  0.003759  , -0.0370059 , -0.01399595,
       -0.02125567, -0.03098136, -0.02801037,  0.00610146,  0.00834453,
        0.00155056, -0.01593572, -0.05426192, -0.05738616, -0.03181559,
        0.02134934,  0.00575542, -0.0022133 , -0.02373421, -0.04187602,
       -0.01939499,  0.01149094, -0.00942582, -0.01294637,  0.03033528,
        0.0812315 , -0.03541517, -0.02579325, -0.02382213, -0.03661305,
       -0.00801933, -0.00755411, -0.01823515, -0.0351119 , -0.03827125,
       -0.06906605, -0.00657129, -0.02492863, -0.00713164, -0.02199388,
       -0.01912141,  0.0357779 , -0.02171987, -0.00683856, -0.01550424,
        0.0036301 , -0.00993472,  0.00213516,  0.05597222, -0.03993398,
       -0.00298005, -0.02027029, -0.02534062, -0.02602386, -0.00823247,
        0.04025003, -0.03899878, -0.00370234, -0.01509917, -0.02637047,
       -0.02993429,  0.03641999, -0.00127167,  0.02978992,  0.0345099 ,
       -0.01929975, -0.009942  ,  0.02140123, -0.0099209 ,  0.01

In [34]:
model.eval(dpredict)

'[0]\teval-rmse:0.034598'