In [1]:
%matplotlib inline
import xgboost as xgb
import numpy as np
from alphamind.api import *
from PyFin.api import *
from sklearn.model_selection import train_test_split

engine = SqlEngine()

In [23]:
start_date = '2012-01-01'
end_date = '2018-01-05'

features = ['roe_q',
            'ep_q',
            'DivP',
            'cfinc1_q',
            'EBIT',
            'EARNYILD',
            'EPIBS']

freq = '10b'
batch = 16
universe = Universe('custom', ['zz500', 'hs300'])
benchmark = 905
neutralized_risk = ['SIZE'] + industry_styles
horizon = map_freq(freq)

In [24]:
%%time
factor_data = fetch_data_package(engine,
                                 features,
                                 start_date,
                                 end_date,
                                 '5b',
                                 universe,
                                 benchmark,
                                 batch=batch,
                                 warm_start=batch,
                                 neutralized_risk=neutralized_risk, 
                                 pre_process=[winsorize_normal, standardize],
                                 post_process=[winsorize_normal, standardize])

2018-01-11 15:12:44,105 - ALPHA_MIND - INFO - Starting data package fetching ...
2018-01-11 15:12:53,578 - ALPHA_MIND - INFO - factor data loading finished
2018-01-11 15:13:03,880 - ALPHA_MIND - INFO - return data loading finished
2018-01-11 15:13:05,384 - ALPHA_MIND - INFO - industry data loading finished
2018-01-11 15:13:06,178 - ALPHA_MIND - INFO - benchmark data loading finished
2018-01-11 15:13:17,845 - ALPHA_MIND - INFO - risk data loading finished
2018-01-11 15:13:21,266 - ALPHA_MIND - INFO - data merging finished
2018-01-11 15:13:23,371 - ALPHA_MIND - INFO - Loading data is finished
2018-01-11 15:13:33,174 - ALPHA_MIND - INFO - Data processing is finished


Wall time: 49.1 s


In [25]:
features = factor_data['x_names']

train_x = factor_data['train']['x']
train_y = factor_data['train']['y']
train_risk = factor_data['train']['risk']
ref_dates = sorted(train_x.keys())

predict_x = factor_data['predict']['x']
predict_y = factor_data['predict']['y']
predict_risk = factor_data['predict']['risk']
settlement = factor_data['settlement']

In [26]:
for key, val in train_y.items():
    train_y[key] = np.where(val > 0., 1, 0)
    
for key, val in predict_y.items():
    predict_y[key] = np.where(val > 0., 1, 0)

## Logistic Regression
--------------

In [27]:
%%time
train_dates = list(train_x.keys())
train_scores = []
predict_scores = []

for i, date in enumerate(train_dates):
    if i % 15 == 0:
        print(date)
    x = train_x[date]
    y = train_y[date]
    
    model = LogisticRegression(fit_intercept=False, features=features)
    model.fit(x, y)
    train_scores.append(model.score(x, y))
    
    p_x = predict_x[date]
    p_y = predict_y[date]
    predict_scores.append(model.score(p_x, p_y))

2012-02-15 00:00:00
2012-06-06 00:00:00
2012-09-20 00:00:00
2013-01-15 00:00:00
2013-05-14 00:00:00
2013-08-30 00:00:00
2013-12-24 00:00:00
2014-04-17 00:00:00
2014-08-05 00:00:00
2014-11-26 00:00:00
2015-03-20 00:00:00
2015-07-08 00:00:00
2015-10-30 00:00:00
2016-02-22 00:00:00
2016-06-08 00:00:00
2016-09-27 00:00:00
2017-01-18 00:00:00
2017-05-15 00:00:00
2017-08-30 00:00:00
2017-12-20 00:00:00
Wall time: 5.34 s


In [28]:
print(np.mean(train_scores))
print(np.mean(predict_scores))

0.541013986745
0.51932344036


## Random Forest Classifier
-----------

In [29]:
%%time
train_dates = list(train_x.keys())
train_scores = []
predict_scores = []

for i, date in enumerate(train_dates):
    if i % 15 == 0:
        print(date)
    x = train_x[date]
    y = train_y[date]
    
    model = RandomForestClassifier(n_estimators=1000, max_features='sqrt', max_depth=3, n_jobs=-1)
    model.fit(x, y)
    train_scores.append(model.score(x, y))
    
    p_x = predict_x[date]
    p_y = predict_y[date]
    predict_scores.append(model.score(p_x, p_y))

2012-02-15 00:00:00
2012-06-06 00:00:00
2012-09-20 00:00:00
2013-01-15 00:00:00
2013-05-14 00:00:00
2013-08-30 00:00:00
2013-12-24 00:00:00
2014-04-17 00:00:00
2014-08-05 00:00:00
2014-11-26 00:00:00
2015-03-20 00:00:00
2015-07-08 00:00:00
2015-10-30 00:00:00
2016-02-22 00:00:00
2016-06-08 00:00:00
2016-09-27 00:00:00
2017-01-18 00:00:00
2017-05-15 00:00:00
2017-08-30 00:00:00
2017-12-20 00:00:00
Wall time: 15min 34s


In [30]:
print(np.mean(train_scores))
print(np.mean(predict_scores))

0.557563825608
0.553974775005


## XGBoost Classifier
---------

In [31]:
%%time
train_dates = list(train_x.keys())
train_scores = []
predict_scores = []

for i, date in enumerate(train_dates):
    if i % 15 == 0:
        print(date)
    x = train_x[date]
    y = train_y[date]
    
    model = XGBClassifier(n_estimators=1000,
                         learning_rate=0.02,
                         max_depth=3,
                         n_jobs=-1,
                         subsample=0.25,
                         colsample_bytree=0.5)
    model.fit(x, y)
    train_scores.append(model.score(x, y))
    
    p_x = predict_x[date]
    p_y = predict_y[date]
    predict_scores.append(model.score(p_x, p_y))

2012-02-15 00:00:00
2012-06-06 00:00:00
2012-09-20 00:00:00
2013-01-15 00:00:00
2013-05-14 00:00:00
2013-08-30 00:00:00
2013-12-24 00:00:00
2014-04-17 00:00:00
2014-08-05 00:00:00
2014-11-26 00:00:00
2015-03-20 00:00:00
2015-07-08 00:00:00
2015-10-30 00:00:00
2016-02-22 00:00:00
2016-06-08 00:00:00
2016-09-27 00:00:00
2017-01-18 00:00:00
2017-05-15 00:00:00
2017-08-30 00:00:00
2017-12-20 00:00:00
Wall time: 13min 40s


In [32]:
print(np.mean(train_scores))
print(np.mean(predict_scores))

0.642946015759
0.537550683184


## Native XGBoost Classifier
---------------

In [33]:
%%time
train_dates = list(train_x.keys())
train_scores = []
predict_scores = []

for i, date in enumerate(train_dates):
    if i % 15 == 0:
        print(date)
    x = train_x[date]
    y = train_y[date]
    
    x_train, x_eval, y_train, y_eval = train_test_split(x, y, test_size=0.33, random_state=42)
    
    dtrain = xgb.DMatrix(x_train, y_train)
    deval = xgb.DMatrix(x_eval, y_eval)
    param = {'silent': 1,
             'objective': 'binary:logistic',
             'max_depth': 3,
             'eta': 0.01,
             'boost': 'dart',
             'tree_method': 'hist',
             'subsample': 0.25,
             'colsample_bytree': 0.5}
    num_round = 2000
    model = xgb.train(param, dtrain, num_round, evals=[(deval, 'eval')], early_stopping_rounds=50, verbose_eval=False)
    
    y_train_predict = model.predict(dtrain)
    label = dtrain.get_label()
    train_score = np.sum((y_train_predict > 0.5) == label) / float(len(label))

    train_scores.append(train_score)
    
    p_x = predict_x[date]
    p_y = predict_y[date]
    dtest = xgb.DMatrix(p_x, p_y)
    
    y_test_predict = model.predict(dtest)
    p_label = dtest.get_label()
    test_score = np.sum((y_test_predict > 0.5) == p_label) / float(len(p_label))
    predict_scores.append(test_score)

2012-02-15 00:00:00
2012-06-06 00:00:00
2012-09-20 00:00:00
2013-01-15 00:00:00
2013-05-14 00:00:00
2013-08-30 00:00:00
2013-12-24 00:00:00
2014-04-17 00:00:00
2014-08-05 00:00:00
2014-11-26 00:00:00
2015-03-20 00:00:00
2015-07-08 00:00:00
2015-10-30 00:00:00
2016-02-22 00:00:00
2016-06-08 00:00:00
2016-09-27 00:00:00
2017-01-18 00:00:00
2017-05-15 00:00:00
2017-08-30 00:00:00
2017-12-20 00:00:00
Wall time: 1min 6s


In [34]:
print(np.mean(train_scores))
print(np.mean(predict_scores))

0.567225761699
0.550997907465


## Logistic Regression with More Features
-----------------

In [35]:
def cross_product(x, y):
    n, m = x.shape
    res = []
    
    for j in range(m):
        res.append(x[:, [j]] * y)
        
    return np.concatenate(res, axis=1)

In [36]:
%%time
train_dates = list(train_x.keys())
train_scores = []
predict_scores = []

for i, date in enumerate(train_dates):
    if i % 15 == 0:
        print(date)
    x = train_x[date]
    y = train_y[date]
    risk = train_risk[date][:, 1:]
    new_x = cross_product(x, risk)
    
    model = LogisticRegression(fit_intercept=False, features=features)
    model.fit(new_x, y)
    train_scores.append(model.score(new_x, y))
    
    p_x = predict_x[date]
    p_y = predict_y[date]
    p_risk = predict_risk[date][:, 1:]
    new_p_x = cross_product(p_x, p_risk)
    predict_scores.append(model.score(new_p_x, p_y))

2012-02-15 00:00:00
2012-06-06 00:00:00
2012-09-20 00:00:00
2013-01-15 00:00:00
2013-05-14 00:00:00
2013-08-30 00:00:00
2013-12-24 00:00:00
2014-04-17 00:00:00
2014-08-05 00:00:00
2014-11-26 00:00:00
2015-03-20 00:00:00
2015-07-08 00:00:00
2015-10-30 00:00:00
2016-02-22 00:00:00
2016-06-08 00:00:00
2016-09-27 00:00:00
2017-01-18 00:00:00
2017-05-15 00:00:00
2017-08-30 00:00:00
2017-12-20 00:00:00
Wall time: 36.1 s


In [37]:
print(np.mean(train_scores))
print(np.mean(predict_scores))

0.568125478425
0.517523115163


## Random Forest Classifier with More Features
-----------

In [38]:
%%time
train_dates = list(train_x.keys())
train_scores = []
predict_scores = []

for i, date in enumerate(train_dates):
    if i % 15 == 0:
        print(date)
    x = train_x[date]
    y = train_y[date]
    risk = train_risk[date][:, 1:]
    new_x = cross_product(x, risk)
    
    model = RandomForestClassifier(n_estimators=1000, max_features='sqrt', max_depth=3, n_jobs=-1)
    model.fit(new_x, y)
    train_scores.append(model.score(new_x, y))
    
    p_x = predict_x[date]
    p_y = predict_y[date]
    p_risk = predict_risk[date][:, 1:]
    new_p_x = cross_product(p_x, p_risk)
    predict_scores.append(model.score(new_p_x, p_y))

2012-02-15 00:00:00
2012-06-06 00:00:00
2012-09-20 00:00:00
2013-01-15 00:00:00
2013-05-14 00:00:00
2013-08-30 00:00:00
2013-12-24 00:00:00
2014-04-17 00:00:00
2014-08-05 00:00:00
2014-11-26 00:00:00
2015-03-20 00:00:00
2015-07-08 00:00:00
2015-10-30 00:00:00
2016-02-22 00:00:00
2016-06-08 00:00:00
2016-09-27 00:00:00
2017-01-18 00:00:00
2017-05-15 00:00:00
2017-08-30 00:00:00
2017-12-20 00:00:00
Wall time: 14min 40s


In [39]:
print(np.mean(train_scores))
print(np.mean(predict_scores))

0.549090142483
0.559944504146


## XGBoost Classifier with More Features
---------

In [40]:
%%time
train_dates = list(train_x.keys())
train_scores = []
predict_scores = []

for i, date in enumerate(train_dates):
    if i % 15 == 0:
        print(date)
    x = train_x[date]
    y = train_y[date]
    risk = train_risk[date][:, 1:]
    new_x = cross_product(x, risk)
    
    model = XGBClassifier(n_estimators=500,
                         learning_rate=0.02,
                         max_depth=3,
                         n_jobs=-1,
                         subsample=0.25,
                         colsample_bytree=0.1)
    model.fit(new_x, y)
    train_scores.append(model.score(new_x, y))
    
    p_x = predict_x[date]
    p_y = predict_y[date]
    p_risk = predict_risk[date][:, 1:]
    new_p_x = cross_product(p_x, p_risk)
    predict_scores.append(model.score(new_p_x, p_y))

2012-02-15 00:00:00
2012-06-06 00:00:00
2012-09-20 00:00:00
2013-01-15 00:00:00
2013-05-14 00:00:00
2013-08-30 00:00:00
2013-12-24 00:00:00
2014-04-17 00:00:00
2014-08-05 00:00:00
2014-11-26 00:00:00
2015-03-20 00:00:00
2015-07-08 00:00:00
2015-10-30 00:00:00
2016-02-22 00:00:00
2016-06-08 00:00:00
2016-09-27 00:00:00
2017-01-18 00:00:00
2017-05-15 00:00:00
2017-08-30 00:00:00
2017-12-20 00:00:00
Wall time: 12min 25s


In [41]:
print(np.mean(train_scores))
print(np.mean(predict_scores))

0.59375573895
0.55230987889


## Native XGBoost Classifier with More Features
---------------

In [42]:
%%time
train_dates = list(train_x.keys())
train_scores = []
predict_scores = []

for i, date in enumerate(train_dates):
    if i % 15 == 0:
        print(date)
    x = train_x[date]
    y = train_y[date]
    risk = train_risk[date][:, 1:]
    new_x = cross_product(x, risk)
    
    x_train, x_eval, y_train, y_eval = train_test_split(new_x, y, test_size=0.33, random_state=42)
    
    dtrain = xgb.DMatrix(x_train, y_train)
    deval = xgb.DMatrix(x_eval, y_eval)
    param = {'silent': 1,
             'objective': 'binary:logistic',
             'max_depth': 3,
             'eta': 0.01,
             'booster': 'dart',
             'tree_method': 'hist',
             'subsample': 0.25,
             'colsample_bytree': 0.5}
    num_round = 2000
    model = xgb.train(param, dtrain, num_round, evals=[(deval, 'eval')], early_stopping_rounds=50, verbose_eval=False)
    
    y_train_predict = model.predict(dtrain)
    label = dtrain.get_label()
    train_score = np.sum((y_train_predict > 0.5) == label) / float(len(label))

    train_scores.append(train_score)
    
    p_x = predict_x[date]
    p_y = predict_y[date]
    p_risk = predict_risk[date][:, 1:]
    new_p_x = cross_product(p_x, p_risk)
    dtest = xgb.DMatrix(new_p_x, p_y)
    
    y_test_predict = model.predict(dtest)
    p_label = dtest.get_label()
    test_score = np.sum((y_test_predict > 0.5) == p_label) / float(len(p_label))
    predict_scores.append(test_score)

2012-02-15 00:00:00
2012-06-06 00:00:00
2012-09-20 00:00:00
2013-01-15 00:00:00
2013-05-14 00:00:00
2013-08-30 00:00:00
2013-12-24 00:00:00
2014-04-17 00:00:00
2014-08-05 00:00:00
2014-11-26 00:00:00
2015-03-20 00:00:00
2015-07-08 00:00:00
2015-10-30 00:00:00
2016-02-22 00:00:00
2016-06-08 00:00:00
2016-09-27 00:00:00
2017-01-18 00:00:00
2017-05-15 00:00:00
2017-08-30 00:00:00
2017-12-20 00:00:00
Wall time: 5min 23s


In [43]:
print(np.mean(train_scores))
print(np.mean(predict_scores))

0.560057712549
0.552663472836
