In [1]:
%matplotlib inline
import numpy as np
from alphamind.api import *
from PyFin.api import *

engine = SqlEngine()

In [2]:
start_date = '2012-01-01'
end_date = '2017-12-31'

features = ['roe_q',
            'ep_q',
            'DivP',
            'cfinc1_q',
            'EBIT',
            'EARNYILD',
            'EPIBS']

freq = '5b'
batch = 16
universe = Universe('custom', ['zz500', 'hs300'])
benchmark = 905
neutralized_risk = ['SIZE'] + industry_styles
horizon = map_freq(freq)

In [3]:
%%time
factor_data = fetch_data_package(engine,
                                 features,
                                 start_date,
                                 end_date,
                                 '5b',
                                 universe,
                                 benchmark,
                                 batch=batch,
                                 warm_start=batch,
                                 neutralized_risk=neutralized_risk, 
                                 pre_process=[winsorize_normal, standardize],
                                 post_process=[winsorize_normal, standardize])

2018-01-08 16:54:05,618 - ALPHA_MIND - INFO - Starting data package fetching ...
2018-01-08 16:54:15,904 - ALPHA_MIND - INFO - factor data loading finished
2018-01-08 16:54:26,575 - ALPHA_MIND - INFO - return data loading finished
2018-01-08 16:54:27,944 - ALPHA_MIND - INFO - industry data loading finished
2018-01-08 16:54:28,634 - ALPHA_MIND - INFO - benchmark data loading finished
2018-01-08 16:54:41,966 - ALPHA_MIND - INFO - risk data loading finished
2018-01-08 16:54:45,557 - ALPHA_MIND - INFO - data merging finished
2018-01-08 16:54:48,150 - ALPHA_MIND - INFO - Loading data is finished
2018-01-08 16:54:59,541 - ALPHA_MIND - INFO - Data processing is finished


Wall time: 53.9 s


In [4]:
features = factor_data['x_names']

train_x = factor_data['train']['x']
train_y = factor_data['train']['y']
train_risk = factor_data['train']['risk']
ref_dates = sorted(train_x.keys())

predict_x = factor_data['predict']['x']
predict_y = factor_data['predict']['y']
predict_risk = factor_data['predict']['risk']
settlement = factor_data['settlement']

In [5]:
for key, val in train_y.items():
    train_y[key] = np.where(val > 0., 1, 0)
    
for key, val in predict_y.items():
    predict_y[key] = np.where(val > 0., 1, 0)

## Logistic Regression
--------------

In [6]:
%%time
train_dates = list(train_x.keys())
train_scores = []
predict_scores = []

for i, date in enumerate(train_dates):
    if i % 15 == 0:
        print(date)
    x = train_x[date]
    y = train_y[date]
    
    model = LogisticRegression(fit_intercept=False, features=features)
    model.fit(x, y)
    train_scores.append(model.score(x, y))
    
    p_x = predict_x[date]
    p_y = predict_y[date]
    predict_scores.append(model.score(p_x, p_y))

2012-02-15 00:00:00
2012-06-06 00:00:00
2012-09-20 00:00:00
2013-01-15 00:00:00
2013-05-14 00:00:00
2013-08-30 00:00:00
2013-12-24 00:00:00
2014-04-17 00:00:00
2014-08-05 00:00:00
2014-11-26 00:00:00
2015-03-20 00:00:00
2015-07-08 00:00:00
2015-10-30 00:00:00
2016-02-22 00:00:00
2016-06-08 00:00:00
2016-09-27 00:00:00
2017-01-18 00:00:00
2017-05-15 00:00:00
2017-08-30 00:00:00
2017-12-20 00:00:00
Wall time: 6.92 s


In [7]:
print(np.mean(train_scores))
print(np.mean(predict_scores))

0.54106394519
0.519270440032


## Random Forest Classifier
-----------

In [21]:
%%time
train_dates = list(train_x.keys())
train_scores = []
predict_scores = []

for i, date in enumerate(train_dates):
    if i % 15 == 0:
        print(date)
    x = train_x[date]
    y = train_y[date]
    
    model = RandomForestClassifier(n_estimators=1000, max_features='sqrt', max_depth=3, n_jobs=-1)
    model.fit(x, y)
    train_scores.append(model.score(x, y))
    
    p_x = predict_x[date]
    p_y = predict_y[date]
    predict_scores.append(model.score(p_x, p_y))

2012-02-15 00:00:00
2012-06-06 00:00:00
2012-09-20 00:00:00
2013-01-15 00:00:00
2013-05-14 00:00:00
2013-08-30 00:00:00
2013-12-24 00:00:00
2014-04-17 00:00:00
2014-08-05 00:00:00
2014-11-26 00:00:00
2015-03-20 00:00:00
2015-07-08 00:00:00
2015-10-30 00:00:00
2016-02-22 00:00:00
2016-06-08 00:00:00
2016-09-27 00:00:00
2017-01-18 00:00:00
2017-05-15 00:00:00
2017-08-30 00:00:00
2017-12-20 00:00:00
Wall time: 17min 2s


In [22]:
print(np.mean(train_scores))
print(np.mean(predict_scores))

0.557667621301
0.554107283453


## XGBoost Classifier
---------

In [10]:
%%time
train_dates = list(train_x.keys())
train_scores = []
predict_scores = []

for i, date in enumerate(train_dates):
    if i % 15 == 0:
        print(date)
    x = train_x[date]
    y = train_y[date]
    
    model = XGBClassifier(n_estimators=500,
                         learning_rate=0.02,
                         max_depth=3,
                         n_jobs=-1,
                         subsample=0.25,
                         colsample_bytree=0.5)
    model.fit(x, y)
    train_scores.append(model.score(x, y))
    
    p_x = predict_x[date]
    p_y = predict_y[date]
    predict_scores.append(model.score(p_x, p_y))

2012-02-15 00:00:00
2012-06-06 00:00:00
2012-09-20 00:00:00
2013-01-15 00:00:00
2013-05-14 00:00:00
2013-08-30 00:00:00
2013-12-24 00:00:00
2014-04-17 00:00:00
2014-08-05 00:00:00
2014-11-26 00:00:00
2015-03-20 00:00:00
2015-07-08 00:00:00
2015-10-30 00:00:00
2016-02-22 00:00:00
2016-06-08 00:00:00
2016-09-27 00:00:00
2017-01-18 00:00:00
2017-05-15 00:00:00
2017-08-30 00:00:00
2017-12-20 00:00:00
Wall time: 4min 33s


In [11]:
print(np.mean(train_scores))
print(np.mean(predict_scores))

0.612408578757
0.543523900352


## Logistic Regression with More Features
-----------------

In [12]:
def cross_product(x, y):
    n, m = x.shape
    res = []
    
    for j in range(m):
        res.append(x[:, [j]] * y)
        
    return np.concatenate(res, axis=1)

In [13]:
%%time
train_dates = list(train_x.keys())
train_scores = []
predict_scores = []

for i, date in enumerate(train_dates):
    if i % 15 == 0:
        print(date)
    x = train_x[date]
    y = train_y[date]
    risk = train_risk[date][:, 1:]
    new_x = cross_product(x, risk)
    
    model = LogisticRegression(fit_intercept=False, features=features)
    model.fit(new_x, y)
    train_scores.append(model.score(new_x, y))
    
    p_x = predict_x[date]
    p_y = predict_y[date]
    p_risk = predict_risk[date][:, 1:]
    new_p_x = cross_product(p_x, p_risk)
    predict_scores.append(model.score(new_p_x, p_y))

2012-02-15 00:00:00
2012-06-06 00:00:00
2012-09-20 00:00:00
2013-01-15 00:00:00
2013-05-14 00:00:00
2013-08-30 00:00:00
2013-12-24 00:00:00
2014-04-17 00:00:00
2014-08-05 00:00:00
2014-11-26 00:00:00
2015-03-20 00:00:00
2015-07-08 00:00:00
2015-10-30 00:00:00
2016-02-22 00:00:00
2016-06-08 00:00:00
2016-09-27 00:00:00
2017-01-18 00:00:00
2017-05-15 00:00:00
2017-08-30 00:00:00
2017-12-20 00:00:00
Wall time: 26.7 s


In [14]:
print(np.mean(train_scores))
print(np.mean(predict_scores))

0.568151341668
0.517317353974


## Random Forest Classifier with More Features
-----------

In [19]:
%%time
train_dates = list(train_x.keys())
train_scores = []
predict_scores = []

for i, date in enumerate(train_dates):
    if i % 15 == 0:
        print(date)
    x = train_x[date]
    y = train_y[date]
    risk = train_risk[date][:, 1:]
    new_x = cross_product(x, risk)
    
    model = RandomForestClassifier(n_estimators=1000, max_features='sqrt', max_depth=3, n_jobs=-1)
    model.fit(new_x, y)
    train_scores.append(model.score(new_x, y))
    
    p_x = predict_x[date]
    p_y = predict_y[date]
    p_risk = predict_risk[date][:, 1:]
    new_p_x = cross_product(p_x, p_risk)
    predict_scores.append(model.score(new_p_x, p_y))

2012-02-15 00:00:00
2012-06-06 00:00:00
2012-09-20 00:00:00
2013-01-15 00:00:00
2013-05-14 00:00:00
2013-08-30 00:00:00
2013-12-24 00:00:00
2014-04-17 00:00:00
2014-08-05 00:00:00
2014-11-26 00:00:00
2015-03-20 00:00:00
2015-07-08 00:00:00
2015-10-30 00:00:00
2016-02-22 00:00:00
2016-06-08 00:00:00
2016-09-27 00:00:00
2017-01-18 00:00:00
2017-05-15 00:00:00
2017-08-30 00:00:00
2017-12-20 00:00:00
Wall time: 12min 2s


In [20]:
print(np.mean(train_scores))
print(np.mean(predict_scores))

0.549010335268
0.56003282178


## XGBoost Classifier with More Features
---------

In [17]:
%%time
train_dates = list(train_x.keys())
train_scores = []
predict_scores = []

for i, date in enumerate(train_dates):
    if i % 15 == 0:
        print(date)
    x = train_x[date]
    y = train_y[date]
    risk = train_risk[date][:, 1:]
    new_x = cross_product(x, risk)
    
    model = XGBClassifier(n_estimators=500,
                         learning_rate=0.02,
                         max_depth=3,
                         n_jobs=-1,
                         subsample=0.25,
                         colsample_bytree=0.1)
    model.fit(new_x, y)
    train_scores.append(model.score(new_x, y))
    
    p_x = predict_x[date]
    p_y = predict_y[date]
    p_risk = predict_risk[date][:, 1:]
    new_p_x = cross_product(p_x, p_risk)
    predict_scores.append(model.score(new_p_x, p_y))

2012-02-15 00:00:00
2012-06-06 00:00:00
2012-09-20 00:00:00
2013-01-15 00:00:00
2013-05-14 00:00:00
2013-08-30 00:00:00
2013-12-24 00:00:00
2014-04-17 00:00:00
2014-08-05 00:00:00
2014-11-26 00:00:00
2015-03-20 00:00:00
2015-07-08 00:00:00
2015-10-30 00:00:00
2016-02-22 00:00:00
2016-06-08 00:00:00
2016-09-27 00:00:00
2017-01-18 00:00:00
2017-05-15 00:00:00
2017-08-30 00:00:00
2017-12-20 00:00:00
Wall time: 17min 2s


In [18]:
print(np.mean(train_scores))
print(np.mean(predict_scores))

0.593739136739
0.552533996977
