In [8]:
%matplotlib inline
import numpy as np
import xgboost as xgb
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
from alphamind.api import *
from PyFin.api import *

engine = SqlEngine()

In [52]:
start_date = '2012-01-01'
end_date = '2018-01-05'

features = ['roe_q',
            'ep_q',
            'DivP',
            'cfinc1_q',
            'EBIT',
            'EARNYILD',
            'EPIBS']

freq = '5b'
batch = 32
universe = Universe('custom', ['zz500', 'hs300'])
benchmark = 905
neutralized_risk = ['SIZE'] + industry_styles
horizon = map_freq(freq)

In [53]:
%%time
factor_data = fetch_data_package(engine,
                                 features,
                                 start_date,
                                 end_date,
                                 '5b',
                                 universe,
                                 benchmark,
                                 batch=batch,
                                 warm_start=batch,
                                 neutralized_risk=neutralized_risk, 
                                 pre_process=[winsorize_normal, standardize],
                                 post_process=[winsorize_normal, standardize])

2018-01-10 14:56:47,595 - ALPHA_MIND - INFO - Starting data package fetching ...
2018-01-10 14:56:54,781 - ALPHA_MIND - INFO - factor data loading finished
2018-01-10 14:57:03,949 - ALPHA_MIND - INFO - return data loading finished
2018-01-10 14:57:05,113 - ALPHA_MIND - INFO - industry data loading finished
2018-01-10 14:57:05,828 - ALPHA_MIND - INFO - benchmark data loading finished
2018-01-10 14:57:15,662 - ALPHA_MIND - INFO - risk data loading finished
2018-01-10 14:57:17,773 - ALPHA_MIND - INFO - data merging finished
2018-01-10 14:57:19,490 - ALPHA_MIND - INFO - Loading data is finished
2018-01-10 14:57:35,324 - ALPHA_MIND - INFO - Data processing is finished


Wall time: 47.7 s


In [54]:
features = factor_data['x_names']

train_x = factor_data['train']['x']
train_y = factor_data['train']['y']
train_risk = factor_data['train']['risk']
ref_dates = sorted(train_x.keys())

predict_x = factor_data['predict']['x']
predict_y = factor_data['predict']['y']
predict_risk = factor_data['predict']['risk']
settlement = factor_data['settlement']

## Linear Regression
------------------

In [66]:
%%time
train_dates = list(train_x.keys())
train_scores = []
predict_scores = []

for i, date in enumerate(train_dates):
    if i % 15 == 0:
        print(date)
    x = train_x[date]
    y = train_y[date]
    
    model = LinearRegression(fit_intercept=False, features=features)
    model.fit(x, y)
    train_scores.append(model.score(x, y))
    
    p_x = predict_x[date]
    p_y = predict_y[date]
    predict_scores.append(model.score(p_x, p_y))

2012-02-15 00:00:00
2012-06-06 00:00:00
2012-09-20 00:00:00
2013-01-15 00:00:00
2013-05-14 00:00:00
2013-08-30 00:00:00
2013-12-24 00:00:00
2014-04-17 00:00:00
2014-08-05 00:00:00
2014-11-26 00:00:00
2015-03-20 00:00:00
2015-07-08 00:00:00
2015-10-30 00:00:00
2016-02-22 00:00:00
2016-06-08 00:00:00
2016-09-27 00:00:00
2017-01-18 00:00:00
2017-05-15 00:00:00
2017-08-30 00:00:00
2017-12-20 00:00:00
Wall time: 1.26 s


In [67]:
print(np.mean(train_scores))
print(np.mean(predict_scores))

0.0107609007052
-0.480548329833


## Lasso Regression
---------

In [60]:
%%time
train_dates = list(train_x.keys())
train_scores = []
predict_scores = []

for i, date in enumerate(train_dates):
    if i % 15 == 0:
        print(date)
    x = train_x[date]
    y = train_y[date]
    
    model = LassoRegression(alpha=0.01, fit_intercept=False, features=features)
    model.fit(x, y)
    train_scores.append(model.score(x, y))
    
    p_x = predict_x[date]
    p_y = predict_y[date]
    predict_scores.append(model.score(p_x, p_y))

2012-02-15 00:00:00
2012-06-06 00:00:00
2012-09-20 00:00:00
2013-01-15 00:00:00
2013-05-14 00:00:00
2013-08-30 00:00:00
2013-12-24 00:00:00
2014-04-17 00:00:00
2014-08-05 00:00:00
2014-11-26 00:00:00
2015-03-20 00:00:00
2015-07-08 00:00:00
2015-10-30 00:00:00
2016-02-22 00:00:00
2016-06-08 00:00:00
2016-09-27 00:00:00
2017-01-18 00:00:00
2017-05-15 00:00:00
2017-08-30 00:00:00
2017-12-20 00:00:00
Wall time: 1.58 s


In [61]:
print(np.mean(train_scores))
print(np.mean(predict_scores))

0.00875291615929
-0.475440026


## Linear Regression with More Features
----------

In [9]:
def cross_product(x, y):
    n, m = x.shape
    res = []
    
    for j in range(m):
        res.append(x[:, [j]] * y)
        
    return np.concatenate(res, axis=1)

In [34]:
%%time
train_dates = list(train_x.keys())
train_scores = []
predict_scores = []

for i, date in enumerate(train_dates[:1]):
    if i % 15 == 0:
        print(date)
    x = train_x[date]
    y = train_y[date]
    risk = train_risk[date][:, 1:]
    new_x = cross_product(x, risk)
    
    model = LinearRegression(fit_intercept=False, features=features)
    model.fit(new_x, y)
    train_scores.append(model.score(new_x, y))
    
    p_x = predict_x[date]
    p_y = predict_y[date]
    p_risk = predict_risk[date][:, 1:]
    new_p_x = cross_product(p_x, p_risk)
    predict_scores.append(model.score(new_p_x, p_y))

2017-01-03 00:00:00


NameError: name 'cross_product' is not defined

In [11]:
print(np.mean(train_scores))
print(np.mean(predict_scores))

0.0291928676769
-0.24146254373


## Lasso Regression with More Features
----------------------

In [12]:
%%time
train_dates = list(train_x.keys())
train_scores = []
predict_scores = []

for i, date in enumerate(train_dates):
    if i % 15 == 0:
        print(date)
    x = train_x[date]
    y = train_y[date]
    risk = train_risk[date][:, 1:]
    new_x = cross_product(x, risk)
    
    model = LassoRegression(alpha=0.01, fit_intercept=False, features=features)
    model.fit(new_x, y)
    train_scores.append(model.score(new_x, y))
    
    p_x = predict_x[date]
    p_y = predict_y[date]
    p_risk = predict_risk[date][:, 1:]
    new_p_x = cross_product(p_x, p_risk)
    predict_scores.append(model.score(new_p_x, p_y))

2017-01-03 00:00:00
2017-04-27 00:00:00
2017-08-15 00:00:00
2017-12-05 00:00:00
Wall time: 4.78 s


In [13]:
print(np.mean(train_scores))
print(np.mean(predict_scores))

0.000355789142204
-0.200552889618


## Random Forest Regressor
---------------

In [14]:
%%time
train_dates = list(train_x.keys())
train_scores = []
predict_scores = []

for i, date in enumerate(train_dates):
    if i % 15 == 0:
        print(date)
    x = train_x[date]
    y = train_y[date]
    
    model = RandomForestRegressor(n_estimators=500, max_features='sqrt', max_depth=3, n_jobs=-1)
    model.fit(x, y)
    train_scores.append(model.score(x, y))
    
    p_x = predict_x[date]
    p_y = predict_y[date]
    predict_scores.append(model.score(p_x, p_y))

2017-01-03 00:00:00
2017-04-27 00:00:00
2017-08-15 00:00:00
2017-12-05 00:00:00
Wall time: 1min 18s


In [15]:
print(np.mean(train_scores))
print(np.mean(predict_scores))

0.0137863030105
-0.197952235791


## XGBoost Regressor
------------

In [16]:
%%time
train_dates = list(train_x.keys())
train_scores = []
predict_scores = []

for i, date in enumerate(train_dates):
    if i % 15 == 0:
        print(date)
    x = train_x[date]
    y = train_y[date]
    model = XGBRegressor(n_estimators=500,
                         learning_rate=0.02,
                         max_depth=3,
                         n_jobs=-1,
                         subsample=0.25,
                         colsample_bytree=0.5)
    model.fit(x, y)
    train_scores.append(model.score(x, y))
    
    p_x = predict_x[date]
    p_y = predict_y[date]
    predict_scores.append(model.score(p_x, p_y))

2017-01-03 00:00:00
2017-04-27 00:00:00
2017-08-15 00:00:00
2017-12-05 00:00:00
Wall time: 1min 32s


In [17]:
print(np.mean(train_scores))
print(np.mean(predict_scores))

0.0575499865219
-0.209037365429


## Native XGBoost Regressor
---------------

In [68]:
%%time
train_dates = list(train_x.keys())
train_scores = []
predict_scores = []

for i, date in enumerate(train_dates):
    if i % 15 == 0:
        print(date)
    x = train_x[date]
    y = train_y[date]
    
    x_train, x_eval, y_train, y_eval = train_test_split(x, y, test_size=0.33, random_state=42)
    
    dtrain = xgb.DMatrix(x_train, y_train)
    deval = xgb.DMatrix(x_eval, y_eval)
    param = {'silent': 1,
             'objective': 'reg:linear',
             'max_depth': 3,
             'eta': 0.005,
             'boost': 'gbtree',
             'tree_method': 'hist',
             'subsample': 0.1,
             'colsample_bytree': 0.25}
    num_round = 2000
    model = xgb.train(param, dtrain, num_round, evals=[(deval, 'eval')], early_stopping_rounds=50, verbose_eval=False)
    
    y_train_predict = model.predict(dtrain)
    train_scores.append(r2_score(y_train, y_train_predict, multioutput='uniform_average'))
    
    p_x = predict_x[date]
    p_y = predict_y[date]
    dtest = xgb.DMatrix(p_x, p_y)
    
    y_test_predict = model.predict(dtest)
    predict_scores.append(r2_score(p_y, y_test_predict, multioutput='uniform_average'))

2012-02-15 00:00:00
2012-06-06 00:00:00
2012-09-20 00:00:00
2013-01-15 00:00:00
2013-05-14 00:00:00
2013-08-30 00:00:00
2013-12-24 00:00:00
2014-04-17 00:00:00
2014-08-05 00:00:00
2014-11-26 00:00:00
2015-03-20 00:00:00
2015-07-08 00:00:00
2015-10-30 00:00:00
2016-02-22 00:00:00
2016-06-08 00:00:00
2016-09-27 00:00:00
2017-01-18 00:00:00
2017-05-15 00:00:00
2017-08-30 00:00:00
2017-12-20 00:00:00
Wall time: 6min 57s


In [69]:
print(np.mean(train_scores))
print(np.mean(predict_scores))

0.0158347715471
-0.477095380466
