In [None]:
%matplotlib inline
import numpy as np
from alphamind.api import *
from PyFin.api import *

engine = SqlEngine('postgres+psycopg2://postgres:we083826@localhost/alpha')

In [None]:
start_date = '2012-01-01'
end_date = '2017-12-31'

features = ['roe_q',
            'ep_q',
            'DivP',
            'cfinc1_q',
            'EBIT',
            'EARNYILD',
            'EPIBS']

freq = '5b'
batch = 16
universe = Universe('custom', ['zz500', 'hs300'])
benchmark = 905
neutralized_risk = ['SIZE'] + industry_styles
horizon = map_freq(freq)

In [None]:
%%time
factor_data = fetch_data_package(engine,
                                 features,
                                 start_date,
                                 end_date,
                                 '5b',
                                 universe,
                                 benchmark,
                                 batch=batch,
                                 warm_start=batch,
                                 neutralized_risk=neutralized_risk, 
                                 pre_process=[winsorize_normal, standardize],
                                 post_process=[winsorize_normal, standardize])

In [None]:
features = factor_data['x_names']

train_x = factor_data['train']['x']
train_y = factor_data['train']['y']
train_risk = factor_data['train']['risk']
ref_dates = sorted(train_x.keys())

predict_x = factor_data['predict']['x']
predict_y = factor_data['predict']['y']
predict_risk = factor_data['predict']['risk']
settlement = factor_data['settlement']

## Linear Regression
------------------

In [None]:
%%time
train_dates = list(train_x.keys())
train_scores = []
predict_scores = []

for i, date in enumerate(train_dates):
    if i % 15 == 0:
        print(date)
    x = train_x[date]
    y = train_y[date]
    
    model = LinearRegression(fit_intercept=False, features=features)
    model.fit(x, y)
    train_scores.append(model.score(x, y))
    
    p_x = predict_x[date]
    p_y = predict_y[date]
    predict_scores.append(model.score(p_x, p_y))

In [None]:
print(np.mean(train_scores))
print(np.mean(predict_scores))

## Lasso Regression
------------

In [None]:
%%time
train_dates = list(train_x.keys())
train_scores = []
predict_scores = []

for i, date in enumerate(train_dates):
    if i % 15 == 0:
        print(date)
    x = train_x[date]
    y = train_y[date]
    
    model = LassoRegression(alpha=0.01, fit_intercept=False, features=features)
    model.fit(x, y)
    train_scores.append(model.score(x, y))
    
    p_x = predict_x[date]
    p_y = predict_y[date]
    predict_scores.append(model.score(p_x, p_y))

In [None]:
print(np.mean(train_scores))
print(np.mean(predict_scores))

## Linear Regression with More Features
----------

In [None]:
def cross_product(x, y):
    n, m = x.shape
    res = []
    
    for j in range(m):
        res.append(x[:, [j]] * y)
        
    return np.concatenate(res, axis=1)

In [None]:
%%time
train_dates = list(train_x.keys())
train_scores = []
predict_scores = []

for i, date in enumerate(train_dates):
    if i % 15 == 0:
        print(date)
    x = train_x[date]
    y = train_y[date]
    risk = train_risk[date][:, 1:]
    new_x = cross_product(x, risk)
    
    model = LinearRegression(fit_intercept=False, features=features)
    model.fit(new_x, y)
    train_scores.append(model.score(new_x, y))
    
    p_x = predict_x[date]
    p_y = predict_y[date]
    p_risk = predict_risk[date][:, 1:]
    new_p_x = cross_product(p_x, p_risk)
    predict_scores.append(model.score(new_p_x, p_y))

In [None]:
print(np.mean(train_scores))
print(np.mean(predict_scores))

## Lasso Regression with More Features
----------------------

In [None]:
%%time
train_dates = list(train_x.keys())
train_scores = []
predict_scores = []

for i, date in enumerate(train_dates):
    if i % 15 == 0:
        print(date)
    x = train_x[date]
    y = train_y[date]
    risk = train_risk[date][:, 1:]
    new_x = cross_product(x, risk)
    
    model = LassoRegression(alpha=0.01, fit_intercept=False, features=features)
    model.fit(new_x, y)
    train_scores.append(model.score(new_x, y))
    
    p_x = predict_x[date]
    p_y = predict_y[date]
    p_risk = predict_risk[date][:, 1:]
    new_p_x = cross_product(p_x, p_risk)
    predict_scores.append(model.score(new_p_x, p_y))

In [None]:
print(np.mean(train_scores))
print(np.mean(predict_scores))

## Random Forest Regressor
---------------

In [None]:
%%time
train_dates = list(train_x.keys())
train_scores = []
predict_scores = []

for i, date in enumerate(train_dates):
    if i % 15 == 0:
        print(date)
    x = train_x[date]
    y = train_y[date]
    
    model = RandomForestRegressor(n_estimators=500, max_features='sqrt', max_depth=3, n_jobs=-1)
    model.fit(x, y)
    train_scores.append(model.score(x, y))
    
    p_x = predict_x[date]
    p_y = predict_y[date]
    predict_scores.append(model.score(p_x, p_y))

In [None]:
print(np.mean(train_scores))
print(np.mean(predict_scores))

## XGBoost Regressor
------------

In [None]:
%%time
train_dates = list(train_x.keys())
train_scores = []
predict_scores = []

for i, date in enumerate(train_dates):
    if i % 15 == 0:
        print(date)
    x = train_x[date]
    y = train_y[date]
    model = XGBRegressor(n_estimators=500, max_features='sqrt', max_depth=3, n_jobs=-1)
    model.fit(x, y)
    train_scores.append(model.score(x, y))
    
    p_x = predict_x[date]
    p_y = predict_y[date]
    predict_scores.append(model.score(p_x, p_y))

In [None]:
%%time
train_dates = list(train_x.keys())
train_scores = []
predict_scores = []

for i, date in enumerate(train_dates):
    if i % 15 == 0:
        print(date)
    x = train_x[date]
    y = train_y[date]
    model = XGBRegressor(n_estimators=500, max_features='sqrt', max_depth=3, n_jobs=-1)
    model.fit(x, y)
    new_train_scores.append(model.score(x, y))
    
    p_x = predict_x[date]
    p_y = predict_y[date]
    new_predict_scores.append(model.score(p_x, p_y))