In [None]:
import numpy as np
import pandas as pd
from tqdm import tqdm_notebook
import warnings
warnings.filterwarnings("ignore")
from lightgbm import LGBMClassifier
from sklearn.metrics import matthews_corrcoef as matt
from hyperopt import hp
from hyperopt import tpe
from hyperopt import Trials
from hyperopt import fmin

In [None]:
"""
Reading the data. Data has stock tickers in the first row and dates in the first column. Only trading days are used.
"""
df = pd.read_excel('C:/Users/FS_Askar_A/midcap.xlsx', index_col="Date")

In [None]:
def create_returns(dtf, tick, lead=5, lags=[1,2,3,4,5,7,10,15,20,30,50], tr=0.025):
    """
    Calculates forward and lagged returns for a stock. Also creates target column
    for returns over some threshold.
    
    Parameters:
    -----------
    dtf - data with all stock returns
    tick - ticker of stock 
    lead - forward return lead
    lags - lagged returns
    tr - threshold value for target variable
    
    Returns:
    --------
    DataFrame for one stock with its price, forward and lagged returns and target column.
    """
    dtf = dtf[[tick]]
    dtf['fwd'] = dtf[tick].shift(-lead) / dtf[tick]-1
    for lag in lags:
        name = 'ret'+str(lag)
        dtf[name] = dtf[tick]/dtf[tick].shift(lag)-1
    dtf['ycol'] = np.where(dtf['fwd'] >= tr, 1, 0)
    return dtf.dropna()


def my_tss(dtf, split, train_size=1000, test_size=100):
    """
    My take on time series split. It separates dataframe into "past" and 
    "future" from a splitting point. Also removes extra columns. 
    
    Parameters:
    dtf - dataframe of a stock
    split - splitting point in trading days
    train_size=1000 - size of the training sample
    test_size=100 - size of the testing sample
    
    Returns:
    trainx - training features
    trainy - training target column
    testx - testing features
    testy - testing target column
    
    Attention!
    This function is not fool-proof - it doesn't check whether the size of 
    the train size is less than splitting point or if testing sample has
    any values. This is done for speed purposes as it is called thousands of times.
    """
    train_start = split - train_size
    test_end = split + test_size
    trainx = dtf.drop(columns=[dtf.columns[0], 'fwd', 'ycol']).iloc[train_start:split]
    testx = dtf.drop(columns=[dtf.columns[0], 'fwd', 'ycol']).iloc[split:test_end]
    trainy = dtf['ycol'].iloc[train_start:split]
    testy = dtf['ycol'].iloc[split:test_end]
    return trainx, testx, trainy, testy


def integerize(d):
    """
    Converts hyperparameter values into integers. This is a compensation of hyperopt's 
    problem where it feeds integer values in float form. I.e. 2.0 instead of 2.
    
    Parameters:
    d - dictionary of hyperparameters
    
    Returns:
    d - dictionary of hyperparameters with integers where required
    """
    
    int_params = [
        'train_size',
        'test_size',
        'num_leaves',
        'max_depth',
        'n_estimators',
        'min_child_samples',
        'upto',
        'ticks_to_use'
    ]
    
    for k in d:
        if k in int_params:
            d[k] = int(d[k])
            
    return d


def calc_ret(results):
    """
    Calculation of return from predicted data.
    
    Parameters:
    -----------
    results - list of dataframes. Dataframes must have 'avg' column, which is the 
                average 5-day forward return for the day.
    
    Returns:
    --------
    ===negative=== return for the whole period, across all the dataframes. 
                Return is negative because hyperopt minimizes a function.
    """
    
    bigdf = pd.concat(results)
    bigdf['avg'] = bigdf.mean(axis=1).fillna(0)+1
    
    lead = 5
    portf = 100
    subportf = [portf/lead for l in range(lead)]
    x = 0
    for day in bigdf['avg']:
        subportf[x%5] = subportf[x%5]*day
        x+=1
        
    return 1-np.sum(subportf)/portf


def check_params(params, upto, stock_num=20):
    """
    Predicts on out-of-sample and out-of-cross-validation data
    using optimized hyperparameters. 
    
    Parameters:
    -------
    params - optimized hyperparameters
    upto - maximum split point for cross-validations,
            here it is incremented by 100 to not include the
            last validation sample (100 points in size)
            
    Returns:
    --------
    dataframe with 'avg' column, which is the 
                average 5-day forward return for the day.
    """
    params = integerize(params)
    bigdf = pd.DataFrame()
    
    for tick in df.columns[:stock_num]:
        
        tempdf = tickers_dfs[tick][['fwd']]
        trainx, testx, trainy, testy = my_tss(tickers_dfs[tick], upto+100, 
                                              train_size=params['train_size'])
        
        my_model = LGBMClassifier(num_leaves=params['num_leaves'],
                                     max_depth=params['max_depth'],
                                     learning_rate=params['learning_rate'],
                                     n_estimators=params['n_estimators'],
                                     min_child_samples=params['min_child_samples'])
        
        sw = np.where(trainy==0, params['sw'], 1)
            
        my_model.fit(trainx, trainy, sample_weight=sw)
        
        testx['pred'] = my_model.predict(testx)
        tempdf['pred'] = testx['pred']
        tempdf.dropna(inplace=True)
        tempdf['predret'] = np.where(tempdf['pred']==1, tempdf['fwd'], np.nan)
        bigdf[tick] = tempdf['predret']
        
    bigdf['avg'] = bigdf.mean(axis=1).fillna(0)+1
        
    return bigdf


def check_paramsEF(tick_params, upto):
    """
    Predicts on out-of-sample and out-of-cross-validation data
    using optimized hyperparameters for every ticker separately. 
    
    Parameters:
    -------
    tick_params - dict of optimized hyperparameters
    upto - maximum split point for cross-validations,
            here it is incremented by 100 to not include the
            last validation sample (100 points in size)
            
    Returns:
    --------
    dataframe with 'avg' column, which is the 
                average 5-day forward return for the day.
    """
    
    bigdf = pd.DataFrame()
    
    for tick, params in tick_params.items():
        
        params = integerize(params)
        
        tempdf = tickers_dfs[tick][['fwd']]
        trainx, testx, trainy, testy = my_tss(tickers_dfs[tick], upto+100, 
                                              train_size=params['train_size'])
        
        my_model = LGBMClassifier(num_leaves=params['num_leaves'],
                                     max_depth=params['max_depth'],
                                     learning_rate=params['learning_rate'],
                                     n_estimators=params['n_estimators'],
                                     min_child_samples=params['min_child_samples'])
        
        sw = np.where(trainy==0, params['sw'], 1)

        my_model.fit(trainx, trainy, sample_weight=sw)
        
        testx['pred'] = my_model.predict(testx)
        tempdf['pred'] = testx['pred']
        tempdf.dropna(inplace=True)
        tempdf['predret'] = np.where(tempdf['pred']==1, tempdf['fwd'], np.nan)
        bigdf[tick] = tempdf['predret']
        
    bigdf['avg'] = bigdf.mean(axis=1).fillna(0)+1
        
    return bigdf


def get_est_series(dtf, com=0):
    """
    Calculates total return over several out-of-cross-validation samples.
    
    Parameters:
    -----------
    dtf - dataframe with 'avg' column
    com=0 - value of commissions for ==one== side of a trade in USD/share
    
    Returns:
    portf_series - series with portfolio value at a point in time
    """
    
    comdf = dtf.drop(columns=['avg'])
    for tick in dtf.columns[:-1]:
        if df[tick].iloc[1600] >= 0:
            comdf[tick] = (df[tick].shift(-5)-com) / (df[tick]+com)-1
            comdf[tick] = np.where(dtf[tick].notna(), comdf[tick], np.nan)
        else:
            comdf[tick] = np.nan
    comdf['avg'] = comdf.mean(axis=1).fillna(0)+1
    
    portf_series = []
    lead = 5
    portf = 100
    subportf = [portf/lead for l in range(lead)]
    x = 0
    for day in comdf['avg']:
        subportf[x%5] = subportf[x%5]*day
        portf_series.append(subportf.copy())
        x+=1
    
    portf_series = pd.DataFrame(portf_series, index=comdf.index)
    portf_series = portf_series.shift(5)
    portf_series = portf_series.fillna(20).sum(axis=1)
    
    return portf_series


def get_res(params):
    
    mymodel_results = []

    for tick in df.columns:

        for split in range(1000, 2200, 100):
            trainx, testx, trainy, testy = my_tss(tickers_dfs[tick], split)
            my_model = LGBMClassifier(**params)
            my_model.fit(trainx, trainy)
            predy = my_model.predict(testx)
            mymodel_results.append(matt(testy, predy))

    return np.mean(mymodel_results)


In [None]:
%%time

tickers_dfs = {}

for tick in tqdm_notebook(df.columns):
    tickers_dfs[tick] = create_returns(df, tick)

In [None]:
def analyze_upto_modelD(params):
    """
    Cross-validation function that takes does hyperparameters 
    validation upto some maximum split point and yields total
    return as output (negative for hyperopt purposes)
    
    Parameters:
    -------
    params - dictionary with hyperparameters from hyperopt
                and 'upto' key, that limits cross-validation in time
                
    Returns:
    --------
    average matt score for the cross-validation
    """
    
    params = integerize(params)
    
    results = []
    
    my_model = LGBMClassifier(num_leaves=params['num_leaves'],
                                         max_depth=params['max_depth'],
                                         learning_rate=params['learning_rate'],
                                         n_estimators=params['n_estimators'],
                                         min_child_samples=params['min_child_samples'])
    
    for split in range(params['upto']-500, params['upto'], 100):
        
        tempdf = pd.DataFrame()
        
        for tick in df.columns[:params['ticks_to_use']]:
            trainx, testx, trainy, testy = my_tss(tickers_dfs[tick], split, 
                                                      train_size=params['train_size'])
            
            sw = np.where(trainy==0, params['sw'], 1)
            
            my_model.fit(trainx, trainy, sample_weight=sw)
            testx['pred'] = my_model.predict(testx)
            tempdf['pred'] = testx['pred']
            tempdf['fwd'] = tickers_dfs[tick]['fwd']
            tempdf[tick] = np.where(tempdf['pred']==1, tempdf['fwd'], np.nan)
        
        results.append(tempdf)
    
    return -calc_ret(results)

def bayes_opt_modelD(upto, ticks_to_use=20, eval_n=100):
    """
    Hyperopt's Bayesian optimization.
    
    Parameters:
    -------
    upto - maximum split point for cross-validations
    eval_n=100 - number of evaluations to do
    
    Returns:
    tpe_best - best found hyperparameters
    """
    
    
    space_index = {
        'num_leaves': hp.quniform('num_leaves', 10, 50, 1),
        'max_depth': hp.quniform('max_depth', 3, 8, 1),
        'learning_rate': hp.loguniform('learning_rate', np.log(0.05), np.log(0.2)),
        'n_estimators': hp.quniform('n_estimators', 32, 512, 8),
        'min_child_samples': hp.quniform('min_child_samples', 10, 50, 5),
        'sw': hp.uniform('sw', 0.4, 0.7),
        'train_size': hp.quniform('train_size', 400, 1000, 100),
        'upto': hp.choice('upto', [upto]),
        'ticks_to_use': ticks_to_use
    }
    
    tpe_algo = tpe.suggest
    tpe_trials = Trials()
    tpe_best = fmin(fn=analyze_upto_modelD, space=space_index, algo=tpe_algo, 
                    trials=tpe_trials, max_evals=eval_n, verbose=True)
    
    return tpe_best


In [None]:
"""
Calculating the overall matt score for Model D20
First 20 stocks only.
"""

modelD20_returns = []
for upto in tqdm_notebook(range(1500, 2100, 100)):

    opt_params = bayes_opt_modelD(upto)
    modelD20_returns.append(check_params(opt_params, upto))

modelD20_ret = get_est_series(pd.concat(modelD20_returns))

print("Average return for Model D20:", \
      round(modelD20_ret.iloc[-1]-100, 2), '%')

In [None]:
"""
Calculating the overall matt score for Model D317
All 317 stocks.
"""

modelD317_returns = []
for upto in tqdm_notebook(range(1500, 2100, 100)):

    opt_params = bayes_opt_modelD(upto, ticks_to_use=317)
    modelD317_returns.append(check_params(opt_params, upto, stock_num=317))

modelD317_ret = get_est_series(pd.concat(modelD317_returns))

print("Average return for Model D317:", \
      round(modelD317_ret.iloc[-1]-100, 2), '%')

In [None]:
bigdf_D317 = pd.concat(modelD317_returns)
"""
Reading indices file. MDY ETF price must be there
"""
inddf = pd.read_excel('indices.xlsx', index_col="Date")
compare = pd.DataFrame()
compare['No commissions'] = get_est_series(bigdf_D317, 0)
compare['0.005 commissions'] = get_est_series(bigdf_D317, 0.005)
compare['MDY ETF'] = inddf['mdy']
compare['MDY ETF'] = compare['MDY ETF']/compare['MDY ETF'].iloc[0]*100
compare.tail()

In [None]:
def new_est_series(dtf, com=0):
    """
    Calculates total return over several out-of-cross-validation samples.
    
    Parameters:
    -----------
    dtf - dataframe with 'avg' column
    com=0 - value of commissions for ==one== side of a trade in USD/share
    
    Returns:
    portf_series - series with portfolio value at a point in time
    """
    
    comdf = dtf.drop(columns=['avg'])
    for tick in dtf.columns[:-1]:
        if df[tick].iloc[1600] >= 0:
            comdf[tick] = (new_data[tick].shift(-5)-com) / (new_data[tick]+com)-1
            comdf[tick] = np.where(dtf[tick].notna(), comdf[tick], np.nan)
        else:
            comdf[tick] = np.nan
    comdf['avg'] = comdf.mean(axis=1).fillna(0)+1
    
    portf_series = []
    lead = 5
    portf = 100
    subportf = [portf/lead for l in range(lead)]
    x = 0
    for day in comdf['avg']:
        subportf[x%5] = subportf[x%5]*day
        portf_series.append(subportf.copy())
        x+=1
    
    portf_series = pd.DataFrame(portf_series, index=comdf.index)
    portf_series = portf_series.shift(5)
    portf_series = portf_series.fillna(20).sum(axis=1)
    
    return portf_series

In [None]:
"""
Creating new data.
"""

new_data = pd.read_excel('C:/Users/FS_Askar_A/long indices/midcapl.xlsx', index_col="Date")

new_dfs = {}

for tick in tqdm_notebook(new_data.columns):
    new_dfs[tick] = create_returns(new_data, tick)

In [None]:
"""
This does one round of cross-validation to get optimized hyperparameters that worked for old data. 
They are then going to be used for model that predicts on new data.
"""
final_old_data = tickers_dfs[df.columns[0]].shape[0]
opt_params = bayes_opt_modelD(final_old_data, ticks_to_use=317)

In [None]:
params = integerize(opt_params)
bigdf = pd.DataFrame()

for tick in tqdm_notebook(new_data.columns):

    tempdf = new_dfs[tick][['fwd']]
    trainx, testx, trainy, testy = my_tss(new_dfs[tick], final_old_data, train_size=params['train_size'])
    
    if testx.shape[0] > 0:
        my_model = LGBMClassifier(num_leaves=params['num_leaves'],
                                     max_depth=params['max_depth'],
                                     learning_rate=params['learning_rate'],
                                     n_estimators=params['n_estimators'],
                                     min_child_samples=params['min_child_samples'])

        sw = np.where(trainy==0, params['sw'], 1)

        my_model.fit(trainx, trainy, sample_weight=sw)
        testx['pred'] = my_model.predict(testx)
        tempdf['pred'] = testx['pred']
        tempdf.dropna(inplace=True)
        tempdf['predret'] = np.where(tempdf['pred']==1, tempdf['fwd'], np.nan)
        bigdf[tick] = tempdf['predret']

bigdf['avg'] = bigdf.mean(axis=1).fillna(0)+1

inddf = pd.read_excel('indices.xlsx', index_col="Date")
compare = pd.DataFrame()
compare['No commissions'] = new_est_series(bigdf, 0)
compare['0.005 commissions'] = new_est_series(bigdf, 0.005)
compare['MDY ETF'] = inddf['mdy']
compare['MDY ETF'] = compare['MDY ETF']/compare['MDY ETF'].iloc[0]*100
compare.tail()