In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm_notebook
import warnings
warnings.filterwarnings("ignore")
from lightgbm import LGBMClassifier
from sklearn.metrics import matthews_corrcoef as matt
from hyperopt import hp
from hyperopt import tpe
from hyperopt import Trials
from hyperopt import fmin

In [2]:
"""
Reading the data. Data has stock tickers in the first row and dates in the first column. Only trading days are used.
"""
df = pd.read_excel('midcap.xlsx', index_col="Date")

### The following methods are used throughout all models

In [28]:
def create_returns(dtf, tick, lead=5, lags=[1,2,3,4,5,7,10,15,20,30,50], tr=0.025):
    """
    Calculates forward and lagged returns for a stock. Also creates target column
    for returns over some threshold.
    
    Parameters:
    -----------
    dtf - data with all stock returns
    tick - ticker of stock 
    lead - forward return lead
    lags - lagged returns
    tr - threshold value for target variable
    
    Returns:
    --------
    DataFrame for one stock with its price, forward and lagged returns and target column.
    """
    dtf = dtf[[tick]]
    dtf['fwd'] = dtf[tick].shift(-lead) / dtf[tick]-1
    for lag in lags:
        name = 'ret'+str(lag)
        dtf[name] = dtf[tick]/dtf[tick].shift(lag)-1
    dtf['ycol'] = np.where(dtf['fwd'] >= tr, 1, 0)
    return dtf.dropna()


def my_tss(dtf, split, train_size=1000, test_size=100):
    """
    My take on time series split. It separates dataframe into "past" and 
    "future" from a splitting point. Also removes extra columns. 
    
    Parameters:
    dtf - dataframe of a stock
    split - splitting point in trading days
    train_size=1000 - size of the training sample
    test_size=100 - size of the testing sample
    
    Returns:
    trainx - training features
    trainy - training target column
    testx - testing features
    testy - testing target column
    
    Attention!
    This function is not fool-proof - it doesn't check whether the size of 
    the train size is less than splitting point or if testing sample has
    any values. This is done for speed purposes as it is called thousands of times.
    """
    train_start = split - train_size
    test_end = split + test_size
    trainx = dtf.drop(columns=[dtf.columns[0], 'fwd', 'ycol']).iloc[train_start:split]
    testx = dtf.drop(columns=[dtf.columns[0], 'fwd', 'ycol']).iloc[split:test_end]
    trainy = dtf['ycol'].iloc[train_start:split]
    testy = dtf['ycol'].iloc[split:test_end]
    return trainx, testx, trainy, testy


def integerize(d):
    """
    Converts hyperparameter values into integers. This is a compensation of hyperopt's 
    problem where it feeds integer values in float form. I.e. 2.0 instead of 2.
    
    Parameters:
    d - dictionary of hyperparameters
    
    Returns:
    d - dictionary of hyperparameters with integers where required
    """
    
    int_params = [
        'train_size',
        'test_size',
        'num_leaves',
        'max_depth',
        'n_estimators',
        'min_child_samples',
        'upto'
    ]
    
    for k in d:
        if k in int_params:
            d[k] = int(d[k])
            
    return d


def calc_ret(results):
    """
    Calculation of return from predicted data.
    
    Parameters:
    -----------
    results - list of dataframes. Dataframes must have 'avg' column, which is the 
                average 5-day forward return for the day.
    
    Returns:
    --------
    ===negative=== return for the whole period, across all the dataframes. 
                Return is negative because hyperopt minimizes a function.
    """
    
    bigdf = pd.concat(results)
    bigdf['avg'] = bigdf.mean(axis=1).fillna(0)+1
    
    lead = 5
    portf = 100
    subportf = [portf/lead for l in range(lead)]
    x = 0
    for day in bigdf['avg']:
        subportf[x%5] = subportf[x%5]*day
        x+=1
        
    return 1-np.sum(subportf)/portf

### Calculating returns for each stock

In [8]:
%%time

tickers_dfs = {}

for tick in tqdm_notebook(df.columns):
    tickers_dfs[tick] = create_returns(df, tick)

HBox(children=(IntProgress(value=0, max=317), HTML(value='')))


Wall time: 2min 41s


### Model A

In [20]:
"""
Model A317 - all 317 stocks
"""

modelA317_results = {}

for tick in tqdm_notebook(df.columns):
    
    tick_results = []
    
    for split in range(1000, 2200, 100):
        trainx, testx, trainy, testy = my_tss(tickers_dfs[tick], split)
        my_model = LGBMClassifier()
        my_model.fit(trainx, trainy)
        predy = my_model.predict(testx)
        tick_results.append(matt(testy, predy))
    
    modelA317_results[tick] = np.mean(tick_results)

modelA317_results = pd.Series(modelA317_results)    

print("Average matt score across all stocks:", round(modelA317_results.mean(), 3))
print("Min matt score across all stocks:", round(modelA317_results.min(), 3))
print("Max matt score across all stocks:", round(modelA317_results.max(), 3))

HBox(children=(IntProgress(value=0, max=5), HTML(value='')))


Average matt score across all stocks: 0.022
Min matt score across all stocks: -0.007
Max matt score across all stocks: 0.063


In [None]:
"""
Model A20 - first 20 stocks
"""

modelA20_results = {}

for tick in tqdm_notebook(df.columns[:20]):
    
    tick_results = []
    
    for split in range(1000, 2200, 100):
        trainx, testx, trainy, testy = my_tss(tickers_dfs[tick], split)
        my_model = LGBMClassifier()
        my_model.fit(trainx, trainy)
        predy = my_model.predict(testx)
        tick_results.append(matt(testy, predy))
    
    modelA20_results[tick] = np.mean(tick_results)

modelA20_results = pd.Series(modelA20_results)    

print("Average matt score for the first 20 stocks:", round(modelA20_results.mean(), 3))

### Sensitivity to hyperparameters change

In [22]:
def get_res(params):
    
    mymodel_results = []

    for tick in df.columns:

        for split in range(1000, 2200, 100):
            trainx, testx, trainy, testy = my_tss(tickers_dfs[tick], split)
            my_model = LGBMClassifier(**params)
            my_model.fit(trainx, trainy)
            predy = my_model.predict(testx)
            mymodel_results.append(matt(testy, predy))

    return np.mean(mymodel_results)

params_changes = {
    'num_leaves': 4,
    'max_depth': 5,
    'learning_rate': 0.01,
    'n_estimators': 20,
    'min_child_samples': 10
}

for k, v in params_changes.items():
    print("Parameter", k, "changed value to", v, \
          "Average matt score changed to", round(get_res({k: v}), 3))


Parameter num_leaves changed value to 4 Average matt score changed to 0.044
Parameter max_depth changed value to 5 Average matt score changed to 0.015
Parameter learning_rate changed value to 0.01 Average matt score changed to 0.004
Parameter n_estimators changed value to 20 Average matt score changed to 0.014
Parameter min_child_samples changed value to 10 Average matt score changed to 0.013


### Model B

In [35]:
def analyze_upto_modelB(params):
    """
    Cross-validation function that takes does hyperparameters 
    validation upto some maximum split point and yields average 
    matt score result as output (negative for hyperopt purposes)
    
    Params:
    -------
    params - dictionary with hyperparameters from hyperopt
                and 'upto' key, that limits cross-validation in time
                
    Returns:
    --------
    average matt score for the cross-validation
    """
    
    params = integerize(params)
    results = []
    
    my_model = LGBMClassifier(num_leaves=params['num_leaves'],
                                         max_depth=params['max_depth'],
                                         learning_rate=params['learning_rate'],
                                         n_estimators=params['n_estimators'],
                                         min_child_samples=params['min_child_samples'])
    
    for split in range(params['upto']-500, params['upto'], 100):
        
        for tick in df.columns[:20]:
            
            trainx, testx, trainy, testy = my_tss(tickers_dfs[tick], split)
            
            my_model.fit(trainx, trainy)
            predy = my_model.predict(testx)
            results.append(matt(testy, predy))
    
    return -np.mean(results)

def bayes_opt_modelB(upto, eval_n=100):
    """
    Hyperopt's Bayesian optimization.
    
    Params:
    -------
    upto - maximum split point for cross-validations
    eval_n=100 - number of evaluations to do
    
    Returns:
    tpe_best - best found hyperparameters
    """
    
    
    space_index = {
        'num_leaves': hp.quniform('num_leaves', 10, 50, 1),
        'max_depth': hp.quniform('max_depth', 3, 8, 1),
        'learning_rate': hp.loguniform('learning_rate', np.log(0.05), np.log(0.2)),
        'n_estimators': hp.quniform('n_estimators', 32, 512, 8),
        'min_child_samples': hp.quniform('min_child_samples', 10, 50, 5),
        'upto': hp.choice('upto', [upto])
    }
    
    tpe_algo = tpe.suggest
    tpe_trials = Trials()
    tpe_best = fmin(fn=analyze_upto_modelB, space=space_index, algo=tpe_algo, 
                    trials=tpe_trials, max_evals=eval_n, verbose=True)
    
    return tpe_best

def predict_params_modelB(params, upto):
    """
    Predicts on out-of-sample and out-of-cross-validation data
    using optimized hyperparameters. 
    
    Params:
    -------
    params - optimized hyperparameters
    upto - maximum split point for cross-validations,
            here it is incremented by 100 to not include the
            last validation sample (100 points in size)
            
    Returns:
    --------
    average matt score result
    """
    
    params = integerize(params)
    results = []
    
    for tick in df.columns[:20]:
        
        tempdf = tickers_dfs[tick][['fwd']]
        trainx, testx, trainy, testy = my_tss(tickers_dfs[tick], upto+100)
        
        my_model = LGBMClassifier(num_leaves=params['num_leaves'],
                                     max_depth=params['max_depth'],
                                     learning_rate=params['learning_rate'],
                                     n_estimators=params['n_estimators'],
                                     min_child_samples=params['min_child_samples'])
                    
        my_model.fit(trainx, trainy)
        predy = my_model.predict(testx)
        results.append(matt(testy, predy))
        
    return np.mean(results)

In [32]:
"""
Calculating the overall matt score for Model B20
First 20 stocks only.
"""

modelB20_results = []
for upto in tqdm_notebook(range(1500, 2100, 100)):

    opt_params = bayes_opt_modelB(upto)
    modelB20_results.append(predict_params_modelB(opt_params, upto))
    
print("Average score for Model B20:", \
      round(np.mean(modelB20_results), 3))

HBox(children=(IntProgress(value=0, max=6), HTML(value='')))

### Model C

In [39]:
def analyze_upto_modelC(params):
    """
    Cross-validation function that takes does hyperparameters 
    validation upto some maximum split point and yields average 
    matt score result as output (negative for hyperopt purposes)
    
    Params:
    -------
    params - dictionary with hyperparameters from hyperopt
                and 'upto' key, that limits cross-validation in time
                
    Returns:
    --------
    average matt score for the cross-validation
    """
    
    params = integerize(params)
    results = []
    
    my_model = LGBMClassifier(num_leaves=params['num_leaves'],
                                         max_depth=params['max_depth'],
                                         learning_rate=params['learning_rate'],
                                         n_estimators=params['n_estimators'],
                                         min_child_samples=params['min_child_samples'])
    
    for split in range(params['upto']-500, params['upto'], 100):
        
        for tick in df.columns[:20]:
            
            trainx, testx, trainy, testy = my_tss(tickers_dfs[tick], split, 
                                                      train_size=params['train_size'])
            
            sw = np.where(trainy==0, params['sw'], 1)
            
            my_model.fit(trainx, trainy, sample_weight=sw)
            predy = my_model.predict(testx)
            results.append(matt(testy, predy))
    
    return -np.mean(results)

def bayes_opt_modelC(upto, eval_n=100):
    """
    Hyperopt's Bayesian optimization.
    
    Params:
    -------
    upto - maximum split point for cross-validations
    eval_n=100 - number of evaluations to do
    
    Returns:
    tpe_best - best found hyperparameters
    """
    
    
    space_index = {
        'num_leaves': hp.quniform('num_leaves', 10, 50, 1),
        'max_depth': hp.quniform('max_depth', 3, 8, 1),
        'learning_rate': hp.loguniform('learning_rate', np.log(0.05), np.log(0.2)),
        'n_estimators': hp.quniform('n_estimators', 32, 512, 8),
        'min_child_samples': hp.quniform('min_child_samples', 10, 50, 5),
        'sw': hp.uniform('sw', 0.4, 0.7),
        'train_size': hp.quniform('train_size', 400, 1000, 100),
        'upto': hp.choice('upto', [upto])
    }
    
    tpe_algo = tpe.suggest
    tpe_trials = Trials()
    tpe_best = fmin(fn=analyze_upto_modelC, space=space_index, algo=tpe_algo, 
                    trials=tpe_trials, max_evals=eval_n, verbose=True)
    
    return tpe_best

def predict_params_modelC(params, upto):
    """
    Predicts on out-of-sample and out-of-cross-validation data
    using optimized hyperparameters. 
    
    Params:
    -------
    params - optimized hyperparameters
    upto - maximum split point for cross-validations,
            here it is incremented by 100 to not include the
            last validation sample (100 points in size)
            
    Returns:
    --------
    average matt score result
    """
    
    params = integerize(params)
    results = []
    
    for tick in df.columns[:20]:
        
        tempdf = tickers_dfs[tick][['fwd']]
        
        my_model = LGBMClassifier(num_leaves=params['num_leaves'],
                                     max_depth=params['max_depth'],
                                     learning_rate=params['learning_rate'],
                                     n_estimators=params['n_estimators'],
                                     min_child_samples=params['min_child_samples'])
        
        trainx, testx, trainy, testy = my_tss(tickers_dfs[tick], split, 
                                                      train_size=params['train_size'])
            
        sw = np.where(trainy==0, params['sw'], 1)

        my_model.fit(trainx, trainy, sample_weight=sw)
        predy = my_model.predict(testx)
        results.append(matt(testy, predy))
        
    return np.mean(results)

In [40]:
def check_params(params, upto, stock_num=20):
    """
    Predicts on out-of-sample and out-of-cross-validation data
    using optimized hyperparameters. 
    
    Params:
    -------
    params - optimized hyperparameters
    upto - maximum split point for cross-validations,
            here it is incremented by 100 to not include the
            last validation sample (100 points in size)
            
    Returns:
    --------
    dataframe with 'avg' column, which is the 
                average 5-day forward return for the day.
    """
    params = integerize(params)
    bigdf = pd.DataFrame()
    
    for tick in df.columns[:stock_num]:
        
        tempdf = tickers_dfs[tick][['fwd']]
        trainx, testx, trainy, testy = my_tss(tickers_dfs[tick], upto+100, 
                                              train_size=params['train_size'])
        
        my_model = LGBMClassifier(num_leaves=params['num_leaves'],
                                     max_depth=params['max_depth'],
                                     learning_rate=params['learning_rate'],
                                     n_estimators=params['n_estimators'],
                                     min_child_samples=params['min_child_samples'])
        
        sw = np.where(trainy==0, params['sw'], 1)
            
        my_model.fit(trainx, trainy, sample_weight=sw)
        
        testx['pred'] = my_model.predict(testx)
        tempdf['pred'] = testx['pred']
        tempdf.dropna(inplace=True)
        tempdf['predret'] = np.where(tempdf['pred']==1, tempdf['fwd'], np.nan)
        bigdf[tick] = tempdf['predret']
        
    bigdf['avg'] = bigdf.mean(axis=1).fillna(0)+1
        
    return bigdf

def get_est_series(dtf, com=0):
    """
    Calculates total return over several out-of-cross-validation samples.
    
    Parameters:
    -----------
    dtf - dataframe with 'avg' column
    com=0 - value of commissions for ==one== side of a trade in USD/share
    
    Returns:
    portf_series - series with portfolio value at a point in time
    """
    
    comdf = dtf.drop(columns=['avg'])
    for tick in bigdf.columns[:-1]:
        if tdf[tick].iloc[1600] >= 0:
            comdf[tick] = (tdf[tick].shift(-5)-com) / (tdf[tick]+com)-1
            comdf[tick] = np.where(bigdf[tick].notna(), comdf[tick], np.nan)
        else:
            comdf[tick] = np.nan
    comdf['avg'] = comdf.mean(axis=1).fillna(0)+1
    
    portf_series = []
    lead = 5
    portf = 100
    subportf = [portf/lead for l in range(lead)]
    x = 0
    for day in comdf['avg']:
        subportf[x%5] = subportf[x%5]*day
        portf_series.append(subportf.copy())
        x+=1
    
    portf_series = pd.DataFrame(portf_series, index=comdf.index)
    portf_series = portf_series.shift(5)
    portf_series = portf_series.fillna(20).sum(axis=1)
    
    return portf_series

In [None]:
"""
Calculating the overall matt score for Model C20
First 20 stocks only.
"""

modelC20_matt = []
for upto in tqdm_notebook(range(1500, 2100, 100)):

    opt_params = bayes_opt_modelC(upto, eval_n=3)
    modelC20_matt.append(predict_params_modelC(opt_params, upto))
    
print("Average score for Model C20:", \
      round(np.mean(modelC20_matt), 3))

HBox(children=(IntProgress(value=0, max=6), HTML(value='')))

In [None]:
from time import time
bayes_step = 1 

def analyze_upto(params):
    global bayes_step
    t0 = time()
    
    params = integerize(params)
    
    results = []
    
    my_model = LGBMClassifier(num_leaves=params['num_leaves'],
                                         max_depth=params['max_depth'],
                                         learning_rate=params['learning_rate'],
                                         n_estimators=params['n_estimators'],
                                         min_child_samples=params['min_child_samples'])
    
    for split in range(params['upto']-500, params['upto'], 100):
        
        tempdf = pd.DataFrame()
        
        for tick in df.columns:
            trainx, testx, trainy, testy = my_tss(tickers_dfs[tick], split, 
                                                      train_size=params['train_size'], 
                                                      test_size=100)
            
            sw = np.where(trainy==0, params['sw'], 1)
            
            my_model.fit(trainx, trainy, sample_weight=sw)
            testx['pred'] = my_model.predict(testx)
            tempdf['pred'] = testx['pred']
            tempdf['fwd'] = tickers_dfs[tick]['fwd']
            tempdf[tick] = np.where(tempdf['pred']==1, tempdf['fwd'], np.nan)
        
        results.append(tempdf)
    
    res = calc_ret(results)
    print("Step #:", bayes_step, round(-res, 4), 'Time elapsed: '+str(round(time()-t0,2)))
    bayes_step+=1
    
    return res

In [None]:
def bayes_opt(upto, eval_n=100):
    space_index = {
        'num_leaves': hp.quniform('num_leaves', 10, 50, 1),
        'max_depth': hp.quniform('max_depth', 3, 8, 1),
        'learning_rate': hp.loguniform('learning_rate', np.log(0.05), np.log(0.2)),
        'n_estimators': hp.quniform('n_estimators', 32, 512, 8),
        'min_child_samples': hp.quniform('min_child_samples', 10, 50, 5),
        'sw': hp.uniform('sw', 0.4, 0.7),
        'train_size': hp.quniform('train_size', 400, 1000, 100),
        'upto': hp.choice('upto', [upto])
    }
    tpe_algo = tpe.suggest
    tpe_trials = Trials()
    tpe_best = fmin(fn=analyze_upto, space=space_index, algo=tpe_algo, trials=tpe_trials, 
                    max_evals=eval_n, verbose=True)
    return tpe_best, tpe_trials

In [None]:
def check_params(params, upto):
    
    params = integerize(params)
    bigdf = pd.DataFrame()
    
    for tick in df.columns:
        
        tempdf = tickers_dfs[tick][['fwd']]
        trainx, testx, trainy, testy = my_tss(tickers_dfs[tick], upto+100, 
                                              train_size=params['train_size'], 
                                                  test_size=100)
        my_model = LGBMClassifier(num_leaves=params['num_leaves'],
                                     max_depth=params['max_depth'],
                                     learning_rate=params['learning_rate'],
                                     n_estimators=params['n_estimators'],
                                     min_child_samples=params['min_child_samples'])
        
        sw = np.where(trainy==0, params['sw'], 1)
            
        my_model.fit(trainx, trainy, sample_weight=sw)
        testx['pred'] = my_model.predict(testx)
        tempdf['pred'] = testx['pred']
        tempdf.dropna(inplace=True)
        tempdf['predret'] = np.where(tempdf['pred']==1, tempdf['fwd'], np.nan)
        bigdf[tick] = tempdf['predret']
        
    bigdf['avg'] = bigdf.mean(axis=1).fillna(0)+1
        
    return bigdf

In [None]:
tdf = pd.read_excel('long indices/midcapl.xlsx', index_col="Date")

In [None]:
%%time

tickers_tdfs = {}

for tick in tqdm_notebook(tdf.columns):
    tickers_tdfs[tick] = create_returns(tdf, tick)

In [None]:
tickers_dfs['exel'].shape, tickers_tdfs['exel'].shape

In [None]:
bayes_step = 1 
opt_params, opt_trials = bayes_opt(tickers_dfs['exel'].shape[0], eval_n=100)

In [None]:
frombest = 1
opt_params = pd.DataFrame(opt_trails1).T.sort_values(0).iloc[frombest][1]
print(pd.DataFrame(opt_trails1).T.sort_values(0).iloc[frombest][0])
params = integerize(opt_params)
bigdf = pd.DataFrame()

for tick in tqdm_notebook(tdf.columns):

    tempdf = tickers_tdfs[tick][['fwd']]
    trainx, testx, trainy, testy = my_tss(tickers_tdfs[tick], tickers_dfs['exel'].shape[0], 
                                          train_size=params['train_size'], 
                                              test_size=100)
    if testx.shape[0] > 0:
        my_model = LGBMClassifier(num_leaves=params['num_leaves'],
                                     max_depth=params['max_depth'],
                                     learning_rate=params['learning_rate'],
                                     n_estimators=params['n_estimators'],
                                     min_child_samples=params['min_child_samples'])

        sw = np.where(trainy==0, params['sw'], 1)

        my_model.fit(trainx, trainy, sample_weight=sw)
        testx['pred'] = my_model.predict(testx)
        tempdf['pred'] = testx['pred']
        tempdf.dropna(inplace=True)
        tempdf['predret'] = np.where(tempdf['pred']==1, tempdf['fwd'], np.nan)
        bigdf[tick] = tempdf['predret']

bigdf['avg'] = bigdf.mean(axis=1).fillna(0)+1



In [None]:
inddf = pd.read_excel('indices.xlsx', index_col="Date")
compare = pd.DataFrame()
compare['No commissions'] = get_est_series(bigdf, 0)
compare['0.005 commissions'] = get_est_series(bigdf, 0.005)
compare['MDY ETF'] = inddf['mdy'].shift(-0)
compare['MDY ETF'] = compare['MDY ETF']/compare['MDY ETF'].iloc[0]*100
compare.to_excel("model5OST.xlsx")

In [None]:
compare.tail()

In [None]:
def convert_trials(trial_dict):
    out_dict = {}
    for k, v in trial_dict.items():
        out_dict[k] = v[0]
        
    return out_dict

opt_trails1 = {}
for x in range(100):
    opt_trails1[x] = [opt_trials.trials[x]['result']['loss'], convert_trials(opt_trials.trials[x]['misc']['vals'])]
    

In [None]:
pd.DataFrame(opt_trails1).T.sort_values(0).iloc[1][1]

In [None]:
allres = []
for upto in tqdm_notebook(range(1500, 2100, 100)):

    opt_params = bayes_opt(upto, eval_n=100)
    allres.append(check_params(opt_params, upto))

In [None]:
#bigdf = pd.read_excel("model5return317.xlsx", index_col="Date")

In [None]:
def get_est_series(dtf, com=0):

    comdf = dtf.drop(columns=['avg'])
    for tick in bigdf.columns[:-1]:
        if tdf[tick].iloc[1600] >= 0:
            comdf[tick] = (tdf[tick].shift(-5)-com) / (tdf[tick]+com)-1
            comdf[tick] = np.where(bigdf[tick].notna(), comdf[tick], np.nan)
        else:
            comdf[tick] = np.nan
    comdf['avg'] = comdf.mean(axis=1).fillna(0)+1
    
    portf_series = []
    lead = 5
    portf = 100
    subportf = [portf/lead for l in range(lead)]
    x = 0
    for day in comdf['avg']:
        subportf[x%5] = subportf[x%5]*day
        portf_series.append(subportf.copy())
        x+=1
    
    portf_series = pd.DataFrame(portf_series, index=comdf.index)
    portf_series = portf_series.shift(5)
    portf_series = portf_series.fillna(20).sum(axis=1)
    
    return portf_series

In [None]:
inddf = pd.read_excel('indices.xlsx', index_col="Date")

In [None]:
compare = pd.DataFrame()
compare['No commissions'] = get_est_series(bigdf, 0)
compare['0.005 commissions'] = get_est_series(bigdf, 0.005)
compare['B-E commissions'] = get_est_series(bigdf, 0.023)
compare['MDY ETF'] = inddf['mdy'].shift(-0)
compare['MDY ETF'] = compare['MDY ETF']/compare['MDY ETF'].iloc[0]*100
compare.to_excel("model5compare317return.xlsx")

In [None]:
compare

In [None]:
compare['B-E commissions'] = get_est_series(bigdf, 0.024)
compare['B-E commissions'].iloc[-1]/compare['MDY ETF'].iloc[-1]

In [None]:
(compare.iloc[-1]/100)**(252/600)-1

In [None]:
bigdf.count(axis=1).mean()

In [None]:
#bigdf.to_excel('model5return317.xlsx')

In [None]:
bigdf.to_excel('model5return20.xlsx')