In [None]:
# TODO UPDATE THIS OLD MESSY CODE WITH NEW PANEL

In [284]:
# Import packages
import pandas as pd
import pickle
import numpy as np
from openpyxl import load_workbook

In [None]:
def calcGeomAvg(returns: np.array,
    annualized: bool=False,
    periods_in_year: int=None) -> float: 
    """ Calculate the geometric average of a vector of simple returns.

    Args:
        returns (np.array): vector of a simple returns at any frequency.
        annualized (bool): whether to annualize the statistic.
        periods_in_year (int): how many periods of the given frequency are in a year.

    Returns:
        (float): scalar geometric average.
    """
    if not isinstance(returns, np.ndarray):
        raise TypeError("Input 'returns' must be a NumPy array")
    if annualized and periods_in_year is None:
        raise ValueError("Input 'periods_in_year' must be provided if 'annualized' is True")
    geom_avg_at_given_freq = np.prod(1 + returns) ** (1 / np.size(returns)) - 1
    return (geom_avg_at_given_freq + 1) ** periods_in_year - 1 if annualized else geom_avg_at_given_freq

def calcTSAvgReturn(returns: np.array,
    annualized: bool=False,
    periods_in_year: int=None) -> float:
    """ Calculate the time series mean return of a vector of simple returns with option to annualize.

    Args:
        returns (np.array): vector of a simple returns at any frequency.
        annualized (bool): whether to annualize the statistic.
        periods_in_year (int): how many periods of the given frequency are in a year.

    Returns:
        (float): scalar time series mean return.
    """
    mean_ret_at_given_freq = np.mean(returns)
    if annualized == False:
        return mean_ret_at_given_freq
    else:
        mean_ret = periods_in_year*mean_ret_at_given_freq
        if mean_ret < -1:
            return -1.
        else:
            return mean_ret

def calcTotalReturn(returns: np.array,
    annualized: bool=False,
    periods_in_year: int=None) -> float:
    """ Calculate the total return of a vector of simple returns with option to annualize.

    Args:
        returns (np.array): vector of a simple returns at any frequency.
        annualized (bool): whether to annualize the statistic.
        periods_in_year (int): how many periods of the given frequency are in a year.

    Returns:
        (float): scalar total return.
    """
    total_return = np.prod(1+returns)-1
    if annualized==False:
        return total_return
    else:
        return (total_return+1)**(periods_in_year/len(returns))-1

def calcSD(returns: np.array,
    annualized: bool=False,
    periods_in_year: int=None) -> float: 
    """ Calculate the standard deviation of a vector of simple returns with option to annualize.

    Args:
        returns (np.array): vector of a simple returns at any frequency.
        annualized (bool): whether to annualize the statistic.
        periods_in_year (int): how many periods of the given frequency are in a year.

    Returns:
        (float): scalar standard deviation.
    """
    sd_at_given_freq = np.std(returns)
    if annualized==False:
        return sd_at_given_freq
    else:
        return np.sqrt(periods_in_year)*sd_at_given_freq

def calcSharpe(returns: np.array,
    periods_in_year: int,
    risk_free_returns: np.array=None) -> float:
    """ Calculate the annual Sharpe Ratio of a vector of simple returns. 

    Args:
        returns (np.array): vector of a simple returns at any frequency.
        periods_in_year (int): how many periods of the given frequency are in a year.
        risk_free_returns (np.array): vector of simple returns of the risk free rate.

    Returns:
        (float): scalar standard deviation.
    """
    if risk_free_returns is not None:
        returns = returns - risk_free_returns
    
    return (calcTSAvgReturn(returns, annualized=True, periods_in_year=periods_in_year) /
            calcSD(returns, annualized=True, periods_in_year=periods_in_year))

def calcMaxDrawdown(returns: np.array) -> float:
    ''' calculate maximum drawdown for a vector of returns of any frequency.
    
    Args:
        returns (np.array): vector of simple returns.
    
    Returns:
        max_drawdown (float): maximum drawdown in simple return units over this period.
    '''
    # calculate the cumulative return as a new vector of the same length
    cumulative_ret=(returns+1).cumprod()

    # for every period, calc the historic maximum value of the portfolio 
    roll_max=pd.Series(cumulative_ret).rolling(len(cumulative_ret), min_periods=1).max()

    # calc drawdown as the current portfolio value divided by the historic max value
    dd=np.min(cumulative_ret/roll_max)
    
    # return simple return of max drawdown
    return dd-1

def calcMaxOneWeekLoss(returns: np.array, periods_in_week: int) -> float:
    ''' Calculate the maximum loss for a one week period given how many obs are in week for input
        returns.
    
    Args:
        returns (np.array): vector of simple returns of any frequency.
        periods_in_week (int): number of observations in a week.
    
    Returns:
        max_loss (float): maximum loss over any one week period in simple returns.
    '''
    weekly_returns = (pd.Series(returns)+1).rolling(periods_in_week).apply(np.prod)
    max_loss = weekly_returns.min()-1
    return max_loss

def calcTransactionCosts(positions: np.array) -> np.array:
    ''' Calculate a vector of transaction costs which are positive numbers in return units.
    
    Args:
        positions (np.array): vector of positions, where positive is long and above 1, in absolute
                              value terms, is a leveraged position.

    Returns:
        tc (np.array): vector of transaction costs in return terms.
    '''
    # transaction costs, in return terms, from kraken for trading two spots paris, on margin
    tc_to_open        = 0.0005
    tc_to_close       = 0.0005
    tc_to_open_margin = 0.00004
    tc_margin_per_hr  = 0.00001

    # initial tc array
    tc = np.zeros(len(positions))

    # set first tc
    first_position = positions[0]
    if first_position == 0:
        tc[0] = 0
    elif (-1 <= first_position) & (first_position <= 1):
        tc[0] = tc_to_open
    elif (-5 <= first_position) & (first_position <= 5):
        tc[0] = tc_to_open+tc_to_open_margin+tc_margin_per_hr
    else:
        raise ValueError('first position is not a valid position.')

    # set remaining tc's
    for i in range(1,len(tc)):
        prev_position = positions[i-1]
        current_position = positions[i]
        if current_position == prev_position:
            if np.abs(current_position)>1:
                tc[i] = tc_margin_per_hr
        else:
            if current_position==0:
                tc[i] = tc_to_close
            elif (-1 <= current_position) & (current_position <= 1):
                tc[i] = tc_to_close+tc_to_open
            elif (-5 <= current_position) & (current_position <= 5):
                tc[i] = tc_to_close+tc_to_open+tc_to_open_margin+tc_margin_per_hr
            else: 
                raise ValueError('position '+str(i)+' is not a valid position.')

    # adjust last tc element for closing position
    last_position = positions[-1]
    if np.abs(last_position)>0:
        tc[-1] += tc_to_close

    return tc

In [None]:
def labelPortfolioWeights(df):
    # assign tertiles
    np.random.seed(42)
    df['rand']    = np.random.uniform(size=df.shape[0])
    df = df.sort_values(by=['date', 'yhat'])
    df['ranking'] = df.groupby(['date']).cumcount()
    df['counts']  = 1
    df['total_assets_per_week'] = df.groupby('date').counts.transform('sum')
    df['ranking']               = df.ranking/df.total_assets_per_week
    df.loc[df.ranking < 1/3, 'prtfl_wght'] = 0
    df.loc[(df.ranking>=1/3) & 
           (df.ranking<2/3), 'prtfl_wght'] = 1/6
    df.loc[df.ranking>=2/3,  'prtfl_wght'] = 5/6
    df['prtfl_wght'] = 3*df.prtfl_wght/df.total_assets_per_week
    
    # clean up
    df = df.drop(['rand', 'ranking', 'counts',
                  'total_assets_per_week'], axis=1)
    
    # confirm portfolio weights roughly sum to 1 for each week
    assert(len(np.unique(df.index)) == 
           np.sum(np.isclose(df.groupby(['date']).prtfl_wght.sum(), 1,
                             rtol=1e-2, atol=1e-2)))

    return df


In [None]:
# TODO MOVE ALL STANDARD ASSET PRICING FUNCS TO A PY SCRIPT THAT I IMPORT AND KEEP IN MY ASSET PRICING TOOLS FOLDER / REPO

In [285]:
def formFFQuintiles(test_df):
    # Build quintiles
    # -mcap is low to high preferable
    # -while r_t_2 is high to low is preferable
    test_df.loc[(test_df.tertile_r_t_2 == 3) &
                (test_df.tertile_mcap_t_1 == 1), 'quintile'] = 5
    test_df.loc[(test_df.tertile_r_t_2 == 3) &
                (test_df.tertile_mcap_t_1 == 2), 'quintile'] = 4
    test_df.loc[(test_df.tertile_r_t_2 == 2) &
                (test_df.tertile_mcap_t_1 == 1), 'quintile'] = 4
    test_df.loc[(test_df.tertile_r_t_2 == 3) &
                (test_df.tertile_mcap_t_1 == 3), 'quintile'] = 3
    test_df.loc[(test_df.tertile_r_t_2 == 2) &
                (test_df.tertile_mcap_t_1 == 2), 'quintile'] = 3
    test_df.loc[(test_df.tertile_r_t_2 == 1) &
                (test_df.tertile_mcap_t_1 == 1), 'quintile'] = 3
    test_df.loc[(test_df.tertile_r_t_2 == 2) &
                (test_df.tertile_mcap_t_1 == 3), 'quintile'] = 2
    test_df.loc[(test_df.tertile_r_t_2 == 1) &
                (test_df.tertile_mcap_t_1 == 2), 'quintile'] = 2
    test_df.loc[(test_df.tertile_r_t_2 == 1) &
                (test_df.tertile_mcap_t_1 == 3), 'quintile'] = 1

    return test_df

In [286]:
def formFFPortfolioResults(quintile_df, sheet_name):
    # Calculate value-weighted average returns for each quintile
    quintile_df['mcap_sum']     = quintile_df.groupby(['date', 'quintile'])['mcap_t_1'].transform('sum')
    quintile_df['weight']       = quintile_df.mcap_t_1 / quintile_df.mcap_sum
    quintile_df['quintile_r_t'] = quintile_df.weight * quintile_df.r_t
    quintile_df['quintile_r_t'] = quintile_df.groupby(['date', 'quintile'])['quintile_r_t'].transform('sum')
    results_df = quintile_df[['quintile', 'quintile_r_t']].drop_duplicates()

    # Form the output table
    output_df = pd.DataFrame(data = {'quintile': [1, 2, 3, 4, 5]})
    for quintile in [1, 2, 3, 4, 5]:
        weekly_returns = results_df[results_df.quintile == quintile].quintile_r_t.values
        output_df.loc[output_df.quintile == quintile, 'Real'] = np.product(weekly_returns+1)**(1/52)-1
        output_df.loc[output_df.quintile == quintile, 'Std'] = np.std(weekly_returns)
        output_df.loc[output_df.quintile == quintile, 'SR'] = np.sqrt(52)*np.mean(weekly_returns)/np.std(weekly_returns)

    # Output to Excel without overwriting the file
    book          = load_workbook('../4-output/portfolio_results.xlsx')
    writer        = pd.ExcelWriter('../4-output/portfolio_results.xlsx', engine='openpyxl') 
    writer.book   = book
    writer.sheets = dict((ws.title, ws) for ws in book.worksheets)
    output_df.to_excel(writer, sheet_name=sheet_name)
    writer.save()


In [287]:
def formQuintiles(test_df): 
    test_df = test_df.sort_values(by=['date', 'y_hat_t'])
    test_df['ranking'] = test_df.groupby(['date']).cumcount()+1
    test_df['counts'] = 1
    test_df['coins_per_week'] = test_df.groupby(['date']).counts.sum()
    test_df['ranking'] = test_df.ranking / test_df.coins_per_week
    test_df.loc[test_df.ranking <= 0.2, 'quintile'] = 1
    test_df.loc[(test_df.ranking > 0.2) & (test_df.ranking <= 0.4), 'quintile'] = 2
    test_df.loc[(test_df.ranking > 0.4) & (test_df.ranking <= 0.6), 'quintile'] = 3
    test_df.loc[(test_df.ranking > 0.6) & (test_df.ranking <= 0.8), 'quintile'] = 4
    test_df.loc[(test_df.ranking > 0.8), 'quintile'] = 5
    
    return test_df

In [359]:
def formPortfolioResults(quintile_df, sheet_name):
    # Calculate equal-weighted average returns for each quintile
    quintile_df['quintile_y_hat_t'] = quintile_df.groupby(['date', 'quintile'])['y_hat_t'].transform('mean')
    quintile_df['quintile_r_t'] = quintile_df.groupby(['date', 'quintile'])['r_t'].transform('mean')
    results_df = quintile_df[['quintile', 'quintile_y_hat_t', 'quintile_r_t']].drop_duplicates()

    # Form the output table
    output_df = pd.DataFrame(data = {'quintile': [1, 2, 3, 4, 5]})
    for quintile in [1, 2, 3, 4, 5]:
        pred_returns = results_df[results_df.quintile == quintile].quintile_y_hat_t.values
        weekly_returns = results_df[results_df.quintile == quintile].quintile_r_t.values
        output_df.loc[output_df.quintile == quintile, 'Pred'] = np.product(pred_returns+1)**(1/52)-1
        output_df.loc[output_df.quintile == quintile, 'Real'] = np.product(weekly_returns+1)**(1/52)-1
        output_df.loc[output_df.quintile == quintile, 'Std'] = np.std(weekly_returns)
        output_df.loc[output_df.quintile == quintile, 'SR'] = np.sqrt(52)*np.mean(weekly_returns)/np.std(weekly_returns)

    # Output to Excel without overwriting the file
    book          = load_workbook('../4-output/portfolio_results.xlsx')
    writer        = pd.ExcelWriter('../4-output/portfolio_results.xlsx', engine='openpyxl') 
    writer.book   = book
    writer.sheets = dict((ws.title, ws) for ws in book.worksheets)
    output_df.to_excel(writer, sheet_name=sheet_name)
    writer.save()

In [412]:
def outputBenchmarks(test_df):
    # Calculate equal weighted return statistics
    eql_wght_weekly_returns = test_df.groupby('date').r_t.mean().values
    eql_wght_real = np.product(eql_wght_weekly_returns+1)**(1/52)-1
    eql_wght_std = np.std(eql_wght_weekly_returns)
    eql_wght_sr = np.sqrt(52)*np.mean(eql_wght_weekly_returns)/eql_wght_std

    # Calculate mcap weighted return statistics
    test_df['mcap_sum'] = test_df.groupby(['date'])['mcap_t_1'].transform('sum')
    test_df['weight']   = test_df.mcap_t_1 / test_df.mcap_sum
    test_df['mcap_r_t'] = test_df.weight * test_df.r_t
    test_df['mcap_r_t'] = test_df.groupby(['date'])['mcap_r_t'].transform('sum')
    mcap_wght_weekly_returns = test_df[['mcap_r_t']].drop_duplicates().mcap_r_t.values
    mcap_wght_real = np.product(mcap_wght_weekly_returns+1)**(1/52)-1
    mcap_wght_std = np.std(mcap_wght_weekly_returns)
    mcap_wght_sr = np.sqrt(52)*np.mean(mcap_wght_weekly_returns)/mcap_wght_std

    # Form output dataframe
    output_df = pd.DataFrame(data={'weights': ['equal', 'mcap']})
    output_df.loc[output_df.weights == 'equal', 'Real'] = eql_wght_real 
    output_df.loc[output_df.weights == 'equal', 'Std'] = eql_wght_std
    output_df.loc[output_df.weights == 'equal', 'SR'] = eql_wght_sr
    output_df.loc[output_df.weights == 'mcap', 'Real'] = mcap_wght_real 
    output_df.loc[output_df.weights == 'mcap', 'Std'] = mcap_wght_std
    output_df.loc[output_df.weights == 'mcap', 'SR'] = mcap_wght_sr

    # Output to Excel without overwriting the file
    book          = load_workbook('../4-output/portfolio_results.xlsx')
    writer        = pd.ExcelWriter('../4-output/portfolio_results.xlsx', engine='openpyxl') 
    writer.book   = book
    writer.sheets = dict((ws.title, ws) for ws in book.worksheets)
    output_df.to_excel(writer, sheet_name='benchmarks')
    writer.save()

In [413]:
# Import FF and CA yhats
pik_ff = '../3-data/clean/ff-rankings-returns.pkl' 
with open(pik_ff, "rb") as f:
    ff_data_in = pickle.load(f)
    
ff_yhats_df, ff_test_df, ff_return_df = ff_data_in

pik_ca = '../3-data/clean/autoencoders-yhats-returns.pkl' 
with open(pik_ca, "rb") as f:
    ca_data_in = pickle.load(f)
    
opt_hps_list, test_dfs_list, returns_dfs_list = ca_data_in

# Import other benchmarks

# TODO: CMC 200
# TODO: BTC
# TODO: ETH
# TODO: S&P 500

# Output FF results
test_df = ff_test_df.copy()
quintile_df = formFFQuintiles(test_df)
formFFPortfolioResults(quintile_df, sheet_name = 'raw_ff')

# Output autoencoder results
for i in range(len(test_dfs_list)):
    test_df = test_dfs_list[i]
    opt_hps = opt_hps_list[i]
    num_hidden_layer = opt_hps['number_hidden_layer']
    num_factor = opt_hps['number_factor']
    sheet_name = 'raw_autoencoder-hl_' + str(num_hidden_layer) + '-fac_' + str(num_factor)
    
    quintile_df = formQuintiles(test_df)
    formPortfolioResults(quintile_df, sheet_name)

# Output benchmarks
outputBenchmarks(ff_test_df)

# TODO: ENSURE I REPORT THE RETURN AND SHARPE AND OTHER METRICS FOR ALL BENCHMARKS:
# -FF, CA, CMC 200, EQUAL WEIGHTS FROM SAME UNIVERSE, MCAP WEIGHTS FROM SAME UNIVERSE, BTC, ETH, S&P 500

# Report out for OOS: (maybe do some of these in a separate script?)
# --return weighted by mcap and equal weights
# --sharpe for both equal and mcap weights
# --source of excess return e.g. distri of each week-asset holding return with naming top returns asset-weeks
# --max DD
# --fees
# --min/Q1-Q3/max portfolio weight each week plotted
# --number of transactions per week
# --portfolio turnover
# --portfolio return per month
# --fees per week/month/overall