In [1]:
import pandas as pd
import numpy as np

In [2]:
def calcGeomAvg(returns: np.array,
    annualized: bool=False,
    periods_in_year: int=None) -> float: 
    """ Calculate the geometric average of a vector of simple returns.

    Args:
        returns (np.array): vector of a simple returns at any frequency.
        annualized (bool): whether to annualize the statistic.
        periods_in_year (int): how many periods of the given frequency are in a year.

    Returns:
        (float): scalar geometric average.
    """
    if annualized and periods_in_year is None:
        raise ValueError("Input 'periods_in_year' must be provided if 'annualized' is True")
    total_return = np.prod(1 + returns)
    if total_return < 0:
        return -1
    else:
        geom_avg_at_given_freq =  total_return** (1 / np.size(returns)) - 1
        return (geom_avg_at_given_freq + 1)**periods_in_year - 1 if annualized else geom_avg_at_given_freq

def calcArithAvg(returns: np.array,
    annualized: bool=False,
    periods_in_year: int=None) -> float:
    """ Calculate the time series mean return of a vector of simple returns with option to annualize.

    Args:
        returns (np.array): vector of a simple returns at any frequency.
        annualized (bool): whether to annualize the statistic.
        periods_in_year (int): how many periods of the given frequency are in a year.

    Returns:
        (float): scalar time series mean return.
    """
    mean_ret_at_given_freq = np.mean(returns)
    if annualized == False:
        return mean_ret_at_given_freq
    else:
        mean_ret = periods_in_year*mean_ret_at_given_freq
        if mean_ret < -1:
            return -1.
        else:
            return mean_ret

def calcSD(returns: np.array,
    annualized: bool=False,
    periods_in_year: int=None) -> float: 
    """ Calculate the standard deviation of a vector of simple returns with option to annualize.

    Args:
        returns (np.array): vector of a simple returns at any frequency.
        annualized (bool): whether to annualize the statistic.
        periods_in_year (int): how many periods of the given frequency are in a year.

    Returns:
        (float): scalar standard deviation.
    """
    sd_at_given_freq = np.std(returns)
    if annualized==False:
        return sd_at_given_freq
    else:
        return np.sqrt(periods_in_year)*sd_at_given_freq

In [3]:
def formPortfolioSortResultsTable(df: pd.DataFrame, rhs_col: str, lhs_col: str, ts_avg_method: str, annualized: bool, periods_in_year: int) -> pd.DataFrame:
    # Check for valid input
    assert ts_avg_method in ['arithmetic', 'geometric'], "Incorrect input for the ts_avg_method."
    
    # Form relevant df
    t_df = df[['date', 'asset', lhs_col, rhs_col]].copy()

    # Randomly sort all rows of the dataframe
    t_df = t_df.sample(frac=1).reset_index(drop=True)

    # Sort the dataframe by 'date' and rhs_col column
    t_df = t_df.sort_values(['date', rhs_col])

    # Form tertile
    t_df['rank_within_date'] = t_df.groupby('date')[rhs_col].rank(method='first')
    t_df['rank_ratio'] = t_df.groupby('date')['rank_within_date'].transform(lambda x: x / x.max())
    t_df['tertile'] = 1+pd.cut(t_df['rank_ratio'], bins=[0, 1/3, 2/3, 1], labels=False, include_lowest=True)
    t_df = t_df.drop(columns=['rank_within_date', 'rank_ratio'])

    # Calculate the average return for each tertile within each date
    daily_avg_returns_df = t_df.groupby(['date', 'tertile'])[lhs_col].mean().reset_index()

    # Calculate the time series average of each tertile's average returns
    if ts_avg_method == 'geometric':
        tertile_avg_returns = daily_avg_returns_df.groupby('tertile')[lhs_col].apply(lambda x: calcGeomAvg(x, annualized=annualized, periods_in_year=periods_in_year))
    else:
        tertile_avg_returns = daily_avg_returns_df.groupby('tertile')[lhs_col].apply(lambda x: calcArithAvg(x, annualized=annualized, periods_in_year=periods_in_year))

    # Calculate the time series average for each year
    daily_avg_returns_df['year'] = daily_avg_returns_df['date'].dt.year
    if ts_avg_method == 'geometric':
        yearly_avg_returns = daily_avg_returns_df.groupby(['year', 'tertile'])[lhs_col].apply(lambda x: calcGeomAvg(x, annualized=annualized, periods_in_year=periods_in_year)).unstack(level=1)
    else:
        yearly_avg_returns = daily_avg_returns_df.groupby(['year', 'tertile'])[lhs_col].apply(lambda x: calcArithAvg(x, annualized=annualized, periods_in_year=periods_in_year)).unstack(level=1)

    # Calculate the t statistics for the overall period
    t_stats = (daily_avg_returns_df.groupby('tertile')[lhs_col].apply(lambda x: calcArithAvg(x, annualized=annualized, periods_in_year=periods_in_year)) 
                / daily_avg_returns_df.groupby('tertile')[lhs_col].apply(lambda x: calcSD(x, annualized=annualized, periods_in_year=periods_in_year)))

    # Calculate the time series average of the difference between the top and bottom tertile's average returns
    diff_daily_avg_returns_df = daily_avg_returns_df.pivot_table(index='date', columns='tertile', values=lhs_col)
    diff_daily_avg_returns_df['year'] = diff_daily_avg_returns_df.index.year
    diff_daily_avg_returns_df['top_bottom_diff'] = diff_daily_avg_returns_df[3] - diff_daily_avg_returns_df[1]
    if ts_avg_method == 'geometric':
        top_bottom_diff_average = calcGeomAvg(diff_daily_avg_returns_df['top_bottom_diff'], annualized=annualized, periods_in_year=periods_in_year)
    else:
        top_bottom_diff_average = calcArithAvg(diff_daily_avg_returns_df['top_bottom_diff'], annualized=annualized, periods_in_year=periods_in_year)

    # Calculate the yearly top_bottom_diff
    if ts_avg_method == 'geometric':
        yearly_diff_avg_returns = diff_daily_avg_returns_df.groupby('year')['top_bottom_diff'].apply(lambda x: calcGeomAvg(x, annualized=annualized, periods_in_year=periods_in_year))
    else:
        yearly_diff_avg_returns = diff_daily_avg_returns_df.groupby('year')['top_bottom_diff'].apply(lambda x: calcArithAvg(x, annualized=annualized, periods_in_year=periods_in_year))

    # Calculate the overall t stat for the top minus bottom portfolio
    t_stat_3_1 = (np.sqrt(len(diff_daily_avg_returns_df))*calcArithAvg(diff_daily_avg_returns_df['top_bottom_diff'], annualized=False)
                    / calcSD(diff_daily_avg_returns_df['top_bottom_diff'], annualized=False))

    # Combine results
    results = yearly_avg_returns.copy()
    results.loc['all'] = tertile_avg_returns
    results.loc['t_stat'] = t_stats
    results['3-1'] = yearly_diff_avg_returns
    results.loc['all', '3-1'] = top_bottom_diff_average
    results.loc['t_stat', '3-1'] = t_stat_3_1
    results['rhs_col'] = rhs_col

    return results

In [21]:
if __name__ == "__main__":
    # set args
    PANEL_IN_FP     = '../data/clean/panel_weekly.pkl' 
    OUT_FP          = '../output/classic_fm/univariate_factor_analysis.xlsx'
    PERIODS_IN_YEAR = 52
    TS_AVG_METHOD   = 'arithmetic'
    LHS_COL         = 'r_ex_tp7'
    ANNUALIZED      = False

    # import
    df = pd.read_pickle(PANEL_IN_FP)

    # TODO TEMP WINDSOR
    p1 = df[LHS_COL].quantile(0.01)
    p99 = df[LHS_COL].quantile(0.99)
    df.loc[df[LHS_COL] < p1, LHS_COL] = p1 
    df.loc[df[LHS_COL] > p99, LHS_COL] = p1 

    # drop columns not needed in weekly panel
    macro_cols = [col for col in df.columns if 'macro_' in col]
    df = df.drop(macro_cols, axis=1)
    static_cols =['char_industry_asset_mgmt',
        'char_industry_cex',
        'char_industry_cloud_compute',
        'char_industry_currency',
        'char_industry_data_mgmt',
        'char_industry_dex',
        'char_industry_gaming',
        'char_industry_infra',
        'char_industry_interop',
        'char_industry_lending',
        'char_industry_media',
        'char_industry_other_defi',
        'char_industry_smart_contract',
        'char_asset_usage_access',
        'char_asset_usage_discount',
        'char_asset_usage_dividends',
        'char_asset_usage_payments',
        'char_asset_usage_vote',
        'char_asset_usage_work',
        'char_pow',
        'char_pos',
        'char_ico_price',
        'char_ico']
    df = df.drop(static_cols, axis=1)
    rhs_cols = list(df.columns.values)
    rhs_cols.remove('date')
    rhs_cols.remove('asset')
    rhs_cols.remove(LHS_COL)

    # Form results
    results_df = pd.DataFrame()
    for rhs_col in rhs_cols:
        result = formPortfolioSortResultsTable(df, rhs_col, LHS_COL, TS_AVG_METHOD, ANNUALIZED, PERIODS_IN_YEAR)
        results_df = pd.concat([results_df, result])

    # Save results
    with pd.ExcelWriter(OUT_FP, engine='openpyxl', mode='a', if_sheet_exists='replace') as writer: 
        results_df.to_excel(writer,sheet_name='raw_univariate')

