In [1]:
# To be able to use the quantools, due to my crap path names have to add to sys path
import sys
sys.path.insert(0, '/home/adam/Dropbox/2-creations/2-crafts/7-buidl/0-utils/quant_tools/code')

# IMPORT PACKAGES
from tools import QuantTools
from typing import Dict, List
import pandas as pd
import numpy as np
import pickle


In [2]:
def subsetToAssetUniverse(df: pd.DataFrame, asset_universe_dict: Dict[str, List[str]]) -> pd.DataFrame:
    """
    Subset a DataFrame based on a dictionary of asset universes.

    Parameters
    ----------
    df : pd.DataFrame
        The input DataFrame. Must contain columns "date" and "asset".
    asset_universe_dict : Dict[str, List[str]]
        A dictionary where keys are dates in 'YYYY-MM-DD' format and values are lists of asset names.

    Returns
    -------
    pd.DataFrame
        The subsetted DataFrame.
    """
    # Check that the required columns are present in the DataFrame
    if not set(['date', 'asset']).issubset(df.columns):
        raise ValueError('Input DataFrame must contain "date" and "asset" columns.')

    # Ensure that the 'date' column is of datetime type
    if df['date'].dtype != 'datetime64[ns]':
        df['date'] = pd.to_datetime(df['date'])

    # Loop over all months with their relevant assets
    for key, values in asset_universe_dict.items():
        # Extract the year and month from the key
        year, month = key.split('-')[:2]

        # Drop rows from the dataframe which match the year and month but not the assets
        df = df[~((df.date.dt.year == int(year)) 
                    & (df.date.dt.month == int(month)) 
                    & (~df.asset.isin(values)))]

    return df


In [3]:
def windsorize(df: pd.DataFrame, lhs_col: str, clip_prctl: float=0.01) -> pd.DataFrame:
    """
    Windsorize the values in the specified column of a DataFrame.
    
    This function replaces values below the clip_prctl percentile with that percentile's
    value, and the same for the values about the 1-clip_prctl percentile.
    
    Parameters:
    - df: DataFrame containing the column to be windsorized.
    - lhs_col: Name of the column to be windsorized.
    - clip_prctl: Percentile of left tail to clip below and 
                    same for 1-clip_prctl on the right tail.
    
    Returns:
    - DataFrame with windsorized values.
    """
    # Calculate quantiles
    p_left = df[lhs_col].quantile(clip_prctl)
    p_right = df[lhs_col].quantile(1-clip_prctl)

    # Windsorize
    df.loc[df[lhs_col] < p_left, lhs_col] = p_left
    df.loc[df[lhs_col] > p_right, lhs_col] = p_right

    return df


In [4]:
def formPortfolioSortResultsTable(df: pd.DataFrame, rhs_col: str, lhs_col: str, 
    ts_avg_method: str, annualized: bool, periods_in_year: int, num_quantiles: int,
    mcap_weighted: bool) -> pd.DataFrame:
    """
    Forms a portfolio sort results table based given number of quantiles sorted by given RHS col for the input DataFrame.
    
    Parameters:
    - df (pd.DataFrame): The input DataFrame.
    - rhs_col (str): The right-hand side column name to use to form quantiles.
    - lhs_col (str): The left-hand side column name of future returns.
    - ts_avg_method (str): The method for calculating the time series average, either 'arithmetic' or 'geometric'.
    - annualized (bool): If True, the results are annualized.
    - periods_in_year (int): Number of periods in a year (e.g., 52 for weekly data).
    - num_quantiles (int): Number of quantiles for ranking.
    - mcap_weighted (bool): Whether to mcap weight or just equally weight.
    
    Returns:
    - pd.DataFrame: A DataFrame containing the portfolio sort results.
    """
    # Check for valid input
    assert ts_avg_method in ['arithmetic', 'geometric'], "Incorrect input for the ts_avg_method."

    # Form relevant df
    if mcap_weighted == True:
        t_df = df[['date', 'asset', lhs_col, rhs_col, 'mcap']].copy()
    else:
        t_df = df[['date', 'asset', lhs_col, rhs_col]].copy()

    # Randomly sort all rows of the dataframe
    t_df = t_df.sample(frac=1, random_state=42).reset_index(drop=True)

    # Sort the dataframe by 'date' and rhs_col column
    t_df = t_df.sort_values(['date', rhs_col])

    # Form quantiles
    t_df['rank_within_date'] = t_df.groupby('date')[rhs_col].rank(method='first')
    t_df['rank_ratio'] = t_df.groupby('date')['rank_within_date'].transform(lambda x: x / x.max())
    quantile_bins = list(np.arange(0, num_quantiles+1)/num_quantiles)
    t_df['quant'] = 1+pd.cut(t_df['rank_ratio'], bins=quantile_bins, labels=False, include_lowest=True)
    t_df = t_df.drop(columns=['rank_within_date', 'rank_ratio'])

    # Calculate the average return for each quantile within each date
    if mcap_weighted == True:
        t_df['weighted_return'] = t_df[lhs_col] * t_df.mcap
        grouped_df = t_df.groupby(['date', 'quant'])[['weighted_return', 'mcap']].sum().reset_index()
        grouped_df[lhs_col] = grouped_df['weighted_return'] / grouped_df['mcap']
        date_quantile_avg_returns_df = grouped_df[['date', 'quant', lhs_col]].copy()
    else:
        date_quantile_avg_returns_df = t_df.groupby(['date', 'quant'])[[lhs_col]].mean().reset_index()

    # Calculate the time series average of each quantile's average returns
    if ts_avg_method == 'geometric':
        quantile_avg_returns = date_quantile_avg_returns_df.groupby('quant')[lhs_col].apply(lambda x: QuantTools.calcGeomAvg(x, annualized=annualized, periods_in_year=periods_in_year))
    else:
        quantile_avg_returns = date_quantile_avg_returns_df.groupby('quant')[lhs_col].apply(lambda x: QuantTools.calcTSAvgReturn(x, annualized=annualized, periods_in_year=periods_in_year))

    # Calculate the time series average for each year
    date_quantile_avg_returns_df['year'] = date_quantile_avg_returns_df['date'].dt.year
    if ts_avg_method == 'geometric':
        yearly_avg_returns = date_quantile_avg_returns_df.groupby(['year', 'quant'])[lhs_col].apply(lambda x: QuantTools.calcGeomAvg(x, annualized=annualized, periods_in_year=periods_in_year)).unstack(level=1)
    else:
        yearly_avg_returns = date_quantile_avg_returns_df.groupby(['year', 'quant'])[lhs_col].apply(lambda x: QuantTools.calcTSAvgReturn(x, annualized=annualized, periods_in_year=periods_in_year)).unstack(level=1)

    # Calculate the t statistics for the overall period
    t_stats = (np.sqrt(len(date_quantile_avg_returns_df)/num_quantiles)*date_quantile_avg_returns_df.groupby('quant')[lhs_col].apply(lambda x: QuantTools.calcTSAvgReturn(x, annualized=False)) 
                / date_quantile_avg_returns_df.groupby('quant')[lhs_col].apply(lambda x: QuantTools.calcSD(x, annualized=False)))

    # Calculate the time series average of the difference between the top and bottom quantile's average returns
    top_quantile = num_quantiles
    bottom_quantile = 1
    diff_date_quantile_avg_returns_df = date_quantile_avg_returns_df.pivot_table(index='date', columns='quant', values=lhs_col)
    diff_date_quantile_avg_returns_df['year'] = diff_date_quantile_avg_returns_df.index.year
    diff_date_quantile_avg_returns_df['top_bottom_diff'] = (diff_date_quantile_avg_returns_df[top_quantile] 
                                                            - diff_date_quantile_avg_returns_df[bottom_quantile])
    if ts_avg_method == 'geometric':
        top_bottom_diff_average = QuantTools.calcGeomAvg(diff_date_quantile_avg_returns_df['top_bottom_diff'], annualized=annualized, periods_in_year=periods_in_year)
    else:
        top_bottom_diff_average = QuantTools.calcTSAvgReturn(diff_date_quantile_avg_returns_df['top_bottom_diff'], annualized=annualized, periods_in_year=periods_in_year)

    # Calculate the yearly top_bottom_diff
    if ts_avg_method == 'geometric':
        yearly_diff_avg_returns = diff_date_quantile_avg_returns_df.groupby('year')['top_bottom_diff'].apply(lambda x: QuantTools.calcGeomAvg(x, annualized=annualized, periods_in_year=periods_in_year))
    else:
        yearly_diff_avg_returns = diff_date_quantile_avg_returns_df.groupby('year')['top_bottom_diff'].apply(lambda x: QuantTools.calcTSAvgReturn(x, annualized=annualized, periods_in_year=periods_in_year))

    # Calculate the overall t stat for the top minus bottom portfolio
    t_stat_top_bottom_diff = (np.sqrt(len(diff_date_quantile_avg_returns_df))*QuantTools.calcTSAvgReturn(diff_date_quantile_avg_returns_df['top_bottom_diff'], annualized=False)
                    / QuantTools.calcSD(diff_date_quantile_avg_returns_df['top_bottom_diff'], annualized=False))

    # Combine results
    results = np.round(yearly_avg_returns.copy(), 4)
    results.loc['all'] = np.round(quantile_avg_returns, 4)
    results.loc['t_stat'] = np.round(t_stats, 2)
    top_bottom_diff_col = str(top_quantile)+'-'+str(bottom_quantile)
    results[top_bottom_diff_col] = yearly_diff_avg_returns
    results.loc['t_stat', top_bottom_diff_col] = np.round(t_stat_top_bottom_diff, 2)
    top_bottom_diff_avg_rounded = np.round(top_bottom_diff_average, 4)
    results['rhs_col'] = rhs_col
    results['sig'] = 'yes'
    if (np.abs(t_stat_top_bottom_diff) > 2.576):
        results.loc['all', top_bottom_diff_col] = str(top_bottom_diff_avg_rounded)+"***"
    elif (np.abs(t_stat_top_bottom_diff) > 1.96):
        results.loc['all', top_bottom_diff_col] = str(top_bottom_diff_avg_rounded)+"**"
    elif (np.abs(t_stat_top_bottom_diff) > 1.645):
        results.loc['all', top_bottom_diff_col] = str(top_bottom_diff_avg_rounded)+"*"
    else:
        results.loc['all', top_bottom_diff_col] = str(top_bottom_diff_avg_rounded)
        results['sig'] = 'no'
    
    return results


In [5]:
def calcUnivariateFactorResults(df: pd.DataFrame, rhs_cols: List[str],
    lhs_col: str, ts_avg_method: str, annualized: bool, periods_in_year: int,
    num_quantiles: int, mcap_weighted: bool, out_fp: str, out_sheet: str) -> None:
    # Remove vc column as will handle separately
    if 'char_vc_t' in rhs_cols:
        rhs_cols.remove('char_vc_t')
        
    # Form results
    results_df = pd.DataFrame()
    for rhs_col in rhs_cols:
        result = formPortfolioSortResultsTable(df, rhs_col, lhs_col, 
                    ts_avg_method, annualized, periods_in_year, num_quantiles, mcap_weighted)
        results_df = pd.concat([results_df, result])

    # Separate results
    sig_chars = list(set(results_df[results_df.sig=='yes'].rhs_col.values))
    sig_results_df = results_df[results_df.rhs_col.isin(sig_chars)]
    insig_results_df = results_df[~results_df.rhs_col.isin(sig_chars)]

    # Save results
    with pd.ExcelWriter(out_fp, engine='openpyxl', mode='a', if_sheet_exists='replace') as writer: 
        sheet_name = out_sheet+'_sig'
        sig_results_df.to_excel(writer, sheet_name=sheet_name)
        sheet_name = out_sheet+'_insig'
        insig_results_df.to_excel(writer, sheet_name=sheet_name)


In [6]:
if __name__ == "__main__":
    # set args
    PANEL_IN_FP     = '../data/clean/panel_weekly.pkl' 
    ASSET_IN_FP     = '../data/clean/asset_universe_dict.pickle'
    OUT_FP          = '../output/low_dim_fm/univariate_factor_analysis.xlsx'
    OUT_SHEET       = 'raw_uni'
    PERIODS_IN_YEAR = 52
    TS_AVG_METHOD   = 'arithmetic'
    LHS_COL         = 'r_ex_tp7'
    ANNUALIZED      = False
    NUM_QUANTILES   = 5
    WINDSORIZE      = False
    MCAP_WEIGHTED   = True
    RHS_COLS = ['char_tx_volume_tm7',
        'char_addr_active_tm7',
        'char_addr_new_log_delta_tm14_tm7',
        'char_addr_new_tm7',
        'char_addr_total_t',
        'char_circulation_tm7',
        'char_age_destroyed_tm7',
        'char_delta_flow_dist_tm7',
        'char_delta_holders_dist_tm7',
        'char_prct_supply_in_profit_t',
        'char_cex_prct_circ_supply_t',
        'char_dex_prct_circ_supply_t',
        'char_defi_prct_circ_supply_t',
        'char_traders_prct_circ_supply_t',
        'char_exchange_inflow_tm7',
        'char_exchange_outflow_tm7',
        'char_num_pairs_t',
        'char_social_volume_tm7',
        'char_social_volume_reddit_tm7',
        'char_social_volume_twitter_tm7',
        'char_sent_pos_reddit_tm7',
        'char_sent_pos_twitter_tm7',
        'char_sent_neg_reddit_tm7',
        'char_sent_neg_twitter_tm7',
        'char_dev_activity_tm7',
        'char_vc_t',
        'char_r_tm7',
        'char_r_tm14',
        'char_r_tm30',
        'char_r_tm60',
        'char_r_tm90',
        'char_r_tm14_tm7',
        'char_r_tm30_tm14',
        'char_r_tm90_tm30',
        'char_r_ath_t',
        'char_r_atl_t',
        'char_r_industry_tm30',
        'char_r_industry_tm60',
        'char_trades_sum_tm7',
        'char_volume_sum_tm7',
        'char_spread_bps_t',
        'char_ask_size_t',
        'char_bid_size_t',
        'char_illiq_tm7',
        'char_turnover_tm7',
        'char_price_t',
        'char_size_t',
        'char_mvrv_t',
        'char_alpha_tm7',
        'char_alpha_tm30',
        'char_beta_tm7',
        'char_beta_tm30',
        'char_beta_downside_tm30',
        'char_coskew_tm30',
        'char_iskew_tm30',
        'char_shortfall5_tm7',
        'char_var5_tm7',
        'char_vol_tm7',
        'char_vol_tm30',
        'char_vol_tm90',
        'char_ivol_tm7',
        'char_ivol_tm30',
        'char_ivol_tm90']

    # edit fp based on args
    if NUM_QUANTILES == 2:
        OUT_SHEET += '_bi'
    elif NUM_QUANTILES == 3:
        OUT_SHEET += '_ter'
    elif NUM_QUANTILES == 5:
        OUT_SHEET += '_quin'
    else:
        assert(1==0),('set new quantiles naming of sheet!')
    if MCAP_WEIGHTED:
        OUT_SHEET += '_mcap'

    # import
    with open(ASSET_IN_FP, "rb") as f:
        asset_universe_dict = pickle.load(f)
    df = pd.read_pickle(PANEL_IN_FP)

    # drop columns not needed in weekly panel
    df = df.rename(columns={'char_mcap_t': 'mcap'})
    df = df[['date', 'asset', LHS_COL, 'mcap']+RHS_COLS].copy()

    # drop rows that are not in the asset universe
    df = subsetToAssetUniverse(df, asset_universe_dict)

    # windsorize
    if WINDSORIZE:
        OUT_SHEET += '_wind'
        df = windsorize(df, LHS_COL)
    
    # calculate results
    calcUnivariateFactorResults(df, RHS_COLS,
        LHS_COL, TS_AVG_METHOD, ANNUALIZED, PERIODS_IN_YEAR,
        NUM_QUANTILES, MCAP_WEIGHTED, OUT_FP, OUT_SHEET)
