In [1]:
# To be able to use the quantools, due to my crap path names have to add to sys path
import sys
sys.path.insert(0, '/home/adam/Dropbox/2-creations/2-crafts/7-buidl/0-utils/quant_tools/code')

# Import packages
import matplotlib.pyplot as plt
from typing import Dict, List
from tools import QuantTools
from scipy.stats import norm
import scipy.stats as stats
import yfinance as yf
import pandas as pd
import numpy as np
import pickle

In [2]:
def subsetToAssetUniverse(df: pd.DataFrame, asset_universe_dict: Dict[str, List[str]]) -> pd.DataFrame:
    """
    Subset a DataFrame based on a dictionary of asset universes.

    Parameters
    ----------
    df : pd.DataFrame
        The input DataFrame. Must contain columns "date" and "asset".
    asset_universe_dict : Dict[str, List[str]]
        A dictionary where keys are dates in 'YYYY-MM-DD' format and values are lists of asset names.

    Returns
    -------
    pd.DataFrame
        The subsetted DataFrame.
    """
    # Check that the required columns are present in the DataFrame
    if not set(['date', 'asset']).issubset(df.columns):
        raise ValueError('Input DataFrame must contain "date" and "asset" columns.')

    # Ensure that the 'date' column is of datetime type
    if df['date'].dtype != 'datetime64[ns]':
        df['date'] = pd.to_datetime(df['date'])

    # Loop over all months with their relevant assets
    for key, values in asset_universe_dict.items():
        # Extract the year and month from the key
        year, month = key.split('-')[:2]

        # Drop rows from the dataframe which match the year and month but not the assets
        df = df[~((df.date.dt.year == int(year)) 
                    & (df.date.dt.month == int(month)) 
                    & (~df.asset.isin(values)))]

    return df

In [3]:
def plotReturnHistograms(df: pd.DataFrame, out_fp: str):
    """
    This function takes a DataFrame containing time series data for returns of
    btc, eth, and the cmkt and saves a histogram plot for all three to given fp.
    Each histogram also includes a normal distribution fit.

    Parameters:
    df (pd.DataFrame): A DataFrame containing columns 'date', 'asset', 'char_r_tm7', 'macro_cmkt_tm7'.
    out_fp (str): A string specifying the filepath where the plot should be saved.

    Returns:
    None
    """
    # extract relevant returns
    btc_df  = df[df.asset=='btc'][['date', 'char_r_tm7']]
    btc_df  = btc_df.rename(columns={'char_r_tm7': 'btc'})
    eth_df  = df[df.asset=='eth'][['date', 'char_r_tm7']]
    eth_df  = eth_df.rename(columns={'char_r_tm7': 'eth'})
    cmkt_df = df.groupby('date')[['macro_cmkt_tm7']].mean().reset_index()
    cmkt_df = cmkt_df.rename(columns={'macro_cmkt_tm7': 'cmkt'})

    # form single dataframe
    hist_df = cmkt_df.merge(btc_df, on='date', how='inner', validate='one_to_one')
    hist_df = hist_df.merge(eth_df, on='date', how='inner', validate='one_to_one')

    # initiate the plot with given colors and columns
    fig, axs = plt.subplots(3, sharex=True, sharey=True, figsize=(6.4,4), facecolor='none')
    colors = plt.get_cmap('viridis')(np.linspace(0, 10))
    data_columns = ['cmkt', 'btc', 'eth']

    # plot the data with the normal dist fit
    for idx, ax in enumerate(axs):
        data = hist_df[data_columns[idx]]
        n, bins, patches = ax.hist(data, bins=30, color=colors[idx], alpha=1)
        #  density=True, 

        # Fit a normal distribution
        mu, std = norm.fit(data)

        # Scale normal distribution to histogram
        scale = n.max() / norm.pdf(mu, mu, std).max()
        
        # Plot the PDF
        xmin, xmax = ax.get_xlim()
        x = np.linspace(xmin, xmax, 100)
        p = norm.pdf(x, mu, std) * scale
        ax.plot(x, p, 'k', linewidth=2)

        for spine in ax.spines.values():
            spine.set_visible(False)

    # tighen up the plot
    fig.tight_layout()

    # adjust x axis labels
    plt.xticks(np.arange(-.5, 0.7, 0.1))

    # output
    plt.savefig(out_fp)

    # close the figure
    plt.close(fig)

In [4]:
def plotCumulativeReturns(df: pd.DataFrame, out_fp: str) -> None:
    """
    Plot the time series of cumulative returns to the given output filepath.

    Args:
        df (pd.DataFrame): DataFrame containing the time series data
        out_fp (str): a relative filepath to save the figure to.

    Returns:
        None
    """
    # initialize df with timeserieses to plot
    plot_df = pd.DataFrame(data={'date': []})

    # find all assets present in the panel
    assets = list(np.unique(df.asset.values))

    # form each asset's cumulative return
    for asset in assets:
        # extract asset's returns
        temp_df = df[df.asset==asset][['date', 'char_r_tm7']]

        # ensure it is sorted
        temp_df = temp_df.sort_values(by='date', ignore_index=True)

        # form cumulative return
        temp_df[asset] = (1 + temp_df['char_r_tm7']).cumprod()

        # merge on results
        plot_df = plot_df.merge(temp_df[['date', asset]], on='date', how='outer', validate='one_to_one')

    # form the cmkt return
    temp_df = df[df.asset=='btc'][['date', 'macro_cmkt_tm7']]
    temp_df = temp_df.sort_values(by='date', ignore_index=True)
    temp_df['cmkt'] = (1 + temp_df['macro_cmkt_tm7']).cumprod()
    plot_df = plot_df.merge(temp_df[['date', 'cmkt']], on='date', how='outer', validate='one_to_one')

    # resort
    plot_df = plot_df.sort_values(by='date', ignore_index=True)

    # set index
    plot_df.set_index('date', inplace=True)

    # Plotting the time series
    plt.figure(figsize=(8, 5), facecolor='none')

    # Form column list
    columns = list(plot_df.columns)
    columns.remove('btc')
    columns.remove('eth')
    columns.remove('cmkt')
    columns = columns + ['eth', 'btc', 'cmkt']

    # Iterate over the columns and plot each time series
    for column in columns:
        if column == 'btc':
            color = '#FDE725FF'
            linewidth = 2
        elif column == 'eth':
            color = '#2D708EFF'
            linewidth = 2
        elif column == 'cmkt':
            color = '#482677FF'
            linewidth = 2
        else:
            color = 'gray'
            linewidth = 0.5
        plt.plot(plot_df.index, plot_df[column], color=color, linewidth=linewidth)

    # Set y-axis to logarithmic scale
    plt.yscale('log')

    # Remove y-axis minor ticks
    plt.gca().yaxis.set_minor_locator(plt.NullLocator())

    # Customize the plot
    plt.grid(visible=True, which='major', axis='y', linewidth=0.5)
    plt.box(False)

    # Add custom labels for important time series
    plt.text(plot_df.index[-45], plot_df['cmkt'].iloc[-52]+5, 'cmkt', color='#482677FF', fontweight='bold', verticalalignment='center', bbox=dict(facecolor='none', edgecolor='none'))
    plt.text(plot_df.index[-1], plot_df['btc'].iloc[-1]-0.1, 'btc', color='#FDE725FF', fontweight='bold', verticalalignment='center', bbox=dict(facecolor='none', edgecolor='none'))
    plt.text(plot_df.index[-1], plot_df['eth'].iloc[-1]+0.3, 'eth', color='#2D708EFF', fontweight='bold', verticalalignment='center', bbox=dict(facecolor='none', edgecolor='none'))

    # output
    plt.savefig(out_fp)
    plt.close()

In [5]:
def genSummaryStatistics(df: pd.DataFrame, lhs_col: str, out_fp: str) -> None:
    """
    Generates summary statistics for the panel and saves them to an Excel file.

    :param df: DataFrame containing asset return data.
    :param lhs_col: Column in df that contains the return data.
    :param out_fp: Output file path for the Excel file.
    """
    # define function for calculating return statistics
    def calcReturnStats(temp_df: pd.DataFrame, asset: str, return_col: str) -> dict:
        mean_return = QuantTools.calcTSAvgReturn(temp_df[return_col].values, annualized=True, periods_in_year=52)
        std_dev = QuantTools.calcSD(temp_df[return_col].values, annualized=True, periods_in_year=52)
        sharpe_ratio = QuantTools.calcSharpe(temp_df[return_col].values, periods_in_year=52)
        skewness = stats.skew(temp_df[return_col].values) / np.sqrt(52)
        kurtosis = stats.kurtosis(temp_df[return_col].values) / 52
        perc_return_above_zero = np.sum(temp_df[return_col]>0) / len(temp_df)
        
        return {'asset': asset,
            'Mean': mean_return,
            'SD': std_dev,
            'Sharpe': sharpe_ratio,
            'Skewness': skewness,
            'Kurtosis': kurtosis,
            'Pct pos': perc_return_above_zero}

    # drop to only necessary columns
    df = df[['date', 'asset', lhs_col, 'macro_snp500_t', 'char_size_t', 'char_volume_sum_tm7']].copy()

    # form btc and eth returns
    btc_df = df[df.asset=='btc'].set_index('date')[[lhs_col]]
    eth_df = df[df.asset=='eth'].set_index('date')[[lhs_col]]

    # form cmkt return
    df['weighted_return'] = df[lhs_col] * df['char_size_t']
    total_market_cap = df.groupby('date')['char_size_t'].sum()
    cmkt_df = df.groupby('date')['weighted_return'].sum() / total_market_cap
    cmkt_df = pd.DataFrame(cmkt_df).rename(columns={0: 'return'})

    # obtain nasdaq weekly return
    nsdq_df = yf.Ticker('^IXIC').history(period='1d', start='2017-12-29', end='2022-12-31').reset_index()
    nsdq_df['Date'] = pd.to_datetime(nsdq_df.Date).to_numpy(dtype='datetime64[D]')
    nsdq_df = nsdq_df[['Date', 'Close']].rename(columns={'Date': 'date', 'Close': 'return'}).set_index('date')
    nsdq_df = nsdq_df.resample('W').last().pct_change().dropna()

    # calc return statistics
    cmkt_stats = calcReturnStats(cmkt_df, 'CMKT', 'return')
    btc_stats  = calcReturnStats(btc_df, 'Bitcoin', lhs_col)
    eth_stats  = calcReturnStats(eth_df, 'Ethereum', lhs_col)
    nsdq_stats = calcReturnStats(nsdq_df, 'Nasdaq', 'return')
    ret_df = pd.DataFrame([cmkt_stats, btc_stats, eth_stats, nsdq_stats])

    # calc extreme event statistics
    ext_data = {'threshold': [], 'count': [], 'percent': []}
    num_obs  = len(cmkt_df)
    for threshold in [-.3, -.2, -.1, -.05, .05, .1, .2, .3]:
        ext_data['threshold'].append(threshold)
        if threshold < 0:
            count = (cmkt_df['return'] < threshold).sum()
            ext_data['count'].append(count)
            ext_data['percent'].append(count / num_obs)
        else:
            count = (cmkt_df['return'] > threshold).sum()
            ext_data['count'].append(count)
            ext_data['percent'].append(count / num_obs)
    ext_df = pd.DataFrame(ext_data)

    # calculate yearly stats of unique assets and median mcap and volume
    df['year'] = df['date'].dt.year
    yr_df = pd.DataFrame({
        'num_unique_assets': df.groupby(['year'])['asset'].nunique(),
        'median_market_cap': df.groupby(['year'])['char_size_t'].median(),
        'median_weekly_asset_volume': df.groupby(['year'])['char_volume_sum_tm7'].median()}).reset_index()
    all_df = pd.DataFrame({
        'num_unique_assets': [df['asset'].nunique()],
        'median_market_cap': [df['char_size_t'].median()],
        'median_weekly_asset_volume': [df['char_volume_sum_tm7'].median()]})
    all_df['year'] = 'all'
    yr_df = pd.concat([yr_df, all_df])

    # calculate the total mcap in the last week of each year
    max_dates = df.groupby('year')['date'].max()
    filtered_df = df[df['date'].isin(max_dates)]
    total_mcap_by_year = filtered_df.groupby('year')[['char_size_t']].sum().reset_index()
    yr_df = yr_df.merge(total_mcap_by_year, on='year', how='outer', validate='one_to_one')

    # extract yearly returns
    cmkt_df = cmkt_df.reset_index()
    cmkt_df['year'] = cmkt_df.date.dt.year
    for year in [2018, 2019, 2020, 2021, 2022]:
        yr_df.loc[yr_df.year==year, 'cmkt_ret'] = ((cmkt_df[cmkt_df.year==year]['return']+1).cumprod()-1).values[-1]
    yr_df.loc[yr_df.year=='all', 'cmkt_ret'] = ((cmkt_df['return']+1).cumprod()-1).values[-1]

    # save results
    with pd.ExcelWriter(out_fp, engine='openpyxl', mode='a', if_sheet_exists='replace') as writer: 
        ret_df.to_excel(writer, sheet_name='raw_ret_stats')
        ext_df.to_excel(writer, sheet_name='raw_extreme_stats')
        yr_df.to_excel(writer, sheet_name='raw_yearly_stats')
        


In [6]:
if __name__ == "__main__":
    # set args
    PANEL_IN_FP     = '../data/clean/panel_weekly.pkl' 
    ASSET_IN_FP     = '../data/clean/asset_universe_dict.pickle'
    HIST_OUT_FP     = '../output/desc_stats/histograms.png'
    CUM_RET_OUT_FP  = '../output/desc_stats/cumulative_returns.png'
    OUT_FP          = '../output/desc_stats/descriptive_statistics.xlsx'
    PERIODS_IN_YEAR = 52
    TS_AVG_METHOD   = 'arithmetic'
    LHS_COL         = 'r_ex_tp7'
    ANNUALIZED      = False

    # import
    with open(ASSET_IN_FP, "rb") as f:
        asset_universe_dict = pickle.load(f)
    df = pd.read_pickle(PANEL_IN_FP)

    # drop rows that are not in the asset universe
    df = subsetToAssetUniverse(df, asset_universe_dict)

    # generate plots
    plotReturnHistograms(df, HIST_OUT_FP)
    plotCumulativeReturns(df, CUM_RET_OUT_FP)

    # generate tables
    genSummaryStatistics(df, LHS_COL, OUT_FP)

    # # TODO SCOPE IF RESULTS FOR ALL CHANGE MUCH AFTER A WINSOR
    # p1 = df[LHS_COL].quantile(0.01)
    # p99 = df[LHS_COL].quantile(0.99)
    # df.loc[df[LHS_COL] < p1, LHS_COL] = p1 
    # df.loc[df[LHS_COL] > p99, LHS_COL] = p1 
