In [1]:
# TODO NOTEBOOK IMPORTING AND FORMING OF DATA NEEDS TO BE CLEANED
# WAY MESSY TO SAVE TIME

In [2]:
# To be able to use the quantools, due to my crap path names have to add to sys path
import sys
sys.path.insert(0, '/home/adam/Dropbox/2-creations/2-crafts/7-buidl/0-utils/quant_tools/code')

# Import packages
from joblib import Parallel, delayed
import pandas_datareader as pdr
import matplotlib.pyplot as plt
from datetime import timedelta
from typing import Dict, List
from tools import QuantTools
from scipy.stats import norm
import statsmodels.api as sm
import scipy.stats as stats
import yfinance as yf
import pandas as pd
import numpy as np
import matplotlib
import datetime
import pickle
import random

# set color map
viridis = matplotlib.colormaps['viridis']


In [3]:
def importYahoo(ticker: str, start_date: str, end_date: str, rf_df: pd.DataFrame, 
            new_ret_col: str, resample_freq: str='W', rf_col: str='r_rf_tm7') -> pd.DataFrame:
    """ 
    Import Yahoo Finance data for given ticker, time period, taking our risk free rate in rf_df. 
    
    Parameters:
    ticker (str): The ticker symbol to get data for.
    start_date (str): The start date of the data retrieval period.
    end_date (str): The end date of the data retrieval period.
    rf_df (DataFrame): A DataFrame containing risk-free rates.
    new_ret_col (str): The new column name for returns.
    
    Returns:
    DataFrame: A DataFrame containing the date and return data.

    Note: this is done at weekly frequency; would need adjustment for different.
    """
    # import the data
    df = yf.Ticker(ticker).history(period='1d', start=start_date, end=end_date).reset_index()
    
    # reformat
    df['Date'] = pd.to_datetime(df.Date).to_numpy(dtype='datetime64[D]')
    df = df[['Date', 'Close']].rename(columns={'Date': 'date', 'Close': new_ret_col}).set_index('date')
    df = df.resample(resample_freq).last().pct_change().dropna()

    # adjust if resampling monthly
    if resample_freq == 'M':
        df.index = df.index + pd.Timedelta(days=1)

    # take out rf rate
    df = df.merge(rf_df, on='date', how='inner', validate='one_to_one')
    df[new_ret_col] = df[new_ret_col] - df[rf_col]

    # check for NaN values in new_ret_col
    if df[new_ret_col].isnull().values.any():
        print("Warning: NaN values found in return data")
    
    return df[['date', new_ret_col]]


In [4]:
def subsetToAssetUniverse(df: pd.DataFrame, asset_universe_dict: Dict[str, List[str]]) -> pd.DataFrame:
    """
    Subset a DataFrame based on a dictionary of asset universes.

    Parameters
    ----------
    df : pd.DataFrame
        The input DataFrame. Must contain columns "date" and "asset".
    asset_universe_dict : Dict[str, List[str]]
        A dictionary where keys are dates in 'YYYY-MM-DD' format and values are lists of asset names.

    Returns
    -------
    pd.DataFrame
        The subsetted DataFrame.
    """
    # Check that the required columns are present in the DataFrame
    if not set(['date', 'asset']).issubset(df.columns):
        raise ValueError('Input DataFrame must contain "date" and "asset" columns.')

    # Ensure that the 'date' column is of datetime type
    if df['date'].dtype != 'datetime64[ns]':
        df['date'] = pd.to_datetime(df['date'])

    # Loop over all months with their relevant assets
    for key, values in asset_universe_dict.items():
        # Extract the year and month from the key
        year, month = key.split('-')[:2]

        # Drop rows from the dataframe which match the year and month but not the assets
        df = df[~((df.date.dt.year == int(year)) 
                    & (df.date.dt.month == int(month)) 
                    & (~df.asset.isin(values)))]

    return df

In [5]:
def plotReturnHistograms(df: pd.DataFrame, out_fp: str):
    """
    This function takes a DataFrame containing time series data for returns of
    btc, eth, and the cmkt and saves a histogram plot for all three to given fp.
    Each histogram also includes a normal distribution fit.

    Parameters:
    df (pd.DataFrame): A DataFrame containing columns 'date', 'asset', 'char_r_tm7', 'macro_cmkt_tm7'.
    out_fp (str): A string specifying the filepath where the plot should be saved.

    Returns:
    None
    """
    # extract relevant returns
    btc_df  = df[df.asset=='btc'][['date', 'char_r_tm7']]
    btc_df  = btc_df.rename(columns={'char_r_tm7': 'btc'})
    eth_df  = df[df.asset=='eth'][['date', 'char_r_tm7']]
    eth_df  = eth_df.rename(columns={'char_r_tm7': 'eth'})
    cmkt_df = df.groupby('date')[['macro_cmkt_tm7']].mean().reset_index()
    cmkt_df = cmkt_df.rename(columns={'macro_cmkt_tm7': 'cmkt'})

    # form single dataframe
    hist_df = cmkt_df.merge(btc_df, on='date', how='inner', validate='one_to_one')
    hist_df = hist_df.merge(eth_df, on='date', how='inner', validate='one_to_one')

    # initiate the plot with given colors and columns
    fig, axs = plt.subplots(3, sharex=True, sharey=True, figsize=(6.4,4), facecolor='none')
    colors = plt.get_cmap('viridis')(np.linspace(0, 10))
    data_columns = ['cmkt', 'btc', 'eth']

    # plot the data with the normal dist fit
    for idx, ax in enumerate(axs):
        data = hist_df[data_columns[idx]]
        n, bins, patches = ax.hist(data, bins=30, color=colors[idx], alpha=1)
        #  density=True, 

        # Fit a normal distribution
        mu, std = norm.fit(data)

        # Scale normal distribution to histogram
        scale = n.max() / norm.pdf(mu, mu, std).max()
        
        # Plot the PDF
        xmin, xmax = ax.get_xlim()
        x = np.linspace(xmin, xmax, 100)
        p = norm.pdf(x, mu, std) * scale
        ax.plot(x, p, 'k', linewidth=2)

        for spine in ax.spines.values():
            spine.set_visible(False)

    # tighen up the plot
    fig.tight_layout()

    # adjust x axis labels
    plt.xticks(np.arange(-.5, 0.7, 0.1))

    # output
    plt.savefig(out_fp)

    # close the figure
    plt.close(fig)

In [6]:
def plotCumulativeReturns(df: pd.DataFrame, out_fp: str) -> None:
    """
    Plot the time series of cumulative returns to the given output filepath.

    Args:
        df (pd.DataFrame): DataFrame containing the time series data
        out_fp (str): a relative filepath to save the figure to.

    Returns:
        None
    """
    # initialize df with timeserieses to plot
    plot_df = pd.DataFrame(data={'date': []})

    # find all assets present in the panel
    assets = list(np.unique(df.asset.values))

    # form each asset's cumulative return
    for asset in assets:
        # extract asset's returns
        temp_df = df[df.asset==asset][['date', 'char_r_tm7']]

        # ensure it is sorted
        temp_df = temp_df.sort_values(by='date', ignore_index=True)

        # form cumulative return
        temp_df[asset] = (1 + temp_df['char_r_tm7']).cumprod()

        # merge on results
        plot_df = plot_df.merge(temp_df[['date', asset]], on='date', how='outer', validate='one_to_one')

    # form the cmkt return
    temp_df = df[df.asset=='btc'][['date', 'macro_cmkt_tm7']]
    temp_df = temp_df.sort_values(by='date', ignore_index=True)
    temp_df['cmkt'] = (1 + temp_df['macro_cmkt_tm7']).cumprod()
    plot_df = plot_df.merge(temp_df[['date', 'cmkt']], on='date', how='outer', validate='one_to_one')

    # resort
    plot_df = plot_df.sort_values(by='date', ignore_index=True)

    # set index
    plot_df.set_index('date', inplace=True)

    # Plotting the time series
    plt.figure(figsize=(4*1.61, 4), facecolor='none')

    # Form column list
    columns = list(plot_df.columns)
    columns.remove('btc')
    columns.remove('eth')
    columns.remove('cmkt')
    columns = columns + ['eth', 'btc', 'cmkt']

    # Iterate over the columns and plot each time series
    for column in columns:
        if column == 'btc':
            color = '#FDE725FF'
            linewidth = 2
        elif column == 'eth':
            color = '#2D708EFF'
            linewidth = 2
        elif column == 'cmkt':
            color = '#482677FF'
            linewidth = 2
        else:
            color = 'gray'
            linewidth = 0.5
        plt.plot(plot_df.index, plot_df[column], color=color, linewidth=linewidth)

    # Set y-axis to logarithmic scale
    plt.yscale('log')

    # Remove y-axis minor ticks
    plt.gca().yaxis.set_minor_locator(plt.NullLocator())

    # Customize the plot
    plt.grid(visible=True, which='major', axis='y', linewidth=0.5)
    plt.box(False)

    # Add custom labels for important time series
    plt.text(plot_df.index[-45], plot_df['cmkt'].iloc[-52]+5, 'cmkt', color='#482677FF', fontweight='bold', verticalalignment='center', bbox=dict(facecolor='none', edgecolor='none'))
    plt.text(plot_df.index[-1], plot_df['btc'].iloc[-1]-0.1, 'btc', color='#FDE725FF', fontweight='bold', verticalalignment='center', bbox=dict(facecolor='none', edgecolor='none'))
    plt.text(plot_df.index[-1], plot_df['eth'].iloc[-1]+0.3, 'eth', color='#2D708EFF', fontweight='bold', verticalalignment='center', bbox=dict(facecolor='none', edgecolor='none'))

    # output
    plt.savefig(out_fp)
    plt.close()

In [7]:
def genSummaryStatistics(df: pd.DataFrame, lhs_col: str, out_fp: str) -> None:
    """
    Generates summary statistics for the panel and saves them to an Excel file.

    :param df: DataFrame containing asset return data.
    :param lhs_col: Column in df that contains the return data.
    :param out_fp: Output file path for the Excel file.
    """
    # define function for calculating return statistics
    def calcReturnStats(temp_df: pd.DataFrame, asset: str, return_col: str) -> dict:
        mean_return = QuantTools.calcTSAvgReturn(temp_df[return_col].values, annualized=True, periods_in_year=52)
        std_dev = QuantTools.calcSD(temp_df[return_col].values, annualized=True, periods_in_year=52)
        sharpe_ratio = QuantTools.calcSharpe(temp_df[return_col].values, periods_in_year=52)
        skewness = stats.skew(temp_df[return_col].values) / np.sqrt(52)
        kurtosis = stats.kurtosis(temp_df[return_col].values) / 52
        perc_return_above_zero = np.sum(temp_df[return_col]>0) / len(temp_df)
        
        return {'asset': asset,
            'Mean': mean_return,
            'SD': std_dev,
            'Sharpe': sharpe_ratio,
            'Skewness': skewness,
            'Kurtosis': kurtosis,
            'Pct pos': perc_return_above_zero}

    # drop to only necessary columns
    df = df[['date', 'asset', lhs_col, 'char_size_t', 'char_volume_sum_tm7', 
        'macro_snp500_t', 'macro_dgs1mo_t']].copy()

    # form btc and eth returns
    btc_df = df[df.asset=='btc'].set_index('date')[[lhs_col]]
    eth_df = df[df.asset=='eth'].set_index('date')[[lhs_col]]

    # form cmkt return
    df['weighted_return'] = df[lhs_col] * df['char_size_t']
    total_market_cap = df.groupby('date')['char_size_t'].sum()
    cmkt_df = df.groupby('date')['weighted_return'].sum() / total_market_cap
    cmkt_df = pd.DataFrame(cmkt_df).rename(columns={0: 'return'})

    # import nasdaq data and take out risk free rate
    rf_df = df[['date', 'macro_dgs1mo_t']].drop_duplicates()
    rf_df['r_rf_tm7'] = (1 + rf_df.macro_dgs1mo_t.values / 100) ** (1 / (365 / 7)) - 1
    rf_df = rf_df[['date', 'r_rf_tm7']]
    nsdq_df = importYahoo('^IXIC', '2017-12-29', '2022-12-31', rf_df, 'r_nsdq_tm7')    

    # calc return statistics
    cmkt_stats = calcReturnStats(cmkt_df, 'CMKT', 'return')
    btc_stats  = calcReturnStats(btc_df, 'Bitcoin', lhs_col)
    eth_stats  = calcReturnStats(eth_df, 'Ethereum', lhs_col)
    nsdq_stats = calcReturnStats(nsdq_df, 'Nasdaq', 'r_nsdq_tm7')
    ret_df = pd.DataFrame([cmkt_stats, btc_stats, eth_stats, nsdq_stats])
    
    # calc extreme event statistics
    ext_data = {'threshold': [], 'count': [], 'percent': []}
    num_obs  = len(cmkt_df)
    for threshold in [-.3, -.2, -.1, -.05, .05, .1, .2, .3]:
        ext_data['threshold'].append(threshold)
        if threshold < 0:
            count = (cmkt_df['return'] < threshold).sum()
            ext_data['count'].append(count)
            ext_data['percent'].append(count / num_obs)
        else:
            count = (cmkt_df['return'] > threshold).sum()
            ext_data['count'].append(count)
            ext_data['percent'].append(count / num_obs)
    ext_df = pd.DataFrame(ext_data)
    
    # calculate yearly stats of unique assets and median mcap and volume
    df['year'] = df['date'].dt.year
    yr_df = pd.DataFrame({
        'num_unique_assets': df.groupby(['year'])['asset'].nunique(),
        'median_market_cap': df.groupby(['year'])['char_size_t'].median(),
        'median_weekly_asset_volume': df.groupby(['year'])['char_volume_sum_tm7'].median()}).reset_index()
    all_df = pd.DataFrame({
        'num_unique_assets': [df['asset'].nunique()],
        'median_market_cap': [df['char_size_t'].median()],
        'median_weekly_asset_volume': [df['char_volume_sum_tm7'].median()]})
    all_df['year'] = 'all'
    yr_df = pd.concat([yr_df, all_df])

    # calculate the total mcap in the last week of each year
    max_dates = df.groupby('year')['date'].max()
    filtered_df = df[df['date'].isin(max_dates)]
    total_mcap_by_year = filtered_df.groupby('year')[['char_size_t']].sum().reset_index()
    yr_df = yr_df.merge(total_mcap_by_year, on='year', how='outer', validate='one_to_one')

    # extract yearly returns
    cmkt_df = cmkt_df.reset_index()
    cmkt_df['year'] = cmkt_df.date.dt.year
    for year in [2018, 2019, 2020, 2021, 2022]:
        yr_df.loc[yr_df.year==year, 'cmkt_ret'] = ((cmkt_df[cmkt_df.year==year]['return']+1).cumprod()-1).values[-1]
    yr_df.loc[yr_df.year=='all', 'cmkt_ret'] = ((cmkt_df['return']+1).cumprod()-1).values[-1]
    
    # save results
    with pd.ExcelWriter(out_fp, engine='openpyxl', mode='a', if_sheet_exists='replace') as writer: 
        ret_df.to_excel(writer, sheet_name='raw_ret_stats')
        ext_df.to_excel(writer, sheet_name='raw_extreme_stats')
        yr_df.to_excel(writer, sheet_name='raw_yearly_stats')
        


In [8]:
def plotRollingSharpe(out_fp: str) -> None:
    """ Plot rolling four year sharpe ratio with new data for the study period. """
    
    # Obtain risk-free rate
    rf_df = pdr.DataReader('DGS1MO', 'fred', '2013-12-31').reset_index()
    rf_df['r_rf_tm7'] = (1 + rf_df.DGS1MO.values / 100) ** (1 / (365 / 7)) - 1
    rf_df['date'] = pd.to_datetime(rf_df.DATE)
    rf_df = rf_df[['date', 'r_rf_tm7']]
    rf_df.set_index('date', inplace=True)
    date_range = pd.date_range(start=rf_df.index.min(), end=rf_df.index.max(), freq='D')
    rf_df = rf_df.reindex(date_range)
    rf_df['r_rf_tm7'].fillna(method='ffill', inplace=True)
    rf_df.reset_index(inplace=True)
    rf_df = rf_df.rename(columns={'index': 'date'})

    # import other asset class data
    start_date = '2013-12-29'
    end_date   = '2022-12-31'
    nsdq_df = importYahoo('^IXIC', start_date, end_date, rf_df, 'Nasdaq')
    snp_df  = importYahoo('^GSPC', start_date, end_date, rf_df, 'SnP 500')
    vt_df   = importYahoo('VT', start_date, end_date, rf_df, 'Global Stocks')
    bnd_df  = importYahoo('BND', start_date, end_date, rf_df, 'US Bonds')
    real_df = importYahoo('VNQ', start_date, end_date, rf_df, 'US Real Estate')
    emrg_df = importYahoo('EBND', start_date, end_date, rf_df, 'Emerging Currencies')
    gld_df  = importYahoo('GLD', start_date, end_date, rf_df, 'Gold')

    # import the btc data and form weekly excess returns
    btc_df = pd.read_csv('../data/raw/XBTUSD_1440.csv',
                        header=None)
    btc_df[0] = pd.to_datetime(btc_df[0], unit='s')
    btc_df = btc_df[[0, 4]]
    btc_df = btc_df.rename(columns={0: 'date', 4: 'price'})
    btc_df.set_index('date', inplace=True)
    btc_df = btc_df.resample('W').last()
    btc_df['Bitcoin'] = btc_df['price'].pct_change()
    btc_df = btc_df.merge(rf_df, on='date', how='inner', validate='one_to_one')
    btc_df['Bitcoin'] = btc_df['Bitcoin'] - btc_df.r_rf_tm7
    btc_df = btc_df.drop(['price', 'r_rf_tm7'], axis=1)

    # form single dataframe
    df = btc_df.merge(nsdq_df, on='date', how='inner', validate='one_to_one')
    for temp_df in [snp_df, vt_df, bnd_df, real_df, emrg_df, gld_df]:
        df = df.merge(temp_df, on='date', how='inner', validate='one_to_one')

    # form sharpe ratios
    window_size = 208
    df.set_index('date', inplace=True)
    columns = list(df.columns.values)
    for col in columns:
        df[col] = np.sqrt(52) * df[col].rolling(window_size).mean() / df[col].rolling(window_size).std()

    # subset to relevant time period
    df = df[(df.index.year >= 2018)
        & (df.index.year <= 2022)]

    # form sharpe ratio plot
    plt.figure(figsize=(4*1.61, 4))
    df.plot(cmap='viridis')
    plt.box(False)
    plt.grid(visible=True, which='major', axis='x', linewidth=0.5)
    #plt.title('Sharpe Ratios: Bitcoin vs Major Asset Classes$^{12}$')
    plt.legend(labels=df.columns.values, 
                loc='lower center',
                ncol=3,
                bbox_to_anchor=(0.5, -0.37),
                frameon=False)
    plt.xlabel("")
    plt.tick_params(axis='both', which='both', bottom=False)
    plt.xticks(rotation=0, ha='center')
    plt.savefig(out_fp, bbox_inches='tight')
    plt.close()


In [9]:
def plotTransactionStats(df: pd.DataFrame, out_fp: str) -> None:
    """ Plot rolling transaction statistics for Bitcoin. """
    # Extract relevant transaction data
    tx_df = df[['date', 'macro_btc_fee_med_usd_t', 'macro_btc_tx_tfr_val_adj_usd_t']].drop_duplicates().copy()

    # reformat the data
    tx_df.set_index('date', inplace=True)
    temp1_df = tx_df[['macro_btc_tx_tfr_val_adj_usd_t']].resample('M').sum()
    temp2_df = tx_df[['macro_btc_fee_med_usd_t']].resample('M').median()
    r_df = temp1_df.merge(temp2_df, how='inner', left_index=True, right_index=True, validate='one_to_one')
    r_df = r_df.rename(columns={'macro_btc_fee_med_usd_t': 'Median Fee (USD)',
                                'macro_btc_tx_tfr_val_adj_usd_t': 'Monthly Volume (USD)'})
    r_df['date'] = r_df.index
    r_df['date'] = r_df.date.dt.strftime('%Y-%m')
    r_df = r_df.set_index('date')

    # plot the data
    plt.figure(figsize=(4*1.61, 4))
    r_df.plot(color=[viridis.colors[0], viridis.colors[111]])
    plt.box(False)
    plt.grid(visible=True, which='major', axis='x', linewidth=0.5)
    plt.legend(labels=r_df.columns.values, 
            loc='lower center',
            ncol=3,
            bbox_to_anchor=(0.5, -0.22),
            frameon=False)
    plt.yscale("log")
    plt.xlabel("")
    plt.xticks(np.array([0,12,24,36,48]), 
            ['2018', '2019', '2020', '2021', '2022'],
            rotation=0, ha='center')
    plt.tick_params(axis='both', which='both', bottom=False)
    plt.savefig(out_fp, bbox_inches='tight')
    plt.close()


In [10]:
def plotHodlingStats(df: pd.DataFrame, out_fp: str) -> None:
    # form the hodl data
    utxo_df = df[['date', 'macro_btc_utxo_age_med_t']].copy()
    utxo_df = utxo_df.drop_duplicates()
    utxo_df = utxo_df.set_index('date')
    utxo_df = utxo_df.rename(columns={'macro_btc_utxo_age_med_t':
                                    'UTXO Median Age (Days)'})

    # Plot the hodl data
    plt.figure(figsize=(4*1.61, 4))
    utxo_df.plot(color=[viridis.colors[0]],
                legend=None)
    plt.box(False)
    plt.grid(visible=True, which='major', axis='x', linewidth=0.5)
    plt.xlabel("")
    plt.tick_params(axis='both', which='both', bottom=False)
    plt.savefig(out_fp, bbox_inches='tight')
    plt.close()

In [11]:
def genForkStatistics(df: pd.DataFrame, out_fp: str) -> None:
    """ Generate event studies for BTC fork dates. """
    # PARAMETERS
    num_bs_samples = int(1e6)
    num_cpus       = 22
    window_size    = 8

    # DEFINE RELEVANT FORKS
    forks = {'bitcoin-21': datetime.datetime(2016,4,17),
            'zcash': datetime.datetime(2016,10,28),
            'bitcoin-cash': datetime.datetime(2017,7,31),
            'bitcoin-gold': datetime.datetime(2017,10,24),
            'bitcoin-diamond': datetime.datetime(2017,11,24),
            'lightning-bitcoin': datetime.datetime(2017,12,18),
            'bitcoinfast': datetime.datetime(2017,12,26),
            'bitcoin2': datetime.datetime(2017,12,28),
            'bitcoin-plus': datetime.datetime(2018,1,2),
            'bitcoin-interest': datetime.datetime(2018,1,22),
            'bitcoin-atom': datetime.datetime(2018,1,24),
            'bitcoin-private': datetime.datetime(2018,2,28),
            'microbitcoin': datetime.datetime(2018,5,29),
            'bitcoin-bep2': datetime.datetime(2018,6,29),
            'bitcoin-sv': datetime.datetime(2018,11,11)}

    # PULL IN OLD DATA TO BUILD RELEVANT TIMESERIES FOR BTC
    df = pd.read_csv('../data/raw/cmc_price_vol_mcap_panel.csv')
    df = df[df.cmc_id==1]
    df = df[['date', 'usd_per_token', 'usd_volume_24h']]
    df['date'] = pd.to_datetime(df.date)
    df = df.drop_duplicates(subset='date')
    df = df.reset_index(drop=True)
    san_df = pd.read_pickle('../data/raw/santiment_panel.pkl')
    san_df = san_df[san_df.san_slug=='bitcoin'][['date', 'active_addresses_24h', 'github_activity',
                                                'social_volume_total']].reset_index(drop=True)
    hash_df = pd.read_excel('../data/raw/coinmetrics_btc_hashrate.xlsx')
    hash_df = hash_df.rename(columns={'Time': 'date',
                                    'BTC / Mean Hash Rate': 'hash_rate'})
    hash_df['date'] = pd.to_datetime(hash_df['date'])
    df = df.merge(san_df,
                on=['date'],
                how='left',
                validate='one_to_one')
    df = df.merge(hash_df,
                on='date',
                how='left',
                validate='one_to_one')
    del san_df, hash_df

    # CLEAN UP THE PANEL

    # ensure it has all days
    sdate   = datetime.date(2015, 1, 1) 
    edate   = datetime.date(2021, 12, 31)  
    delta   = edate - sdate 
    days    = []
    for i in range(delta.days + 1):
        days.append(sdate+timedelta(days=i))
    days_df = pd.DataFrame(data={'date':days})
    df['date'] = df.date.dt.date
    df      = df.merge(days_df,
                    on='date',
                    how='outer',
                    validate='one_to_one')
    df = df.sort_values('date')
    df = df.interpolate()

    # form pct change in all columns and clean up improper values
    cols = list(df.columns.values)
    cols.remove('date')
    for col in cols:
        df[col] = df[col].pct_change()
    df = df.dropna()
    df = df.fillna(0)
    df = df.replace([np.inf, -np.inf], 0)

    # subset to time period of interest 
    df['date'] = pd.to_datetime(df['date'])
    df[df.date.dt.year <= 2021]
    df = df.reset_index(drop=True)

    # rename columns
    df = df.rename(columns = {'usd_per_token': 'return',
                            'usd_volume_24h': 'usd_volume',
                            'active_addresses_24h': 'active_addresses',
                            'github_activity': 'developer_activity',
                            'social_volume_total': 'social_volume'})

    # INITIALIZE RESULTS 
    cols = list(df.columns.values)
    cols.remove('date')
    results_df = pd.DataFrame(data={'stat': cols,
                                    'window': np.repeat('7 days', len(cols)),
                                    'est': np.zeros(len(cols)),
                                    'se': np.zeros(len(cols))})

    # CALC STAT
    for col in cols:
        diffs = []
        for i in range(len(forks)):
            fork_date = list(forks.values())[i]
            pre_date  = fork_date-pd.Timedelta(window_size, unit="d")
            post_date = fork_date+pd.Timedelta(window_size, unit="d")
            pre_mean  = np.mean(df[(df.date >= pre_date) & (df.date < fork_date)][col])
            post_mean = np.mean(df[(df.date > fork_date) & (df.date <= post_date)][col])
            diff      = post_mean-pre_mean
            diffs.append(diff)
        results_df.loc[results_df.stat==col, 'est'] = np.mean(diffs)

    # CALC STANDARD ERROR (NOTE: 6 MIN RUN TIME)
    num_forks = len(forks)
    rel_dates = list(df.date.values)[window_size:-window_size]
    for col in cols:
        # calc all diffs across the panel
        diffs = []
        for date in rel_dates:
            pre_date  = date-pd.Timedelta(31, unit="d")
            post_date = date+pd.Timedelta(31, unit="d")
            pre_mean  = np.mean(df[(df.date >= pre_date) & (df.date < date)][col])
            post_mean = np.mean(df[(df.date > date) & (df.date <= post_date)][col])
            diff      = post_mean-pre_mean
            diffs.append(diff)

        # reorder the list to randomize
        random.shuffle(diffs)

        # calc bootstrap distribution
        def loopOverNumberBootstrapSamples(i):
            random_diffs = []
            for j in range(num_forks):
                index = np.random.randint(low=0,high=len(diffs))
                diff  = diffs[index]
                random_diffs.append(diff)
            return np.mean(random_diffs)
        bs_stat = Parallel(n_jobs=num_cpus)(delayed(loopOverNumberBootstrapSamples)(i) for i in range(num_bs_samples))

        # calc standard error and add it to results
        se = np.std(bs_stat)
        results_df.loc[results_df.stat==col, 'se'] = se

    # OUTPUT
    with pd.ExcelWriter(out_fp, engine='openpyxl', mode='a', if_sheet_exists='replace') as writer: 
        results_df.to_excel(writer, sheet_name='raw_forks')


In [12]:
def genCorrStatistics(df: pd.DataFrame, out_fp: str) -> None:
    """ Build table of correlation statistics. """
    # Set dates
    start_date = '2013-12-01'
    end_date   = '2023-07-01'

    # Obtain risk-free rate
    rf_df = pdr.DataReader('DGS1MO', 'fred', '2013-12-31').reset_index()
    rf_df['r_rf_tm7'] = (1 + rf_df.DGS1MO.values / 100) ** (1 / (365 / 7)) - 1
    rf_df['date'] = pd.to_datetime(rf_df.DATE)
    rf_df = rf_df[['date', 'r_rf_tm7']]
    rf_df.set_index('date', inplace=True)
    date_range = pd.date_range(start=rf_df.index.min(), end=rf_df.index.max(), freq='D')
    rf_df = rf_df.reindex(date_range)
    rf_df['r_rf_tm7'].fillna(method='ffill', inplace=True)
    rf_df.reset_index(inplace=True)

    rf_df = rf_df.rename(columns={'index': 'date'})

    # Import other asset class data
    nsdq_df = importYahoo('^IXIC', start_date, end_date, rf_df, 'Nasdaq')
    snp_df  = importYahoo('^GSPC', start_date, end_date, rf_df, 'SnP 500')
    rus_df  = importYahoo('^RUT', start_date, end_date, rf_df, 'Russell 2000')
    vt_df   = importYahoo('VT', start_date, end_date, rf_df, 'Global Stocks')
    bnd_df  = importYahoo('BND', start_date, end_date, rf_df, 'US Bonds')
    bndx_df = importYahoo('BNDX', start_date, end_date, rf_df, 'Ex-US Global Bonds')
    real_df = importYahoo('VNQ', start_date, end_date, rf_df, 'US Real Estate')
    emrg_df = importYahoo('EBND', start_date, end_date, rf_df, 'Emerging Currencies')
    dbc_df  = importYahoo('DBC', start_date, end_date, rf_df, 'Commodities')
    gld_df  = importYahoo('GLD', start_date, end_date, rf_df, 'Gold')

    # Form crypto time series
    btc_df = df[df.asset=='btc'][['date', 'char_r_tm7']].reset_index(drop=True).copy()
    btc_df = btc_df.rename(columns={'char_r_tm7': 'Bitcoin'})

    eth_df = df[df.asset=='eth'][['date', 'char_r_tm7']].reset_index(drop=True).copy()
    eth_df = eth_df.rename(columns={'char_r_tm7': 'Ethereum'})

    cmkt_df = df.groupby('date')[['macro_cmkt_tm7']].mean().reset_index()
    cmkt_df = cmkt_df.rename(columns={'macro_cmkt_tm7': 'Crypto Market'})


    # form single dataframe
    t_df = cmkt_df.merge(btc_df, on='date', how='inner', validate='one_to_one')
    for temp_df in [eth_df, nsdq_df, snp_df, rus_df, vt_df, bnd_df, bndx_df, real_df, emrg_df, dbc_df, gld_df]:
        t_df = t_df.merge(temp_df, on='date', how='inner', validate='one_to_one')
    t_df = t_df.set_index('date')

    # Export
    with pd.ExcelWriter(out_fp, engine='openpyxl', mode='a', if_sheet_exists='replace') as writer: 
        t_df[(t_df.index >= '2018-01-01') 
            & (t_df.index <= '2022-12-31')].corr().to_excel(writer, sheet_name='raw_corr')


In [13]:
def plotRollingCorrelations(out_fp: str) -> None:
    """ Plot figure out rolling correlations between BTC and other asset classes. """
    # Parameters
    start_date = '2013-11-01'
    end_date   = '2023-07-02'
    rolling_window = 48

    # Pull and form exp inf data
    ei_df = pdr.DataReader('EXPINF1YR', 'fred', start_date).reset_index()
    ei_df = ei_df.rename(columns={'DATE': 'date'})
    ei_df = ei_df.set_index('date')
    ei_df = ei_df.pct_change().dropna().reset_index()
    ei_df = ei_df[(ei_df.date >= '2014-01-01') & (ei_df.date <= '2023-07-01')]

    # Obtain risk-free rate
    rf_df = pdr.DataReader('DGS1MO', 'fred', '2013-12-15').reset_index()
    rf_df['r_rf'] = (1 + rf_df.DGS1MO.values / 100) ** (1 / 12) - 1
    rf_df['date'] = pd.to_datetime(rf_df.DATE)
    rf_df = rf_df[['date', 'r_rf']]
    rf_df.set_index('date', inplace=True)
    date_range = pd.date_range(start=rf_df.index.min(), end=rf_df.index.max(), freq='D')
    rf_df = rf_df.reindex(date_range)
    rf_df['r_rf'].fillna(method='ffill', inplace=True)
    rf_df.reset_index(inplace=True)
    rf_df = rf_df.rename(columns={'index': 'date'})
    rf_df = rf_df[(rf_df.date >= '2014-01-01') & (rf_df.date <= '2023-07-01')]

    # Import other asset class data
    nsdq_df = importYahoo('^IXIC', start_date, end_date, rf_df, 'Nasdaq', 'M', 'r_rf')
    bnd_df  = importYahoo('BND', start_date, end_date, rf_df, 'US Bonds', 'M', 'r_rf')
    real_df = importYahoo('VNQ', start_date, end_date, rf_df, 'US Real Estate', 'M', 'r_rf')
    emrg_df = importYahoo('EBND', start_date, end_date, rf_df, 'Emerging Currencies', 'M', 'r_rf')
    dbc_df  = importYahoo('DBC', start_date, end_date, rf_df, 'Commodities', 'M', 'r_rf')
    gld_df  = importYahoo('GLD', start_date, end_date, rf_df, 'Gold', 'M', 'r_rf')

    # import the btc data and form monthly excess returns
    btc_df = pd.read_csv('../data/raw/cm_btc.csv')
    btc_df['date'] = pd.to_datetime(btc_df.date)
    btc_df = btc_df.rename(columns={'btc': 'Bitcoin'})
    btc_df.set_index('date', inplace=True)
    btc_df = btc_df.resample('M').last()
    btc_df.index = btc_df.index + pd.Timedelta(days=1)
    btc_df['Bitcoin'] = btc_df['Bitcoin'].pct_change()
    btc_df = btc_df.merge(rf_df, on='date', how='inner', validate='one_to_one')
    btc_df['Bitcoin'] = btc_df['Bitcoin'] - btc_df.r_rf
    btc_df = btc_df.drop(['r_rf'], axis=1)

    # form single dataframe
    df = btc_df.merge(nsdq_df, on='date', how='inner', validate='one_to_one')
    for temp_df in [bnd_df, real_df, emrg_df, dbc_df, gld_df, ei_df]:
        df = df.merge(temp_df, on='date', how='inner', validate='one_to_one')
    df = df.set_index('date')

    # Calc rolling correlations
    results_df = pd.DataFrame(data = {'date': df.index.values})
    results_df = results_df.set_index('date')
    col_list = list(df.columns.values)[1:]
    for col in col_list:
        temp_df = df[['Bitcoin']].rolling(rolling_window).corr(df[col])
        temp_df = temp_df.rename(columns = {'Bitcoin': col})
        temp_df = temp_df.dropna()
        results_df = results_df.merge(temp_df,
                                    how='inner',
                                    left_index=True,
                                    right_index=True,
                                    validate='one_to_one')
    results_df = results_df[results_df.index < '2023-01-01']

    # Form figure
    plt.figure(figsize=(4*1.61, 4))
    results_df.plot(color=[viridis.colors[0], viridis.colors[44], viridis.colors[88],
        viridis.colors[132], viridis.colors[178], viridis.colors[222], viridis.colors[255]])
    plt.box(False)
    plt.grid(visible=True, which='major', axis='x', linewidth=0.5)
    plt.legend(labels=results_df.columns.values, 
                loc='lower center',
                ncol=3,
                bbox_to_anchor=(0.5, -0.37),
                frameon=False)
    plt.xlabel("")
    plt.tick_params(axis='both', which='both', bottom=False)
    plt.xticks(rotation=0, ha='center')
    plt.savefig(out_fp, bbox_inches='tight')
    plt.close()


In [14]:
def formRiskReturnScatterPlot(df: pd.DataFrame, out_fp: str) -> None:
    """ Plot all assets in return risk space. """
    # calc each asset's annualized return and risk
    r_df = df[['date', 'asset', 'char_r_tm7', 'macro_dgs1mo_t']].copy()
    r_df['char_r_tm7'] = r_df.char_r_tm7 + ((r_df.macro_dgs1mo_t.values/100)+1)**(1/52)-1
    r_df = r_df.drop('macro_dgs1mo_t', axis=1)
    r_df['annual_return'] = r_df.groupby('asset')['char_r_tm7'].transform(lambda x: 
                                QuantTools.calcGeomAvg(x, annualized=True, periods_in_year=52))
    r_df['annual_risk'] = r_df.groupby('asset')['char_r_tm7'].transform(lambda x: 
                                QuantTools.calcSD(x, annualized=True, periods_in_year=52))
    r_df = r_df[['asset', 'annual_return', 'annual_risk']].drop_duplicates()
    r_df = r_df.sort_values(by='asset', ignore_index=True)

    # add 1 month tbill to data
    t_df = df[['date', 'macro_dgs1mo_t']].drop_duplicates().reset_index(drop=True)
    t_weekly_returns = ((t_df.macro_dgs1mo_t.values/100)+1)**(1/52)-1
    t_annual_return = QuantTools.calcGeomAvg(t_weekly_returns, annualized=True, periods_in_year=52)
    r_df = pd.concat([r_df, 
                    pd.DataFrame(data={'asset': ['1moTbill'], 
                                    'annual_return': [t_annual_return],
                                    'annual_risk': [0]})]).reset_index(drop=True)

    # Calc risk free rate
    rf_df = pdr.DataReader('DGS1MO', 'fred', '2013-12-31').reset_index()
    rf_df['r_rf_tm7'] = (1 + rf_df.DGS1MO.values / 100) ** (1 / (365 / 7)) - 1
    rf_df['date'] = pd.to_datetime(rf_df.DATE)
    rf_df = rf_df[['date', 'r_rf_tm7']]
    rf_df.set_index('date', inplace=True)
    date_range = pd.date_range(start=rf_df.index.min(), end=rf_df.index.max(), freq='D')
    rf_df = rf_df.reindex(date_range)
    rf_df['r_rf_tm7'].fillna(method='ffill', inplace=True)
    rf_df.reset_index(inplace=True)
    rf_df = rf_df.rename(columns={'index': 'date'})

    # Import other asset class data
    nsdq_df = importYahoo('^IXIC', '2017-11-30', '2023-01-08', rf_df, 'Nasdaq')
    nsdq_df = nsdq_df[(nsdq_df.date >= '2018-01-01') & (nsdq_df.date <= '2022-12-31')]

    # Add nsdq
    nsdq_annual_ret = QuantTools.calcGeomAvg(nsdq_df.Nasdaq.values, annualized=True, periods_in_year=52)
    nsdq_sd = QuantTools.calcSD(nsdq_df.Nasdaq.values, annualized=True, periods_in_year=52)
    r_df = pd.concat([r_df, 
                    pd.DataFrame(data={'asset': ['nsdq'], 
                            'annual_return': [nsdq_annual_ret],
                            'annual_risk': [nsdq_sd]})]).reset_index(drop=True)

    # Add cmkt
    cmkt_df = df.groupby('date')[['macro_cmkt_tm7']].mean().reset_index()
    cmkt_df = cmkt_df.rename(columns={'macro_cmkt_tm7': 'cmkt'})
    cmkt_df['cmkt'] = cmkt_df.cmkt+t_weekly_returns
    cmkt_annual_ret = QuantTools.calcGeomAvg(cmkt_df.cmkt.values, annualized=True, periods_in_year=52)
    cmkt_sd = QuantTools.calcSD(cmkt_df.cmkt.values, annualized=True, periods_in_year=52)
    r_df = pd.concat([r_df,
                    pd.DataFrame(data={'asset': ['cmkt'], 
                            'annual_return': [cmkt_annual_ret],
                            'annual_risk': [cmkt_sd]})]).reset_index(drop=True)

    # Form optimal risk portfolio
    nsdq_pcts = list(np.arange(0, 1, 0.05)) + [.99]
    max_sharpe = 0
    nsdq_opt   = 0
    for nsdq_pct in nsdq_pcts:
        cmkt_pct = 1 - nsdq_pct
        d_ret = nsdq_df.Nasdaq.values*nsdq_pct + cmkt_df.cmkt.values*cmkt_pct
        d_annual_ret = QuantTools.calcGeomAvg(d_ret, annualized=True, periods_in_year=52)
        d_sd = QuantTools.calcSD(d_ret, annualized=True, periods_in_year=52)
        print(f"for nsdq pct {np.round(nsdq_pct, 2)}, the sharpe is {np.round(d_annual_ret / d_sd, 3)} and annual return is {np.round(d_annual_ret, 4)}")
        if (d_annual_ret / d_sd) > max_sharpe:
            nsdq_opt = nsdq_pct
            opt_return = d_annual_ret
            opt_sd     = d_sd
            max_sharpe = opt_return / opt_sd
        
    print(f"\n for a sharpe of {np.round(max_sharpe, 4)}, allocate {np.round(nsdq_opt, 4)} to nsdq and {np.round(1-nsdq_opt, 4)} to cmkt.")

    opt_col_name = 'nsdq_'+str(np.round(nsdq_opt, 2))+'_cmkt_'+str(np.round(1-nsdq_opt, 2))
    r_df = pd.concat([r_df,
                    pd.DataFrame(data={'asset': [opt_col_name], 
                            'annual_return': [opt_return],
                            'annual_risk': [opt_sd]})]).reset_index(drop=True)

    # Final edits
    r_df = r_df.set_index('asset')
    r_df = r_df[r_df.index != 'luna']

    # Form figure
    plt.figure(figsize=(4*1.61, 4))
    plt.scatter(r_df['annual_risk'], r_df['annual_return'], color=viridis.colors[111])

    labels = ["1moTbill", "btc", "cmkt", "eth", "nsdq", opt_col_name]
    colors = ['lightgrey', viridis.colors[255], viridis.colors[33], 
        viridis.colors[177], 'grey', 'black']
    for label, color in zip(labels, colors):
        asset_df = r_df[r_df.index==label]
        plt.scatter(asset_df['annual_risk'], asset_df['annual_return'], color=color)

    point_rf = r_df[r_df.index == "1moTbill"].iloc[0]
    point_nsdq = r_df[r_df.index == "nsdq"].iloc[0]
    point_cmkt = r_df[r_df.index == "cmkt"].iloc[0]
    point_opt  = r_df[r_df.index == opt_col_name].iloc[0]
    slope_nsdq = (point_nsdq['annual_return'] - point_rf['annual_return']) / (point_nsdq['annual_risk'] - point_rf['annual_risk'])
    slope_cmkt = (point_cmkt['annual_return'] - point_rf['annual_return']) / (point_cmkt['annual_risk'] - point_rf['annual_risk'])
    slope_opt = (point_opt['annual_return'] - point_rf['annual_return']) / (point_opt['annual_risk'] - point_rf['annual_risk'])
    max_x = r_df['annual_risk'].max()
    max_y_nsdq = slope_nsdq * (max_x - point_rf['annual_risk']) + point_rf['annual_return']
    max_y_cmkt = slope_cmkt * (max_x - point_rf['annual_risk']) + point_rf['annual_return']
    max_y_opt  = slope_opt  * (max_x - point_rf['annual_risk']) + point_rf['annual_return']
    plt.plot([point_rf['annual_risk'], max_x], [point_rf['annual_return'], max_y_nsdq], ':', color='grey')
    plt.plot([point_rf['annual_risk'], max_x], [point_rf['annual_return'], max_y_cmkt], ':', color=viridis.colors[33])
    plt.plot([point_rf['annual_risk'], max_x], [point_rf['annual_return'], max_y_opt], ':', color='black')

    ax = plt.gca()
    ax.spines['right'].set_visible(False)
    ax.spines['top'].set_visible(False)

    plt.xlim(0, max_x + 0.3)

    plt.savefig(out_fp, bbox_inches='tight')
    plt.close()


In [15]:
def reportICOStatistics(df: pd.DataFrame) -> None:
    # Subset to relevant data
    ico_df = df[['date', 'asset', 'r_ex_tp7', 'char_price_t', 
        'char_ico', 'char_ico_days_since_t', 'char_ico_price']].copy()

    # Determine which assets are ico assets
    ico_assets = np.unique(ico_df[ico_df.char_ico==1].asset.values)

    # Subset to relevant assets
    ico_df = ico_df[ico_df.asset.isin(ico_assets)]
    ico_df = ico_df.drop(columns='char_ico', axis=1)

    # Form dataframe of the last tradable data for all assets
    last_df = ico_df.groupby('asset')[['date']].max().reset_index()
    last_df = last_df.merge(ico_df[['date', 'asset', 'char_price_t', 'char_ico_price']], 
        on=['date', 'asset'], how='left', validate='one_to_one')

    # Calc and report return since ICO
    last_df['r_from_ico'] = last_df.char_price_t / last_df.char_ico_price - 1
    return_from_ico_dates = last_df.r_from_ico.mean()
    print(f"Return on ICO assets from ICO date to end of panel is: {int(return_from_ico_dates)}x.")

    # Calc return of ico assets from first tradable data to end of panel
    min_df = ico_df[(ico_df.char_ico_days_since_t>0)].groupby('asset')[['date']].min().reset_index()
    min_df = min_df.merge(ico_df[['date', 'asset', 'char_price_t']],
        on=['date', 'asset'], how='left', validate='one_to_one')
    min_df = min_df.rename(columns={'char_price_t': 'first_trade_price'})
    min_df = min_df.drop('date', axis=1)

    max_df = ico_df.groupby('asset')[['date']].max().reset_index()
    max_df = max_df.merge(ico_df[['date', 'asset', 'char_price_t']], 
        on=['date', 'asset'], how='left', validate='one_to_one')
    max_df = max_df.rename(columns={'char_price_t': 'last_trade_price'})
    max_df = max_df.drop('date', axis=1)
    trade_df = max_df.merge(min_df, on='asset', how='inner', validate='one_to_one')
    trade_df['r'] = trade_df.last_trade_price / trade_df.first_trade_price - 1

    return_from_first_tradable = trade_df.r.mean()
    print(f"Return on ICO assets from first tradable date to end of panel is: {np.round(100*return_from_first_tradable, 1)}%.")

In [16]:
def inflationStatistics(df: pd.DataFrame, out_fp: str) -> None:
    # Copy the raw data for use later as this func is terribly written
    temp_df = df.copy()

    # Params
    start_date = '2013-11-01'
    end_date   = '2023-07-02'

    # Obtain monthly excess returns for cmkt
    cmkt_df = df[['date', 'macro_cmkt_tm7']].drop_duplicates().copy()
    cmkt_df.set_index('date', inplace=True)
    cmkt_df['cmkt'] = cmkt_df.macro_cmkt_tm7+1
    cmkt_df = cmkt_df.drop('macro_cmkt_tm7', axis=1)
    cmkt_df = cmkt_df.resample('M')[['cmkt']].prod()-1
    cmkt_df.index = cmkt_df.index + pd.DateOffset(days=1) - pd.DateOffset(months=1)

    # Pull and form exp inf data
    inf_col = 'EXPINF10YR'
    ei_df = pdr.DataReader(inf_col, 'fred', start_date).reset_index()
    ei_df = ei_df.rename(columns={'DATE': 'date'})
    ei_df = ei_df.set_index('date')
    ei_df[inf_col] = (1+ei_df[inf_col])**(1/12)-1 # annual inflation rate to monthly rate
    ei_df = ei_df.pct_change().dropna().reset_index()
    ei_df = ei_df[(ei_df.date >= '2014-01-01') & (ei_df.date <= '2023-07-01')]

    # Obtain risk-free rate
    rf_df = pdr.DataReader('DGS1MO', 'fred', '2013-12-15').reset_index()
    rf_df['r_rf'] = (1 + rf_df.DGS1MO.values / 100) ** (1 / 12) - 1
    rf_df['date'] = pd.to_datetime(rf_df.DATE)
    rf_df = rf_df[['date', 'r_rf']]
    rf_df.set_index('date', inplace=True)
    date_range = pd.date_range(start=rf_df.index.min(), end=rf_df.index.max(), freq='D')
    rf_df = rf_df.reindex(date_range)
    rf_df['r_rf'].fillna(method='ffill', inplace=True)
    rf_df.reset_index(inplace=True)
    rf_df = rf_df.rename(columns={'index': 'date'})
    rf_df = rf_df[(rf_df.date >= '2014-01-01') & (rf_df.date <= '2023-07-01')]

    # Import other asset class data
    gld_df  = importYahoo('GLD', start_date, end_date, rf_df, 'Gold', 'M', 'r_rf')
    nsdq_df  = importYahoo('^IXIC', start_date, end_date, rf_df, 'Nasdaq', 'M', 'r_rf')

    # import the btc data and form monthly excess returns
    btc_df = pd.read_csv('../data/raw/cm_btc.csv')
    btc_df['date'] = pd.to_datetime(btc_df.date)
    btc_df = btc_df.rename(columns={'btc': 'Bitcoin'})
    btc_df.set_index('date', inplace=True)
    btc_df = btc_df.resample('M').last()
    btc_df.index = btc_df.index + pd.Timedelta(days=1)
    btc_df['Bitcoin'] = btc_df['Bitcoin'].pct_change()
    btc_df = btc_df.merge(rf_df, on='date', how='inner', validate='one_to_one')
    btc_df['Bitcoin'] = btc_df['Bitcoin'] - btc_df.r_rf
    btc_df = btc_df.drop(['r_rf'], axis=1)

    # form single dataframe
    df = cmkt_df.merge(btc_df, on='date', how='right', validate='one_to_one')
    df = df.merge(ei_df, on='date', how='outer', validate='one_to_one')
    df = df.merge(gld_df, on='date', how='outer', validate='one_to_one')
    df = df.merge(nsdq_df, on='date', how='outer', validate='one_to_one')
    df = df.set_index('date')
    df = df[df.index <= '2023-01-01']

    # OVERALL CORR OF MONTHLY RETURNS AND INF INNOV
    print(df.index.min())
    print(df.index.max())
    print(df.drop('cmkt', axis=1).corr())

    # OVERALL CORR OF MONTHLY RETURNS AND INF INNOV JUST 2018-2022
    print(df[df.cmkt.notnull()].corr())

    # CORR BETWEEN BTC AND EXPINF1YR AS WELL AS BTWN GOLD AND EXPINF1YR 
    # ON TOP 12 MONTHS IN 2014 Jan to Dec 2022 and Jan 2018 to Dec 2022 for cmkt
    df['inf_abs'] = np.abs(df[inf_col])
    print(df.drop('cmkt', axis=1).sort_values(by='inf_abs', ascending=False)[:12].corr())
    print(df[df.cmkt.notnull()].sort_values(by='inf_abs', ascending=False)[:12].corr())
    df = df.drop('inf_abs', axis=1)

    # Regress BTC returns on CMKT and inf
    df = df[df.cmkt.notnull()]
    y = df.Bitcoin
    X = df[['cmkt', inf_col]]
    X = sm.add_constant(X)
    model = sm.OLS(y, X)
    results = model.fit()
    results_summary = results.summary2().tables
    summary_df = pd.DataFrame(results_summary[1])
    summary_df['N'] = len(y)
    summary_df['R2'] = results.rsquared
    with pd.ExcelWriter(out_fp, engine='openpyxl', mode='a', if_sheet_exists='replace') as writer: 
        summary_df.to_excel(writer, sheet_name='raw_inf_reg')

    # Fama Macbeth
    temp_df = temp_df[['date', 'asset', 'char_r_tm7']].copy()
    asset_df = pd.DataFrame()
    assets = list(np.unique(temp_df.asset.values))
    for asset in assets:
        t_df = temp_df[temp_df.asset==asset][['date', 'char_r_tm7']]
        t_df.set_index('date', inplace=True)
        t_df['r'] = t_df['char_r_tm7']+1
        t_df = t_df.drop('char_r_tm7', axis=1)
        t_df = t_df.resample('M')[['r']].prod()-1
        t_df.index = t_df.index + pd.DateOffset(days=1) - pd.DateOffset(months=1)
        t_df = t_df.reset_index()
        t_df['asset'] = asset
        asset_df = pd.concat([asset_df, t_df])

    beta_hats = []
    assets = list(np.unique(asset_df.asset.values))
    assets.remove('inv')
    for asset in assets:
        a_df = asset_df[asset_df.asset==asset]
        a_df = a_df.merge(df[[inf_col]].reset_index(), on='date', how='inner', validate='one_to_one')
        y = a_df.r
        X = a_df[[inf_col]]
        X = sm.add_constant(X)
        model = sm.OLS(y, X)
        results = model.fit()
        beta_hats.append(results.params[1])

    y = asset_df[asset_df.asset!='inv'].groupby('asset')['r'].mean().values
    x = np.array(beta_hats)
    X = sm.add_constant(x)
    model = sm.OLS(y, X)
    results = model.fit()

    results_summary = results.summary2().tables
    summary_df = pd.DataFrame(results_summary[1])
    summary_df['N'] = len(y)
    summary_df['R2'] = results.rsquared
    with pd.ExcelWriter(out_fp, engine='openpyxl', mode='a', if_sheet_exists='replace') as writer: 
        summary_df.to_excel(writer, sheet_name='raw_fama_macbeth')



In [17]:
if __name__ == "__main__":
    # set args
    PANEL_IN_FP      = '../data/clean/panel_weekly.pkl' 
    ASSET_IN_FP      = '../data/clean/asset_universe_dict.pickle'
    HIST_OUT_FP      = '../output/desc_stats/histograms.png'
    CUM_RET_OUT_FP   = '../output/desc_stats/cumulative_returns.png'
    SHARPE_OUT_FP    = '../output/desc_stats/sharpe.png'
    TX_OUT_FP        = '../output/desc_stats/btc_tx.png'
    HODL_OUT_FP      = '../output/desc_stats/hodl.png'
    CORR_OUT_FP      = '../output/desc_stats/corr.png'
    RISK_RTRN_OUT_FP = '../output/desc_stats/risk_return.png'
    OUT_FP           = '../output/desc_stats/descriptive_statistics.xlsx'
    PERIODS_IN_YEAR  = 52
    TS_AVG_METHOD    = 'arithmetic'
    LHS_COL          = 'r_ex_tp7'
    ANNUALIZED       = False

    # import
    with open(ASSET_IN_FP, "rb") as f:
        asset_universe_dict = pickle.load(f)
    df = pd.read_pickle(PANEL_IN_FP)

    # drop rows that are not in the asset universe
    df = subsetToAssetUniverse(df, asset_universe_dict)

    # generate plots
    plotReturnHistograms(df, HIST_OUT_FP)
    plotCumulativeReturns(df, CUM_RET_OUT_FP)
    plotRollingSharpe(SHARPE_OUT_FP)
    plotTransactionStats(df, TX_OUT_FP)
    plotHodlingStats(df, HODL_OUT_FP)
    plotRollingCorrelations(CORR_OUT_FP)
    formRiskReturnScatterPlot(df, RISK_RTRN_OUT_FP)

    # generate tables
    genSummaryStatistics(df, LHS_COL, OUT_FP)
    # genForkStatistics(df, OUT_FP)
    genCorrStatistics(df, OUT_FP)

    # report statistics
    reportICOStatistics(df)
    inflationStatistics(df, OUT_FP)
    
    # # TODO SCOPE IF RESULTS FOR ALL CHANGE MUCH AFTER A WINSOR
    # p1 = df[LHS_COL].quantile(0.01)
    # p99 = df[LHS_COL].quantile(0.99)
    # df.loc[df[LHS_COL] < p1, LHS_COL] = p1 
    # df.loc[df[LHS_COL] > p99, LHS_COL] = p1 


for nsdq pct 0.0, the sharpe is 0.327 and annual return is 0.2643
for nsdq pct 0.05, the sharpe is 0.355 and annual return is 0.2733
for nsdq pct 0.1, the sharpe is 0.382 and annual return is 0.2801
for nsdq pct 0.15, the sharpe is 0.409 and annual return is 0.2849
for nsdq pct 0.2, the sharpe is 0.436 and annual return is 0.2876
for nsdq pct 0.25, the sharpe is 0.462 and annual return is 0.2882
for nsdq pct 0.3, the sharpe is 0.488 and annual return is 0.2867
for nsdq pct 0.35, the sharpe is 0.514 and annual return is 0.2832
for nsdq pct 0.4, the sharpe is 0.538 and annual return is 0.2777
for nsdq pct 0.45, the sharpe is 0.561 and annual return is 0.2702
for nsdq pct 0.5, the sharpe is 0.582 and annual return is 0.2608
for nsdq pct 0.55, the sharpe is 0.601 and annual return is 0.2495
for nsdq pct 0.6, the sharpe is 0.617 and annual return is 0.2363
for nsdq pct 0.65, the sharpe is 0.628 and annual return is 0.2214
for nsdq pct 0.7, the sharpe is 0.632 and annual return is 0.2049
for

<Figure size 644x400 with 0 Axes>

<Figure size 644x400 with 0 Axes>

<Figure size 644x400 with 0 Axes>

<Figure size 644x400 with 0 Axes>