In [1]:
# To be able to use the quantools, due to my crap path names have to add to sys path
import sys
sys.path.insert(0, '/home/adam/Dropbox/2-creations/2-crafts/7-buidl/0-utils/quant_tools/code')

# Import packages
from joblib import Parallel, delayed
from datetime import timedelta
import pandas_datareader as pdr
import matplotlib.pyplot as plt
from typing import Dict, List
from tools import QuantTools
from scipy.stats import norm
import scipy.stats as stats
import yfinance as yf
import pandas as pd
import numpy as np
import matplotlib
import datetime
import pickle
import random

# set color map
viridis = matplotlib.colormaps['viridis']

In [2]:
def importYahoo(ticker: str, start_date: str, end_date: str, rf_df: pd.DataFrame, new_ret_col: str) -> pd.DataFrame:
    """ 
    Import Yahoo Finance data for given ticker, time period, taking our risk free rate in rf_df. 
    
    Parameters:
    ticker (str): The ticker symbol to get data for.
    start_date (str): The start date of the data retrieval period.
    end_date (str): The end date of the data retrieval period.
    rf_df (DataFrame): A DataFrame containing risk-free rates.
    new_ret_col (str): The new column name for returns.
    
    Returns:
    DataFrame: A DataFrame containing the date and return data.

    Note: this is done at weekly frequency; would need adjustment for different.
    """
    # import the data
    df = yf.Ticker(ticker).history(period='1d', start=start_date, end=end_date).reset_index()
    
    # reformat
    df['Date'] = pd.to_datetime(df.Date).to_numpy(dtype='datetime64[D]')
    df = df[['Date', 'Close']].rename(columns={'Date': 'date', 'Close': new_ret_col}).set_index('date')
    df = df.resample('W').last().pct_change().dropna()

    # take out rf rate
    df = df.merge(rf_df, on='date', how='inner', validate='one_to_one')
    df[new_ret_col] = df[new_ret_col] - df.r_rf_tm7

    # check for NaN values in new_ret_col
    if df[new_ret_col].isnull().values.any():
        print("Warning: NaN values found in return data")
    
    return df[['date', new_ret_col]]


In [3]:
def subsetToAssetUniverse(df: pd.DataFrame, asset_universe_dict: Dict[str, List[str]]) -> pd.DataFrame:
    """
    Subset a DataFrame based on a dictionary of asset universes.

    Parameters
    ----------
    df : pd.DataFrame
        The input DataFrame. Must contain columns "date" and "asset".
    asset_universe_dict : Dict[str, List[str]]
        A dictionary where keys are dates in 'YYYY-MM-DD' format and values are lists of asset names.

    Returns
    -------
    pd.DataFrame
        The subsetted DataFrame.
    """
    # Check that the required columns are present in the DataFrame
    if not set(['date', 'asset']).issubset(df.columns):
        raise ValueError('Input DataFrame must contain "date" and "asset" columns.')

    # Ensure that the 'date' column is of datetime type
    if df['date'].dtype != 'datetime64[ns]':
        df['date'] = pd.to_datetime(df['date'])

    # Loop over all months with their relevant assets
    for key, values in asset_universe_dict.items():
        # Extract the year and month from the key
        year, month = key.split('-')[:2]

        # Drop rows from the dataframe which match the year and month but not the assets
        df = df[~((df.date.dt.year == int(year)) 
                    & (df.date.dt.month == int(month)) 
                    & (~df.asset.isin(values)))]

    return df

In [4]:
def plotReturnHistograms(df: pd.DataFrame, out_fp: str):
    """
    This function takes a DataFrame containing time series data for returns of
    btc, eth, and the cmkt and saves a histogram plot for all three to given fp.
    Each histogram also includes a normal distribution fit.

    Parameters:
    df (pd.DataFrame): A DataFrame containing columns 'date', 'asset', 'char_r_tm7', 'macro_cmkt_tm7'.
    out_fp (str): A string specifying the filepath where the plot should be saved.

    Returns:
    None
    """
    # extract relevant returns
    btc_df  = df[df.asset=='btc'][['date', 'char_r_tm7']]
    btc_df  = btc_df.rename(columns={'char_r_tm7': 'btc'})
    eth_df  = df[df.asset=='eth'][['date', 'char_r_tm7']]
    eth_df  = eth_df.rename(columns={'char_r_tm7': 'eth'})
    cmkt_df = df.groupby('date')[['macro_cmkt_tm7']].mean().reset_index()
    cmkt_df = cmkt_df.rename(columns={'macro_cmkt_tm7': 'cmkt'})

    # form single dataframe
    hist_df = cmkt_df.merge(btc_df, on='date', how='inner', validate='one_to_one')
    hist_df = hist_df.merge(eth_df, on='date', how='inner', validate='one_to_one')

    # initiate the plot with given colors and columns
    fig, axs = plt.subplots(3, sharex=True, sharey=True, figsize=(6.4,4), facecolor='none')
    colors = plt.get_cmap('viridis')(np.linspace(0, 10))
    data_columns = ['cmkt', 'btc', 'eth']

    # plot the data with the normal dist fit
    for idx, ax in enumerate(axs):
        data = hist_df[data_columns[idx]]
        n, bins, patches = ax.hist(data, bins=30, color=colors[idx], alpha=1)
        #  density=True, 

        # Fit a normal distribution
        mu, std = norm.fit(data)

        # Scale normal distribution to histogram
        scale = n.max() / norm.pdf(mu, mu, std).max()
        
        # Plot the PDF
        xmin, xmax = ax.get_xlim()
        x = np.linspace(xmin, xmax, 100)
        p = norm.pdf(x, mu, std) * scale
        ax.plot(x, p, 'k', linewidth=2)

        for spine in ax.spines.values():
            spine.set_visible(False)

    # tighen up the plot
    fig.tight_layout()

    # adjust x axis labels
    plt.xticks(np.arange(-.5, 0.7, 0.1))

    # output
    plt.savefig(out_fp)

    # close the figure
    plt.close(fig)

In [5]:
def plotCumulativeReturns(df: pd.DataFrame, out_fp: str) -> None:
    """
    Plot the time series of cumulative returns to the given output filepath.

    Args:
        df (pd.DataFrame): DataFrame containing the time series data
        out_fp (str): a relative filepath to save the figure to.

    Returns:
        None
    """
    # initialize df with timeserieses to plot
    plot_df = pd.DataFrame(data={'date': []})

    # find all assets present in the panel
    assets = list(np.unique(df.asset.values))

    # form each asset's cumulative return
    for asset in assets:
        # extract asset's returns
        temp_df = df[df.asset==asset][['date', 'char_r_tm7']]

        # ensure it is sorted
        temp_df = temp_df.sort_values(by='date', ignore_index=True)

        # form cumulative return
        temp_df[asset] = (1 + temp_df['char_r_tm7']).cumprod()

        # merge on results
        plot_df = plot_df.merge(temp_df[['date', asset]], on='date', how='outer', validate='one_to_one')

    # form the cmkt return
    temp_df = df[df.asset=='btc'][['date', 'macro_cmkt_tm7']]
    temp_df = temp_df.sort_values(by='date', ignore_index=True)
    temp_df['cmkt'] = (1 + temp_df['macro_cmkt_tm7']).cumprod()
    plot_df = plot_df.merge(temp_df[['date', 'cmkt']], on='date', how='outer', validate='one_to_one')

    # resort
    plot_df = plot_df.sort_values(by='date', ignore_index=True)

    # set index
    plot_df.set_index('date', inplace=True)

    # Plotting the time series
    plt.figure(figsize=(4*1.61, 4), facecolor='none')

    # Form column list
    columns = list(plot_df.columns)
    columns.remove('btc')
    columns.remove('eth')
    columns.remove('cmkt')
    columns = columns + ['eth', 'btc', 'cmkt']

    # Iterate over the columns and plot each time series
    for column in columns:
        if column == 'btc':
            color = '#FDE725FF'
            linewidth = 2
        elif column == 'eth':
            color = '#2D708EFF'
            linewidth = 2
        elif column == 'cmkt':
            color = '#482677FF'
            linewidth = 2
        else:
            color = 'gray'
            linewidth = 0.5
        plt.plot(plot_df.index, plot_df[column], color=color, linewidth=linewidth)

    # Set y-axis to logarithmic scale
    plt.yscale('log')

    # Remove y-axis minor ticks
    plt.gca().yaxis.set_minor_locator(plt.NullLocator())

    # Customize the plot
    plt.grid(visible=True, which='major', axis='y', linewidth=0.5)
    plt.box(False)

    # Add custom labels for important time series
    plt.text(plot_df.index[-45], plot_df['cmkt'].iloc[-52]+5, 'cmkt', color='#482677FF', fontweight='bold', verticalalignment='center', bbox=dict(facecolor='none', edgecolor='none'))
    plt.text(plot_df.index[-1], plot_df['btc'].iloc[-1]-0.1, 'btc', color='#FDE725FF', fontweight='bold', verticalalignment='center', bbox=dict(facecolor='none', edgecolor='none'))
    plt.text(plot_df.index[-1], plot_df['eth'].iloc[-1]+0.3, 'eth', color='#2D708EFF', fontweight='bold', verticalalignment='center', bbox=dict(facecolor='none', edgecolor='none'))

    # output
    plt.savefig(out_fp)
    plt.close()

In [6]:
def genSummaryStatistics(df: pd.DataFrame, lhs_col: str, out_fp: str) -> None:
    """
    Generates summary statistics for the panel and saves them to an Excel file.

    :param df: DataFrame containing asset return data.
    :param lhs_col: Column in df that contains the return data.
    :param out_fp: Output file path for the Excel file.
    """
    # define function for calculating return statistics
    def calcReturnStats(temp_df: pd.DataFrame, asset: str, return_col: str) -> dict:
        mean_return = QuantTools.calcTSAvgReturn(temp_df[return_col].values, annualized=True, periods_in_year=52)
        std_dev = QuantTools.calcSD(temp_df[return_col].values, annualized=True, periods_in_year=52)
        sharpe_ratio = QuantTools.calcSharpe(temp_df[return_col].values, periods_in_year=52)
        skewness = stats.skew(temp_df[return_col].values) / np.sqrt(52)
        kurtosis = stats.kurtosis(temp_df[return_col].values) / 52
        perc_return_above_zero = np.sum(temp_df[return_col]>0) / len(temp_df)
        
        return {'asset': asset,
            'Mean': mean_return,
            'SD': std_dev,
            'Sharpe': sharpe_ratio,
            'Skewness': skewness,
            'Kurtosis': kurtosis,
            'Pct pos': perc_return_above_zero}

    # drop to only necessary columns
    df = df[['date', 'asset', lhs_col, 'char_size_t', 'char_volume_sum_tm7', 
        'macro_snp500_t', 'macro_dgs1mo_t']].copy()

    # form btc and eth returns
    btc_df = df[df.asset=='btc'].set_index('date')[[lhs_col]]
    eth_df = df[df.asset=='eth'].set_index('date')[[lhs_col]]

    # form cmkt return
    df['weighted_return'] = df[lhs_col] * df['char_size_t']
    total_market_cap = df.groupby('date')['char_size_t'].sum()
    cmkt_df = df.groupby('date')['weighted_return'].sum() / total_market_cap
    cmkt_df = pd.DataFrame(cmkt_df).rename(columns={0: 'return'})

    # import nasdaq data and take out risk free rate
    rf_df = df[['date', 'macro_dgs1mo_t']].drop_duplicates()
    rf_df['r_rf_tm7'] = (1 + rf_df.macro_dgs1mo_t.values / 100) ** (1 / (365 / 7)) - 1
    rf_df = rf_df[['date', 'r_rf_tm7']]
    nsdq_df = importYahoo('^IXIC', '2017-12-29', '2022-12-31', rf_df, 'r_nsdq_tm7')    

    # calc return statistics
    cmkt_stats = calcReturnStats(cmkt_df, 'CMKT', 'return')
    btc_stats  = calcReturnStats(btc_df, 'Bitcoin', lhs_col)
    eth_stats  = calcReturnStats(eth_df, 'Ethereum', lhs_col)
    nsdq_stats = calcReturnStats(nsdq_df, 'Nasdaq', 'r_nsdq_tm7')
    ret_df = pd.DataFrame([cmkt_stats, btc_stats, eth_stats, nsdq_stats])
    
    # calc extreme event statistics
    ext_data = {'threshold': [], 'count': [], 'percent': []}
    num_obs  = len(cmkt_df)
    for threshold in [-.3, -.2, -.1, -.05, .05, .1, .2, .3]:
        ext_data['threshold'].append(threshold)
        if threshold < 0:
            count = (cmkt_df['return'] < threshold).sum()
            ext_data['count'].append(count)
            ext_data['percent'].append(count / num_obs)
        else:
            count = (cmkt_df['return'] > threshold).sum()
            ext_data['count'].append(count)
            ext_data['percent'].append(count / num_obs)
    ext_df = pd.DataFrame(ext_data)
    
    # calculate yearly stats of unique assets and median mcap and volume
    df['year'] = df['date'].dt.year
    yr_df = pd.DataFrame({
        'num_unique_assets': df.groupby(['year'])['asset'].nunique(),
        'median_market_cap': df.groupby(['year'])['char_size_t'].median(),
        'median_weekly_asset_volume': df.groupby(['year'])['char_volume_sum_tm7'].median()}).reset_index()
    all_df = pd.DataFrame({
        'num_unique_assets': [df['asset'].nunique()],
        'median_market_cap': [df['char_size_t'].median()],
        'median_weekly_asset_volume': [df['char_volume_sum_tm7'].median()]})
    all_df['year'] = 'all'
    yr_df = pd.concat([yr_df, all_df])

    # calculate the total mcap in the last week of each year
    max_dates = df.groupby('year')['date'].max()
    filtered_df = df[df['date'].isin(max_dates)]
    total_mcap_by_year = filtered_df.groupby('year')[['char_size_t']].sum().reset_index()
    yr_df = yr_df.merge(total_mcap_by_year, on='year', how='outer', validate='one_to_one')

    # extract yearly returns
    cmkt_df = cmkt_df.reset_index()
    cmkt_df['year'] = cmkt_df.date.dt.year
    for year in [2018, 2019, 2020, 2021, 2022]:
        yr_df.loc[yr_df.year==year, 'cmkt_ret'] = ((cmkt_df[cmkt_df.year==year]['return']+1).cumprod()-1).values[-1]
    yr_df.loc[yr_df.year=='all', 'cmkt_ret'] = ((cmkt_df['return']+1).cumprod()-1).values[-1]
    
    # save results
    with pd.ExcelWriter(out_fp, engine='openpyxl', mode='a', if_sheet_exists='replace') as writer: 
        ret_df.to_excel(writer, sheet_name='raw_ret_stats')
        ext_df.to_excel(writer, sheet_name='raw_extreme_stats')
        yr_df.to_excel(writer, sheet_name='raw_yearly_stats')
        


In [7]:
def plotRollingSharpe(out_fp: str) -> None:
    """ Plot rolling four year sharpe ratio with new data for the study period. """
    
    # Obtain risk-free rate
    rf_df = pdr.DataReader('DGS1MO', 'fred', '2013-12-31').reset_index()
    rf_df['r_rf_tm7'] = (1 + rf_df.DGS1MO.values / 100) ** (1 / (365 / 7)) - 1
    rf_df['date'] = pd.to_datetime(rf_df.DATE)
    rf_df = rf_df[['date', 'r_rf_tm7']]
    rf_df.set_index('date', inplace=True)
    date_range = pd.date_range(start=rf_df.index.min(), end=rf_df.index.max(), freq='D')
    rf_df = rf_df.reindex(date_range)
    rf_df['r_rf_tm7'].fillna(method='ffill', inplace=True)
    rf_df.reset_index(inplace=True)
    rf_df = rf_df.rename(columns={'index': 'date'})

    # import other asset class data
    start_date = '2013-12-29'
    end_date   = '2022-12-31'
    nsdq_df = importYahoo('^IXIC', start_date, end_date, rf_df, 'Nasdaq')
    snp_df  = importYahoo('^GSPC', start_date, end_date, rf_df, 'SnP 500')
    vt_df   = importYahoo('VT', start_date, end_date, rf_df, 'Global Stocks')
    bnd_df  = importYahoo('BND', start_date, end_date, rf_df, 'US Bonds')
    real_df = importYahoo('VNQ', start_date, end_date, rf_df, 'US Real Estate')
    emrg_df = importYahoo('EBND', start_date, end_date, rf_df, 'Emerging Currencies')
    gld_df  = importYahoo('GLD', start_date, end_date, rf_df, 'Gold')

    # import the btc data and form weekly excess returns
    btc_df = pd.read_csv('../data/raw/XBTUSD_1440.csv',
                        header=None)
    btc_df[0] = pd.to_datetime(btc_df[0], unit='s')
    btc_df = btc_df[[0, 4]]
    btc_df = btc_df.rename(columns={0: 'date', 4: 'price'})
    btc_df.set_index('date', inplace=True)
    btc_df = btc_df.resample('W').last()
    btc_df['Bitcoin'] = btc_df['price'].pct_change()
    btc_df = btc_df.merge(rf_df, on='date', how='inner', validate='one_to_one')
    btc_df['Bitcoin'] = btc_df['Bitcoin'] - btc_df.r_rf_tm7
    btc_df = btc_df.drop(['price', 'r_rf_tm7'], axis=1)

    # form single dataframe
    df = btc_df.merge(nsdq_df, on='date', how='inner', validate='one_to_one')
    for temp_df in [snp_df, vt_df, bnd_df, real_df, emrg_df, gld_df]:
        df = df.merge(temp_df, on='date', how='inner', validate='one_to_one')

    # form sharpe ratios
    window_size = 208
    df.set_index('date', inplace=True)
    columns = list(df.columns.values)
    for col in columns:
        df[col] = np.sqrt(52) * df[col].rolling(window_size).mean() / df[col].rolling(window_size).std()

    # subset to relevant time period
    df = df[(df.index.year >= 2018)
        & (df.index.year <= 2022)]

    # form sharpe ratio plot
    plt.figure(figsize=(4*1.61, 4))
    df.plot(cmap='viridis')
    plt.box(False)
    plt.grid(visible=True, which='major', axis='x', linewidth=0.5)
    #plt.title('Sharpe Ratios: Bitcoin vs Major Asset Classes$^{12}$')
    plt.legend(labels=df.columns.values, 
                loc='lower center',
                ncol=3,
                bbox_to_anchor=(0.5, -0.37),
                frameon=False)
    plt.xlabel("")
    plt.tick_params(axis='both', which='both', bottom=False)
    plt.xticks(rotation=0, ha='center')
    plt.savefig(out_fp, bbox_inches='tight')
    plt.close()


In [8]:
def plotTransactionStats(df: pd.DataFrame, out_fp: str) -> None:
    """ Plot rolling transaction statistics for Bitcoin. """
    # Extract relevant transaction data
    tx_df = df[['date', 'macro_btc_fee_med_usd_t', 'macro_btc_tx_tfr_val_adj_usd_t']].drop_duplicates().copy()

    # reformat the data
    tx_df.set_index('date', inplace=True)
    temp1_df = tx_df[['macro_btc_tx_tfr_val_adj_usd_t']].resample('M').sum()
    temp2_df = tx_df[['macro_btc_fee_med_usd_t']].resample('M').median()
    r_df = temp1_df.merge(temp2_df, how='inner', left_index=True, right_index=True, validate='one_to_one')
    r_df = r_df.rename(columns={'macro_btc_fee_med_usd_t': 'Median Fee (USD)',
                                'macro_btc_tx_tfr_val_adj_usd_t': 'Monthly Volume (USD)'})
    r_df['date'] = r_df.index
    r_df['date'] = r_df.date.dt.strftime('%Y-%m')
    r_df = r_df.set_index('date')

    # plot the data
    plt.figure(figsize=(4*1.61, 4))
    r_df.plot(color=[viridis.colors[0], viridis.colors[111]])
    plt.box(False)
    plt.grid(visible=True, which='major', axis='x', linewidth=0.5)
    plt.legend(labels=r_df.columns.values, 
            loc='lower center',
            ncol=3,
            bbox_to_anchor=(0.5, -0.22),
            frameon=False)
    plt.yscale("log")
    plt.xlabel("")
    plt.xticks(np.array([0,12,24,36,48]), 
            ['2018', '2019', '2020', '2021', '2022'],
            rotation=0, ha='center')
    plt.tick_params(axis='both', which='both', bottom=False)
    plt.savefig(out_fp, bbox_inches='tight')
    plt.close()


In [9]:
def plotHodlingStats(df: pd.DataFrame, out_fp: str) -> None:
    # form the hodl data
    utxo_df = df[['date', 'macro_btc_utxo_age_med_t']].copy()
    utxo_df = utxo_df.drop_duplicates()
    utxo_df = utxo_df.set_index('date')
    utxo_df = utxo_df.rename(columns={'macro_btc_utxo_age_med_t':
                                    'UTXO Median Age (Days)'})

    # Plot the hodl data
    plt.figure(figsize=(4*1.61, 4))
    utxo_df.plot(color=[viridis.colors[0]],
                legend=None)
    plt.box(False)
    plt.grid(visible=True, which='major', axis='x', linewidth=0.5)
    plt.xlabel("")
    plt.tick_params(axis='both', which='both', bottom=False)
    plt.savefig(out_fp, bbox_inches='tight')
    plt.close()

In [10]:
def genForkStatistics(df: pd.DataFrame, out_fp: str) -> None:
    """ Generate event studies for BTC fork dates. """
    # PARAMETERS
    num_bs_samples = int(1e6)
    num_cpus       = 22
    window_size    = 8

    # DEFINE RELEVANT FORKS
    forks = {'bitcoin-21': datetime.datetime(2016,4,17),
            'zcash': datetime.datetime(2016,10,28),
            'bitcoin-cash': datetime.datetime(2017,7,31),
            'bitcoin-gold': datetime.datetime(2017,10,24),
            'bitcoin-diamond': datetime.datetime(2017,11,24),
            'lightning-bitcoin': datetime.datetime(2017,12,18),
            'bitcoinfast': datetime.datetime(2017,12,26),
            'bitcoin2': datetime.datetime(2017,12,28),
            'bitcoin-plus': datetime.datetime(2018,1,2),
            'bitcoin-interest': datetime.datetime(2018,1,22),
            'bitcoin-atom': datetime.datetime(2018,1,24),
            'bitcoin-private': datetime.datetime(2018,2,28),
            'microbitcoin': datetime.datetime(2018,5,29),
            'bitcoin-bep2': datetime.datetime(2018,6,29),
            'bitcoin-sv': datetime.datetime(2018,11,11)}

    # PULL IN OLD DATA TO BUILD RELEVANT TIMESERIES FOR BTC
    df = pd.read_csv('../data/raw/cmc_price_vol_mcap_panel.csv')
    df = df[df.cmc_id==1]
    df = df[['date', 'usd_per_token', 'usd_volume_24h']]
    df['date'] = pd.to_datetime(df.date)
    df = df.drop_duplicates(subset='date')
    df = df.reset_index(drop=True)
    san_df = pd.read_pickle('../data/raw/santiment_panel.pkl')
    san_df = san_df[san_df.san_slug=='bitcoin'][['date', 'active_addresses_24h', 'github_activity',
                                                'social_volume_total']].reset_index(drop=True)
    hash_df = pd.read_excel('../data/raw/coinmetrics_btc_hashrate.xlsx')
    hash_df = hash_df.rename(columns={'Time': 'date',
                                    'BTC / Mean Hash Rate': 'hash_rate'})
    hash_df['date'] = pd.to_datetime(hash_df['date'])
    df = df.merge(san_df,
                on=['date'],
                how='left',
                validate='one_to_one')
    df = df.merge(hash_df,
                on='date',
                how='left',
                validate='one_to_one')
    del san_df, hash_df

    # CLEAN UP THE PANEL

    # ensure it has all days
    sdate   = datetime.date(2015, 1, 1) 
    edate   = datetime.date(2021, 12, 31)  
    delta   = edate - sdate 
    days    = []
    for i in range(delta.days + 1):
        days.append(sdate+timedelta(days=i))
    days_df = pd.DataFrame(data={'date':days})
    df['date'] = df.date.dt.date
    df      = df.merge(days_df,
                    on='date',
                    how='outer',
                    validate='one_to_one')
    df = df.sort_values('date')
    df = df.interpolate()

    # form pct change in all columns and clean up improper values
    cols = list(df.columns.values)
    cols.remove('date')
    for col in cols:
        df[col] = df[col].pct_change()
    df = df.dropna()
    df = df.fillna(0)
    df = df.replace([np.inf, -np.inf], 0)

    # subset to time period of interest 
    df['date'] = pd.to_datetime(df['date'])
    df[df.date.dt.year <= 2021]
    df = df.reset_index(drop=True)

    # rename columns
    df = df.rename(columns = {'usd_per_token': 'return',
                            'usd_volume_24h': 'usd_volume',
                            'active_addresses_24h': 'active_addresses',
                            'github_activity': 'developer_activity',
                            'social_volume_total': 'social_volume'})

    # INITIALIZE RESULTS 
    cols = list(df.columns.values)
    cols.remove('date')
    results_df = pd.DataFrame(data={'stat': cols,
                                    'window': np.repeat('7 days', len(cols)),
                                    'est': np.zeros(len(cols)),
                                    'se': np.zeros(len(cols))})

    # CALC STAT
    for col in cols:
        diffs = []
        for i in range(len(forks)):
            fork_date = list(forks.values())[i]
            pre_date  = fork_date-pd.Timedelta(window_size, unit="d")
            post_date = fork_date+pd.Timedelta(window_size, unit="d")
            pre_mean  = np.mean(df[(df.date >= pre_date) & (df.date < fork_date)][col])
            post_mean = np.mean(df[(df.date > fork_date) & (df.date <= post_date)][col])
            diff      = post_mean-pre_mean
            diffs.append(diff)
        results_df.loc[results_df.stat==col, 'est'] = np.mean(diffs)

    # CALC STANDARD ERROR (NOTE: 6 MIN RUN TIME)
    num_forks = len(forks)
    rel_dates = list(df.date.values)[window_size:-window_size]
    for col in cols:
        # calc all diffs across the panel
        diffs = []
        for date in rel_dates:
            pre_date  = date-pd.Timedelta(31, unit="d")
            post_date = date+pd.Timedelta(31, unit="d")
            pre_mean  = np.mean(df[(df.date >= pre_date) & (df.date < date)][col])
            post_mean = np.mean(df[(df.date > date) & (df.date <= post_date)][col])
            diff      = post_mean-pre_mean
            diffs.append(diff)

        # reorder the list to randomize
        random.shuffle(diffs)

        # calc bootstrap distribution
        def loopOverNumberBootstrapSamples(i):
            random_diffs = []
            for j in range(num_forks):
                index = np.random.randint(low=0,high=len(diffs))
                diff  = diffs[index]
                random_diffs.append(diff)
            return np.mean(random_diffs)
        bs_stat = Parallel(n_jobs=num_cpus)(delayed(loopOverNumberBootstrapSamples)(i) for i in range(num_bs_samples))

        # calc standard error and add it to results
        se = np.std(bs_stat)
        results_df.loc[results_df.stat==col, 'se'] = se

    # OUTPUT
    with pd.ExcelWriter(out_fp, engine='openpyxl', mode='a', if_sheet_exists='replace') as writer: 
        results_df.to_excel(writer, sheet_name='raw_forks')


In [11]:
if __name__ == "__main__":
    # set args
    PANEL_IN_FP     = '../data/clean/panel_weekly.pkl' 
    ASSET_IN_FP     = '../data/clean/asset_universe_dict.pickle'
    HIST_OUT_FP     = '../output/desc_stats/histograms.png'
    CUM_RET_OUT_FP  = '../output/desc_stats/cumulative_returns.png'
    SHARPE_OUT_FP   = '../output/desc_stats/sharpe.png'
    TX_OUT_FP       = '../output/desc_stats/btc_tx.png'
    HODL_OUT_FP     = '../output/desc_stats/hodl.png'
    OUT_FP          = '../output/desc_stats/descriptive_statistics.xlsx'
    PERIODS_IN_YEAR = 52
    TS_AVG_METHOD   = 'arithmetic'
    LHS_COL         = 'r_ex_tp7'
    ANNUALIZED      = False

    # import
    with open(ASSET_IN_FP, "rb") as f:
        asset_universe_dict = pickle.load(f)
    df = pd.read_pickle(PANEL_IN_FP)

    # drop rows that are not in the asset universe
    df = subsetToAssetUniverse(df, asset_universe_dict)

    # generate plots
    plotReturnHistograms(df, HIST_OUT_FP)
    plotCumulativeReturns(df, CUM_RET_OUT_FP)
    plotRollingSharpe(SHARPE_OUT_FP)
    plotTransactionStats(df, TX_OUT_FP)
    plotHodlingStats(df, HODL_OUT_FP)

    # generate tables
    genSummaryStatistics(df, LHS_COL, OUT_FP)
    genForkStatistics(df, OUT_FP)

    # # TODO SCOPE IF RESULTS FOR ALL CHANGE MUCH AFTER A WINSOR
    # p1 = df[LHS_COL].quantile(0.01)
    # p99 = df[LHS_COL].quantile(0.99)
    # df.loc[df[LHS_COL] < p1, LHS_COL] = p1 
    # df.loc[df[LHS_COL] > p99, LHS_COL] = p1 


<Figure size 644x400 with 0 Axes>

<Figure size 644x400 with 0 Axes>

<Figure size 644x400 with 0 Axes>