In [None]:
# TODO UPDATE THIS OLD MESSY CODE WITH NEW PANEL

In [33]:
import pandas as pd
import numpy as np
from sklearn import decomposition
import statsmodels.formula.api as smf
pd.options.mode.chained_assignment = None

In [34]:
# helper function
def data_preprocessing_table2(df, factors, ast, var = ['week_idx'], index ='week_idx', max_index = 262):
    '''
    Data preprocessing of table 2: 
        keep the specified asset, factors, other variables (index by default);
        calculate delta of factors;
        keep the data within a date range.
    
    Keyword arguments:
        df -- raw dataframe
        factors -- list of factors to inspect
        ast -- asset to explore
        var -- other variables to keep
        index -- index column
        max_index -- cutoff of the specified index (week_idx of the last datapoint of 2020 is 262)
    
    Return value:
        df_ast -- preprocessed dataframe
        factors_pct -- list of strings of the new columns that are pct change in factors 
    '''
    
    # only keep the specified asset's rows in the panel data
    # i.e. it is now just timeseries data
    df_ast = df[df['asset'] == ast]

    # only keep specified factors and variables
    colns = var + factors
    df_ast = df_ast[colns]

    # calculate factor pct change
    factors_pct = []
    for factor in factors:
        factor_pct = 'pct_'+factor
        df_ast[factor_pct] = df_ast[factor].pct_change()
        factors_pct.append(factor_pct)
    
    # drop the first row (NaN diff)
    df_ast = df_ast.dropna()
    
    # drop original factor colns
    df_ast = df_ast.drop(factors, axis=1)

    # only keep data from 2016 to 2020 (drop week_idx >= 262)
    df_ast = df_ast[df_ast[index] < max_index]

    return df_ast, factors_pct

In [35]:
def generate_principal_component(df, var, ncomponents = 1, pc_col_name = 'pca'):
    
    '''
    Generate a column of the principle component among var and add the column to the dataframe
    
    Keyword arguments:
        df -- input dataframe
        var -- list of variables to inspect
        ncomponents -- number of components of PCA
        pc_col_name -- name of the PC column
        
    Return value:
        df -- dataframe with the PC column
    '''
    
    # pca
    X = df[var]
    pca = decomposition.PCA(n_components=ncomponents)
    res = pca.fit_transform(X)

    # add pc to dataframe
    df[pc_col_name] = res
    
    return df

In [36]:
def generate_table_2_panel_a(df, factors, ast, index = 'week_idx', max_index = 262):
    
    '''
    Generate Panel A of Table 2
    
    Keyword arguments: 
        df -- raw dataframe
        factors -- list of factors to explore
        ast -- asset of interest
        index -- index column
        max_index -- cutoff of the specified index (week_idx of the last datapoint of 2020 is 262)
    
    Return value:
        df_corr -- correlation matrix of delta_factors and their PC
    '''
    
    # DATA PREPROCESSING
    df_ast, factors_pct = data_preprocessing_table2(df, factors, ast)
    
    # GENERATE PRINCIPAL COMPONENTS
    df_ast = generate_principal_component(df_ast, factors_pct)
    
    # GENERATE CORRELATION MATRIX
    # calculate correlation matrix without the index col
    df_corr = df_ast.drop(columns = index).corr()

    # drop redundant values in the matrix
    df_corr = df_corr.drop(columns = ['pca'])
    r = 0
    for ind in df_corr.index:
        if r == 0 or r == len(df_corr.index)-1:
            r += 1
            continue
        for c in range(r):
            if c < r:
                df_corr.iloc[r,c] = float("NAN")
        r += 1

    return df_corr

In [37]:
def generate_table_2_panel_b(df, predictors, response, ast, index = 'week_idx', max_index = 262):
    '''
    Generate Panel B of Table 2
    
    Keyword arguments: 
        df -- raw dataframe
        predictors -- list of predictors to inspect; not really predicting, it is same time period
        response -- response variable
        ast -- asset to inspect
        index -- index column
        max_index -- cutoff of the specified index (week_idx of the last datapoint of 2020 is 262)
    
    Return value:
        df_res -- regression results
    '''
    
    # DATA PREPROCESSING
    var = [index, response]
    df_ast, predictors = data_preprocessing_table2(df, predictors, ast, var)
        
    # GENERATE PRINCIPAL COMPONENTS
    df_ast = generate_principal_component(df_ast, predictors)
    predictors.append('pca')
    
    # GENERATE RESULT DF
    # rows
    res_index = []
    for predictor in predictors:
        res_index.append(predictor+"_coef")
        res_index.append(predictor+"_t_stat")
    res_index.append("R_squared")
    # columns
    res_col = range(1, 1+len(predictors))
    # create empty df
    df_res = pd.DataFrame(np.nan, index=res_index, columns=res_col)
    
    # REGRESSIONS
    # run regression for each predictor and fill in the panel
    r = 0
    for predictor in predictors:
        fml = "" + response + " ~ " + predictor
        model = smf.ols(formula = fml, data = df_ast)
        results = model.fit(cov_type = 'HC1')
        
        # TODO: CONFIRM THIS IS CORRECT STANDARD ERROR TO USE.

        coef     = results.params[1]
        rsquared = results.rsquared
        tstat    = results.tvalues[1]
        pval     = results.pvalues[1]
        
        # formatting
        if(pval<=0.01):
            coef = str(coef)+"***"
        elif(pval <= 0.05):
            coef = str(coef)+"**"
        elif(pval <= 0.1):
            coef = str(coef)+"*"
        else:
            coef = str(coef)
            
        tstat = "("+str(tstat)+")"
        
        # fill results into df_res
        df_res.iloc[r, int(r/2)] = coef
        df_res.iloc[r+1, int(r/2)] = tstat
        df_res.iloc[len(df_res.index)-1, int(r/2)] = rsquared
        
        r += 2
        
    return df_res

In [38]:
def generate_table_3(df, predictor, responses, ast, maxlag = 8, index = 'week_idx', max_index = 262):
    '''
    Generate Panel 3
    
    Keyword arguments: 
        df -- raw dataframe
        predictor -- string of predictor variable is the return of the asset class
        responses -- list of LHS response variables, which are the covariates
        ast -- asset to inspect
        maxlag -- the maximum lag 
        index -- index column
        max_index -- cutoff of the specified index (week_idx of the last datapoint of 2020 is 262)
    
    Return value:
        df_res -- regression results
    '''
    
    # DATA PREPROCESSING
    # only keep the specified asset and needed columns
    df_ast  = df[df['asset'] == ast]
    columns = [index, predictor]+responses
    df_ast  = df_ast[columns]

    # create the shifted response variables
    responses_shifted = []
    for response in responses:
        for shift in range(1, maxlag+1):
            newcol = 'pct_'+response+'plus'+str(shift)
            responses_shifted.append(newcol)
            df_ast[newcol] = df_ast[response].pct_change().shift(-shift)

    # Drop unshifted cols, rows with missing, data beyond index, and reindex
    df_ast = df_ast.drop(responses, axis=1)
    df_ast = df_ast.dropna()
    df_ast = df_ast[df_ast[index] < max_index]
    df_ast = df_ast.reset_index(drop=True)

    # INITIALIZE RESULT DF
    responses_col = []
    for response in responses:
        responses_col += list(np.repeat(response, 5))
    stat_col     = []
    stats_list   = ['cmkt_coef', 'cmkt_t_stat',
                    'cons_coef', 'cons_t_stat',
                    'r_squared']
    stats_col    = stats_list*len(responses)
    df_res       = pd.DataFrame(data={'response': responses_col,
                                      'stat': stats_col})
    for i in range(1,maxlag+1):
        df_res[i] = np.nan
    
    # ADD STATS TO RESULTS DATA FRAME
    for response in responses: 
        for shift in range(1,maxlag+1):
            # fit regression
            response_shifted = 'pct_'+response+'plus'+str(shift)
            fml = "" + response_shifted + " ~ " + predictor
            model = smf.ols(formula = fml, data = df_ast)
            results = model.fit(cov_type = 'HC1')

            # extract statistics
            for i in [1,0]:
                if i==1:
                    param = 'cmkt'
                elif i==0:
                    param = 'cons'
                coef  = results.params[i]
                tstat = results.tvalues[i]
                pval  = results.pvalues[i]

                # formatting
                if(pval<=0.01):
                    coef = str(coef)+"***"
                elif(pval <= 0.05):
                    coef = str(coef)+"**"
                elif(pval <= 0.1):
                    coef = str(coef)+"*"
                else:
                    coef = str(coef)

                tstat = "("+str(tstat)+")"

                # fill results into df_res
                df_res.loc[(df_res.response==response) & 
                           (df_res.stat==(param+'_coef')), shift]   = coef
                df_res.loc[(df_res.response==response) & 
                           (df_res.stat==(param+'_t_stat')), shift] = tstat

            # extract r^2 stat
            df_res.loc[(df_res.response==response) & 
                       (df_res.stat=='r_squared'), shift] = results.rsquared

    return df_res

In [39]:
# NOTE FOR FRANCESCO TO DELETE ONCE SCRIPT IS COMPLETE
# -identifying columns are date/week_idx + asset
# -any column that starts with "macro_" is a variable that is constant across assets WITHIN week
# -macro_mcap_t is the total market cap of crypto at that date
# -macro_mcap_ret_t is the return on the total market cap of crypto from last week to that date
# -r_tplus7 is the return for that asset over the SUBSEQUENT week
# -all columns that start with "covar_" are our covariates or RHS variables that we are interested in 
#  exploring if they have explanatory power for the cross-section of returns, i.e. r_tplus7

# -we do not want to look at 2021 data as later on this project we are using that as
#  out of sample data to check the performance of models; so we dont want to know
#  about relations in that data

In [40]:
if __name__ == '__main__':
    # READ IN THE DATA
    df = pd.read_csv('liu_panel.csv')
    
    # FORM GROUPS OF FEATURES

    # drop non feature columns
    all_cols = list(df.columns.values)
    for col in ['Unnamed: 0',
                'week_idx',
                'date',
                'asset',
                'macro_mcap_t',
                'macro_mcap_ret_t',
                'r_tplus7']:
        all_cols.remove(col)

    # remove columns we are not interested in
    for col in all_cols:
        if 'covar_mcap' in col:
            all_cols.remove(col)
        if 'macro_med' in col:
            all_cols.remove(col)


    # FRANCESCO TODO
    # form groups of column names by finding the below strings in the column names
    # e.g. list(df.filter(regex=REPLACE_THIS_WITH_STRING_TO_FIND, axis=1).columns.values)
    # e.g.:
    # token_age_feats = list(df.filter(regex='age_', axis=1).columns.values)
    # token_age_feats += list(df.filter(regex='dormant_circulation', axis=1).columns.values)

    # group1: token_age_feats
    # age_
    # dormant_circulation

    # group2: token_activity_feats
    # payments_
    # active_addresses
    # _transaction_
    # velocity
    # circulation

    # group3: dev_feats
    # github
    # _dev_activ_

    # group4: rank_feats
    # _alexa_rank_
    # _rank_

    # group5: sentiment_feats
    # _sentiment_
    # _san_sent_

    # group6: social_feats
    # _social_
    #_twitter_

    # group7: price_feats
    # price_
    # _p_

    # group8: return_feats
    # kurt
    # _r_
    # _vol_

    # group9: volume_feats
    # _volume_

    # group10: rv_feats
    # _realized_value_
    # mvrv_
    # nvt_

    # group11: exchange_feats
    # num_market_pairs
    # _active_cryptos_
    # _active_ex_
    # _ex_pairs_
    # _ex_volume_

    # group11: defi_feats
    # _cex_to_dex_flow_
    # _dex_to_defi_
    # _ex_to_defi_flow_
    # _traders_to_defi_
    # _whale_defi_
    # _mcd_collat_ratio_
    # _whale_to_defi_

    # group12: eth_feats
    # _eth_

    # group13: btc_feats
    # _btc_

    # group14: supply_feats
    # _supply_

    # group15: fed_feats
    # _fed_

    # group16: nft_feats
    # _nft_

    # group17: stf_feats
    # stock_to_flow_

    # group18: usdt_feats
    # _usdt_

    # TODO: MAKE SURE THESE GROUPS COVER ALL THE COLUMNS AND ARE MUTUALLY EXCLUSIVE
    
    # TODO: INSERT FOR LOOP OVER THE FEATURE GROUP TO SET FACTORS EQUAL TO EACH OF THE FEATURE LISTS
    #       AND THEN RUN THE BELOW 6 LINES
    generate_table_2_panel_a(df, factors, 'bitcoin').to_csv(r'./btc_t2pa.csv')
    generate_table_2_panel_b(df, factors, 'macro_mcap_ret_t', 'bitcoin').to_csv(r'./btc_t2pb.csv') 
    generate_table_3(df, 'macro_mcap_ret_t', factors, 'bitcoin').to_csv(r'./btc_t3.csv')

    generate_table_2_panel_a(df, factors, 'ethereum').to_csv(r'./eth_t2pa.csv')
    generate_table_2_panel_b(df, factors, 'macro_mcap_ret_t', 'ethereum').to_csv(r'./eth_t2pb.csv') 
    generate_table_3(df, 'macro_mcap_ret_t', factors, 'ethereum').to_csv(r'./eth_t3.csv')