In [1]:
from typing import Dict, List, Tuple
from joblib import Parallel, delayed
import statsmodels.api as sm
import pandas as pd
import numpy as np
import scipy.stats
import pickle
import gc


In [2]:
def processMonth(panel_df: pd.DataFrame, date_key: str, asset_list: List[str]) -> pd.DataFrame:
    """ Process a single month's data and return the resulting DataFrame.
    """
    # Convert the date_key string to a datetime object to work with pandas
    date_key_dt = pd.to_datetime(date_key)

    # Create a date mask for the month
    date_mask = (panel_df['date'].dt.year == date_key_dt.year) & (panel_df['date'].dt.month == date_key_dt.month)

    # Subset the panel_df DataFrame based on the date mask and the asset list
    subset = panel_df[date_mask & panel_df['asset'].isin(asset_list)]

    return subset

def subsetByMonthToAssetUniverse(panel_df: pd.DataFrame, asset_universe_dict: Dict[str, List[str]], n_jobs=-1) -> pd.DataFrame:
    """
    Subset the panel data to the assets in each month (key) of asset_universe_dict.

    Args:
        panel_df: A Pandas DataFrame containing panel data at the asset-hour level
                    with ID columns 'date' and 'asset'.
        asset_universe_dict: A dictionary with keys as dates in the format YYYY-MM-DD 
                                and values as lists of asset strings.
        n_jobs: Number of CPU cores to use for parallelization. Default is -1, which means using all available cores.
    
    Returns: A Pandas DataFrame without the rows not included in the study.
    """
    # Run the process_month function in parallel using joblib
    results = Parallel(n_jobs=n_jobs)(delayed(processMonth)(panel_df, date_key, asset_list) for date_key, asset_list in asset_universe_dict.items())

    # Combine the results into a single DataFrame
    new_df = pd.concat(results, ignore_index=True)

    return new_df

def subsetToWeeklyFreq(df: pd.DataFrame) -> pd.DataFrame:
    """ Takes DataFrame with datetime column "date" to subset
        it to observations on Sunday at midnight. """
    return df[(df.date.dt.day_name() == 'Sunday') 
            & (df.date.dt.time == pd.Timestamp('00:00:00').time())]

def calcReturn(x: pd.Series) -> float:
    return (x.iloc[-1] - x.iloc[0]) / x.iloc[0]
            
def setMissingIfIncomplete(panel_df: pd.DataFrame, return_col: str, hours_to_check: int) -> pd.DataFrame:
    """
    Set the specified return column to missing (np.nan) if the DataFrame is missing any of the previous hours
    specified by hours_to_check for each asset.

    Args:
        panel_df (pd.DataFrame): The input DataFrame
        return_col (str): The name of the column to set to missing (np.nan) if any of the previous hours are missing
        hours_to_check (int): The number of previous hours to check for.

    Returns: The DataFrame with the return column set to missing if any of the previous hours are missing
    """
    # Shift the date column by the specified hours_to_check
    panel_df['prev_date'] = panel_df.groupby('asset')['date'].shift(hours_to_check)
    
    # Calculate the rolling sum of hour differences over a window of size hours_to_check
    panel_df['hours_present'] = panel_df.groupby('asset')['date'].transform(
        lambda x: x.diff().dt.total_seconds().rolling(window=hours_to_check).sum() / 3600
    )

    # Set the return column value to missing (None) if the total number of hours present is not equal to hours_to_check
    panel_df.loc[panel_df['hours_present'] != hours_to_check, return_col] = np.nan
    
    # Drop temporary columns
    panel_df.drop(columns=['prev_date', 'hours_present'], inplace=True)

    return panel_df

def process_asset(asset_data: pd.DataFrame, target_col: str, new_col: str, range_hours: int, func) -> pd.DataFrame:
    """ Process a single asset's data and return the resulting DataFrame.
    """
    # Apply the function to each asset's previous values and store the result in a new Series
    # note: we add one to range hours so we get the appropriate window
    asset_data[new_col] = asset_data[target_col].rolling(range_hours+1).apply(func)

    # Reset missing values
    asset_data = setMissingIfIncomplete(asset_data, new_col, range_hours)

    return asset_data[['date', 'asset', new_col]]

def formNewColumnByAsset(panel_df: pd.DataFrame, target_col: str, new_col: str, range_hours: int, func, n_jobs=-1) -> pd.DataFrame:
    """ Adds a new column to a Pandas DataFrame containing panel data at the asset-hour level.
    The new column is calculated by applying a function to a range of previous values for each asset.
    Any values that do not have the previous range_hours are reset to missing (np.nan).

    Args:
        panel_df: Pandas DataFrame containing the panel data.
        target_col: Name of the column to apply the given function to.
        new_col: Name of the new column to add.
        range_hours: Number of previous hours to consider for each asset.
        func: Function to apply to the range of values for each asset.
        n_jobs: Number of CPU cores to use for parallelization. Default is -1, which means using all available cores.
    
    Returns: A new DataFrame with the columns "date", "asset", and new_col for each asset.
    """
    # Group the DataFrame by asset
    grouped = panel_df.groupby('asset')
    
    # Run the process_asset function in parallel using joblib
    results = Parallel(n_jobs=n_jobs)(delayed(process_asset)(asset_data, target_col, new_col, range_hours, func) for _, asset_data in grouped)

    # Combine the results into a single DataFrame
    result_df = pd.concat(results, ignore_index=True)

    return result_df


In [3]:
def formStaticCols(panel_df: pd.DataFrame) -> pd.DataFrame:
    # set columns to keep
    static_cols = ['char_industry_asset_mgmt',
        'char_industry_cex',
        'char_industry_cloud_compute',
        'char_industry_currency',
        'char_industry_data_mgmt',
        'char_industry_dex',
        'char_industry_gaming',
        'char_industry_infra',
        'char_industry_interop',
        'char_industry_lending',
        'char_industry_media',
        'char_industry_other_defi',
        'char_industry_smart_contract',
        'char_asset_usage_access',
        'char_asset_usage_discount',
        'char_asset_usage_dividends',
        'char_asset_usage_payments',
        'char_asset_usage_vote',
        'char_asset_usage_work',
        'char_pow',
        'char_pos',
        'char_ico_price',
        'char_ico']
    
    # form column subset
    static_df = panel_df[['date', 'asset']+static_cols]

    return static_df


In [4]:
def formDescStatCols(panel_df: pd.DataFrame) -> pd.DataFrame:
    # identify cols
    cols = ['char_price_global_t', 'char_volume_24h_global_t', 
            'char_ico_days_since_t', 'char_vc_t',
            'char_rank_cmc_t', 'char_num_pairs_t']

    # subset to cols
    desc_stat_df = panel_df[['date', 'asset']+cols]

    return desc_stat_df


In [5]:
def formMomentumCols(panel_df: pd.DataFrame, static_df: pd.DataFrame) -> pd.DataFrame:
    # Form relevant data
    temp_df = panel_df[['date', 'asset', 'char_price_t', 'char_mcap_t']].copy()

    # Form momentums 
    mom1h_df    = formNewColumnByAsset(temp_df, target_col='char_price_t', new_col='char_r_tm1h', range_hours=1, func=calcReturn)
    mom2h_df    = formNewColumnByAsset(temp_df, target_col='char_price_t', new_col='char_r_tm2h', range_hours=2, func=calcReturn)
    mom6h_df    = formNewColumnByAsset(temp_df, target_col='char_price_t', new_col='char_r_tm6h', range_hours=6, func=calcReturn)
    mom12h_df   = formNewColumnByAsset(temp_df, target_col='char_price_t', new_col='char_r_tm12h', range_hours=12, func=calcReturn)
    mom1_df     = formNewColumnByAsset(temp_df, target_col='char_price_t', new_col='char_r_tm1', range_hours=24, func=calcReturn)
    mom7_df     = formNewColumnByAsset(temp_df, target_col='char_price_t', new_col='char_r_tm7', range_hours=168, func=calcReturn)
    mom14_df    = formNewColumnByAsset(temp_df, target_col='char_price_t', new_col='char_r_tm14', range_hours=336, func=calcReturn)
    mom30_14_df = formNewColumnByAsset(temp_df, target_col='char_price_t', new_col='char_r_tm30_tm14', range_hours=384, func=calcReturn)
    mom30_df    = formNewColumnByAsset(temp_df, target_col='char_price_t', new_col='char_r_tm30', range_hours=720, func=calcReturn)
    mom60_df    = formNewColumnByAsset(temp_df, target_col='char_price_t', new_col='char_r_tm60', range_hours=1440, func=calcReturn)
    mom90_df    = formNewColumnByAsset(temp_df, target_col='char_price_t', new_col='char_r_tm90', range_hours=2160, func=calcReturn)
    del temp_df

    # Prep industry column for forming industry momentums
    industry_cols = [col for col in static_df.columns if 'industry' in col]
    temp_df = static_df[['date', 'asset']+industry_cols].copy()
    def collapse_industries(df):
        # Extract the columns containing industry information
        industry_columns = [col for col in df.columns if col.startswith('char_industry_')]

        # Verify that each date-asset combination has only one industry with a value of 1
        for _, row in df[industry_columns].sum(axis=1).iteritems():
            assert row == 1, f"Error: {row.name[0]} and asset {row.name[1]} has {row} industries with a value of 1."
        
        # Create a new column with the industry name, removing the 'char_industry_' prefix
        df['industry'] = df[industry_columns].idxmax(axis=1).str.replace('char_industry_', '')
        
        # Drop the original industry columns
        df_result = df.drop(columns=industry_columns)
        
        return df_result

    industry_df = collapse_industries(temp_df)

    assert not industry_df.duplicated(subset=['date', 'asset']).any()

    # Form industry momentums
    temp_df  = panel_df[['date', 'asset', 'char_mcap_t']].copy()
    mom6h_df = mom6h_df.dropna()
    mom6h_df = mom6h_df.merge(industry_df, on=['date', 'asset'], how='inner', validate='one_to_one')
    mom6h_df = mom6h_df.merge(temp_df[['date', 'asset', 'char_mcap_t']], on=['date', 'asset'], how='inner', validate='one_to_one')
    weighted_averages = mom6h_df.groupby(['date', 'industry']).apply(lambda x: (x['char_r_tm6h'] * x['char_mcap_t']).sum() / x['char_mcap_t'].sum())
    weighted_averages = weighted_averages.reset_index(name='char_r_industry_tm6h')
    mom6h_df = mom6h_df.merge(weighted_averages, on=['date', 'industry'])
    mom6h_df = mom6h_df.drop(columns=['industry', 'char_mcap_t'])
    mom30_df = mom30_df.dropna()
    mom30_df = mom30_df.merge(industry_df, on=['date', 'asset'], how='inner', validate='one_to_one')
    mom30_df = mom30_df.merge(temp_df[['date', 'asset', 'char_mcap_t']], on=['date', 'asset'], how='inner', validate='one_to_one')
    weighted_averages = mom30_df.groupby(['date', 'industry']).apply(lambda x: (x['char_r_tm30'] * x['char_mcap_t']).sum() / x['char_mcap_t'].sum())
    weighted_averages = weighted_averages.reset_index(name='char_r_industry_tm30')
    mom30_df = mom30_df.merge(weighted_averages, on=['date', 'industry'])
    mom30_df = mom30_df.drop(columns=['industry', 'char_mcap_t'])
    mom60_df = mom60_df.dropna()
    mom60_df = mom60_df.merge(industry_df, on=['date', 'asset'], how='inner', validate='one_to_one')
    mom60_df = mom60_df.merge(temp_df[['date', 'asset', 'char_mcap_t']], on=['date', 'asset'], how='inner', validate='one_to_one')
    weighted_averages = mom60_df.groupby(['date', 'industry']).apply(lambda x: (x['char_r_tm60'] * x['char_mcap_t']).sum() / x['char_mcap_t'].sum())
    weighted_averages = weighted_averages.reset_index(name='char_r_industry_tm60')
    mom60_df = mom60_df.merge(weighted_averages, on=['date', 'industry'])
    mom60_df = mom60_df.drop(columns=['industry', 'char_mcap_t'])
    del temp_df

    # Form reversals
    mom7_df['char_r_tm14_tm7'] = mom7_df.groupby('asset')['char_r_tm7'].shift(168)
    mom30_14_df['char_r_tm30_tm14'] = mom30_14_df.groupby('asset')['char_r_tm30_tm14'].shift(336)
    mom60_df['char_r_tm90_tm30'] = mom60_df.groupby('asset')['char_r_tm60'].shift(720)

    # Form single momentum df
    mom_df = mom1h_df.copy()
    for df in [mom2h_df, mom6h_df, mom12h_df, mom1_df, mom7_df, mom14_df, mom30_df, mom60_df, mom90_df, mom30_14_df]:
        mom_df = mom_df.merge(df, on=['date', 'asset'], how='inner', validate='one_to_one')

    # Take out risk free rate from all returns
    temp_df = panel_df[['date', 'macro_dgs1mo_t']].drop_duplicates().copy()
    temp_df['r_rf_tm1h'] = (1+temp_df.macro_dgs1mo_t.values/100)**(1/(365*24))-1
    temp_df['r_rf_tm2h'] = (1+temp_df.macro_dgs1mo_t.values/100)**(1/(365*12))-1
    temp_df['r_rf_tm6h'] = (1+temp_df.macro_dgs1mo_t.values/100)**(1/(365*4))-1
    temp_df['r_rf_tm12h'] = (1+temp_df.macro_dgs1mo_t.values/100)**(1/(365*2))-1
    temp_df['r_rf_tm1'] = (1+temp_df.macro_dgs1mo_t.values/100)**(1/(365))-1
    temp_df['r_rf_tm7'] = (1+temp_df.macro_dgs1mo_t.values/100)**(1/(365/7))-1
    temp_df['r_rf_tm14'] = (1+temp_df.macro_dgs1mo_t.values/100)**(1/(365/14))-1
    temp_df['r_rf_tm16'] = (1+temp_df.macro_dgs1mo_t.values/100)**(1/(365/16))-1
    temp_df['r_rf_tm30'] = (1+temp_df.macro_dgs1mo_t.values/100)**(1/(365/30))-1
    temp_df['r_rf_tm60'] = (1+temp_df.macro_dgs1mo_t.values/100)**(1/(365/60))-1
    temp_df['r_rf_tm90'] = (1+temp_df.macro_dgs1mo_t.values/100)**(1/(365/90))-1
    temp_df = temp_df.drop('macro_dgs1mo_t', axis=1)
    temp_df['r_rf_tm1h'] = temp_df['r_rf_tm1h'].shift(1)
    temp_df['r_rf_tm2h'] = temp_df['r_rf_tm2h'].shift(2)
    temp_df['r_rf_tm6h'] = temp_df['r_rf_tm6h'].shift(6)
    temp_df['r_rf_tm12h'] = temp_df['r_rf_tm12h'].shift(12)
    temp_df['r_rf_tm1'] = temp_df['r_rf_tm1'].shift(24)
    temp_df['r_rf_tm7'] = temp_df['r_rf_tm7'].shift(24*7)
    temp_df['r_rf_tm14'] = temp_df['r_rf_tm14'].shift(24*14)
    temp_df['r_rf_tm16'] = temp_df['r_rf_tm16'].shift(24*16) 
    temp_df['r_rf_tm30'] = temp_df['r_rf_tm30'].shift(24*30)
    temp_df['r_rf_tm60'] = temp_df['r_rf_tm60'].shift(24*60) 
    temp_df['r_rf_tm90'] = temp_df['r_rf_tm90'].shift(24*90) 
    temp_df['r_rf_tm14_tm7'] = temp_df['r_rf_tm7'].shift(24*7) 
    temp_df['r_rf_tm30_tm14'] = temp_df['r_rf_tm16'].shift(24*14) 
    temp_df = temp_df.drop('r_rf_tm16', axis=1)
    temp_df['r_rf_tm90_tm30'] = temp_df['r_rf_tm60'].shift(24*30)
    mom_df = mom_df.merge(temp_df, on=['date'], how='inner', validate='many_to_one')
    mom_df['char_r_tm1h']          = mom_df['char_r_tm1h'] - mom_df['r_rf_tm1h']
    mom_df['char_r_tm2h']          = mom_df['char_r_tm2h'] - mom_df['r_rf_tm2h']
    mom_df['char_r_tm6h']          = mom_df['char_r_tm6h'] - mom_df['r_rf_tm6h']
    mom_df['char_r_industry_tm6h'] = mom_df['char_r_industry_tm6h'] - mom_df['r_rf_tm6h']
    mom_df['char_r_tm12h']         = mom_df['char_r_tm12h'] - mom_df['r_rf_tm12h']
    mom_df['char_r_tm1']           = mom_df['char_r_tm1'] - mom_df['r_rf_tm1']
    mom_df['char_r_tm7']           = mom_df['char_r_tm7'] - mom_df['r_rf_tm7']
    mom_df['char_r_tm14_tm7']      = mom_df['char_r_tm14_tm7'] - mom_df['r_rf_tm14_tm7']
    mom_df['char_r_tm14']          = mom_df['char_r_tm14'] - mom_df['r_rf_tm14']
    mom_df['char_r_tm30']          = mom_df['char_r_tm30'] - mom_df['r_rf_tm30']
    mom_df['char_r_industry_tm30'] = mom_df['char_r_industry_tm30'] - mom_df['r_rf_tm30']
    mom_df['char_r_tm60']          = mom_df['char_r_tm60'] - mom_df['r_rf_tm60']
    mom_df['char_r_industry_tm60'] = mom_df['char_r_industry_tm60'] - mom_df['r_rf_tm60']
    mom_df['char_r_tm90_tm30']     = mom_df['char_r_tm90_tm30'] - mom_df['r_rf_tm90_tm30']
    mom_df['char_r_tm90']          = mom_df['char_r_tm90'] - mom_df['r_rf_tm90']
    mom_df['char_r_tm30_tm14']     = mom_df['char_r_tm30_tm14'] - mom_df['r_rf_tm30_tm14']
    rf_cols = [col for col in mom_df.columns if '_rf_' in col]
    mom_df = mom_df.drop(rf_cols, axis=1)

    # Fix outliers
    ret_cols = [col for col in mom_df.columns if col not in ['date', 'asset']]
    for col in ret_cols:
        mom_df.loc[mom_df[col] < -.95, col] = -0.95
        mom_df.loc[mom_df[col] > 100, col] = 100

    return mom_df


In [6]:
def formCmktCol(panel_df: pd.DataFrame, asset_universe_dict: Dict[str, List[str]]) -> pd.DataFrame:
    # form mom cols, subset to relevant assets, and merge together for temp data to use to form cmkt return
    mom7_df = formNewColumnByAsset(panel_df[['date', 'asset', 'char_price_t']], target_col='char_price_t', new_col='char_r_tm7', range_hours=168, func=calcReturn)
    mom1h_df = formNewColumnByAsset(panel_df[['date', 'asset', 'char_price_t']], target_col='char_price_t', new_col='char_r_tm1h', range_hours=1, func=calcReturn)
    rel_assets_dt_df = subsetByMonthToAssetUniverse(panel_df[['date', 'asset', 'char_mcap_t', 'char_price_t']], 
                                                    asset_universe_dict)
    rel_assets_dt_df = rel_assets_dt_df.drop('char_price_t', axis=1)
    temp_df = rel_assets_dt_df.merge(mom7_df, on=['date', 'asset'], how='inner', validate='one_to_one')
    temp_1h_df = rel_assets_dt_df.merge(mom1h_df, on=['date', 'asset'], how='inner', validate='one_to_one')
    temp_1h_df.loc[temp_1h_df['char_r_tm1h'] < -.95, 'char_r_tm1h'] = -0.95
    temp_1h_df.loc[temp_1h_df['char_r_tm1h'] > 100, 'char_r_tm1h'] = 100
    temp_df.loc[temp_df['char_r_tm7'] < -.95, 'char_r_tm7'] = -0.95
    temp_df.loc[temp_df['char_r_tm7'] > 100, 'char_r_tm7'] = 100

    # form cmkt-weighted average return by week and by day
    cmkt_df = temp_df.groupby('date').apply(lambda x: (x['char_r_tm7']*x['char_mcap_t']).sum() / x['char_mcap_t'].sum())
    cmkt_1h_df = temp_1h_df.groupby('date').apply(lambda x: (x['char_r_tm1h']*x['char_mcap_t']).sum() / x['char_mcap_t'].sum())

    # clean it up
    cmkt_df = pd.DataFrame(cmkt_df)
    cmkt_df.columns = ['macro_cmkt_tm7']
    cmkt_df = cmkt_df.reset_index()
    cmkt_1h_df = pd.DataFrame(cmkt_1h_df)
    cmkt_1h_df.columns = ['macro_cmkt_tm1h']
    cmkt_1h_df = cmkt_1h_df.reset_index()

    # Take out risk free rate
    temp_df = panel_df[['date', 'macro_dgs1mo_t']].drop_duplicates().copy()
    temp_df['r_rf_tm1h'] = (1+temp_df.macro_dgs1mo_t.values/100)**(1/(365*24))-1
    temp_df['r_rf_tm7'] = (1+temp_df.macro_dgs1mo_t.values/100)**(1/(365/7))-1
    temp_df = temp_df.drop('macro_dgs1mo_t', axis=1)
    temp_df['r_rf_tm1h'] = temp_df['r_rf_tm1h'].shift(1)
    temp_df['r_rf_tm7'] = temp_df['r_rf_tm7'].shift(24*7)
    cmkt_df = cmkt_df.merge(temp_df[['date', 'r_rf_tm7']], on=['date'], how='inner', validate='one_to_one')
    cmkt_1h_df = cmkt_1h_df.merge(temp_df[['date', 'r_rf_tm1h']], on=['date'], how='inner', validate='one_to_one')
    cmkt_df['macro_cmkt_tm7'] = cmkt_df['macro_cmkt_tm7'] - cmkt_df['r_rf_tm7']
    cmkt_df = cmkt_df.drop('r_rf_tm7', axis=1)
    cmkt_1h_df['macro_cmkt_tm1h'] = cmkt_1h_df['macro_cmkt_tm1h'] - cmkt_1h_df['r_rf_tm1h']
    cmkt_1h_df = cmkt_1h_df.drop('r_rf_tm1h', axis=1)

    # Fix outliers
    cmkt_1h_df.loc[cmkt_1h_df.macro_cmkt_tm1h>0.5, 'macro_cmkt_tm1h'] = 0.1
    cmkt_df.loc[cmkt_df.macro_cmkt_tm7>0.65, 'macro_cmkt_tm7'] = 0.65

    return cmkt_df, cmkt_1h_df


In [7]:
def formCumRetCols(df: pd.DataFrame) -> pd.DataFrame:
    """
    Adds two new columns to the input DataFrame containing panel data at the asset-hour level.
    The new columns are 'char_r_ath_t' and 'char_r_atl_t', representing the cumulative return since
    each asset's historical all-time high price and all-time low price, respectively.

    Args:
        df (pd.DataFrame): The input DataFrame containing panel data at the asset-hour level.

    Returns:
        pd.DataFrame: The modified DataFrame with the new columns added.
    """
    # Form group mask
    grouped = df.groupby('asset')

    # Calculate the cumulative maximum for the 'char_price_t' column within each group
    df['cummax_price'] = grouped['char_price_t'].cummax()

    # Calculate the return since the all-time high price
    df['char_r_ath_t'] = df['char_price_t'] / df['cummax_price'] - 1

    # Calculate the cumulative minimum for the 'char_price_t' column within each group
    df['cummin_price'] = grouped['char_price_t'].cummin()

    # Calculate the return since the all-time low price
    df['char_r_atl_t'] = df['char_price_t'] / df['cummin_price'] - 1

    # Drop the temporary 'cummax_price' and 'cummin_price' columns
    df.drop(columns=['cummax_price', 'cummin_price'], inplace=True)

    return df

def perform_regression(group, window_size, include_squared_term, lhs_col, rhs_col):
    group = group.dropna(subset=[lhs_col, rhs_col]).reset_index(drop=True).copy()

    group['intercept'] = np.nan
    group['slope'] = np.nan
    group['residuals'] = np.nan

    if include_squared_term:
        group['coskew'] = np.nan
        group[f"{rhs_col}_sq"] = group[rhs_col]**2

    for i in range(window_size, len(group)):
        X = group.iloc[i-window_size:i][rhs_col].reset_index(drop=True)

        if include_squared_term:
            X = pd.concat([X, X**2], axis=1)
            X.columns = [rhs_col, f'{rhs_col}_sq']

        y = group.iloc[i-window_size:i][lhs_col].reset_index(drop=True)

        X = pd.concat([pd.DataFrame(data={'const': np.ones(window_size)}), pd.DataFrame(X)], axis=1) 
        model = sm.OLS(y, X).fit()

        group.loc[group.index[i], 'intercept'] = model.params['const']
        group.loc[group.index[i], 'slope'] = model.params[rhs_col]

        constant = np.array([1])
        selected_row = group.loc[group.index[i]][list(X.columns[1:])].values
        with_constant = np.concatenate([constant, selected_row])

        y_pred = model.predict(with_constant)
        group.loc[group.index[i], 'residuals'] = group.loc[group.index[i], lhs_col] - y_pred

        if include_squared_term:
            group.loc[group.index[i], 'coskew'] = model.params[f'{rhs_col}_sq']

    return group

def formFinancialCols(panel_df: pd.DataFrame, cmkt_df: pd.DataFrame, cmkt_1h_df: pd.DataFrame) -> pd.DataFrame:
    """ Form all financial columns. """
    # subset to needed columns
    fin_df = panel_df[['date', 'asset', 'char_price_t', 'char_mcap_t']].copy()

    # merge on new data
    fin_df = fin_df.merge(cmkt_df, on='date', how='left', validate='many_to_one') 
    fin_df = fin_df.merge(cmkt_1h_df, on='date', how='left', validate='many_to_one') 

    # form characteristics
    fin_df   = fin_df.rename(columns={'char_mcap_t': 'char_size_t'})
    temp_df = formNewColumnByAsset(fin_df, target_col='char_price_t', new_col='char_r_tm1h', range_hours=1, func=calcReturn)
    temp_df.loc[temp_df['char_r_tm1h'] < -.95, 'char_r_tm1h'] = -0.95
    temp_df.loc[temp_df['char_r_tm1h'] > 100, 'char_r_tm1h'] = 100
    fin_df = fin_df.merge(temp_df, on=['date', 'asset'], how='left', validate='one_to_one')
    temp_df = formNewColumnByAsset(fin_df, target_col='char_r_tm1h', new_col='char_r_max_tm12h', range_hours=12, func=np.max)
    fin_df = fin_df.merge(temp_df, on=['date', 'asset'], how='left', validate='one_to_one')
    temp_df = formNewColumnByAsset(fin_df, target_col='char_r_tm1h', new_col='char_r_max_tm1', range_hours=24, func=np.max)
    fin_df = fin_df.merge(temp_df, on=['date', 'asset'], how='left', validate='one_to_one')
    temp_df = formNewColumnByAsset(fin_df, target_col='char_r_tm1h', new_col='char_r_max_tm7', range_hours=168, func=np.max)
    fin_df = fin_df.merge(temp_df, on=['date', 'asset'], how='left', validate='one_to_one')
    temp_df = formNewColumnByAsset(fin_df, target_col='char_r_tm1h', new_col='char_r_max_tm30', range_hours=720, func=np.max)
    fin_df = fin_df.merge(temp_df, on=['date', 'asset'], how='left', validate='one_to_one')
    temp_df = formNewColumnByAsset(fin_df, target_col='char_r_tm1h', new_col='char_vol_tm6h', range_hours=6, func=np.std)
    fin_df = fin_df.merge(temp_df, on=['date', 'asset'], how='left', validate='one_to_one')
    temp_df = formNewColumnByAsset(fin_df, target_col='char_r_tm1h', new_col='char_vol_tm12h', range_hours=12, func=np.std)
    fin_df = fin_df.merge(temp_df, on=['date', 'asset'], how='left', validate='one_to_one')
    temp_df = formNewColumnByAsset(fin_df, target_col='char_r_tm1h', new_col='char_vol_tm1', range_hours=24, func=np.std)
    fin_df = fin_df.merge(temp_df, on=['date', 'asset'], how='left', validate='one_to_one')
    temp_df = formNewColumnByAsset(fin_df, target_col='char_r_tm1h', new_col='char_vol_tm7', range_hours=168, func=np.std)
    fin_df = fin_df.merge(temp_df, on=['date', 'asset'], how='left', validate='one_to_one')
    temp_df = formNewColumnByAsset(fin_df, target_col='char_r_tm1h', new_col='char_vol_tm30', range_hours=720, func=np.std)
    fin_df = fin_df.merge(temp_df, on=['date', 'asset'], how='left', validate='one_to_one')
    temp_df = formNewColumnByAsset(fin_df, target_col='char_r_tm1h', new_col='char_vol_tm90', range_hours=2160, func=np.std)
    fin_df = fin_df.merge(temp_df, on=['date', 'asset'], how='left', validate='one_to_one')

    fin_df['char_tradable_t'] = (fin_df['date'] - fin_df.groupby('asset')['date'].transform('min')).dt.total_seconds() / 3600

    temp_df = formNewColumnByAsset(fin_df, target_col='char_r_tm1h', new_col='char_var5_tm1', range_hours=24, func=lambda x: x.quantile(0.05))
    fin_df = fin_df.merge(temp_df, on=['date', 'asset'], how='left', validate='one_to_one')
    temp_df = formNewColumnByAsset(fin_df, target_col='char_r_tm1h', new_col='char_var5_tm7', range_hours=168, func=lambda x: x.quantile(0.05))
    fin_df = fin_df.merge(temp_df, on=['date', 'asset'], how='left', validate='one_to_one')
    temp_df = formNewColumnByAsset(fin_df, target_col='char_r_tm1h', new_col='char_var5_tm90', range_hours=2160, func=lambda x: x.quantile(0.05))
    fin_df = fin_df.merge(temp_df, on=['date', 'asset'], how='left', validate='one_to_one')
    temp_df = formNewColumnByAsset(fin_df, target_col='char_r_tm1h', new_col='char_shortfall5_tm1', range_hours=24, func=lambda x: x[x < x.quantile(0.05)].mean())
    fin_df = fin_df.merge(temp_df, on=['date', 'asset'], how='left', validate='one_to_one')
    temp_df = formNewColumnByAsset(fin_df, target_col='char_r_tm1h', new_col='char_shortfall5_tm7', range_hours=168, func=lambda x: x[x < x.quantile(0.05)].mean())
    fin_df = fin_df.merge(temp_df, on=['date', 'asset'], how='left', validate='one_to_one')
    temp_df = formNewColumnByAsset(fin_df, target_col='char_r_tm1h', new_col='char_shortfall5_tm90', range_hours=2160, func=lambda x: x[x < x.quantile(0.05)].mean())
    fin_df = fin_df.merge(temp_df, on=['date', 'asset'], how='left', validate='one_to_one')

    fin_df = formCumRetCols(fin_df)


    # Form regression characteristics

    # Extract unique asset names
    asset_names = fin_df['asset'].unique()

    # Perform the 7 day reg
    params = {
        'window_size': 168,
        'include_squared_term': False,
        'lhs_col': 'char_r_tm1h',
        'rhs_col': 'macro_cmkt_tm7'
    }
    reg_df = fin_df[['date', 'asset', 'char_r_tm1h', 'macro_cmkt_tm7']].copy()
    reg_df = reg_df.dropna()
    results = Parallel(n_jobs=-1)(delayed(perform_regression)(reg_df[reg_df['asset'] == asset], **params) for asset in asset_names)
    reg_df = pd.concat(results).sort_values(by=['date', 'asset'])
    reg_df = reg_df[['date', 'asset', 'residuals', 'intercept', 'slope']]
    temp_df = formNewColumnByAsset(
        reg_df,
        'residuals',
        'char_ivol_tm7',
        168,
        lambda x: x.std()
    )
    reg_df = reg_df.drop('residuals', axis=1)
    reg_df = reg_df.rename(columns={'intercept': 'char_alpha_tm7',
                                    'slope': 'char_beta_tm7'})
    reg_df = setMissingIfIncomplete(reg_df, 'char_alpha_tm7', 168)
    reg_df = setMissingIfIncomplete(reg_df, 'char_beta_tm7', 168)
    fin_df = fin_df.merge(temp_df, on=['date', 'asset'], how='left', validate='one_to_one')
    fin_df = fin_df.merge(reg_df, on=['date', 'asset'], how='left', validate='one_to_one')

    # Perform the 30 day reg 
    params = {
        'window_size': 720,
        'include_squared_term': False,
        'lhs_col': 'char_r_tm1h',
        'rhs_col': 'macro_cmkt_tm7'
    }
    reg_df = fin_df[['date', 'asset', 'char_r_tm1h', 'macro_cmkt_tm7']].copy()
    reg_df = reg_df.dropna()
    results = Parallel(n_jobs=-1)(delayed(perform_regression)(reg_df[reg_df['asset'] == asset], **params) for asset in asset_names)
    reg_df = pd.concat(results).sort_values(by=['date', 'asset'])
    reg_df = reg_df[['date', 'asset', 'residuals', 'intercept', 'slope']]
    temp_df = formNewColumnByAsset(
        reg_df[['date', 'asset', 'residuals']],
        'residuals',
        'char_ivol_tm30',
        720,
        lambda x: x.std()
    )
    reg_df = reg_df.drop('residuals', axis=1)
    reg_df = reg_df.rename(columns={'intercept': 'char_alpha_tm30',
                                    'slope': 'char_beta_tm30'})
    reg_df = setMissingIfIncomplete(reg_df, 'char_alpha_tm30', 720)
    reg_df = setMissingIfIncomplete(reg_df, 'char_beta_tm30', 720)
    fin_df = fin_df.merge(temp_df, on=['date', 'asset'], how='left', validate='one_to_one')
    fin_df = fin_df.merge(reg_df, on=['date', 'asset'], how='left', validate='one_to_one')

    # Perform the 90 day reg
    params = {
        'window_size': 2160,
        'include_squared_term': False,
        'lhs_col': 'char_r_tm1h',
        'rhs_col': 'macro_cmkt_tm7'
    }
    reg_df = fin_df[['date', 'asset', 'char_r_tm1h', 'macro_cmkt_tm7']].copy()
    reg_df = reg_df.dropna()
    results = Parallel(n_jobs=-1)(delayed(perform_regression)(reg_df[reg_df['asset'] == asset], **params) for asset in asset_names)
    reg_df = pd.concat(results).sort_values(by=['date', 'asset'])
    reg_df = reg_df[['date', 'asset', 'residuals']]
    temp_df = formNewColumnByAsset(
        reg_df,
        'residuals',
        'char_ivol_tm90',
        2160,
        lambda x: x.std()
    )
    fin_df = fin_df.merge(temp_df, on=['date', 'asset'], how='left', validate='one_to_one')

    # Perform the 30 day reg with negative returns to extract slope
    fin_df['char_r_neg_tm1h'] = 0
    fin_df['macro_cmkt_neg_t'] = 0
    fin_df.loc[fin_df.char_r_tm1h<0, 'char_r_neg_tm1h'] = fin_df.loc[fin_df.char_r_tm1h<0, 'char_r_tm1h']
    fin_df.loc[fin_df.macro_cmkt_tm7<0, 'macro_cmkt_neg_t'] = fin_df.loc[fin_df.macro_cmkt_tm7<0, 'macro_cmkt_tm7']
    params = {
        'window_size': 720,
        'include_squared_term': False,
        'lhs_col': 'char_r_neg_tm1h',
        'rhs_col': 'macro_cmkt_neg_t'
    }
    reg_df = fin_df[['date', 'asset', 'char_r_neg_tm1h', 'macro_cmkt_neg_t']].copy()
    reg_df = reg_df.dropna()
    results = Parallel(n_jobs=-1)(delayed(perform_regression)(reg_df[reg_df['asset'] == asset], **params) for asset in asset_names)
    reg_df = pd.concat(results).sort_values(by=['date', 'asset'])
    reg_df = reg_df[['date', 'asset', 'slope']]
    reg_df = reg_df.rename(columns={'slope': 'char_beta_downside_tm30'})
    reg_df = setMissingIfIncomplete(reg_df, 'char_beta_downside_tm30', 720)
    fin_df = fin_df.merge(reg_df, on=['date', 'asset'], how='left', validate='one_to_one')
    fin_df = fin_df.drop(['char_r_neg_tm1h', 'macro_cmkt_neg_t'], axis=1)

    # Perform the 30 day reg with two RHS including cmkt^2
    params = {
        'window_size': 720,
        'include_squared_term': True,
        'lhs_col': 'char_r_tm1h',
        'rhs_col': 'macro_cmkt_tm7'
    }
    reg_df = fin_df[['date', 'asset', 'char_r_tm1h', 'macro_cmkt_tm7']].copy()
    reg_df = reg_df.dropna()
    results = Parallel(n_jobs=-1)(delayed(perform_regression)(reg_df[reg_df['asset'] == asset], **params) for asset in asset_names)
    reg_df = pd.concat(results).sort_values(by=['date', 'asset'])
    reg_df = reg_df[['date', 'asset', 'residuals', 'coskew']]
    temp_df = formNewColumnByAsset(
        reg_df[['date', 'asset', 'residuals']],
        'residuals',
        'char_iskew_tm30',
        720,
        lambda x: scipy.stats.skew(x)
    )
    reg_df = reg_df.drop('residuals', axis=1)
    reg_df = reg_df.rename(columns={'coskew': 'char_coskew_tm30'})
    reg_df = setMissingIfIncomplete(reg_df, 'char_coskew_tm30', 720)
    fin_df = fin_df.merge(temp_df, on=['date', 'asset'], how='left', validate='one_to_one')
    fin_df = fin_df.merge(reg_df, on=['date', 'asset'], how='left', validate='one_to_one')

    # drop cols we dont need
    fin_df = fin_df.drop(columns=['macro_cmkt_tm7', 'macro_cmkt_tm1h', 'char_r_tm1h'])

    return fin_df


In [8]:
def formMicrostructureCols(panel_df: pd.DataFrame) -> pd.DataFrame:
    # Subset to needed columns
    mic_df = panel_df[['date', 'asset', 'char_price_t', 'char_volume_t', 'char_trades_t',
        'char_bidask_t', 'char_bid_t', 'char_ask_t', 'char_bid_size_t', 'char_ask_size_t', 
        'char_supply_circ_t']].copy()

    # Form bidask in bps
    mic_df['char_spread_bps_t'] = mic_df.char_bidask_t / mic_df.char_price_t

    # Add on needed momentum col
    temp_df = formNewColumnByAsset(mic_df, target_col='char_price_t', new_col='char_r_tm1h', range_hours=1, func=calcReturn)
    temp_df.loc[temp_df['char_r_tm1h'] < -.95, 'char_r_tm1h'] = -0.95
    temp_df.loc[temp_df['char_r_tm1h'] > 100, 'char_r_tm1h'] = 100
    mic_df = mic_df.merge(temp_df, on=['date', 'asset'], how='left', validate='one_to_one')

    # Add temporary columns
    mic_df['temp_volume_price_t'] = mic_df.char_volume_t * mic_df.char_price_t
    temp_df = formNewColumnByAsset(mic_df, target_col='char_r_tm1h', new_col='temp_r_1m1h_abs_avg_tm7', range_hours=168, func=lambda x: np.mean(np.abs(x)))
    mic_df = mic_df.merge(temp_df, on=['date', 'asset'], how='left', validate='one_to_one')
    temp_df = formNewColumnByAsset(mic_df, target_col='char_r_tm1h', new_col='temp_r_1m1h_abs_avg_tm1', range_hours=24, func=lambda x: np.mean(np.abs(x)))
    mic_df = mic_df.merge(temp_df, on=['date', 'asset'], how='left', validate='one_to_one')

    # Form characteristics
    temp_df = formNewColumnByAsset(mic_df, target_col='char_volume_t', new_col='char_volume_sum_tm12h', range_hours=12, func=np.sum)
    mic_df = mic_df.merge(temp_df, on=['date', 'asset'], how='left', validate='one_to_one')
    temp_df = formNewColumnByAsset(mic_df, target_col='char_volume_t', new_col='char_volume_sum_tm1', range_hours=24, func=np.sum)
    mic_df = mic_df.merge(temp_df, on=['date', 'asset'], how='left', validate='one_to_one')
    temp_df = formNewColumnByAsset(mic_df, target_col='char_volume_t', new_col='char_volume_sum_tm7', range_hours=168, func=np.sum)
    mic_df = mic_df.merge(temp_df, on=['date', 'asset'], how='left', validate='one_to_one')
    temp_df = formNewColumnByAsset(mic_df, target_col='char_volume_t', new_col='char_volume_std_tm12h', range_hours=12, func=np.std)
    mic_df = mic_df.merge(temp_df, on=['date', 'asset'], how='left', validate='one_to_one')
    temp_df = formNewColumnByAsset(mic_df, target_col='char_volume_t', new_col='char_volume_std_tm7', range_hours=168, func=np.std)
    mic_df = mic_df.merge(temp_df, on=['date', 'asset'], how='left', validate='one_to_one')
    temp_df = formNewColumnByAsset(mic_df, target_col='char_trades_t', new_col='char_trades_sum_tm7', range_hours=168, func=np.sum)
    mic_df = mic_df.merge(temp_df, on=['date', 'asset'], how='left', validate='one_to_one')
    temp_df = formNewColumnByAsset(mic_df, target_col='char_trades_t', new_col='char_trades_std_tm7', range_hours=168, func=np.std)
    mic_df = mic_df.merge(temp_df, on=['date', 'asset'], how='left', validate='one_to_one')
    temp_df = formNewColumnByAsset(mic_df, target_col='temp_volume_price_t', new_col='char_volume_price_avg_tm7', range_hours=168, func=np.mean)
    mic_df = mic_df.merge(temp_df, on=['date', 'asset'], how='left', validate='one_to_one')
    temp_df = formNewColumnByAsset(mic_df, target_col='temp_volume_price_t', new_col='char_volume_price_std_tm7', range_hours=168, func=np.std)
    mic_df = mic_df.merge(temp_df, on=['date', 'asset'], how='left', validate='one_to_one')
    mic_df['char_turnover_tm1'] = mic_df.char_volume_sum_tm1 / mic_df.char_supply_circ_t
    mic_df['char_turnover_tm7'] = mic_df.char_volume_sum_tm7 / mic_df.char_supply_circ_t
    mic_df['char_illiq_tm1'] = mic_df.temp_r_1m1h_abs_avg_tm1 / (mic_df.char_volume_sum_tm1/24)
    mic_df['char_illiq_tm7'] = mic_df.temp_r_1m1h_abs_avg_tm7 / (mic_df.char_volume_sum_tm7/168)

    # drop unneeded columns
    mic_df = mic_df.drop(['char_price_t', 'temp_volume_price_t', 
        'char_supply_circ_t', 'char_r_tm1h', 
        'temp_r_1m1h_abs_avg_tm1', 'temp_r_1m1h_abs_avg_tm7'], axis=1)

    # Form std of residuals from regressing hourly turnover on a constant over last thirty days
    temp_df = formNewColumnByAsset(mic_df, target_col='char_turnover_tm7', new_col='char_turnover_avg_tm30', range_hours=720, func=np.mean)
    mic_df = mic_df.merge(temp_df, on=['date', 'asset'], how='left', validate='one_to_one')
    mic_df['temp_diff'] = mic_df.char_turnover_tm7 - mic_df.char_turnover_avg_tm30
    temp_df = formNewColumnByAsset(mic_df, target_col='temp_diff', new_col='char_turnover_res_vol_tm30', range_hours=720, func=np.std)
    mic_df = mic_df.merge(temp_df, on=['date', 'asset'], how='left', validate='one_to_one')
    mic_df = mic_df.drop(columns=['char_turnover_avg_tm30', 'temp_diff'], axis=1)

    return mic_df


In [9]:
def formOnchainCols(panel_df: pd.DataFrame) -> pd.DataFrame:
    # Subset to the needed columns
    oc_df = panel_df[['date', 'asset', 'char_network_growth_t',
        'char_holders_distribution_total_t', 'char_active_addr_t', 
        'char_tx_volume_t',
        'char_circulation_7d_t', 'char_circulation_30d_t', 
        'char_circulation_90d_t', 'char_circulation_365d_t', 
        'char_circulation_3y_t',  'char_dormant_circulation_365d_t', 
        'char_supply_circ_t', 'char_supply_max_t', 
        'char_age_mean_dollar_t', 'char_age_destroyed_t']].copy()

    # Rename columns
    oc_df = oc_df.rename(columns={'char_holders_distribution_total_t': 'char_addr_total_t',
                                'char_active_addr_t': 'char_addr_active_t',
                                'char_circulation_7d_t': 'char_circulation_tm7',
                                'char_circulation_30d_t': 'char_circulation_tm30',
                                'char_circulation_90d_t': 'char_circulation_tm90',
                                'char_circulation_365d_t': 'char_circulation_tm365',
                                'char_circulation_3y_t': 'char_circulation_tm3y',
                                'char_dormant_circulation_365d_t': 'char_circulation_dormant_tm365'})

    # Form new characteristics
    temp_df = formNewColumnByAsset(oc_df, target_col='char_network_growth_t', new_col='char_addr_new_tm1', range_hours=24, func=np.sum)
    oc_df = oc_df.merge(temp_df, on=['date', 'asset'], how='left', validate='one_to_one')
    temp_df = formNewColumnByAsset(oc_df, target_col='char_network_growth_t', new_col='char_addr_new_tm7', range_hours=168, func=np.sum)
    oc_df = oc_df.merge(temp_df, on=['date', 'asset'], how='left', validate='one_to_one')
    temp_df = formNewColumnByAsset(oc_df, target_col='char_addr_active_t', new_col='char_addr_active_tm1', range_hours=24, func=np.sum)
    oc_df = oc_df.merge(temp_df, on=['date', 'asset'], how='left', validate='one_to_one')
    temp_df = formNewColumnByAsset(oc_df, target_col='char_addr_active_t', new_col='char_addr_active_tm7', range_hours=168, func=np.sum)
    oc_df = oc_df.merge(temp_df, on=['date', 'asset'], how='left', validate='one_to_one')
    oc_df = oc_df.rename(columns={'char_network_growth_t': 'char_addr_new_tm1h',
                                'char_addr_active_t': 'char_addr_active_tm1h'})
    temp_df = formNewColumnByAsset(oc_df, target_col='char_tx_volume_t', new_col='char_tx_volume_tm1', range_hours=24, func=np.sum)
    oc_df = oc_df.merge(temp_df, on=['date', 'asset'], how='left', validate='one_to_one')
    temp_df = formNewColumnByAsset(oc_df, target_col='char_tx_volume_t', new_col='char_tx_volume_tm7', range_hours=168, func=np.sum)
    oc_df = oc_df.merge(temp_df, on=['date', 'asset'], how='left', validate='one_to_one')
    temp_df = formNewColumnByAsset(oc_df, target_col='char_age_destroyed_t', new_col='char_age_destroyed_tm1', range_hours=24, func=np.mean)
    oc_df = oc_df.merge(temp_df, on=['date', 'asset'], how='left', validate='one_to_one')
    temp_df = formNewColumnByAsset(oc_df, target_col='char_age_destroyed_t', new_col='char_age_destroyed_tm7', range_hours=168, func=np.mean)
    oc_df = oc_df.merge(temp_df, on=['date', 'asset'], how='left', validate='one_to_one')

    # Drop columns we do not need
    oc_df = oc_df.drop(columns=['char_age_destroyed_t'], axis=1)

    # Form change in week over week of char_addr_new_tm7
    oc_df['char_addr_new_log_delta_tm2_tm1'] = oc_df.groupby('asset')['char_addr_new_tm1'].transform(lambda x: np.log(x).diff(24))
    oc_df['char_addr_new_log_delta_tm14_tm7'] = oc_df.groupby('asset')['char_addr_new_tm7'].transform(lambda x: np.log(x).diff(168))

    return oc_df

In [10]:
def formDevCols(panel_df: pd.DataFrame) -> pd.DataFrame:
    # Subset to the needed columns
    dev_df = panel_df[['date', 'asset', 'char_dev_activity_t']].copy()

    # Form dev column
    temp_df = formNewColumnByAsset(dev_df, target_col='char_dev_activity_t', new_col='char_dev_activity_tm1', range_hours=24, func=np.mean)
    dev_df = dev_df.merge(temp_df, on=['date', 'asset'], how='left', validate='one_to_one')
    temp_df = formNewColumnByAsset(dev_df, target_col='char_dev_activity_t', new_col='char_dev_activity_tm7', range_hours=168, func=np.mean)
    dev_df = dev_df.merge(temp_df, on=['date', 'asset'], how='left', validate='one_to_one')

    # Drop nonnecssary columns
    dev_df = dev_df.drop(['char_dev_activity_t'], axis=1)
    
    return dev_df

In [11]:
def formSocialCols(panel_df: pd.DataFrame) -> pd.DataFrame:
    # Subset to the needed columns
    s_df = panel_df[['date', 'asset', 
        'char_sentiment_negative_reddit_t', 'char_sentiment_negative_twitter_t',
        'char_sentiment_positive_reddit_t', 'char_sentiment_positive_twitter_t',
        'char_sentiment_volume_consumed_total_t', 'char_social_dominance_total_t',
        'char_social_volume_reddit_t', 'char_social_volume_twitter_t', 
        'char_unique_social_volume_total_1h_t']].copy()

    # Form columns
    temp_df = formNewColumnByAsset(s_df, target_col='char_sentiment_negative_reddit_t', new_col='char_sent_neg_reddit_tm1', range_hours=24, func=np.mean)
    s_df = s_df.merge(temp_df, on=['date', 'asset'], how='left', validate='one_to_one')
    temp_df = formNewColumnByAsset(s_df, target_col='char_sentiment_negative_twitter_t', new_col='char_sent_neg_twitter_tm1', range_hours=24, func=np.mean)
    s_df = s_df.merge(temp_df, on=['date', 'asset'], how='left', validate='one_to_one')
    temp_df = formNewColumnByAsset(s_df, target_col='char_sentiment_positive_reddit_t', new_col='char_sent_pos_reddit_tm1', range_hours=24, func=np.mean)
    s_df = s_df.merge(temp_df, on=['date', 'asset'], how='left', validate='one_to_one')
    temp_df = formNewColumnByAsset(s_df, target_col='char_sentiment_positive_twitter_t', new_col='char_sent_pos_twitter_tm1', range_hours=24, func=np.mean)
    s_df = s_df.merge(temp_df, on=['date', 'asset'], how='left', validate='one_to_one')
    temp_df = formNewColumnByAsset(s_df, target_col='char_social_volume_reddit_t', new_col='char_social_volume_reddit_tm1', range_hours=24, func=np.sum)
    s_df = s_df.merge(temp_df, on=['date', 'asset'], how='left', validate='one_to_one')
    temp_df = formNewColumnByAsset(s_df, target_col='char_social_volume_twitter_t', new_col='char_social_volume_twitter_tm1', range_hours=24, func=np.sum)
    s_df = s_df.merge(temp_df, on=['date', 'asset'], how='left', validate='one_to_one')
    temp_df = formNewColumnByAsset(s_df, target_col='char_sentiment_volume_consumed_total_t', new_col='char_sent_volume_consumed_tm1', range_hours=24, func=np.sum)
    s_df = s_df.merge(temp_df, on=['date', 'asset'], how='left', validate='one_to_one')
    temp_df = formNewColumnByAsset(s_df, target_col='char_social_dominance_total_t', new_col='char_social_dom_avg_tm1', range_hours=24, func=np.mean)
    s_df = s_df.merge(temp_df, on=['date', 'asset'], how='left', validate='one_to_one')
    temp_df = formNewColumnByAsset(s_df, target_col='char_unique_social_volume_total_1h_t', new_col='char_social_volume_tm1', range_hours=24, func=np.sum)
    s_df = s_df.merge(temp_df, on=['date', 'asset'], how='left', validate='one_to_one')

    temp_df = formNewColumnByAsset(s_df, target_col='char_sentiment_negative_reddit_t', new_col='char_sent_neg_reddit_tm7', range_hours=168, func=np.mean)
    s_df = s_df.merge(temp_df, on=['date', 'asset'], how='left', validate='one_to_one')
    temp_df = formNewColumnByAsset(s_df, target_col='char_sentiment_negative_twitter_t', new_col='char_sent_neg_twitter_tm7', range_hours=168, func=np.mean)
    s_df = s_df.merge(temp_df, on=['date', 'asset'], how='left', validate='one_to_one')
    temp_df = formNewColumnByAsset(s_df, target_col='char_sentiment_positive_reddit_t', new_col='char_sent_pos_reddit_tm7', range_hours=168, func=np.mean)
    s_df = s_df.merge(temp_df, on=['date', 'asset'], how='left', validate='one_to_one')
    temp_df = formNewColumnByAsset(s_df, target_col='char_sentiment_positive_twitter_t', new_col='char_sent_pos_twitter_tm7', range_hours=168, func=np.mean)
    s_df = s_df.merge(temp_df, on=['date', 'asset'], how='left', validate='one_to_one')
    temp_df = formNewColumnByAsset(s_df, target_col='char_social_volume_reddit_t', new_col='char_social_volume_reddit_tm7', range_hours=168, func=np.sum)
    s_df = s_df.merge(temp_df, on=['date', 'asset'], how='left', validate='one_to_one')
    temp_df = formNewColumnByAsset(s_df, target_col='char_social_volume_twitter_t', new_col='char_social_volume_twitter_tm7', range_hours=168, func=np.sum)
    s_df = s_df.merge(temp_df, on=['date', 'asset'], how='left', validate='one_to_one')
    temp_df = formNewColumnByAsset(s_df, target_col='char_sentiment_volume_consumed_total_t', new_col='char_sent_volume_consumed_tm7', range_hours=168, func=np.sum)
    s_df = s_df.merge(temp_df, on=['date', 'asset'], how='left', validate='one_to_one')
    temp_df = formNewColumnByAsset(s_df, target_col='char_social_dominance_total_t', new_col='char_social_dom_avg_tm7', range_hours=168, func=np.mean)
    s_df = s_df.merge(temp_df, on=['date', 'asset'], how='left', validate='one_to_one')
    temp_df = formNewColumnByAsset(s_df, target_col='char_unique_social_volume_total_1h_t', new_col='char_social_volume_tm7', range_hours=168, func=np.sum)
    s_df = s_df.merge(temp_df, on=['date', 'asset'], how='left', validate='one_to_one')

    # Drop unncessary columns
    s_df = s_df.drop(columns=['char_sentiment_negative_reddit_t', 'char_sentiment_negative_twitter_t',
        'char_sentiment_positive_reddit_t', 'char_sentiment_positive_twitter_t',
        'char_sentiment_volume_consumed_total_t', 'char_social_dominance_total_t',
        'char_social_volume_reddit_t', 'char_social_volume_twitter_t',
        'char_unique_social_volume_total_1h_t'], axis=1)

    return s_df


In [12]:
def formValueCols(panel_df: pd.DataFrame) -> pd.DataFrame:
    # form new df with relevant columns
    val_df = panel_df[['date', 'asset', 
        'char_percent_of_total_supply_in_profit_t',
        'char_mvrv_long_short_diff_usd_t', 'char_mvrv_usd_t',
        'char_realized_value_usd_t']].copy()

    # Form RHS
    val_df = val_df.rename(columns={'char_percent_of_total_supply_in_profit_t': 'char_prct_supply_in_profit_t',
                                    'char_mvrv_long_short_diff_usd_t': 'char_mvrv_long_short_diff_t',
                                    'char_mvrv_usd_t': 'char_mvrv_t',
                                    'char_realized_value_usd_t': 'char_size_realized_t'})

    return val_df

In [13]:
def formFlowCols(panel_df: pd.DataFrame) -> pd.DataFrame:
    # Form new dataframe to build
    bal_df = panel_df[['date', 'asset',
        'char_exchange_inflow_usd_t', 'char_exchange_outflow_usd_t',
        'char_supply_circ_t',
        'char_traders_balance_t',
        'char_exchange_balance_t', 'char_dex_balance_t',
        'char_cex_balance_t', 'char_amount_in_top_holders_t',
        'char_defi_balance_t',
        'char_holders_distribution_over_100_t',
        'char_holders_distribution_over_100k_t',
        'char_holders_distribution_over_10_t',
        'char_holders_distribution_over_10k_t',
        'char_holders_distribution_over_1M_t',
        'char_holders_distribution_over_1_t',
        'char_holders_distribution_over_1k_t',
        'char_cexes_to_defi_flow_t',
        'char_cexes_to_dex_flow_t',
        'char_cexes_to_dex_traders_flow_t',
        'char_cexes_to_traders_flow_t',
        'char_defi_to_cexes_flow_t',
        'char_defi_to_dex_traders_flow_t',
        'char_defi_to_dexes_flow_t',
        'char_defi_to_exchanges_flow_t',
        'char_defi_to_traders_flow_t',
        'char_dex_to_cexes_flow_t',
        'char_dex_traders_to_cexes_flow_t',
        'char_dex_traders_to_defi_flow_t',
        'char_dex_traders_to_dexes_flow_t',
        'char_dex_traders_to_exchanges_flow_t',
        'char_dexes_to_defi_flow_t',
        'char_dexes_to_dex_traders_flow_t',
        'char_dexes_to_traders_flow_t',
        'char_exchanges_to_defi_flow_t',
        'char_exchanges_to_dex_traders_flow_t',
        'char_exchanges_to_genesis_flow_t',
        'char_exchanges_to_traders_flow_t',
        'char_traders_to_cexes_flow_t',
        'char_traders_to_defi_flow_t',
        'char_traders_to_dexes_flow_t',
        'char_traders_to_exchanges_flow_t']]
        
    # Form asset characteristics
    temp_df = formNewColumnByAsset(bal_df, target_col='char_exchange_inflow_usd_t', new_col='char_exchange_inflow_tm7', range_hours=168, func=np.sum)
    bal_df = bal_df.merge(temp_df, on=['date', 'asset'], how='left', validate='one_to_one')
    temp_df = formNewColumnByAsset(bal_df, target_col='char_exchange_outflow_usd_t', new_col='char_exchange_outflow_tm7', range_hours=168, func=np.sum)
    bal_df = bal_df.merge(temp_df, on=['date', 'asset'], how='left', validate='one_to_one')
    bal_df = bal_df.rename(columns={'char_exchange_inflow_usd_t': 'char_exchange_inflow_tm1h',
                                    'char_exchange_outflow_usd_t': 'char_exchange_outflow_tm1h'})
                                    
    bal_df['char_traders_prct_circ_supply_t'] = bal_df['char_traders_balance_t'] / bal_df['char_supply_circ_t']
    bal_df['char_exchange_prct_circ_supply_t'] = bal_df['char_exchange_balance_t'] / bal_df['char_supply_circ_t']
    bal_df['char_dex_prct_circ_supply_t'] = bal_df['char_dex_balance_t'] / bal_df['char_supply_circ_t']
    bal_df['char_cex_prct_circ_supply_t'] = bal_df['char_cex_balance_t'] / bal_df['char_supply_circ_t']
    bal_df['char_hodlers_top_prct_circ_supply_t'] = bal_df['char_amount_in_top_holders_t'] / bal_df['char_supply_circ_t']
    bal_df['char_defi_prct_circ_supply_t'] = bal_df['char_defi_balance_t'] / bal_df['char_supply_circ_t']

    hodlers_cols = ['char_holders_distribution_over_100_t',
        'char_holders_distribution_over_100k_t',
        'char_holders_distribution_over_10_t',
        'char_holders_distribution_over_10k_t',
        'char_holders_distribution_over_1M_t',
        'char_holders_distribution_over_1_t',
        'char_holders_distribution_over_1k_t']

    temp_cols_to_drop = []
    for col in hodlers_cols:
        temp_df = formNewColumnByAsset(bal_df, target_col=col, new_col=col+'m7', range_hours=168, func=np.mean)
        bal_df = bal_df.merge(temp_df, on=['date', 'asset'], how='left', validate='one_to_one')
        bal_df[col+'m7_diff_168h'] = bal_df.groupby('asset')[col+'m7'].diff(periods=168)
        bal_df[col+'m7_diff_1h'] = bal_df.groupby('asset')[col].diff(periods=1)
        temp_cols_to_drop.append(col+'m7')

    diff_cols_168h = [col for col in bal_df.columns if 'diff_168h' in col]
    diff_cols_1h = [col for col in bal_df.columns if 'diff_1h' in col]
    bal_df['char_delta_holders_dist_tm7'] = bal_df[diff_cols_168h].abs().sum(axis=1) / bal_df[hodlers_cols].sum(axis=1)
    bal_df['char_delta_holders_dist_tm1h'] = bal_df[diff_cols_1h].abs().sum(axis=1) / bal_df[hodlers_cols].sum(axis=1)

    bal_df = bal_df.drop(temp_cols_to_drop+diff_cols_168h+diff_cols_1h+hodlers_cols, axis=1)

    flow_cols = ['char_cexes_to_defi_flow_t',
        'char_cexes_to_dex_flow_t',
        'char_cexes_to_dex_traders_flow_t',
        'char_cexes_to_traders_flow_t',
        'char_defi_to_cexes_flow_t',
        'char_defi_to_dex_traders_flow_t',
        'char_defi_to_dexes_flow_t',
        'char_defi_to_exchanges_flow_t',
        'char_defi_to_traders_flow_t',
        'char_dex_to_cexes_flow_t',
        'char_dex_traders_to_cexes_flow_t',
        'char_dex_traders_to_defi_flow_t',
        'char_dex_traders_to_dexes_flow_t',
        'char_dex_traders_to_exchanges_flow_t',
        'char_dexes_to_defi_flow_t',
        'char_dexes_to_dex_traders_flow_t',
        'char_dexes_to_traders_flow_t',
        'char_exchanges_to_defi_flow_t',
        'char_exchanges_to_dex_traders_flow_t',
        'char_exchanges_to_genesis_flow_t',
        'char_exchanges_to_traders_flow_t',
        'char_traders_to_cexes_flow_t',
        'char_traders_to_defi_flow_t',
        'char_traders_to_dexes_flow_t',
        'char_traders_to_exchanges_flow_t']

    temp_cols_to_drop = []
    for col in flow_cols:
        temp_df = formNewColumnByAsset(bal_df, target_col=col, new_col=col+'m7', range_hours=168, func=np.mean)
        bal_df = bal_df.merge(temp_df, on=['date', 'asset'], how='left', validate='one_to_one')
        bal_df[col+'m7_diff_168h'] = bal_df.groupby('asset')[col+'m7'].diff(periods=168)
        bal_df[col+'m7_diff_1h'] = bal_df.groupby('asset')[col].diff(periods=1)
        temp_cols_to_drop.append(col+'m7')

    diff_cols_168h = [col for col in bal_df.columns if 'diff_168h' in col]
    diff_cols_1h = [col for col in bal_df.columns if 'diff_1h' in col]
    bal_df['char_delta_flow_dist_tm7'] = bal_df[diff_cols_168h].sum(axis=1) / bal_df[flow_cols].sum(axis=1)
    bal_df['char_delta_flow_dist_tm1h'] = bal_df[diff_cols_1h].sum(axis=1) / bal_df[flow_cols].sum(axis=1)

    bal_df = bal_df.drop(temp_cols_to_drop+diff_cols_168h+diff_cols_1h+flow_cols, axis=1)

    # Drop unnecessary columns
    bal_df = bal_df.drop(columns=['char_supply_circ_t', 'char_traders_balance_t',
        'char_exchange_balance_t', 'char_dex_balance_t',  'char_cex_balance_t', 
        'char_amount_in_top_holders_t', 'char_defi_balance_t'], axis=1)

    return bal_df


In [14]:
def formMacroCovariates(
    macro_df: pd.DataFrame, cmkt_df: pd.DataFrame, cmkt_1h_df: pd.DataFrame) -> pd.DataFrame:
    # Merge on market returns
    macro_df = macro_df.merge(cmkt_df, on='date', how='outer', validate='one_to_one')
    macro_df = macro_df.merge(cmkt_1h_df, on='date', how='outer', validate='one_to_one')

    # Form rolling sum columns
    sum_cols = ['macro_ico_count_t',
        'macro_dex_volume_t',
        'macro_ex_usd_volume_24h_cex_t',
        'macro_ex_volume_future_usd_t',
        'macro_ex_volume_t',
        'macro_total_nft_retail_trades_t',
        'macro_total_nft_retail_volume_t',
        'macro_total_nft_trades_t',
        'macro_total_nft_volume_t',
        'macro_total_nft_whale_trades_t',
        'macro_total_nft_whale_volume_t',
        'macro_us_ex_volume_future_usd_t',
        'macro_us_ex_volume_spot_usd_t',
        'macro_total_usd_ask_size_t',
        'macro_total_usd_bid_size_t']
    for col in sum_cols:
        macro_df[col[:-2]+'_sum_tm7'] = macro_df[col].rolling(168).sum()

    # Form rolling avg cols
    macro_df['macro_avg_bidask_tm7'] = macro_df['macro_avg_bidask_t'].rolling(168).mean()

    return macro_df


In [15]:
def formRHS(df: pd.DataFrame) -> Tuple:
    """ Form RHS covariates, including asset characteristics and macro covariates. """
    # Form panel and macro dataframes
    macro_cols = [col for col in df.columns if 'macro_' in col]
    panel_df = df.drop(macro_cols, axis=1)
    macro_df = df[['date']+macro_cols]
    macro_df = macro_df.drop_duplicates()
    assert macro_df.date.is_unique

    # add on cols needed for panel data
    panel_df = panel_df.merge(macro_df[['date', 'macro_dgs1mo_t']], on=['date'], how='left', validate='many_to_one')

    # Form all RHS asset characteristics
    static_df = formStaticCols(panel_df)
    static_cols = [col for col in static_df.columns if col not in ['date', 'asset']]
    panel_df = panel_df.drop(static_cols, axis=1)
    desc_stat_df = formDescStatCols(panel_df)
    desc_stat_cols = [col for col in desc_stat_df.columns if col not in ['date', 'asset']]
    panel_df = panel_df.drop(desc_stat_cols, axis=1)
    mom_df = formMomentumCols(panel_df, static_df)
    cmkt_df, cmkt_1h_df = formCmktCol(panel_df, asset_universe_dict)
    panel_df = panel_df.drop('macro_dgs1mo_t', axis=1)
    fin_df  = formFinancialCols(panel_df, cmkt_df, cmkt_1h_df)
    mic_df = formMicrostructureCols(panel_df)
    mic_cols = [col for col in mic_df.columns if col not in ['date', 'asset']]
    drop_cols = set(mic_cols).intersection(set(panel_df.columns))
    panel_df = panel_df.drop(drop_cols, axis=1)
    oc_df  = formOnchainCols(panel_df)
    oc_cols = [col for col in oc_df.columns if col not in ['date', 'asset']]
    oc_cols.remove('char_supply_circ_t')
    drop_cols = set(oc_cols).intersection(set(panel_df.columns))
    panel_df = panel_df.drop(drop_cols, axis=1)
    dev_df = formDevCols(panel_df)
    panel_df = panel_df.drop('char_dev_activity_t', axis=1)
    s_df = formSocialCols(panel_df)
    social_cols = [col for col in s_df.columns if col not in ['date', 'asset']]
    drop_cols = set(social_cols).intersection(set(panel_df.columns))
    panel_df = panel_df.drop(drop_cols, axis=1)
    val_df = formValueCols(panel_df)
    bal_df = formFlowCols(panel_df)
    del panel_df
    gc.collect()

    # Merge all characteristics together and clean up memory
    char_df = static_df.merge(desc_stat_df, on=['date', 'asset'], how='outer', validate='one_to_one')
    del static_df, desc_stat_df
    gc.collect()
    char_df = char_df.merge(mom_df, on=['date', 'asset'], how='outer', validate='one_to_one')
    del mom_df
    gc.collect()
    char_df = char_df.merge(fin_df, on=['date', 'asset'], how='outer', validate='one_to_one')
    del fin_df
    gc.collect()
    char_df = char_df.merge(mic_df, on=['date', 'asset'], how='outer', validate='one_to_one')
    char_df = char_df.merge(oc_df, on=['date', 'asset'], how='outer', validate='one_to_one')
    char_df = char_df.merge(dev_df, on=['date', 'asset'], how='outer', validate='one_to_one')
    del mic_df, oc_df, dev_df
    gc.collect()
    char_df = char_df.merge(s_df, on=['date', 'asset'], how='outer', validate='one_to_one')
    char_df = char_df.merge(val_df, on=['date', 'asset'], how='outer', validate='one_to_one')
    char_df = char_df.merge(bal_df, on=['date', 'asset'], how='outer', validate='one_to_one')
    del s_df, val_df, bal_df
    gc.collect()

    # Add cmkt return to macro and form the rest of the macro covariates
    macro_df = formMacroCovariates(macro_df, cmkt_df, cmkt_1h_df)

    # Return data
    return char_df, macro_df
        

In [16]:
def formWeeklyPanel(panel_df: pd.DataFrame, asset_universe_dict: Dict[str, List[str]]) -> pd.DataFrame:
    # Drop to weekly frequency
    weekly_df = subsetToWeeklyFreq(panel_df)
    weekly_df = weekly_df.copy()

    # Drop other lhs variable
    weekly_df = weekly_df.drop(columns=['r_ex_tp1'], axis=1)

    # Rename weekly LHS to match naming convention for weekly panel
    weekly_df = weekly_df.rename(columns={'r_ex_tp168': 'r_ex_tp7'})

    # Form list of columns
    first_cols = ['date', 'asset', 'r_ex_tp7']
    industry_cols = [col for col in weekly_df.columns if ('industry' in col) & ('industry_tm' not in col)]
    usage_cols = [col for col in weekly_df.columns if 'asset_usage' in col]
    static_cols = (industry_cols + usage_cols + 
        ['char_pow', 'char_pos', 'char_ico_price', 'char_ico'])
    char_cols = [col for col in weekly_df.columns if 'char_' in col]
    char_cols = list(set(char_cols).difference(static_cols))
    char_cols = [col for col in char_cols if col[-1:] != 'h']
    char_cols.sort()
    macro_tm7_cols = [col for col in weekly_df.columns if ('macro_' in col) & (col[-3:] == 'tm7')]
    macro_cols_to_keep_for_weekly = ['macro_snp500_t',
        'macro_snp_div_yield_t',
        'macro_snp_pe_t',
        'macro_vixclsx_t',
        'macro_fedfunds_t',
        'macro_dgs1mo_t',
        'macro_gs1_t',
        'macro_gs5_t',
        'macro_gs10_t',
        'macro_cp3mx_t',
        'macro_aaa_t',
        'macro_baa_t',
        'macro_m1sl_t',
        'macro_m2real_t',
        'macro_m2sl_t',
        'macro_expinf10yr_t',
        'macro_expinf1yr_t',
        'macro_expinf20yr_t',
        'macro_expinf2yr_t',
        'macro_expinf30yr_t',
        'macro_expinf3yr_t',
        'macro_expinf5yr_t',
        'macro_t10yie_t',
        'macro_t20yiem_t',
        'macro_t30yiem_t',
        'macro_t5yie_t',
        'macro_tb3ms_t',
        'macro_tb6ms_t',
        'macro_stablecoin_dev_t',
        'macro_active_cryptos_t',
        'macro_active_exchanges_t',
        'macro_active_market_pairs_t',
        'macro_ex_num_pairs_cex_t',
        'macro_ex_num_pairs_dex_t',
        'macro_ex_open_interest_future_usd_t',
        'macro_aave_med_borrow_apy_t',
        'macro_aave_med_supply_apy_t',
        'macro_mcd_avg_liq_t',
        'macro_mcd_med_collat_ratio_t',
        'macro_btc_diff_mean_t',
        'macro_btc_fee_med_usd_t',
        'macro_btc_fee_tot_usd_t',
        'macro_btc_hash_rate_t',
        'macro_btc_sply_cur_t',
        'macro_btc_sply_ff_t',
        'macro_btc_sply_utxo_loss_t',
        'macro_btc_sply_utxo_prof_t',
        'macro_btc_tx_tfr_cnt_t',
        'macro_btc_tx_tfr_val_adj_usd_t',
        'macro_btc_tx_tfr_val_med_usd_t',
        'macro_btc_utxo_age_med_t',
        'macro_eth_fee_med_t',
        'macro_eth_sply_cur_t',
        'macro_eth_sply_ff_t',
        'macro_eth_stakers_count_t',
        'macro_eth_total_fee_t',
        'macro_eth_tx_tfr_cnt_t',
        'macro_eth_tx_tfr_val_adj_usd_t',
        'macro_eth_tx_tfr_val_med_usd_t',
        'macro_total_aave_borrowed_t',
        'macro_total_aave_deposits_t',
        'macro_total_aave_liq_t',
        'macro_total_aave_new_debt_t',
        'macro_total_aave_supply_t',
        'macro_total_compound_borrowed_t',
        'macro_total_compound_deposits_t',
        'macro_total_compound_liq_t',
        'macro_total_compound_new_debt_t',
        'macro_total_compound_supply_t',
        'macro_total_maker_borrowed_t',
        'macro_total_maker_deposits_t',
        'macro_total_maker_supply_t',
        'macro_total_uni_claims_t',
        'macro_total_usd_mcap_t',
        'macro_ico_count_t',
        'macro_dex_volume_t',
        'macro_ex_usd_volume_24h_cex_t',
        'macro_ex_volume_future_usd_t',
        'macro_ex_volume_t',
        'macro_total_nft_retail_trades_t',
        'macro_total_nft_retail_volume_t',
        'macro_total_nft_trades_t',
        'macro_total_nft_volume_t',
        'macro_total_nft_whale_trades_t',
        'macro_total_nft_whale_volume_t',
        'macro_us_ex_volume_future_usd_t',
        'macro_us_ex_volume_spot_usd_t']
    macro_cols = macro_cols_to_keep_for_weekly+macro_tm7_cols
    macro_cols.sort()
    weekly_df = weekly_df[first_cols+static_cols+char_cols+macro_cols]

    # Drop rows missing LHS variable
    weekly_df = weekly_df[weekly_df.r_ex_tp7.notnull()]

    # Cut to study period
    weekly_df = weekly_df[weekly_df.date.dt.year >= 2018]
    weekly_df = weekly_df[weekly_df.date.dt.year < 2023]

    # Cut down to only tradable date-assets
    weekly_df = subsetByMonthToAssetUniverse(weekly_df, asset_universe_dict)

    # Fix inf values to missing
    weekly_df.replace([np.inf, -np.inf], np.nan, inplace=True)

    # Forward fill missing values in RHS columns
    weekly_df = weekly_df.groupby('asset').apply(lambda group: group.fillna(method='ffill'))

    # Fill remaining missings
    weekly_df.fillna(0, inplace=True)

    # Replace negative values with zero in known columns
    circ_cols = [col for col in weekly_df.columns if 'char_circulation_' in col]
    drop_neg_cols = (circ_cols 
        + ['char_age_mean_dollar_t', 'char_exchange_prct_circ_supply_t', 'char_prct_supply_in_profit_t'])
    for col in drop_neg_cols:
        weekly_df.loc[weekly_df[col] < 0, col] = 0

    # Trim max values in known columns
    weekly_df.loc[weekly_df.char_cex_prct_circ_supply_t > 1, 'char_cex_prct_circ_supply_t'] = 1
    weekly_df.loc[weekly_df.char_defi_prct_circ_supply_t > 1, 'char_cex_prct_circ_supply_t'] = 1
    weekly_df.loc[weekly_df.char_traders_prct_circ_supply_t > 1, 'char_traders_prct_circ_supply_t'] = 1

    weekly_df.loc[weekly_df.char_prct_supply_in_profit_t > 100, 'char_prct_supply_in_profit_t'] = 100

    weekly_df.loc[weekly_df.char_alpha_tm30 > 0.3, 'char_alpha_tm30'] = 0.3
    weekly_df.loc[weekly_df.char_alpha_tm7 > 0.3, 'char_alpha_tm7'] = 0.3

    weekly_df.loc[weekly_df.char_beta_downside_tm30 > 0.5, 'char_beta_downside_tm30'] = 0.5

    weekly_df.loc[weekly_df.char_beta_tm30 > 1, 'char_beta_tm30'] = 1
    weekly_df.loc[weekly_df.char_beta_tm7 > 1, 'char_beta_tm7'] = 1

    weekly_df.loc[weekly_df.char_coskew_tm30 > 10, 'char_coskew_tm30'] = 10

    weekly_df.loc[weekly_df.char_delta_flow_dist_tm7 > 100, 'char_delta_flow_dist_tm7'] = 100
    weekly_df.loc[weekly_df.char_delta_flow_dist_tm7 < -100, 'char_delta_flow_dist_tm7'] = -100

    weekly_df.loc[weekly_df.char_ivol_tm7 > 10, 'char_ivol_tm7'] = 10
    weekly_df.loc[weekly_df.char_ivol_tm90 > 2, 'char_ivol_tm90'] = 2

    weekly_df.loc[weekly_df.char_vol_tm1 > 20, 'char_vol_tm1'] = 20
    weekly_df.loc[weekly_df.char_vol_tm30 > 3, 'char_vol_tm30'] = 3
    weekly_df.loc[weekly_df.char_vol_tm7 > 5, 'char_vol_tm7'] = 5
    weekly_df.loc[weekly_df.char_vol_tm90 > 2, 'char_vol_tm90'] = 2

    # Fix known return errors
    weekly_df.loc[(weekly_df.date=='2022-05-22') & (weekly_df.asset=='luna'), 'r_ex_tp7'] = 0

    # Ensure the weekly_df dataframe contains all dates
    earliest_date = weekly_df['date'].min()
    end_date = pd.to_datetime('2022-12-31')
    all_sundays = pd.date_range(earliest_date, end_date, freq='W-SUN')
    assert len(all_sundays) == len(weekly_df.date.unique())

    # Ensure no missing values
    assert 0 == weekly_df.isnull().sum().sum()

    # Sort rows and columns and reset index
    weekly_df = weekly_df.sort_values(by=['date', 'asset'], ignore_index=True)

    # Ensure no duplicates by date and asset
    assert not weekly_df.duplicated(subset=['date', 'asset']).any()

    # Convert all float64 columns to float32
    for col in weekly_df.columns:
        if weekly_df[col].dtype == np.float64:
            weekly_df[col] = weekly_df[col].astype(np.float32)

    return weekly_df


In [17]:
def formHourlyPanels(df: pd.DataFrame) -> pd.DataFrame:
    # Drop other lhs variable
    df = df.drop(columns=['r_ex_tp168'], axis=1)

    # Let's get rid of the char ico price column for now
    df = df.drop('char_ico_price', axis=1)

    # Form list of columns
    first_cols = ['date', 'asset', 'r_ex_tp1']
    industry_cols = [col for col in df.columns if ('industry' in col) & ('industry_tm' not in col)]
    usage_cols = [col for col in df.columns if 'asset_usage' in col]
    static_cols = (industry_cols + usage_cols + 
        ['char_pow', 'char_pos', 'char_ico'])
    char_cols = [col for col in df.columns if 'char_' in col]
    char_cols = list(set(char_cols).difference(static_cols))
    char_cols.sort()
    macro_tm7_cols = [col for col in df.columns if ('macro_' in col) & (col[-3:] == 'tm7')]
    macro_cols = [col for col in df.columns if ('macro_' in col)]
    macro_cols = list(set(macro_cols).difference(set(macro_tm7_cols)))
    macro_cols.sort()

    # Keep relevant columns
    df = df[first_cols+static_cols+char_cols+macro_cols]

    # Drop rows missing LHS variable
    df = df[df.r_ex_tp1.notnull()]

    # Cut to study period
    df = df[df.date.dt.year >= 2018]
    df = df[df.date.dt.year < 2023]

    # Fix inf values to missing
    df.replace([np.inf, -np.inf], np.nan, inplace=True)

    # Forward fill missing values in RHS columns
    df = df.groupby('asset').apply(lambda group: group.fillna(method='ffill'))

    # Fill remaining missing with cross-sectional median
    missing = df.isnull().sum().T
    missing_cols = list(missing[missing > 0].index)
    for col in missing_cols:
        df[col].fillna(df.groupby('date')[col].transform('median'), inplace=True)

    # Ensure the df dataframe contains all dates
    first_date = df['date'].min()
    last_date = df['date'].max()
    all_hours = pd.date_range(first_date, last_date, freq='H')
    assert len(all_hours) == len(df.date.unique())

    # Assert range (sans char and macro cols which i handle in relevant script)
    assert 0 == df[static_cols].min().min()
    assert 1 == df[static_cols].max().max()
    assert -1 < df.r_ex_tp1.min()
    assert 2 >= df.r_ex_tp1.max()

    # Ensure no missing values
    assert 0 == df.isnull().sum().sum()

    # Sort rows and columns and reset index
    df = df.sort_values(by=['date', 'asset'], ignore_index=True)

    # Ensure no duplicates by date and asset
    assert not df.duplicated(subset=['date', 'asset']).any()

    # Convert all float64 columns to float32
    for col in df.columns:
        if df[col].dtype == np.float64:
            df[col] = df[col].astype(np.float32)

    # Break training and test data
    train_df = df[~((df.date.dt.year == 2022) & (df.date.dt.month >= 7))].copy()
    test_df = df[((df.date.dt.year == 2022) & (df.date.dt.month >= 7))].copy()

    return train_df, test_df


In [18]:
if __name__ == "__main__":
    # set args
    ASSET_IN_FP         = '../data/clean/asset_universe_dict.pickle'
    PANEL_IN_FP         = '../data/clean/panel.pkl'
    WEEKLY_PANEL_OUT_FP  = '../data/clean/panel_weekly.pkl' 
    HOURLY_PANEL_TRAIN_OUT_FP = '../data/clean/panel_train.pkl'
    HOURLY_PANEL_TEST_OUT_FP = '../data/clean/panel_test.pkl'

    # import
    with open(ASSET_IN_FP, "rb") as f:
        asset_universe_dict = pickle.load(f)
    df = pd.read_pickle(PANEL_IN_FP)

    # drop unncessary col
    df = df.drop(columns=['r_ex_tp24'], axis=1)

    # form rhs covars
    char_df, macro_df = formRHS(df)
    panel_df = char_df.merge(
        df[['date', 'asset', 'r_ex_tp1', 'r_ex_tp168']], on=['date', 'asset'], how='right', validate='one_to_one')
    panel_df = panel_df.merge(macro_df, on='date', how='left', validate='many_to_one')
    del char_df, macro_df, df
    gc.collect()

    # Form weekly panel
    weekly_df = formWeeklyPanel(panel_df, asset_universe_dict)
    weekly_df.to_pickle(WEEKLY_PANEL_OUT_FP)

    # Form hourly panels
    train_df, test_df = formHourlyPanels(panel_df)
    train_df.to_pickle(HOURLY_PANEL_TRAIN_OUT_FP)
    test_df.to_pickle(HOURLY_PANEL_TEST_OUT_FP)


  for _, row in df[industry_columns].sum(axis=1).iteritems():
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
To preserve the previous behavior, use

	>>> .groupby(..., group_keys=False)


	>>> .groupby(..., group_keys=True)
  weekly_df = weekly_df.groupby('asset').apply(lambda group: group.fillna(method='ffill'))
To preserve the previous behavior, use

	>>> .groupby(..., group_keys=False)


	>>> .groupby(..., group_keys=True)
  df = df.groupby('asset').apply(lambda group: group.fillna(method='ffill'))
