In [5]:
import pandas as pd
import numpy as np
import datetime

In [6]:
def createPanelFeatures(panel_df: pd.DataFrame) -> pd.DataFrame:
    ''' Form all features at panel level.
    
    Args:
        panel_df (pd.DataFrame): raw minute-level panel of BTC and ETH: usd per token prices, 
                                 token volume, and trade count.
    
    Returns:
        panel_df (pd.DataFrame): panel data with features and without raw columns.
    '''
    # simple returns
    for t in [1, 2, 3, 4, 5, 10, 15, 20, 30, 60, 120, 360, 365, 720, 
              1440, 4320, 8640, 20160, 40320, 86400]:
        panel_df['covar_r_tm'+str(t)] = panel_df.groupby('asset')['price'].pct_change(periods=t)

    # moments of 1 minute returns
    for t in [5, 10, 20, 30, 60, 360, 720, 4320, 8640, 20160]:
        panel_df['covar_r_1min_ma_tm'+str(t)]   = panel_df.groupby('asset')['covar_r_tm1'].transform(
                                                        lambda x: x.rolling(t).mean())
        panel_df['covar_r_1min_ema_tm'+str(t)]  = panel_df.groupby('asset')['covar_r_tm1'].transform(
                                                        lambda x: x.ewm(span=t, adjust=False).mean())
        panel_df['covar_r_1min_vol_tm'+str(t)]  = panel_df.groupby('asset')['covar_r_tm1'].transform(
                                                        lambda x: x.rolling(t).std())
        panel_df['covar_r_1min_skew_tm'+str(t)] = panel_df.groupby('asset')['covar_r_tm1'].transform(
                                                        lambda x: x.rolling(t).skew())
        panel_df['covar_r_1min_kurt_tm'+str(t)] = panel_df.groupby('asset')['covar_r_tm1'].transform(
                                                        lambda x: x.rolling(t).kurt())

    # moments of 5 minute returns
    for t in [30, 60, 720, 4320, 8640, 20160]:
        panel_df['covar_r_5min_ma_tm'+str(t)]   = panel_df.groupby('asset')['covar_r_tm5'].transform(
                                                                    lambda x: x.rolling(t).mean())
        panel_df['covar_r_5min_min_tm'+str(t)]  = panel_df.groupby('asset')['covar_r_tm5'].transform(
                                                                    lambda x: x.rolling(t).min())
        panel_df['covar_r_5min_max_tm'+str(t)]  = panel_df.groupby('asset')['covar_r_tm5'].transform(
                                                                    lambda x: x.rolling(t).max())
        panel_df['covar_r_5min_vol_tm'+str(t)]  = panel_df.groupby('asset')['covar_r_tm5'].transform(
                                                                    lambda x: x.rolling(t).std())
        panel_df['covar_r_5min_skew_tm'+str(t)] = panel_df.groupby('asset')['covar_r_tm5'].transform(
                                                                    lambda x: x.rolling(t).skew())
        panel_df['covar_r_5min_kurt_tm'+str(t)] = panel_df.groupby('asset')['covar_r_tm5'].transform(
                                                                    lambda x: x.rolling(t).kurt())

    # form price variables
    panel_df = panel_df.rename(columns = {'price': 'covar_p_t'})
    panel_df['covar_p_log_t'] = np.log(panel_df.covar_p_t)

    # current volume
    panel_df = panel_df.rename(columns = {'volume': 'covar_volume_t',
                                          'trades': 'covar_trades_t'})

    # form functions of volume
    for col in ['covar_volume_t', 'covar_trades_t']:
        for t in [5, 10, 20, 30, 60, 360, 720, 4320, 8640, 20160]:
            panel_df['covar_'+col+'_ma_tm'+str(t)]  = panel_df.groupby('asset')[col].transform(
                                                                    lambda x: x.rolling(t).mean())
            panel_df['covar_'+col+'_sum_tm'+str(t)] = panel_df.groupby('asset')[col].transform(
                                                                    lambda x: x.rolling(t).sum())                                                                        
            panel_df['covar_'+col+'_min_tm'+str(t)] = panel_df.groupby('asset')[col].transform(
                                                                    lambda x: x.rolling(t).min()) 
            panel_df['covar_'+col+'_max_tm'+str(t)] = panel_df.groupby('asset')[col].transform(
                                                                    lambda x: x.rolling(t).max()) 
            panel_df['covar_'+col+'_vol_tm'+str(t)] = panel_df.groupby('asset')[col].transform(
                                                                    lambda x: x.rolling(t).std()) 
    
    # form returns from cum max and min prices
    panel_df['covar_r_cummax_t'] = ((panel_df.covar_p_t - panel_df.covar_p_t.cummax()) 
                                    / panel_df.covar_p_t.cummax())
    panel_df['covar_r_cummin_t'] = ((panel_df.covar_p_t - panel_df.covar_p_t.cummin()) 
                                    / panel_df.covar_p_t.cummin())

    return panel_df

In [7]:
def renameColumnsWithAssetName(temp_df: pd.DataFrame, asset: str) -> pd.DataFrame:
    ''' Helper function to rename columns with asset name at specific point in feature name.

    Args:
        asset (str): asset abbreviation to include in column names.
    
    Returns:
        temp_df (pd.DataFrame): same data frame with new column names.
    '''
    # obtain list of features
    cols = list(temp_df.columns.values)
    cols.remove('date')

    # initialize dictionary to use to rename
    col_rename_dict = {}

    # build dictionary to rename
    for col in cols:
        assert(col[:6] == 'covar_')
        col_rename_dict[col] = 'covar_'+asset+'_'+col[6:]

    # execute rename
    temp_df = temp_df.rename(columns=col_rename_dict)

    return temp_df

In [8]:
def collapseToTimeBars(panel_df: pd.DataFrame) -> pd.DataFrame:
    ''' Collapse panel to time bars, ensuring no missing dates.
    
    Args: 
        panel_df (pd.DataFrame): panel data with features and without raw columns.

    Returns:
        df (pd.DataFrame): time bar level data with RHS features.
    '''

    # form seperate dataframes
    btc_df = panel_df[panel_df.asset=='btc'].copy()
    eth_df = panel_df[panel_df.asset=='eth'].copy()

    # drop unnecessary column
    btc_df = btc_df.drop('asset', axis=1)
    eth_df = eth_df.drop('asset', axis=1)

    # rename columns with asset name
    btc_df = renameColumnsWithAssetName(btc_df, 'btc')
    eth_df = renameColumnsWithAssetName(eth_df, 'eth')

    # merge
    df = btc_df.merge(eth_df, 
                      on=['date'],
                      how='inner',
                      validate='one_to_one')

    # ensure no missing time bars
    min_date = np.min(df.date.values)
    max_date = np.max(df.date.values)
    number_of_bars = 1+int(max_date - min_date)/1e9/60 # ns to seconds to minutes plus one minute
    assert(df.shape[0] == number_of_bars)
    
    return df

In [9]:
def createLHSVariables(df: pd.DataFrame) -> pd.DataFrame:
    ''' Create LHS target variables in binary and absolute return difference versions.
        Decided to go with six hour frequency as about 90% of the windows have return difference
        such that if forecasted accurately profit would be above fees.
        Window is from every six hours (starting midnight) plus five minutes (to give time to 
        pull data and predict) to the subsequent six hours plus 10 minutes (to repeat the process 
        and give time to place/update trades).
    
    Args:
        df (pd.DataFrame): time bar level data with only RHS features.
    
    Returns:
        df (pd.DataFrame): time bar level data with LHS and RHS features.
    '''
    # form temporary columns of btc and eth returns over target window
    df['temp_btc_r_tp5_tp370'] = df.covar_btc_r_tm365.shift(-370)
    df['temp_eth_r_tp5_tp370'] = df.covar_eth_r_tm365.shift(-370)

    # form binary LHS outcome y where 1 if BTC outperforms and 0 otherwise
    df.loc[df.temp_btc_r_tp5_tp370 >= df.temp_eth_r_tp5_tp370, 'y'] = 1
    df.loc[df.temp_btc_r_tp5_tp370 < df.temp_eth_r_tp5_tp370, 'y'] = 0

    # form real valued LHS outcome for return difference
    df['y_btc_eth_diff_r_tp5_tp370'] = df.temp_btc_r_tp5_tp370 - df.temp_eth_r_tp5_tp370

    # drop temporary columns
    df = df.drop(['temp_btc_r_tp5_tp370', 'temp_eth_r_tp5_tp370'], axis=1)

    return df

In [10]:
def createRSIFeatures(df: pd.DataFrame) -> pd.DataFrame:
    ''' Form relative strength index features at one hour frequency over various windows.

    Args: 
        df (pd.DataFrame): time bar level data with necessary RHS features.
    
    Returns:
        df (pd.DataFrame): same time bar level data frame with RSI features added.    

    FUTURE TODO:
        -convert code to work at panel level instead of time bar level
        -pass in freq to work at, noting units
        -pass in windows to create, noting units
        -make generic function in my relevant class
    '''

    for asset in ['btc', 'eth']:
        for window in [360, 720, 4320, 8640, 20160]:
            one_hr_p_delta_col = 'covar_'+asset+'_p_delta_tm12'
            df[one_hr_p_delta_col] = df['covar_'+asset+'_p_t'].diff(periods=12)
            df['temp_'+asset+'neg_p_delta_1hr'] = df[one_hr_p_delta_col].clip(upper=0)
            df['temp_'+asset+'pos_p_delta_1hr'] = -1*df[one_hr_p_delta_col].clip(lower=0)
            df['temp_'+asset+'_avg_neg_p_delta_1hr_tm'+str(window)] = df['temp_'+asset+'neg_p_delta_1hr'].transform(lambda x: x.rolling(window, min_periods=1).mean())
            df['temp_'+asset+'_avg_pos_p_delta_1hr_tm'+str(window)] = df['temp_'+asset+'pos_p_delta_1hr'].transform(lambda x: x.rolling(window, min_periods=1).mean())
            df['covar_'+asset+'_rsi_tm'+str(window)] = (100 - 100/(1 + 
                                                                df['temp_'+asset+'_avg_pos_p_delta_1hr_tm'+str(window)]/
                                                                df['temp_'+asset+'_avg_neg_p_delta_1hr_tm'+str(window)]))
            df = df.drop(['temp_'+asset+'neg_p_delta_1hr', 
                        'temp_'+asset+'pos_p_delta_1hr',
                        'temp_'+asset+'_avg_neg_p_delta_1hr_tm'+str(window),
                        'temp_'+asset+'_avg_pos_p_delta_1hr_tm'+str(window)], axis=1)
    
    return df

In [11]:
def createFeaturesLHSAndCollapseToTimeBars(panel_df: pd.DataFrame) -> pd.DataFrame:
    ''' Transform raw panel to time bars with clean LHS and RHS features.
    
    Args:
        panel_df (pd.DataFrame): raw minute-level panel of BTC and ETH:
                                 usd per token prices, token volume, and trade count.
    
    Returns:
        df (pd.DataFrame): time bars of LHS and RHS features.
    '''
    # form features
    panel_df = createPanelFeatures(panel_df)

    # form LHS and collapse
    df = collapseToTimeBars(panel_df)
    df = createLHSVariables(df)
    
    # form rsi features (needs to be time bar data for the code i have)
    df = createRSIFeatures(df)

    # resample to target frequency of every six hours
    df = df[df.date.dt.hour.isin([0,6,12,18]) & (df.date.dt.minute==0)].reset_index(drop=True)

    return df

In [31]:
def finalClean(df: pd.DataFrame) -> pd.DataFrame:
    ''' Final checks and clean of the time bar data.

    Args:
        df (pd.DataFrame): time bar level data with LHS and RHS features.

    Returns:
        df (pd.DataFrame): clean time bar level data with sorted columns and rows.    
    '''
    # interpolate missing values in skew, kurt, and rsi columns with trailing four day mean
    periods = 16 # four days
    cols = list(df.columns.values)
    interpol_cols = [col for col in cols if ('rsi' in col) | ('kurt' in col) | ('skew' in col)]
    for col in interpol_cols:
        df['rolling_average'] = df[col].rolling(periods, min_periods=1).mean()
        df[col] = df[col].fillna(df['rolling_average'])
        df = df.drop('rolling_average', axis=1)

    # drop rows pre 2016 and post June 2022
    df = df[df.date.dt.year >= 2016]
    df = df[~((df.date.dt.year == 2022) & (df.date.dt.month == 6))]

    # order columns
    cols = list(df.columns.values)
    cols.remove('date')
    cols.remove('y')
    cols.remove('y_btc_eth_diff_r_tp5_tp370')
    sorted_cols = sorted(cols)
    df = df[['date', 'y', 'y_btc_eth_diff_r_tp5_tp370']+sorted_cols]

    # ensure rows are sorted
    df = df.sort_values(by='date', ignore_index=True)

    # ensure there are the correct number of rows
    min_date = np.min(df.date.values)
    max_date = np.max(df.date.values)
    number_of_bars = 1+int(max_date - min_date)/1e9/60/60/24*4 # ns to sec to quarter days plus one bar
    assert(df.shape[0] == number_of_bars)

    # drop columns missing any data
    num_cols_pre = df.shape[1]
    df = df.dropna(axis=1)
    print('dropped '+str(int(df.shape[1]-num_cols_pre))+' columns that were still missing data.')

    # ensure no missing data
    assert(0==df.isnull().sum().sum()),('there is missing data to be fixed in the time bar data.')

    return df

In [None]:
def calcCorrelationStatistics(df: pd.DataFrame, fp: str):
    print(1)

In [3]:
# TODO do the corr, *spearman rank*, MI, consistency, indep by year and overall as before
# choose covars and be generous here

In [None]:
# TODO scatterplot of each RHS with LHS to just get a feel of stuff

In [None]:
# TODO 
# not sure what i mean by this but
# confirm signal with low dim linear regression
# maybe lasso like the final set to choose some so i work with lower dim problem?
# then start with just linear reg to ensure it works on this low dim set

In [None]:
# TODO final clean
# save the final panel in clean data folder as the full panel, ensure no missing, all vars have proper range, set col order, set col name, reset index, save as pickle

In [None]:
if __name__ == "__main__":
    # set args
    in_fp       = '../1-data/clean/panel_btceth_1min.pkl'
    fe_stats_fp = '../3-output/feat_eng_stats.csv'

    # read in data
    panel_df = pd.read_pickle(in_fp)

    # engineer features
    df = createFeaturesLHSAndCollapseToTimeBars(panel_df)
    
    # ensure time bar data is clean
    df = finalClean(df)

    # calculate correlation statistics
    #calcCorrelationStatistics(df, fe_stats_fp)

    # step 3

In [None]:
# TODO ONCE DONE WITH THIS; TURN INTO PYTHON SCRIPT
# TODO MOVE ANY FUNCTIONS THAT I MAY USE ELSEWHERE TO COMMON FOLDER SHARED ACROSS PROJECTS THAT I JUST IMPORT WHEN NEEDED