In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import pathlib
import datetime  
from time import process_time

In [2]:
def wrangle(df, block_height):
    '''This function takes in a snapshot of the mempool (pool of transactions) and the block_height'''
    
    df.drop('Unnamed: 0', inplace=True, axis=1)
            
    # fee rate [sats/vB] = transaction fee [satoshis] / weight of transaction [virtual bytes]
    df['sat/vB'] = (df.fee / df.weight) * 100000000 
    
    df['goal_block'] = block_height # The next block for this mempool snapshot
    
    df['blocks_in_pool'] = df['goal_block'] - df['height'] # The next block - height the block entered the mempool
    
    df.drop(['bip125'], inplace=True, axis=1)
        
    return df

In [3]:
# The placeholder rate is used as a filler when there are no confirmed transactions for the next_block_fees.
# It is updated every block that it is not used.
placeholder_rate = 1.0

# This is used to determine time between blocks
previous_block_time = datetime.datetime.now()

def make_block_data(df, block_height, time):
    global placeholder_rate
    global previous_block_time
    block_df = {}
    
    # Get categorical mean fees
    next_pool_blocks = df[(df['blocks_in_pool'] == 1)]
    
    hour_pool_blocks = df[(df['blocks_in_pool'] >= 2) &
                               (df['blocks_in_pool'] < 7)]
    six_hour_pool_blocks = df[(df['blocks_in_pool'] >= 7) &
                               (df['blocks_in_pool'] < 37)]
    day_pool_blocks = df[(df['blocks_in_pool'] >= 37) &
                               (df['blocks_in_pool'] < 145)]
    
    block_df['next_block_mean_fee'] = next_pool_blocks['fee'].mean() * 100000000
    block_df['hour_mean_fee'] = hour_pool_blocks['fee'].mean() * 100000000
    block_df['six_hour_mean_fee'] = six_hour_pool_blocks['fee'].mean() * 100000000
    block_df['day_mean_fee'] = day_pool_blocks['fee'].mean() * 100000000
    
    # Set time between blocks
    block_df['time_btwn_blocks'] = time.timestamp() - previous_block_time.timestamp()
    previous_block_time = time
    
    # Get confirmed transactions to create target data
    confirmed = df[df['confirmed']==True]
    
    # SETTING TARGETS
    # Here I try to form the buckets for categorizing feerates,
    # and set a feerate for that bucket from confirmed transaction data
    
    next_block_fees = confirmed[confirmed['blocks_in_pool'] == 1]
    
    hour_block_fees = confirmed[(confirmed['blocks_in_pool'] >= 2) &
                               (confirmed['blocks_in_pool'] < 7)]
    
    six_hour_block_fees = confirmed[(confirmed['blocks_in_pool'] >= 7) &
                                   (confirmed['blocks_in_pool'] < 37)]
    
    day_block_fees = confirmed[(confirmed['blocks_in_pool'] >= 37) &
                               (confirmed['blocks_in_pool'] < 145)]
    
    # If there are no fees for a category, make a default, otherwise get the median
    if (len(next_block_fees) == 0):
        block_df['next_block_sat/vB'] = placeholder_rate
        block_df['next_block_mean_fee'] = 1.0
    else:
        placeholder_rate = next_block_fees['sat/vB'].median()
        block_df['next_block_sat/vB'] = placeholder_rate
    
    if (len(hour_block_fees) == 0):
        block_df['hour_block_sat/vB'] = 1.0
        block_df['hour_mean_fee'] = 1.0
    else:
        block_df['hour_block_sat/vB'] = hour_block_fees['sat/vB'].median()
    
    if (len(six_hour_block_fees) == 0):
        block_df['six_hour_block_sat/vB'] = 1.0
        block_df['six_hour_mean_fee'] = 1.0
    else:
        block_df['six_hour_block_sat/vB'] = six_hour_block_fees['sat/vB'].median()
    
    if (len(day_block_fees) == 0):
        block_df['day_block_sat/vB'] = 1.0
        block_df['day_mean_fee'] = 1.0
    else:
        block_df['day_block_sat/vB'] = day_block_fees['sat/vB'].median()
        
    # Mempool feature: Mean Fee
    block_df['mempool_mean_fee'] = df['fee'].mean()
    
    # Mempool feature: Mean weight
    block_df['mempool_mean_vBytes'] = ((df['weight'] / 4) / len(block_df)).mean()
    
    block_df['month'] = time.month
    block_df['day'] = time.day
    block_df['hour'] = time.hour
    block_df['minute'] = time.minute
    
    return pd.DataFrame(block_df, index=[block_height]).dropna()

In [4]:
blocks_df = pd.DataFrame()

# I have a csv file for each snapshot of the mempool, for each block.
# I simply collect and concatenate them.    

for i in range(3480): #3480 max
    
    block_height = i + 661074 # Initial block for my samples is 661074
    path = 'pool_data/mem_blk_{0}.csv'.format(block_height)
    
    try:
        df = pd.read_csv(path, index_col='txid')
    except:
        continue
    
    # get block time from file creation time
    fname = pathlib.Path(path)
    assert fname.exists(), f'No such file: {fname}'  # check that the file exists
    mtime = datetime.datetime.fromtimestamp(fname.stat().st_mtime)
            
    # Try to get features from transactions
    df = wrangle(df, block_height)
    
    # Create Block Features
    new_block_df = make_block_data(df, block_height, mtime)
    
    # Add new block data to set
    blocks_df = pd.concat([blocks_df, new_block_df])
    

Elapsed time: 95.178043593 1.581831806
Elapsed time during the whole program in seconds: 93.596211787


In [5]:
blocks_df = blocks_df.iloc[1:] # drop first row due to it being an outlier for time_btwn_blocks
blocks_df.head(20)

Unnamed: 0,next_block_mean_fee,hour_mean_fee,six_hour_mean_fee,day_mean_fee,time_btwn_blocks,next_block_sat/vB,hour_block_sat/vB,six_hour_block_sat/vB,day_block_sat/vB,mempool_mean_fee,mempool_mean_vBytes,month,day,hour,minute
661075,18266.115909,10617.828217,1.0,1.0,210.466294,15.82,14.538874,1.0,1.0,8.7e-05,108.348717,12,12,14,40
661076,19331.651644,7964.942768,3514.957597,1.0,500.328237,15.449123,11.279976,12.1854,1.0,7.6e-05,107.846291,12,12,14,48
661077,20859.861588,8049.703946,5439.564061,1.0,275.941552,15.33485,8.08377,8.245048,1.0,6.6e-05,111.770298,12,12,14,52
661078,29385.714286,7310.821814,5441.877662,3511.660161,80.54595,15.448436,5.75,5.25,0.502232,5.2e-05,125.247508,12,12,14,54
661079,1.0,10478.422744,5103.523536,1.0,59.129249,15.448436,4.813616,4.236277,1.0,4.6e-05,149.864093,12,12,14,55
661080,17857.094488,5612.917526,4420.841066,3307.159538,97.43377,12.696335,3.057426,2.850148,0.506696,4.3e-05,168.693578,12,12,14,56
661081,19977.189142,7990.311594,3722.871249,1.0,683.846266,15.0,14.983558,2.25,1.0,7e-05,161.156144,12,12,15,8
661082,16146.949349,4147.437859,3745.621278,3250.749717,639.493657,15.078534,12.923701,1.916168,0.498224,5.9e-05,158.744432,12,12,15,18
661083,18845.397807,2198.131167,3575.229478,1.0,359.306776,15.078534,1.513577,1.517967,1.0,4.9e-05,177.324552,12,12,15,24
661084,15176.378057,1975.138493,3297.192153,1.0,1250.460956,15.201794,15.032308,0.76009,1.0,7.2e-05,138.046806,12,12,15,45
