In [None]:
# Imports
import pandas as pd
import numpy as np

## Handle block_hash and transaction_overview

In [None]:
# file path for data
bh_filepath = 'data/bh.dat'
tx_filepath = 'data/tx.dat'

In [None]:
# read data
df_block_hash = pd.read_csv(bh_filepath, sep = '\t')
df_transaction = pd.read_csv(tx_filepath, sep = '\t')

In [None]:
# set column names
df_block_hash.columns = ['block ID', 'hash', 'timestamp', 'number of transactions']
df_transaction.columns = ['transaction ID', 'block ID', 'input count', 'output count']

In [None]:
# drop irrelevant columns
df_block_hash.drop(['hash', 'number of transactions'], axis = 1, inplace = True)
df_transaction.drop(['input count', 'output count'], axis = 1, inplace = True)

In [None]:
# check head after drop
print(df_block_hash.head())
print(df_transaction.head())

In [None]:
# write block hash dataframe to a new dat file
bh_drop_filepath = 'data/bh_simplified.dat'
df_block_hash.to_csv(path_or_buf = bh_drop_filepath, sep = '\t', index = False, columns = ['block ID', 'timestamp'])

In [None]:
tx_drop_filepath = 'data/tx_simplified.dat'
df_transaction.to_csv(path_or_buf = tx_drop_filepath, sep = '\t', index = False, 
                      columns = ['transaction ID', 'block ID'])

## Handle transaction_input

In [None]:
# split transaction_input file
!split -l 70000000 data/txin.dat t

In [None]:
# set filepath
num_txin_file = 11

txin_input_filepath = ['txin/taa', 'txin/tab', 'txin/tac', 'txin/tad', 'txin/tae', 'txin/taf', 'txin/tag', 'txin/tah',
                      'txin/tai', 'txin/taj', 'txin/tak']
txin_output_filepath = ['txin/txin_1.dat', 'txin/txin_2.dat', 'txin/txin_3.dat', 'txin/txin_4.dat', 'txin/txin_5.dat', 
                        'txin/txin_6.dat', 'txin/txin_7.dat', 'txin/txin_8.dat', 'txin/txin_9.dat', 'txin/txin_10.dat', 
                        'txin/txin_11.dat']

In [None]:
# for loop to drop irrelevant columns in transaction_input
for x in range(num_txin_file):
    curr_input_filepath = txin_input_filepath[x]
    curr_output_filepath = txin_output_filepath[x]
    
    df_txin = pd.read_csv(curr_input_filepath, sep = '\t')
    
    df_txin.columns = ['transaction ID', 'input sequence', 'previous transaction ID', 'previous output sequence', 
                       'address ID', 'sum']
    
    df_txin.drop(['input sequence', 'previous transaction ID', 'previous output sequence', 'sum'], 
                 axis = 1, inplace = True)
    
    df_txin.to_csv(path_or_buf = curr_output_filepath, sep = '\t', index = False, 
                   columns = ['transaction ID', 'address ID'])

## Handle transaction_output

In [None]:
# split transaction_output file
!split -l 70000000 data/txout.dat m

In [None]:
# set filepath
num_txout_file = 12

txout_input_filepath = ['txout/maa', 'txout/mab', 'txout/mac', 'txout/mad', 'txout/mae', 'txout/maf', 'txout/mag', 
                        'txout/mah', 'txout/mai', 'txout/maj', 'txout/mak', 'txout/mal']
txout_output_filepath = ['txout/txout_1.dat', 'txout/txout_2.dat', 'txout/txout_3.dat', 'txout/txout_4.dat', 
                         'txout/txout_5.dat', 'txout/txout_6.dat', 'txout/txout_7.dat', 'txout/txout_8.dat', 
                         'txout/txout_9.dat', 'txout/txout_10.dat', 'txout/txout_11.dat', 'txout/txout_12.dat']

In [None]:
# for loop to drop irrelevant columns in transaction_output
for x in range(num_txout_file):
    curr_input_filepath = txout_input_filepath[x]
    curr_output_filepath = txout_output_filepath[x]
    
    df_txout = pd.read_csv(curr_input_filepath, sep = '\t')
    
    df_txout.columns = ['transaction ID', 'output sequence', 'address ID', 'sum']
    
    df_txout.drop(['output sequence', 'sum'], axis = 1, inplace = True)
    
    df_txout.to_csv(path_or_buf = curr_output_filepath, sep = '\t', index = False, 
                   columns = ['transaction ID', 'address ID'])