## COGS 108 - Final Project

## Part 0: Setup

In [None]:
# Imports
import pandas as pd
import numpy as np

## Part 1: Data Cleaning

## 1a) Condense dat files: block hash, transaction overview
Drop irrelevant columns in bh.dat and tx.dat; store result dataframes to bh.dat and tx.dat.

Result:

bh.dat: 

| Block ID | Block Timestamp |

tx.dat: 

|Transaction ID | Block ID |

In [None]:
# file path for data
bh_filepath = 'data/bh.dat'
tx_filepath = 'data/tx.dat'

In [None]:
# read data
df_block_hash = pd.read_csv(bh_filepath, sep = '\t')
df_transaction = pd.read_csv(tx_filepath, sep = '\t')

In [None]:
# set column names
df_block_hash.columns = ['block ID', 'hash', 'timestamp', 'number of transactions']
df_transaction.columns = ['transaction ID', 'block ID', 'input count', 'output count']

In [None]:
# drop irrelevant columns
df_block_hash.drop(['hash', 'number of transactions'], axis = 1, inplace = True)
df_transaction.drop(['input count', 'output count'], axis = 1, inplace = True)

In [None]:
# check head after drop
print(df_block_hash.head())
print(df_transaction.head())

In [None]:
# write block hash dataframe to a new dat file
bh_drop_filepath = 'data/bh.dat'
df_block_hash.to_csv(path_or_buf = bh_drop_filepath, sep = '\t', index = False, columns = ['block ID', 'timestamp'])

In [None]:
tx_drop_filepath = 'data/tx.dat'
df_transaction.to_csv(path_or_buf = tx_drop_filepath, sep = '\t', index = False, 
                      columns = ['transaction ID', 'block ID'])

## 1b) Condense dat files: transaction input
Split txin.dat to small dat files; drop irrelevant columns; store result dataframes to txin_1.dat to txin_11.dat.

Result:

txin_1.dat - txin_11.dat: 

| Transaction ID | Address ID |

In [None]:
# split transaction_input file
!split -l 70000000 data/txin.dat t

In [None]:
# set filepath
num_txin_file = 11

txin_input_filepath = ['txin/taa', 'txin/tab', 'txin/tac', 'txin/tad', 'txin/tae', 'txin/taf', 'txin/tag', 'txin/tah',
                      'txin/tai', 'txin/taj', 'txin/tak']
txin_output_filepath = ['txin/txin_1.dat', 'txin/txin_2.dat', 'txin/txin_3.dat', 'txin/txin_4.dat', 'txin/txin_5.dat', 
                        'txin/txin_6.dat', 'txin/txin_7.dat', 'txin/txin_8.dat', 'txin/txin_9.dat', 'txin/txin_10.dat', 
                        'txin/txin_11.dat']

In [None]:
# for loop to drop irrelevant columns in transaction_input
for x in range(num_txin_file):
    curr_input_filepath = txin_input_filepath[x]
    curr_output_filepath = txin_output_filepath[x]
    
    df_txin = pd.read_csv(curr_input_filepath, sep = '\t')
    
    df_txin.columns = ['transaction ID', 'input sequence', 'previous transaction ID', 'previous output sequence', 
                       'address ID', 'sum']
    
    df_txin.drop(['input sequence', 'previous transaction ID', 'previous output sequence', 'sum'], 
                 axis = 1, inplace = True)
    
    df_txin.to_csv(path_or_buf = curr_output_filepath, sep = '\t', index = False, 
                   columns = ['transaction ID', 'address ID'])

## 1c) Condense dat files: transaction output
Split txout.dat to small dat files; drop irrelevant columns; store result dataframes to txout_1.dat to txout_12.dat.

Result:

txout_1.dat - txout_12.dat: 

| Transaction ID | Address ID |

In [None]:
# split transaction_output file
!split -l 70000000 data/txout.dat m

In [None]:
# set filepath
num_txout_file = 12

txout_input_filepath = ['txout/maa', 'txout/mab', 'txout/mac', 'txout/mad', 'txout/mae', 'txout/maf', 'txout/mag', 
                        'txout/mah', 'txout/mai', 'txout/maj', 'txout/mak', 'txout/mal']
txout_output_filepath = ['txout/txout_1.dat', 'txout/txout_2.dat', 'txout/txout_3.dat', 'txout/txout_4.dat', 
                         'txout/txout_5.dat', 'txout/txout_6.dat', 'txout/txout_7.dat', 'txout/txout_8.dat', 
                         'txout/txout_9.dat', 'txout/txout_10.dat', 'txout/txout_11.dat', 'txout/txout_12.dat']

In [None]:
# for loop to drop irrelevant columns in transaction_output
for x in range(num_txout_file):
    curr_input_filepath = txout_input_filepath[x]
    curr_output_filepath = txout_output_filepath[x]
    
    df_txout = pd.read_csv(curr_input_filepath, sep = '\t')
    
    df_txout.columns = ['transaction ID', 'output sequence', 'address ID', 'sum']
    
    df_txout.drop(['output sequence', 'sum'], axis = 1, inplace = True)
    
    df_txout.to_csv(path_or_buf = curr_output_filepath, sep = '\t', index = False, 
                   columns = ['transaction ID', 'address ID'])

## 1d) Convert UNIX time to standard time
Implement a function that converts UNIX to standard time; add a column "year" in bh.dat.

Before: 

| Block ID | Block Timestamp |

After: 

| Block ID | Block Timestamp | Month | Year |

## 1e) Re-split tx_in and tx_out dat files to groups of 2-year time periods


## Part 2: Sample Datasets
Data files at this point:

block hash: bh.dat

transaction overview: tx.dat

transaction input: txin_2010.dat, txin_2012.dat, txin_2014.dat, txin_2016.dat

transaction output: txout_2010.dat, txout_2012.dat, txout_2014.dat, txout_2016.dat

## 2a) Load the Data
Load data above into dataframes.

In [None]:
# Code here

## 2b) Initialize the Dataframe
Initialize the following dataframe:

| Address ID | Year | NumTX |

Address ID: Bitcoin account identifier

Year: Year in which the Address ID has its first transaction

NumTx: Number of transactions in total belong to that Bitcoin Address ID

In [None]:
# Code here

## 2c) Sample Address ID

Sample 100 address IDs in each transaction input/output dat file.

Update the following columns in the dataframe:

| Address ID | Year |

In [1]:
# Code here

## 2d) Accumulate Number of Transactions

Iterate through all the transaction inputs and outputs; accumulate the numbers of transactions for each sample.

Update the following columns in the dataframe:

| NumTX |

In [None]:
# Code here

## Part 3: Normal Distribution and T-test
Apply Normal Distributions and T-tests to samples from the following 2 groups:

Group 1:

Year in 2010 or 2011

Group 2:

Year in 2016 or 2017

## 3a) Mean number of transactions

## 3d) Normal Distribution Test

## Part 4: Analysis of Variance/ANOVA

## Part 5: Linear Model

## Part 6: Further Analysis