In [14]:
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn import preprocessing

In [2]:
bitcoin_df = pd.read_csv('BitcoinHeistData.csv')

### Basic data comparison
white = bitcoin_df[bitcoin_df['label'] == 'white']
print(len(white))

non_white = bitcoin_df[bitcoin_df['label'] != 'white']
print(len(non_white))

print("Ratio of white to non-white:", len(non_white)/len(white))

2875284
41413
Ratio of white to non-white: 0.014403098963441524


In [6]:
uni_addresses = bitcoin_df['address'].unique()
print(len(uni_addresses))
uni_addr_dict = {}
for i in range(len(uni_addresses)):
    uni_addr_dict[uni_addresses[i]] = i

2631095


In [8]:
### Split to test and train
train, test = train_test_split(
    bitcoin_df, test_size=0.2, random_state=42)

print(train.shape)
print(test.shape)

(2333357, 10)
(583340, 10)


In [9]:
train_uni_addr = train['address'].unique()
print(len(train_uni_addr))
train_addr_dict = {}
for i in range(len(train_uni_addr)):
    train_addr_dict[train_uni_addr[i]] = i
print(len(train_addr_dict))

test_uni_addr = test['address'].unique()
print(len(test_uni_addr))
test_addr_dict = {}
for i in range(len(test_uni_addr)):
    test_addr_dict[test_uni_addr[i]] = i
print(len(test_addr_dict))



2122026
2122026
553546
553546


In [11]:
#replace the string addresses with assigned numbers rather than drop them
index = train.index
for i in range(len(train['address'])):
    try:
        train.at[index[i],'address'] = int(train_addr_dict[train['address'][index[i]]])
    except KeyError:
         continue

index = test.index
for i in range(len(test['address'])):
    try:
        test.at[index[i],'address'] = int(test_addr_dict[test['address'][index[i]]])
    except KeyError:
         continue


In [12]:
print(train.head())
print(test.head())

        address  year  day  length        weight  count  looped  neighbors  \
2083633       0  2016  227     144  9.128820e-02   2090    2034          2   
1426971       1  2014  301     144  2.205756e-04   2034    1913          2   
2701101       2  2018  115       0  1.000000e+00      1       0          2   
1172117       3  2014   46      14  9.347273e-09      5       0          1   
47537         4  2011   11      46  1.000000e+00      1       0          2   

               income  label  
2083633  1.085223e+09  white  
1426971  1.005948e+08  white  
2701101  2.000000e+08  white  
1172117  4.340717e+08  white  
47537    3.211000e+10  white  
        address  year  day  length    weight  count  looped  neighbors  \
2667698       0  2018   82       8  1.311111      3       0          2   
2013400       1  2016  157       2  0.025000     44       0          1   
2704440       2  2018  118     144  3.619217   5991    5991          2   
1650833       3  2015  160     144  0.000484   34

In [16]:
min_max_scaler = preprocessing.MinMaxScaler()
train_clean = train[(train['label'] == 'white')]

Y_tmp_clean = train_clean['label']
Y_train_clean = Y_tmp_clean.str.contains('white')

X_train_clean = train_clean.drop(columns=['label'])
print("Train Y data:\n",Y_train_clean.head())
print("Train X data:\n",X_train_clean.head())

test_clean = test[(test['label'] == 'white')]

Y__tmp = test_clean['label']
Y_test_clean = Y__tmp.str.contains('white')
X_test_clean = test_clean.drop(columns=['label'])
print("Test Y data: \n",Y_test_clean.head())
print("Test X Data: \n",X_test_clean.head())

X_cln_train_scale = np.array(min_max_scaler.fit_transform(X_train_clean))
X_cln_train_scale

X_cln_test_scale = np.array(min_max_scaler.fit_transform(X_test_clean))
X_cln_test_scale

Train Y data:
 2083633    True
1426971    True
2701101    True
1172117    True
47537      True
Name: label, dtype: bool
Train X data:
         address  year  day  length        weight  count  looped  neighbors  \
2083633       0  2016  227     144  9.128820e-02   2090    2034          2   
1426971       1  2014  301     144  2.205756e-04   2034    1913          2   
2701101       2  2018  115       0  1.000000e+00      1       0          2   
1172117       3  2014   46      14  9.347273e-09      5       0          1   
47537         4  2011   11      46  1.000000e+00      1       0          2   

               income  
2083633  1.085223e+09  
1426971  1.005948e+08  
2701101  2.000000e+08  
1172117  4.340717e+08  
47537    3.211000e+10  
Test Y data: 
 2667698    True
2013400    True
2704440    True
1650833    True
1586811    True
Name: label, dtype: bool
Test X Data: 
         address  year  day  length    weight  count  looped  neighbors  \
2667698       0  2018   82       8  1.31111

array([[0.00000000e+00, 1.00000000e+00, 2.22527473e-01, ...,
        0.00000000e+00, 7.74053719e-05, 1.10190830e-05],
       [1.80653786e-06, 7.14285714e-01, 4.28571429e-01, ...,
        0.00000000e+00, 0.00000000e+00, 1.40943981e-06],
       [3.61307572e-06, 1.00000000e+00, 3.21428571e-01, ...,
        4.13286424e-01, 7.74053719e-05, 5.43540394e-06],
       ...,
       [9.99996387e-01, 1.00000000e+00, 5.57692308e-01, ...,
        0.00000000e+00, 7.74053719e-05, 4.02697087e-07],
       [9.99998193e-01, 5.71428571e-01, 9.78021978e-01, ...,
        0.00000000e+00, 7.74053719e-05, 1.95308087e-05],
       [1.00000000e+00, 4.28571429e-01, 6.92307692e-01, ...,
        0.00000000e+00, 7.74053719e-05, 3.20106379e-06]])

In [17]:
train_dirty = train[(train['label'] != 'white')]


Y_tmp_dirty = train_dirty['label']
Y_train_dirty = Y_tmp_dirty.str.contains('white')

X_train_dirty = train_dirty.drop(columns=['label'])
print("Train Y data:\n",Y_train_dirty.head())
print("Train X data:\n",X_train_dirty.head())

test_dirty = test[(test['label'] != 'white')]

Y_test_tmp = test_dirty['label']
Y_test_dirty = Y_test_tmp.str.contains('white')
X_test_dirty = test_dirty.drop(columns=['label'])
print("Test Y data: \n",Y_test_dirty.head())
print("Test X Data: \n",X_test_dirty.head())

Train Y data:
 4357     False
13784    False
30055    False
8328     False
12013    False
Name: label, dtype: bool
Train X data:
       address  year  day  length    weight  count  looped  neighbors  \
4357       52  2013  311     144  1.231075   2082       2          5   
13784     135  2013  325       2  0.050000      1       0          1   
30055     177  2013  191       0  1.000000      1       0          2   
8328      184  2013  277       2  1.000000      1       0          2   
12013     262  2014   64      54  0.015625      1       0          1   

            income  
4357   392850000.0  
13784   31572719.0  
30055  102660000.0  
8328    79000000.0  
12013   34400000.0  
Test Y data: 
 3898     False
31915    False
39100    False
28205    False
12166    False
Name: label, dtype: bool
Test X Data: 
       address  year  day  length    weight  count  looped  neighbors  \
3898       66  2014  163       2  3.000000      3       0          3   
31915      80  2014   81     144  0.6

In [18]:
X_train_scale = np.array(min_max_scaler.fit_transform(X_train_dirty))
X_train_scale

X_test_scale = np.array(min_max_scaler.fit_transform(X_test_dirty))
X_test_scale

array([[0.00000000e+00, 4.28571429e-01, 4.45054945e-01, ...,
        0.00000000e+00, 3.22580645e-02, 4.09235861e-03],
       [2.53120158e-05, 4.28571429e-01, 2.19780220e-01, ...,
        1.31440776e-01, 1.61290323e-02, 6.81202788e-04],
       [1.33792083e-04, 4.28571429e-01, 4.75274725e-01, ...,
        0.00000000e+00, 3.22580645e-02, 8.25714835e-04],
       ...,
       [3.89206595e-01, 4.28571429e-01, 2.11538462e-01, ...,
        1.72021002e-01, 6.45161290e-02, 8.43603187e-03],
       [9.27811939e-02, 2.85714286e-01, 5.08241758e-01, ...,
        0.00000000e+00, 1.61290323e-02, 9.77820200e-04],
       [7.28660615e-02, 4.28571429e-01, 2.17032967e-01, ...,
        0.00000000e+00, 1.61290323e-02, 1.75059368e-05]])

In [19]:
X_val, X_test, Y_val, Y_test = train_test_split(X_test_scale, Y_test_dirty, test_size=0.5)
print(X_val.shape, X_test.shape, Y_val.shape, Y_test.shape)

(4051, 9) (4052, 9) (4051,) (4052,)


In [20]:
from keras.models import Sequential
from keras.layers import Dense

model = Sequential([
    Dense(32, activation='relu', input_shape=(9,)),
    Dense(32, activation='relu'),
    Dense(1, activation='sigmoid'),
])

In [21]:
model.compile(optimizer='sgd',
              loss='binary_crossentropy',
              metrics=['accuracy'])

In [22]:
hist = model.fit(X_train_scale, Y_train_dirty,
          batch_size=32, epochs=5,
          validation_data=(X_val, Y_val))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [23]:
#Evaluate against only white data; no surprise, accuracy is 0
model.evaluate(X_cln_test_scale, Y_test_clean)[1]



0.0

In [24]:
### Split new test and train 
mix_train, mix_test = train_test_split(
    bitcoin_df, test_size=0.1, random_state=42)

print(train.shape)
print(test.shape)

(2333357, 10)
(583340, 10)


In [25]:
mix_uni_addr = mix_test['address'].unique()
print(len(mix_uni_addr))
mix_addr_dict = {}
for i in range(len(mix_uni_addr)):
    mix_addr_dict[mix_uni_addr[i]] = i
print(len(mix_addr_dict))

index = mix_test.index
for i in range(len(mix_test['address'])):
    try:
        mix_test.at[index[i],'address'] = int(mix_addr_dict[mix_test['address'][index[i]]])
    except KeyError:
         continue
            

Y_mix_tmp = mix_test['label']
Y_test_mix = Y_mix_tmp.str.contains('white')
X_test_mix = mix_test.drop(columns=['label'])
print("Test Y data: \n",Y_test_mix.head())
print("Test X Data: \n",X_test_mix.head())

X_mix_test_scale = np.array(min_max_scaler.fit_transform(X_test_mix))


281409
281409
Test Y data: 
 2667698    True
2013400    True
2704440    True
1650833    True
1586811    True
Name: label, dtype: bool
Test X Data: 
         address  year  day  length    weight  count  looped  neighbors  \
2667698       0  2018   82       8  1.311111      3       0          2   
2013400       1  2016  157       2  0.025000     44       0          1   
2704440       2  2018  118     144  3.619217   5991    5991          2   
1650833       3  2015  160     144  0.000484   3452    3049          2   
1586811       4  2015   96     128  0.625000      2       0          2   

              income  
2667698  577264100.0  
2013400  100000000.0  
2704440  299950000.0  
1650833  299000000.0  
1586811   40000000.0  


In [26]:
#Running evaluate against the mixed test set returns almost the exact percentage as the ratio for white to hiest data
model.evaluate(X_mix_test_scale, Y_test_mix)[1]





0.014163266867399216