# CMPE-255 Data Mining: Group Project
## Group 1: Alex Dodd, Tom Casaletto, Jonathan Conover
Dataset: Bitcoin Heist Ransomeware Address Dataset
https://archive.ics.uci.edu/ml/datasets/BitcoinHeistRansomwareAddressDataset

In [59]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from keras.models import Sequential
from keras.layers import Dense



#### Assign the heist dataset to a pandas dataframe object

In [2]:
df = pd.read_csv('BitcoinHeistData.csv')
df

Unnamed: 0,address,year,day,length,weight,count,looped,neighbors,income,label
0,111K8kZAEnJg245r2cM6y9zgJGHZtJPy6,2017,11,18,0.008333,1,0,2,1.000500e+08,princetonCerber
1,1123pJv8jzeFQaCV4w644pzQJzVWay2zcA,2016,132,44,0.000244,1,0,1,1.000000e+08,princetonLocky
2,112536im7hy6wtKbpH1qYDWtTyMRAcA2p7,2016,246,0,1.000000,1,0,2,2.000000e+08,princetonCerber
3,1126eDRw2wqSkWosjTCre8cjjQW8sSeWH7,2016,322,72,0.003906,1,0,2,7.120000e+07,princetonCerber
4,1129TSjKtx65E35GiUo4AYVeyo48twbrGX,2016,238,144,0.072848,456,0,1,2.000000e+08,princetonLocky
...,...,...,...,...,...,...,...,...,...,...
2916692,12D3trgho1vJ4mGtWBRPyHdMJK96TRYSry,2018,330,0,0.111111,1,0,1,1.255809e+09,white
2916693,1P7PputTcVkhXBmXBvSD9MJ3UYPsiou1u2,2018,330,0,1.000000,1,0,1,4.409699e+07,white
2916694,1KYiKJEfdJtap9QX2v9BXJMpz2SfU4pgZw,2018,330,2,12.000000,6,6,35,2.398267e+09,white
2916695,15iPUJsRNZQZHmZZVwmQ63srsmughCXV4a,2018,330,0,0.500000,1,0,1,1.780427e+08,white


#### The next two code blocks will assign numerical values for the address cells. Done to avoid losing any context these attributes add to the dataset

In [3]:
uni_addr = df['address'].unique()
print(len(uni_addr))
addr_dict = {}
for i in range(len(uni_addr)):
    addr_dict[uni_addr[i]] = i
print(len(addr_dict))

2631095
2631095


In [4]:
index = df.index
for i in range(len(df['address'])):
    try:
        df.at[index[i],'address'] = int(addr_dict[df['address'][index[i]]])
    except KeyError:
         continue

#### Visual inspect that data replacement occured correctly

In [5]:
df

Unnamed: 0,address,year,day,length,weight,count,looped,neighbors,income,label
0,0,2017,11,18,0.008333,1,0,2,1.000500e+08,princetonCerber
1,1,2016,132,44,0.000244,1,0,1,1.000000e+08,princetonLocky
2,2,2016,246,0,1.000000,1,0,2,2.000000e+08,princetonCerber
3,3,2016,322,72,0.003906,1,0,2,7.120000e+07,princetonCerber
4,4,2016,238,144,0.072848,456,0,1,2.000000e+08,princetonLocky
...,...,...,...,...,...,...,...,...,...,...
2916692,2631091,2018,330,0,0.111111,1,0,1,1.255809e+09,white
2916693,2631092,2018,330,0,1.000000,1,0,1,4.409699e+07,white
2916694,2177756,2018,330,2,12.000000,6,6,35,2.398267e+09,white
2916695,2631093,2018,330,0,0.500000,1,0,1,1.780427e+08,white


#### Seperate into X and Y and transform the Y labels into either True for White and False for heist. This directly supports our objective to train a model that distinguishes between white and heist transactions.

In [14]:
Y = df['label']
Y = Y.str.contains('white')
X = df.drop(columns=['label'])
print(Y.shape,X.shape)

(2916697,) (2916697, 9)


#### Scale X by MinMaxScaler

In [7]:
min_max_scaler = preprocessing.MinMaxScaler()
X_scale = min_max_scaler.fit_transform(X)
X_scale

array([[0.00000000e+00, 8.57142857e-01, 2.74725275e-02, ...,
        0.00000000e+00, 7.74053719e-05, 1.40199911e-06],
       [3.80070039e-07, 7.14285714e-01, 3.59890110e-01, ...,
        0.00000000e+00, 0.00000000e+00, 1.40099840e-06],
       [7.60140079e-07, 7.14285714e-01, 6.73076923e-01, ...,
        0.00000000e+00, 7.74053719e-05, 3.40242469e-06],
       ...,
       [8.27699809e-01, 1.00000000e+00, 9.03846154e-01, ...,
        4.13907285e-04, 2.63178265e-03, 4.73991240e-05],
       [9.99999620e-01, 1.00000000e+00, 9.03846154e-01, ...,
        0.00000000e+00, 0.00000000e+00, 2.96296491e-06],
       [1.00000000e+00, 1.00000000e+00, 9.03846154e-01, ...,
        0.00000000e+00, 7.74053719e-05, 1.64817455e-06]])

#### Next two blocks will split the data into a training and test. Then further split the test set into a test and validation sets; validation set is used while training the neural net for on the spot evaluation

In [15]:
### Split to test and train
X_train, X_test, Y_train, Y_test = train_test_split(X_scale, Y, test_size=0.2, random_state=42)

print(X_train.shape, X_test.shape, Y_train.shape, Y_test.shape)

(2333357, 9) (583340, 9) (2333357,) (583340,)


In [16]:
X_val, X_test_net, Y_val, Y_test_net = train_test_split(X_test, Y_test, test_size=0.5)
print(X_val.shape, X_test_net.shape, Y_val.shape, Y_test_net.shape)

(291670, 9) (291670, 9) (291670,) (291670,)


#### Create and train the Keras Neural Net 

In [38]:
model = Sequential([
    Dense(32, activation='relu', input_shape=(9,)),
    Dense(32, activation='relu'),
    Dense(1, activation='sigmoid'),
])

model.compile(optimizer='sgd',
              loss='binary_crossentropy',
              metrics=['accuracy'])

In [39]:
hist = model.fit(X_train, Y_train,
          batch_size=32, epochs=5,
          validation_data=(X_val, Y_val))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


#### Evaluate the test set with labels provided and observe accuracy

In [40]:
model.evaluate(X_test, Y_test)[1]



0.999787449836731

#### Call the predict function against the test set without labels provided for complete evaluation and then compare to the original numbers

In [41]:
# For softmax activation
# np.argmax(model.predict(X_test), axis=-1)

# For sigmoid activiation
classes = (model.predict(X_test) > 0.5).astype("int32")

In [52]:
pred_heist = []
pred_white = []
for i in classes:
    if i == 1:
        pred_white.append(i)
    else:
        pred_heist.append(i)

print("Predicted number white",len(pred_white))
print("Predicted number heist",len(pred_heist))

test_white = []
test_heist = []
for i in Y_test:
    if i == True:
        test_white.append(i)
    else:
        test_heist.append(i)

print("Actual number white",len(pred_white))
print("Actual number heist",len(pred_heist))
        

Predicted number white 575273
Predicted number heist 8067
Actual number white 575273
Actual number heist 8067


In [60]:
def scorer(y_test,y_pred):
    accuracy = accuracy_score(y_test, y_pred)
    p_score = precision_score(y_test, y_pred)
    r_score = recall_score(y_test, y_pred)
    f_score = f1_score(y_test, y_pred)#2*((p_score*r_score)/(p_score+r_score))
    
    return accuracy, p_score, r_score, f_score

accuracy, p_score, r_score, f_score = scorer(Y_test, classes)
print("The Accuracy is: ",accuracy,"\nThe Precision is: ",p_score,"\nThe Recall is: ",r_score,"\nThe F1 Score is: ",f_score)

The Accuracy is:  0.9997874310007886 
The Precision is:  0.9998609355905804 
The Recall is:  0.9999235097881395 
The F1 Score is:  0.9998922217103718


#### The Neural Net is able to correctly distinguish between the white and heist data.