# Neural Net using BitcoinHeistData

Inspired by: https://www.freecodecamp.org/news/how-to-build-your-first-neural-network-to-predict-house-prices-with-keras-f8db83049159/

Intro here...

# Load and process the data

In [None]:
import pandas as pd
df = pd.read_csv('BitcoinHeistData.csv')
df

Split the dataset into our input features (X) and the label (Y) we wish to predict.

In [None]:
import numpy as np

X=df.iloc[:, 3:9]
Ytmp=df.iloc[:, -1]

print(X.shape)
print(Ytmp.shape)

print(X)
print(Ytmp)

# Convert the label to true false for simplicity
Y = Ytmp.str.contains('white')
print(Y)
numWhite = np.count_nonzero(Y)
print(numWhite/len(Y))

Normalize the data so one feature does not dominate.
Use a min-max scaler from scikit-learn which scales our data to be between 0 and 1.

In [None]:
from sklearn import preprocessing

min_max_scaler = preprocessing.MinMaxScaler()
X_scale = min_max_scaler.fit_transform(X)
X_scale

Split the dataset for a validation set and a test set.

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_val_and_test, Y_train, Y_val_and_test = train_test_split(X_scale, Y, test_size=0.3)
X_val, X_test, Y_val, Y_test = train_test_split(X_val_and_test, Y_val_and_test, test_size=0.5)
print(X_train.shape, X_val.shape, X_test.shape, Y_train.shape, Y_val.shape, Y_test.shape)

# Build and train the Neural Network

Use Keras to build the neural net.

In [None]:
from keras.models import Sequential
from keras.layers import Dense

We will be using the Sequential model, which means that we merely need to describe the layers above in sequence. Our neural network has three layers:

- Hidden layer 1: 30 neurons, ReLU activation
- Hidden layer 2: 30 neurons, ReLU activation
- Output Layer: 1 neuron, Sigmoid activation

In [None]:
model = Sequential([
    Dense(32, activation='relu', input_shape=(6,)),
    Dense(32, activation='relu'),
    Dense(1, activation='sigmoid'),
])

Now that we've got our architecture specified, we need to find the best numbers for it. Before we start our training, we have to configure the model by
- Telling it what algorithm you want to use to do the optimization (we'll use stochastic gradient descent)
- Telling it what loss function to use (for binary classification, we will use binary cross entropy)
- Telling it what other metrics you want to track apart from the loss function (we want to track accuracy as well)

We do so below:

In [None]:
model.compile(optimizer='sgd',
              loss='binary_crossentropy',
              metrics=['accuracy'])

Training on the data is pretty straightforward and requires us to write one line of code. The function is called 'fit' as we are fitting the parameters to the data. We specify:
- what data we are training on, which is X_train and Y_train
- the size of our mini-batch 
- how long we want to train it for (epochs)
- what our validation data is so that the model will tell us how we are doing on the validation data at each point.

This function will output a history, which we save under the variable hist. We'll use this variable a little later.

In [None]:
# Usually train with 100 epochs, but each epoch same here (issue?)
hist = model.fit(X_train, Y_train,
          batch_size=32, epochs=5,
          validation_data=(X_val, Y_val))

Evaluating our data on the test set:

In [None]:
model.evaluate(X_test, Y_test)[1]

# Visualizing Loss and Accuracy

In [None]:
import matplotlib.pyplot as plt

We want to visualize the training loss and the validation loss like this:

In [None]:
plt.plot(hist.history['loss'])
plt.plot(hist.history['val_loss'])
plt.title('Model loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Val'], loc='upper right')
plt.show()

print(hist.history.keys())

We can also visualize the training accuracy and the validation accuracy like this:

In [None]:
plt.plot(hist.history['accuracy'], '-o')
plt.plot(hist.history['val_accuracy'], '-x')
plt.title('Model accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
#plt.ylim(0.95, 1)
plt.legend(['Train', 'Val'], loc='lower right')
plt.show()

In [None]:
print(X_test.shape)
print(Y_test.shape)
print(type(X_test))
print(type(Y_test))
print(hist.params)
print(hist.history.keys())

# Look at results where label is not 'white'.  Do we ever predict a heist?
#print(X_test[1,:])
#fooY = Y_test.to_numpy()
#print(fooY[1])
#idxWhite = np.where(Y_test)[0]
#print(type(idxWhite))
#print(idxWhite.shape)
#model.evaluate(X_test[idxWhite,:], Y_test[idxWhite])

#for i in range(X_test):
#    xt = X_test[i]
#    yt = Y_test[i]
#    model.evaluate(xt, yt)[1]
#    break;

In [None]:
# Random forests\n",
from numpy import mean
from numpy import std
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.ensemble import RandomForestClassifier

# define the model
model = RandomForestClassifier()
# evaluate the model
#cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
#n_scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')
# report performance
#print('Accuracy: %.3f (%.3f)' % (mean(n_scores), std(n_scores)))

model.fit(X_train, Y_train)
ynew = model.predict(X_test)
ynew.astype(int)
print(ynew)
print(np.unique(ynew))


In [None]:
print(type(ynew))
print(type(Y_test))
print(type(ynew[0]))
print(type(Y_test[0]))
print(ynew.shape)
print(Y_test.shape)

Y_test_np = Y_test.to_numpy()
print(ynew[3])
print(Y_test_np[3])

numCorrect=0
numIncorrect=0
tp = 0
tn = 0
fp = 0
fn = 0
for idx in range(0, ynew.size-1):
    if ynew[idx] == Y_test_np[idx]:
        numIncorrect = numIncorrect + 1
    else:
        numCorrect = numCorrect + 1
    if ynew[idx] == True and Y_test_np[idx] == True:
        tp = tp + 1
    elif ynew[idx] == False and Y_test_np[idx] == False:
        tn = tn + 1
    elif ynew[idx] == True and Y_test_np[idx] == False:
        fp = fp + 1
    elif ynew[idx] == False and Y_test_np[idx] == True:
        fn = fn + 1

print("numCorrect   =" + str(numCorrect))
print("numIncorrect =" + str(numIncorrect))
print("Accuracy     =" + str(numCorrect/ynew.size))

print("tp   =" + str(tp))
print("tn   =" + str(tn))
print("fp   =" + str(fp))
print("fn   =" + str(fn))

print("precision=" + str(tp/(tp+fp)))
print("recall   =" + str(tp/(tp+fn)))
print("F1       =" + str(2*tp/(2*tp+fp+fn)))