This program creates a neural network from scratch using sigmoid and tanh as activation functions, in order to predict whether a credit card charge is fradulent.


## Import Statements

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import random
import logging
import matplotlib.pyplot as plt
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from google.colab import files
from pathlib import Path
from datetime import datetime
import pytz

## Import Data

In [None]:
# Import dataset from online repository
url = "https://raw.githubusercontent.com/arjund1999learn/DataSets/main/card_transdata.csv"
data = pd.read_csv(url)
display(data)

 # Creating a data frame
preprocessed_data = pd.DataFrame(data)

preprocessed_data.head()

Unnamed: 0,distance_from_home,distance_from_last_transaction,ratio_to_median_purchase_price,repeat_retailer,used_chip,used_pin_number,online_order,fraud
0,57.877857,0.311140,1.945940,1,1,0,0,0
1,10.829943,0.175592,1.294219,1,0,0,0,0
2,5.091079,0.805153,0.427715,1,0,0,1,0
3,2.247564,5.600044,0.362663,1,1,0,1,0
4,44.190936,0.566486,2.222767,1,1,0,1,0
...,...,...,...,...,...,...,...,...
2494,39.551837,7.143371,6.418562,1,1,0,0,0
2495,23.294824,1.980737,2.112573,1,0,0,0,0
2496,11.736348,0.630609,0.661695,1,0,0,0,0
2497,3.684748,9.527367,0.247976,1,0,0,1,0


Unnamed: 0,distance_from_home,distance_from_last_transaction,ratio_to_median_purchase_price,repeat_retailer,used_chip,used_pin_number,online_order,fraud
0,57.877857,0.31114,1.94594,1,1,0,0,0
1,10.829943,0.175592,1.294219,1,0,0,0,0
2,5.091079,0.805153,0.427715,1,0,0,1,0
3,2.247564,5.600044,0.362663,1,1,0,1,0
4,44.190936,0.566486,2.222767,1,1,0,1,0


## Prepare Logging

The logs.txt file is stored in the content folder of this colab's disk space.

In [None]:
logger = logging.getLogger('my_logger')
logger.setLevel(logging.DEBUG)  # Set the logging level for the logger

# Configure the logging format
log_format = '%(asctime)s\t\t%(message)s'


# Define the log file path
log_file_path = '/content/logs.txt'

logging.getLogger().setLevel(logging.DEBUG)
file_handler = logging.FileHandler(log_file_path)
file_handler.setLevel(logging.DEBUG)

formatter = logging.Formatter(log_format)

logging.basicConfig(format=log_format, level=logging.DEBUG)
file_handler.setFormatter(formatter)

logger.addHandler(file_handler)

## Preprocess Data

In [None]:
def preprocess_data(dataframe):

  # Drop duplicate rows
  dataframe = dataframe.drop_duplicates()

  # Drop any missing data
  dataframe = dataframe[~(dataframe == None).any(axis=1)]

  # Normalize continuous parameters
  continuous_param = ['distance_from_home', 'distance_from_last_transaction', 'ratio_to_median_purchase_price']
  mean_values = dataframe[continuous_param].mean()
  std_values = dataframe[continuous_param].std()

  dataframe[continuous_param] = (dataframe[continuous_param] - mean_values) / std_values

  return dataframe

## Split Data into Training and Testing Sets

In [None]:
def split_data(dataframe, train_percent):

  df_randomized = dataframe.sample(frac=1, random_state=10)

  split_index = int(train_percent * len(dataframe))
  train_df = df_randomized[:split_index]
  test_df = df_randomized[split_index:]

  X_train = train_df[["distance_from_home", "distance_from_last_transaction", "ratio_to_median_purchase_price", "repeat_retailer", "used_chip", "used_pin_number", "online_order"]]
  X_test = test_df[["distance_from_home", "distance_from_last_transaction", "ratio_to_median_purchase_price", "repeat_retailer", "used_chip", "used_pin_number", "online_order"]]

  Y_train = train_df[["fraud"]]
  Y_test = test_df[["fraud"]]

  return X_train, X_test, Y_train, Y_test

## Initialize Weights, Biases, and Outputs

Bias terms, weights, and outputs used to calculate the net for each layer are stored in an array containing the weight and output vectors for each neuron in the layer, assuming that there is a connection between each neuron and all neurons in the previous layer. The function also generates an appropriate delta and momentum cache for use in training the model. This function takes in an array with each element declaring the number of neurons in that layer.

In [None]:
def init_model(network_dim):

  layer_weights = []
  layer_outputs = []
  layer_deltas = []
  layer_momentum = []

  # Initialize random weights matrix per layer and add to layer_params with bias terms in index 0
  layer_weights.append(np.random.randn(network_dim[1], network_dim[0]))
  layer_momentum.append(np.zeros((network_dim[1], network_dim[0])))

  for i in range(0, len(network_dim) - 1):
    layer_outputs.append(np.ones(network_dim[i] + 1)) # empty input array
    layer_deltas.append(np.ones(network_dim[i]))

    if i > 0:
      layer_weights.append(np.random.randn(network_dim[i+1], network_dim[i]+1)) # random weights
      layer_momentum.append(np.zeros((network_dim[i+1], network_dim[i]+1))) # momentum cache

  layer_deltas.append(np.ones(network_dim[-1]))
  layer_outputs.append(np.ones(network_dim[-1] + 1)) # final output will be in index 1

  return layer_weights, layer_outputs, layer_deltas, layer_momentum

## Net or Summing Function
The net of each neuron is the dot product of the incoming weights and inputs.

\begin{align}
        &\sum_{i=0}^n w_ix_i
    \end{align}

In [None]:
# Returns dot product of current layer and next layer's current neuron
def net(current_weight, current_input):
  net = np.dot(current_weight, current_input)
  return net

## Sigmoid Function

This function applies the sigmoid function and its derivative to the net and output, respectively.

\begin{align}
        \sigma(x) = \frac{1}{1 + e^{-x}}
    \end{align}

\begin{align}
      \sigma'(x) = \sigma(x) \cdot (1 - \sigma(x))
    \end{align}

In [None]:
def activation_sigmoid(net):
  return 1/(1 + np.exp(-net))

def derivative_sigmoid(output):
  return output * (1 - output)

## Tanh(x) Function

This function applies the tanh function and its derivative to the net and output, respectively.

\begin{align}
        \tanh(x) = \frac{e^{x} - e^{-x}}{e^{x} + e^{-x}}
    \end{align}

\begin{align}
      \tanh'(x) = 1 - \tanh^2(x)
    \end{align}

In [None]:
def activation_tanh(net):
  return (np.exp(net)-np.exp(-net))/(np.exp(net)+np.exp(-net))

def derivative_tanh(output):
  return 1 - output**2

## Forward Propogation

The following function computes the output by applying the given activation function to the computed net of each neuron and stores each output in its respective position in the output cache.

In [None]:
def forward_prop(activation_function, layer_weights, layer_outputs):
  for cur_layer_index in range (0, len(layer_outputs) - 1):                     # Iterates through each layer
    for weight_vector_index in range(0, len(layer_weights[cur_layer_index])):   # Iterates through each weight vector in each layer
      # Stores net in the next output layer
      layer_outputs[cur_layer_index + 1][weight_vector_index + 1] = activation_function(net(layer_weights[cur_layer_index][weight_vector_index], layer_outputs[cur_layer_index]))

## Error Function ($𝐸_{d}(w) )$

This function calculates the 𝛿 term for each neuron in the model. It uses the current layer index to determine whether the 𝛿 term should be calculated using case 1 or case 2.

Case 1: Layer is an output layer. In which case the 𝛿 term is the product of $-(t_{j} - o_{j})$ and the derivative of the activation function. Note that for this dataset, the final output will be 1 or 0 depending on the given threshold.

\begin{align}
        𝛿_{j} = \frac{𝜕𝐸_{d}}{𝜕net_{j}} = \frac{𝜕𝐸_{d}}{𝜕o_{j}} \cdot \frac{𝜕o_{j}}{𝜕net_{j}}
    \end{align}

\begin{align}\frac{𝜕𝐸_{d}}{𝜕o_{j}} = -(t_{j} - o_{j})\end{align}
\begin{align}\frac{𝜕o_{j}}{𝜕net_{j}} = \text{activation function derivative} \end{align}

Case 2: Layer is a hidden layer. In which case the 𝛿 term is the product of the derivative of the activation function and the weighted summation of downstream neurons.

\begin{align}
        𝛿_{j} = o_{j}(1 - o_{j}) &\sum_{k∈downstream(j)} 𝛿_{k} w_{kj}
    \end{align}




In [None]:
def delta_output(cur_layer, layer_weights, layer_outputs, layer_deltas, derivative_activation, actual, threshold):

  # Current layer is the output layer - Case 1
  if cur_layer == len(layer_outputs) - 1:

    # For each neuron in the layer, calculate its delta and store in delta matrix
    for cur_neuron in range(0, len(layer_outputs[cur_layer]) - 1):

      # Apply threshold to final layer output depending on activation function, 0 being negative, 1 being positive
      raw_output = layer_outputs[cur_layer][cur_neuron + 1]

      predicted = apply_threshold(raw_output, threshold, derivative_activation)

      layer_deltas[cur_layer][cur_neuron] = derivative_activation(layer_outputs[cur_layer][cur_neuron + 1])*(actual - predicted)

  # Current layer is a hidden layer - Case 2
  else:

    # Calculate the weighted sum of downstream deltas and store in delta matrix
    for next_neuron in range(0, len(layer_deltas[cur_layer + 1])):
      if next_neuron == 0:  # Overwrite past delta
        layer_deltas[cur_layer] = layer_deltas[cur_layer+1][next_neuron] * layer_weights[cur_layer][next_neuron][1:]
      else:
        layer_deltas[cur_layer] += layer_deltas[cur_layer+1][next_neuron] * layer_weights[cur_layer][next_neuron][1:]

    # Multiply the stored delta by the derivation of the activation function
    for cur_neuron in range(0, len(layer_outputs[cur_layer]) - 1):
      layer_deltas[cur_layer][cur_neuron] *= derivative_activation(layer_outputs[cur_layer][cur_neuron + 1])

## Threshold

This function applies the given threshold to a raw predicted output to assign a binary class 1 or 0 to the prediction. A given threshold is assumed to be between 0 and 1. For tanh, whose possible range differ from the probabilistic friendly range of sigmoid outputs, we center the threshold around 0. For ReLu, we assume a static threshold at 0.  

In [None]:
def apply_threshold(raw_output, threshold, derivative_activation):
    if derivative_activation is derivative_sigmoid:
      predicted = 1 if raw_output > threshold else 0
    elif derivative_activation is derivative_tanh:
      threshold = (threshold * 2) - 1 # Normalize threshold around 0
      predicted = 1 if raw_output > threshold else 0
    else: # Activation function is ReLu
      predicted = 1 if raw_output > 0 else 0

    return predicted

## Backward Propogation

The following function performs backwards propogation by calculating and storing the 𝛿 terms for each neuron in the delta cache on the first pass. The second pass then updates each weight using the deltas stored in the cache and a momentum optimization term that adds a fraction γ of the past step or weight change.

\begin{align}
        w_{ij} = w_{ij} + \Delta w
    \end{align}

\begin{align}
        \Delta w_{ji} = \eta \delta_j x_{ji}
    \end{align}

In [None]:
def backward_prop(target, derivative_activation, learning_rate, momentum, threshold):

  # Calculate deltas in each layer
  for layer_index in range(len(layer_outputs) - 1, 0, -1):
    delta_output(layer_index, layer_weights, layer_outputs, layer_deltas, derivative_activation, target, threshold)

  # Update weights
  for layer_index in range(len(layer_outputs) - 2, -1, -1):
    for weight_index in range(len(layer_weights[layer_index])):

      # Calculate weight changes
      weight_change = learning_rate * layer_deltas[layer_index + 1][weight_index] * layer_outputs[layer_index]
      layer_weights[layer_index][weight_index] += learning_rate * layer_deltas[layer_index + 1][weight_index] * layer_outputs[layer_index] + (momentum * layer_momentum[layer_index][weight_index])

      # Previous weight changes are stored in the momentum cache and used to calculate the momentum term. On the first iteration, this is initialized to 0.
      layer_momentum[layer_index][weight_index] = weight_change

## Train Model

The following function trains and generates a set of weights given the following model parameters and hyper parameters.

In [None]:
def train_model(x_train, y_train, learning_rate, layer_weights, layer_outputs, layer_deltas, momentum, threshold, activation_type, derivative_type, epoch):

  for e in range(epoch):

      # Iterate through each data point in the training data
      for ind in X_train.index:
        # Repopulate the input layer with the next data point
        temp = np.array(df.iloc[ind].values[:-1])
        layer_outputs[0] = np.insert(temp, 0, 1)

        forward_prop(activation_type, layer_weights, layer_outputs)
        backward_prop(df.iloc[ind].values[-1], derivative_type, learning_rate, momentum, threshold)

  return layer_weights

## Testing

The following function uses the generated model weights and calculates training and testing predictions to be used in the model evaluation functions.

In [None]:
def test_model(layer_weights,layer_outputs, X_test, activation_type, derivative_activation, threshold):
  Y_train_pred = []
  Y_test_pred = []
  length = len(layer_outputs) - 1 # Index of last layer

  # Calculate training predictions
  for ind in X_train.index:
    temp = np.array(df.iloc[ind].values[:-1])
    layer_outputs[0] = np.insert(temp, 0,1)
    forward_prop(activation_type, layer_weights, layer_outputs)
    Y_train_pred.append(layer_outputs[length][1]) # Appending final output to Y_train_pred

  # Calculate testing predictions
  for ind in X_test.index:
    temp = np.array(df.iloc[ind].values[:-1])
    layer_outputs[0] = np.insert(temp, 0,1)
    forward_prop(activation_type, layer_weights, layer_outputs)
    Y_test_pred.append(layer_outputs[length][1]) # Appending final output to Y_test_pred

  # Apply threshold to predictions
  for x in range(len(Y_train_pred)):
    Y_train_pred[x] = apply_threshold(Y_train_pred[x], threshold, derivative_activation)
  for x in range(len(Y_test_pred)):
    Y_test_pred[x] = apply_threshold(Y_test_pred[x], threshold, derivative_activation)

  return Y_train_pred, Y_test_pred

## Main

In [None]:
# Preprocess data
df = preprocess_data(preprocessed_data)

# Split into training and testing sets
X_train, X_test, Y_train, Y_test = split_data(df, 0.9)

# Initialize hyper parameters
learning_rate = 0.05
momentum= 0.5
threshold= 0.5
activation_type = activation_tanh
derivative_type = derivative_tanh
epoch = 1
model_dim = [8, 4, 1]

# Initialize model
layer_weights, layer_outputs, layer_deltas, layer_momentum = init_model(model_dim)

# Train model
layer_weights = train_model(X_train, Y_train, learning_rate, layer_weights, layer_outputs, layer_deltas, momentum, threshold, activation_type, derivative_type, epoch)

# Test model on training and testing data
Y_train_pred, Y_test_pred = test_model(layer_weights,layer_outputs, X_test, activation_type= activation_type, derivative_activation = derivative_type, threshold = threshold)

# Print last time the program ran
print("Last runtime of program:", datetime.now(pytz.utc).astimezone(pytz.timezone('America/Chicago')))


Last runtime of program: 2023-07-28 17:44:53.111180-05:00


## Metrics and Logging

In [None]:
print("Testing Statistics")

test_accuracy = round(accuracy_score(Y_test, Y_test_pred), 3)
test_precision = round(precision_score(Y_test, Y_test_pred), 3)
test_recall = round(recall_score(Y_test, Y_test_pred), 3)
test_f1 = round(f1_score(Y_test, Y_test_pred), 3)

train_accuracy = round(accuracy_score(Y_train, Y_train_pred), 3)
train_precision = round(precision_score(Y_train, Y_train_pred), 3)
train_recall = round(recall_score(Y_train, Y_train_pred), 3)
train_f1 = round(f1_score(Y_train, Y_train_pred), 3)

print("Accuracy: \t", test_accuracy)
print("Precision: \t", test_precision)
print("Recall: \t", test_recall)
print("F1: \t\t", test_f1)

print("\nTraining Statistics")
print("Accuracy: \t", train_accuracy)
print("Precision: \t", train_precision)
print("Recall: \t", train_recall)
print("F1: \t\t", train_f1)

logger.info(f"Learning Rate: {learning_rate}, Momentum: {momentum}, Threshold: {threshold}, "
            f"Activation Func.: {activation_type.__name__}, Epochs: {epoch}, "
            f"Model Dim: {model_dim}, Test Accuracy: {test_accuracy}, Test Precision: {test_precision}, "
            f"Test Recall: {test_recall}, Test F1: {test_f1}, Train Accuracy: {train_accuracy}, "
            f"Train Precision: {train_precision}, Train Recall: {train_recall}, Train F1: {train_f1}")


Testing Statistics
Accuracy: 	 0.94
Precision: 	 0.593
Recall: 	 0.8
F1: 		 0.681

Training Statistics

INFO:my_logger:Learning Rate: 0.05, Momentum: 0.5, Threshold: 0.5, Activation Func.: activation_tanh, Epochs: 1, Model Dim: [8, 4, 1], Test Accuracy: 0.94, Test Precision: 0.593, Test Recall: 0.8, Test F1: 0.681, Train Accuracy: 0.92, Train Precision: 0.538, Train Recall: 0.877, Train F1: 0.667



Accuracy: 	 0.92
Precision: 	 0.538
Recall: 	 0.877
F1: 		 0.667
