Download csv from:  
https://www.kaggle.com/datasets/mlg-ulb/creditcardfraud/data
- Anonymized credit card transactions labeled as fraudulent or genuine

I used mini batch gradient descent since it uses batches of 64 samples instead of the whole dataset. Using something like full batch would not be a good idea since the computational cost is exceptionally greater due to the size of observations

In [25]:
# upload csv
from google.colab import files
uploaded = files.upload()

In [22]:
import numpy as np
import pandas as pd

In [23]:
class NeuralNetwork:
    def __init__(self, input_size, hidden_size, output_size, learning_rate=0.01):
        np.random.seed(42)
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.learning_rate = learning_rate

        # init weights and biases
        self.W1 = np.random.randn(self.hidden_size, self.input_size) * 0.01
        self.b1 = np.zeros((self.hidden_size, 1))
        self.W2 = np.random.randn(self.output_size, self.hidden_size) * 0.01
        self.b2 = np.zeros((self.output_size, 1))

    def sigmoid(self, z):
        return 1 / (1 + np.exp(-z))

    def sigmoid_derivative(self, z):
        return self.sigmoid(z) * (1 - self.sigmoid(z))

    def forward_propagation(self, X):
        self.Z1 = np.dot(self.W1, X) + self.b1
        self.A1 = self.sigmoid(self.Z1)
        self.Z2 = np.dot(self.W2, self.A1) + self.b2
        self.A2 = self.sigmoid(self.Z2)
        return self.A2

    def calc_cost(self, Y_hat, Y):
        m = Y.shape[1]
        return -np.sum(Y * np.log(Y_hat) + (1 - Y) * np.log(1 - Y_hat)) / m

    def backward_propagation(self, X, Y):
        m = X.shape[1]
        dZ2 = self.A2 - Y
        dW2 = np.dot(dZ2, self.A1.T) / m
        db2 = np.sum(dZ2, axis=1, keepdims=True) / m

        dZ1 = np.dot(self.W2.T, dZ2) * self.sigmoid_derivative(self.Z1)
        dW1 = np.dot(dZ1, X.T) / m
        db1 = np.sum(dZ1, axis=1, keepdims=True) / m

        # update params
        self.W1 -= self.learning_rate * dW1
        self.b1 -= self.learning_rate * db1
        self.W2 -= self.learning_rate * dW2
        self.b2 -= self.learning_rate * db2

    def train(self, X, Y, epochs=1000, batch_size=64):
        m = X.shape[1]
        for epoch in range(epochs):
            for i in range(0, m, batch_size):
                X_batch = X[:, i:i+batch_size]
                Y_batch = Y[:, i:i+batch_size]

                Y_hat = self.forward_propagation(X_batch)
                self.backward_propagation(X_batch, Y_batch)

            if epoch % 100 == 0:
                cost = self.calc_cost(self.forward_propagation(X), Y)
                print(f"Epoch {epoch}, Cost: {cost}")

    def predict(self, X):
        Y_hat = self.forward_propagation(X)
        return (Y_hat > 0.5).astype(int)

Normalized Amount feature and removed irrelevant columns for faster convergence

In [9]:
data = pd.read_csv("creditcard.csv")
data["normalized_amount"] = (data["Amount"] - data["Amount"].mean()) / data["Amount"].std()
data = data.drop(columns=["Time", "Amount"]) # Drop unnecessary columns

# features and labels
X = data.drop(columns=["Class"]).values.T
Y = data["Class"].values.reshape(1, -1)

# train
nn = NeuralNetwork(input_size=X.shape[0], hidden_size=16, output_size=1, learning_rate=0.01)
nn.train(X, Y, epochs=1000, batch_size=64)

# test
y_pred = nn.predict(X)
accuracy = np.mean(y_pred == Y)
print(f"Accuracy: {accuracy * 100}%")

Epoch 0, Cost: 0.010059005683291582
Epoch 100, Cost: 0.003125603705055983
Epoch 200, Cost: 0.002933489430289504
Epoch 300, Cost: 0.002840752665224149
Epoch 400, Cost: 0.0027966355811873196
Epoch 500, Cost: 0.002765029827188171
Epoch 600, Cost: 0.0027373481512771927
Epoch 700, Cost: 0.0027121637688535473
Epoch 800, Cost: 0.0026890145123767005
Epoch 900, Cost: 0.0026666176051912014
Accuracy: 99.94452383543944%


understnading tensors for pytorch:  
https://pytorch.org/tutorials/beginner/basics/tensorqs_tutorial.html


understanding PyTorch's autograd:  
https://pytorch.org/tutorials/beginner/basics/autogradqs_tutorial.html

building the network:  
https://pytorch.org/tutorials/beginner/basics/buildmodel_tutorial.html

choosing an optimizer:  
https://pytorch.org/tutorials/beginner/basics/optimization_tutorial.html

how to handle large datasets:  
https://pytorch.org/tutorials/beginner/basics/data_tutorial.html

In [13]:
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [14]:
data = pd.read_csv("creditcard.csv")

# normalize Amount
scaler = StandardScaler()
data["normalized_amount"] = scaler.fit_transform(data["Amount"].values.reshape(-1, 1))
data.drop(columns=["Time", "Amount"], inplace=True)

X = data.drop(columns=["Class"]).values
Y = data["Class"].values

In [15]:
# train-dev-test split
X_train, X_temp, Y_train, Y_temp = train_test_split(X, Y, test_size=0.2, random_state=42)
X_dev, X_test, Y_dev, Y_test = train_test_split(X_temp, Y_temp, test_size=0.5, random_state=42)

In [16]:
# convert to tensors

X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
Y_train_tensor = torch.tensor(Y_train, dtype=torch.float32).view(-1, 1)

X_dev_tensor = torch.tensor(X_dev, dtype=torch.float32)
Y_dev_tensor = torch.tensor(Y_dev, dtype=torch.float32).view(-1, 1)

X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
Y_test_tensor = torch.tensor(Y_test, dtype=torch.float32).view(-1, 1)

In [17]:
class NeuralNetwork(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(NeuralNetwork, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, output_size)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        x = self.sigmoid(x)
        return x

In [18]:
input_size = X_train.shape[1]
hidden_size = 16
output_size = 1

model = NeuralNetwork(input_size, hidden_size, output_size)

In [19]:
# loss and optimizer
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [20]:
# training
epochs = 100
batch_size = 64

for epoch in range(epochs):
    permutation = torch.randperm(X_train_tensor.size(0))

    for i in range(0, X_train_tensor.size(0), batch_size):
        indices = permutation[i:i + batch_size]
        X_batch, Y_batch = X_train_tensor[indices], Y_train_tensor[indices]

        optimizer.zero_grad()
        outputs = model(X_batch)
        loss = criterion(outputs, Y_batch)
        loss.backward()
        optimizer.step()

    if epoch % 10 == 0:
        print(f'Epoch {epoch}, Loss: {loss.item()}')

Epoch 0, Loss: 0.0009942662436515093
Epoch 10, Loss: 6.954128184588626e-05
Epoch 20, Loss: 9.2010423031752e-06
Epoch 30, Loss: 0.00016459383186884224
Epoch 40, Loss: 9.878210403257981e-05
Epoch 50, Loss: 1.6216266885749064e-05
Epoch 60, Loss: 3.6888632166665047e-06
Epoch 70, Loss: 9.368735845782794e-06
Epoch 80, Loss: 2.6807897484104615e-06
Epoch 90, Loss: 5.668979611073155e-06


In [24]:
# eval on test set
with torch.no_grad():
    y_pred = model(X_test_tensor)
    y_pred = (y_pred > 0.5).float()
    accuracy = (y_pred == Y_test_tensor).float().mean()
    print(f'Test Accuracy: {accuracy.item() * 100}%')

Test Accuracy: 99.92275834083557%


I chose the hyperparameters by looking at the model complexity, training stability, and generalization performance. I tested using different learning rates and batch sizes, and with activation functions to find the best balance between convergence, speed, and accuracy. I also used L2 regularization so there will not be overfitting. I also used the Adam optimizer since it tweaks the learning rate during the training.