Logistic Regression model to distinguish between forged notes and genuine bank notes.

# install libraries

# load dataset

In [1]:
import pandas as pd

In [6]:
df = pd.read_csv('data_banknote_authentication.txt', header=None)
df.head(3)

Unnamed: 0,0,1,2,3,4
0,3.6216,8.6661,-2.8073,-0.44699,0
1,4.5459,8.1674,-2.4586,-1.4621,0
2,3.866,-2.6383,1.9242,0.10645,0


Check the distribution of labels

In [8]:
import numpy as np

np.bincount(df[4])

array([762, 610])

Define features and labels

In [12]:
X_features = df[[0, 1, 2, 3]].values
y_labels = df[4].values

In [13]:
y_labels

array([0, 0, 0, ..., 1, 1, 1])

In [14]:
X_features

array([[  3.6216 ,   8.6661 ,  -2.8073 ,  -0.44699],
       [  4.5459 ,   8.1674 ,  -2.4586 ,  -1.4621 ],
       [  3.866  ,  -2.6383 ,   1.9242 ,   0.10645],
       ...,
       [ -3.7503 , -13.4586 ,  17.5932 ,  -2.7771 ],
       [ -3.5637 ,  -8.3827 ,  12.393  ,  -1.2823 ],
       [ -2.5419 ,  -0.65804,   2.6842 ,   1.1952 ]])

In [15]:
X_features.shape

(1372, 4)

In [16]:
y_labels.shape

(1372,)

# split data into 80% train, 20% val

In [20]:
train_size = int(X_features.shape[0]*0.80)
train_size

1097

In [22]:
val_size = X_features.shape[0] - train_size
val_size

275

# define a DataLoader

In [23]:
from torch.utils.data import Dataset, DataLoader


class MyDataset(Dataset):
    def __init__(self, X, y):

        self.features = torch.tensor(X, dtype=torch.float32)
        self.labels = torch.tensor(y, dtype=torch.float32)

    def __getitem__(self, index):
        x = self.features[index]
        y = self.labels[index]        
        return x, y

    def __len__(self):
        return self.labels.shape[0]

# generate training and val set

In [28]:
import torch

dataset = MyDataset(X_features, y_labels)

train_set, val_set = torch.utils.data.random_split(dataset, (train_size, val_size))

train_loader = DataLoader(
    dataset=train_set,
    batch_size = 12,
    shuffle=True
)

val_loader = DataLoader(
    dataset=val_set,
    batch_size=10,
    shuffle=False,
)

# implement the model

In [30]:
import torch

class LogisticRegression(torch.nn.Module):

    def __init__(self, num_features):
        super().__init__()
        self.linear = torch.nn.Linear(num_features, 1)

    def forward(self, x):
        logits = self.linear(x)
        probas = torch.sigmoid(logits)
        return probas

# training

In [35]:
import torch.nn.functional as F

torch.manual_seed(1)
model = LogisticRegression(num_features=4)
optimizer = torch.optim.SGD(model.parameters(), lr=1e-2)

num_epochs = 50

for epoch in range(num_epochs):
    model = model.train()
    for batch_idx, (features, class_labels) in enumerate(train_loader):
        probas = model(features)
        loss = F.binary_cross_entropy(probas, class_labels.view(probas.shape))
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        ### LOGGING
        if not batch_idx % 20: # log every 20th batch
            print(f'Epoch: {epoch+1:03d}/{num_epochs:03d}'
                   f' | Batch {batch_idx:03d}/{len(train_loader):03d}'
                   f' | Loss: {loss:.2f}')
        

Epoch: 001/050 | Batch 000/092 | Loss: 1.03
Epoch: 001/050 | Batch 020/092 | Loss: 0.57
Epoch: 001/050 | Batch 040/092 | Loss: 0.64
Epoch: 001/050 | Batch 060/092 | Loss: 0.39
Epoch: 001/050 | Batch 080/092 | Loss: 0.37
Epoch: 002/050 | Batch 000/092 | Loss: 0.57
Epoch: 002/050 | Batch 020/092 | Loss: 0.28
Epoch: 002/050 | Batch 040/092 | Loss: 0.18
Epoch: 002/050 | Batch 060/092 | Loss: 0.24
Epoch: 002/050 | Batch 080/092 | Loss: 0.14
Epoch: 003/050 | Batch 000/092 | Loss: 0.49
Epoch: 003/050 | Batch 020/092 | Loss: 0.25
Epoch: 003/050 | Batch 040/092 | Loss: 0.31
Epoch: 003/050 | Batch 060/092 | Loss: 0.12
Epoch: 003/050 | Batch 080/092 | Loss: 0.36
Epoch: 004/050 | Batch 000/092 | Loss: 0.29
Epoch: 004/050 | Batch 020/092 | Loss: 0.31
Epoch: 004/050 | Batch 040/092 | Loss: 0.24
Epoch: 004/050 | Batch 060/092 | Loss: 0.26
Epoch: 004/050 | Batch 080/092 | Loss: 0.28
Epoch: 005/050 | Batch 000/092 | Loss: 0.32
Epoch: 005/050 | Batch 020/092 | Loss: 0.15
Epoch: 005/050 | Batch 040/092 |

In [41]:
def compute_accuracy(model, dataloader):

    model = model.eval()
    
    correct = 0.0
    total_examples = 0

    for idx, (features, class_labels) in enumerate(dataloader):
        with torch.no_grad():
            probas = model(features)
        pred = torch.where(probas > 0.5, 1, 0)
        lab = class_labels.view(pred.shape).to(pred.dtype)

        compare = pred == lab
        correct += torch.sum(compare)

        total_examples += len(compare)

    return correct/total_examples

In [42]:
train_acc = compute_accuracy(model, train_loader)
train_acc

tensor(0.9854)

In [43]:
val_acc = compute_accuracy(model, val_loader)
val_acc

tensor(0.9964)