In [20]:
import numpy as np

class MultiClassMLP:
    def __init__(self, input_dim, hidden_dims, output_dim, activation='relu', random_seed=42):
        self.input_dim = input_dim
        self.hidden_dims = hidden_dims
        self.output_dim = output_dim
        self.layers = []
        self.num_layers = len(hidden_dims) + 1
        
        # Initialize weights and biases for each layer
        np.random.seed(random_seed)
        layer_dims = [input_dim] + hidden_dims + [output_dim]
        self.activations = [self._sigmoid if activation == 'sigmoid' else self._tanh if activation == 'tanh' else self._relu for i in range(self.num_layers)]
        for i in range(self.num_layers):
            fan_in = layer_dims[i]
            fan_out = layer_dims[i+1]
            W = np.random.randn(fan_in, fan_out) / np.sqrt(fan_in)
            b = np.zeros((1, fan_out))
            self.layers.append({'W': W, 'b': b})
        
    def _softmax(self, X):
        exps = np.exp(X - np.max(X, axis=1, keepdims=True))
        return exps / np.sum(exps, axis=1, keepdims=True)
    
    def _tanh(self, X):
        return np.tanh(X)
    
    def _sigmoid(self, X):
        return 1 / (1 + np.exp(-X))
    
    def _relu(self, X):
        return np.maximum(0, X)

    def delta_cross_entropy(self, X, y):
        """
        X is the output from fully connected layer (num_examples x num_classes)
        y is labels (num_examples x 1)
            Note that y is not one-hot encoded vector.
            It can be computed as y.argmax(axis=1) from one-hot encoded vectors of labels if required.
        """
        m = y.shape[0]
        grad = self._softmax(X)
        grad[range(m), y] -= 1
        grad = grad / m
        return grad
    
    def _forward(self, X):
        activations = [X]
        for i in range(self.num_layers):
            Z = np.dot(activations[-1], self.layers[i]['W']) + self.layers[i]['b']
            A = self.activations[i](Z)
            activations.append(A)
        probs = self._softmax(activations[-1]) # Using softmax activation function for output layer
        return activations, probs
    
    def _backward(self, X, y, activations, probs, learning_rate):
        dL_dO = probs - y
        for i in reversed(range(self.num_layers)):
            if self.activations[i] == self._sigmoid:
                dA = dL_dO * activations[i+1] * (1 - activations[i+1])
            elif self.activations[i] == self._tanh:
                dA = dL_dO * (1 - activations[i+1]**2) # Derivative of tanh activation function
            elif self.activations[i] == self._relu:
                dA = dL_dO * np.where(activations[i+1] > 0, 1, 0) # Derivative of ReLU activation function
            else:
                raise ValueError("Invalid activation function")
                
            dZ = np.dot(dA, self.layers[i]['W'].T)
            dW = np.dot(activations[i].T, dA)
            db = np.sum(dA, axis=0, keepdims=True)
            self.layers[i]['W'] -= learning_rate * dW
            self.layers[i]['b'] -= learning_rate * db
            
            dL_dO = dZ
        
    def train(self, X_train, y_train, learning_rate=0.1, num_epochs=100):
        Y_train = np.eye(self.output_dim)[y_train]
        for epoch in range(num_epochs):
            activations, probs = self._forward(X_train)
            self._backward(X_train, Y_train, activations, probs, learning_rate)
            loss = -np.sum(Y_train * np.log(probs)) / X_train.shape[0]
            if epoch % 10 == 0:
                print(f"Epoch {epoch}: Loss={loss:.4f}")
                
    def predict_proba(self, X):
        _, probs = self._forward(X)
        return probs
    
    def predict(self, X):
        probs = self.predict_proba(X)
        return np.argmax(probs, axis=1)




In [2]:
import matplotlib.pyplot as plt
# torch is just for the feature extractor and the dataset (NOT FOR IMPLEMENTING NEURAL NETWORKS!)
import torch
from torchvision import datasets
import torchvision.transforms as transforms
from torchvision.models import resnet34
import torch.nn as nn
# sklearn is just for evaluation (NOT FOR IMPLEMENTING NEURAL NETWORKS!)
from sklearn.metrics import confusion_matrix, f1_score

In [3]:
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
    ])
feature_extractor = resnet34(pretrained=True)
input_dim = feature_extractor.fc.in_features
for param in feature_extractor.parameters():
  param.requires_grad = False

feature_extractor.fc = nn.Identity()

Downloading: "https://download.pytorch.org/models/resnet34-b627a593.pth" to /root/.cache/torch/hub/checkpoints/resnet34-b627a593.pth
100%|██████████| 83.3M/83.3M [00:00<00:00, 230MB/s]


In [4]:
train_data = datasets.CIFAR10('data', train=True,
                              download=True, transform=transform)
test_data = datasets.CIFAR10('data', train=False,
                             download=True, transform=transform)

Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to data/cifar-10-python.tar.gz


100%|██████████| 170498071/170498071 [00:02<00:00, 78155759.13it/s]


Extracting data/cifar-10-python.tar.gz to data
Files already downloaded and verified


In [5]:
from torch.utils.data import DataLoader

In [6]:
train_loader = DataLoader(train_data, batch_size=64, shuffle=True)
test_loader = DataLoader(test_data, batch_size=64, shuffle=True)

In [7]:
embeddings = []
labels = []
i = 0
for x, y in train_loader:
  if i == 50:
    break
  i += 1
  print(i)
  embeddings += feature_extractor(x)
  labels += y

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50


In [8]:
 len(next(iter(train_loader))[0])

64

In [9]:
len(embeddings)

3200

In [10]:
labels

[tensor(9),
 tensor(9),
 tensor(4),
 tensor(1),
 tensor(6),
 tensor(2),
 tensor(6),
 tensor(1),
 tensor(9),
 tensor(3),
 tensor(3),
 tensor(5),
 tensor(3),
 tensor(8),
 tensor(5),
 tensor(1),
 tensor(8),
 tensor(4),
 tensor(5),
 tensor(1),
 tensor(2),
 tensor(3),
 tensor(6),
 tensor(7),
 tensor(5),
 tensor(7),
 tensor(1),
 tensor(1),
 tensor(9),
 tensor(7),
 tensor(7),
 tensor(1),
 tensor(6),
 tensor(3),
 tensor(8),
 tensor(8),
 tensor(9),
 tensor(0),
 tensor(1),
 tensor(9),
 tensor(8),
 tensor(1),
 tensor(9),
 tensor(1),
 tensor(4),
 tensor(0),
 tensor(9),
 tensor(4),
 tensor(7),
 tensor(6),
 tensor(3),
 tensor(6),
 tensor(8),
 tensor(4),
 tensor(0),
 tensor(5),
 tensor(5),
 tensor(8),
 tensor(9),
 tensor(1),
 tensor(8),
 tensor(7),
 tensor(7),
 tensor(6),
 tensor(2),
 tensor(2),
 tensor(8),
 tensor(0),
 tensor(9),
 tensor(2),
 tensor(7),
 tensor(0),
 tensor(2),
 tensor(0),
 tensor(4),
 tensor(4),
 tensor(6),
 tensor(1),
 tensor(5),
 tensor(4),
 tensor(9),
 tensor(0),
 tensor(0),
 ten

In [11]:
# data_tensor = torch.cat(embeddings, dim=0)

# Convert the tensor to a NumPy array
X = np.array(embeddings)
# data_tensor = torch.cat(labels, dim=0)
for i in range(len(X)):
  X[i] = np.array(X[i])
# Convert the tensor to a NumPy array
Y = np.array(labels)

  X = np.array(embeddings)
  X = np.array(embeddings)


In [12]:
result = []
for i in X:
  result.append(i)
result = np.array(result)
result.shape

(3200, 512)

In [31]:
from sklearn.neural_network import MLPClassifier
clf = MLPClassifier(random_state=1, max_iter=1000, hidden_layer_sizes=[20]).fit(result, Y)
clf.score(result, Y)

0.999375

In [35]:
mlp = MultiClassMLP(input_dim=512, hidden_dims=[100], output_dim=10, activation="tanh")
mlp.train(result, Y, learning_rate=0.001, num_epochs=1000)

Epoch 0: Loss=2.4453
Epoch 10: Loss=2.3689
Epoch 20: Loss=2.1965
Epoch 30: Loss=2.2292
Epoch 40: Loss=2.1404
Epoch 50: Loss=2.1093
Epoch 60: Loss=2.0677
Epoch 70: Loss=2.0357
Epoch 80: Loss=1.8525
Epoch 90: Loss=1.7978
Epoch 100: Loss=1.7345
Epoch 110: Loss=1.8562
Epoch 120: Loss=1.5971
Epoch 130: Loss=1.7594
Epoch 140: Loss=1.7704
Epoch 150: Loss=1.7224
Epoch 160: Loss=1.6534
Epoch 170: Loss=1.5692
Epoch 180: Loss=1.5759
Epoch 190: Loss=1.6665
Epoch 200: Loss=1.5495
Epoch 210: Loss=1.3887
Epoch 220: Loss=1.4652
Epoch 230: Loss=1.5551
Epoch 240: Loss=1.4326
Epoch 250: Loss=1.4783
Epoch 260: Loss=1.4594
Epoch 270: Loss=1.4211
Epoch 280: Loss=1.2683
Epoch 290: Loss=1.4031
Epoch 300: Loss=1.2291
Epoch 310: Loss=1.6110
Epoch 320: Loss=1.3188
Epoch 330: Loss=1.3167
Epoch 340: Loss=1.2982
Epoch 350: Loss=1.2409
Epoch 360: Loss=1.2686
Epoch 370: Loss=1.2485
Epoch 380: Loss=1.2787
Epoch 390: Loss=1.2009
Epoch 400: Loss=1.2387
Epoch 410: Loss=1.1639
Epoch 420: Loss=1.1168
Epoch 430: Loss=1.0988

In [36]:
counter = 0
for i, t in enumerate(result):
  if mlp.predict(t) == Y[i]:
    counter += 1
print(counter / len(result))

0.8690625


In [23]:
X_train = np.random.randn(100, 10)
y_train = np.random.randint(0, 5, size=100)
# Y_train = np.eye(5)[y_train]

# Initialize and train the MLP
mlp2 = MultiClassMLP(input_dim=10, hidden_dims=[20], output_dim=5, activation="sigmoid")
mlp2.train(X_train, y_train, learning_rate=0.1, num_epochs=1000)

Epoch 0: Loss=1.5996
Epoch 10: Loss=1.5479
Epoch 20: Loss=1.5098
Epoch 30: Loss=1.4817
Epoch 40: Loss=1.4593
Epoch 50: Loss=1.4382
Epoch 60: Loss=1.4175
Epoch 70: Loss=1.3972
Epoch 80: Loss=1.3778
Epoch 90: Loss=1.3594
Epoch 100: Loss=1.3424
Epoch 110: Loss=1.3268
Epoch 120: Loss=1.3122
Epoch 130: Loss=1.2984
Epoch 140: Loss=1.2854
Epoch 150: Loss=1.2729
Epoch 160: Loss=1.2605
Epoch 170: Loss=1.2485
Epoch 180: Loss=1.2379
Epoch 190: Loss=1.2283
Epoch 200: Loss=1.2195
Epoch 210: Loss=1.2114
Epoch 220: Loss=1.2039
Epoch 230: Loss=1.1970
Epoch 240: Loss=1.1905
Epoch 250: Loss=1.1842
Epoch 260: Loss=1.1781
Epoch 270: Loss=1.1726
Epoch 280: Loss=1.1675
Epoch 290: Loss=1.1631
Epoch 300: Loss=1.1593
Epoch 310: Loss=1.1560
Epoch 320: Loss=1.1531
Epoch 330: Loss=1.1504
Epoch 340: Loss=1.1480
Epoch 350: Loss=1.1458
Epoch 360: Loss=1.1438
Epoch 370: Loss=1.1418
Epoch 380: Loss=1.1399
Epoch 390: Loss=1.1381
Epoch 400: Loss=1.1364
Epoch 410: Loss=1.1347
Epoch 420: Loss=1.1332
Epoch 430: Loss=1.1318