In [1]:
import numpy as np
import torch
import matplotlib.pyplot as plt

torch.manual_seed(42)
np.random.seed(42)

from components import Bilinear

### (a+b)mod p dataset

In [2]:
# p = 113
# data_set = np.zeros((p*p, 2*p))
# labels = np.zeros((p*p, p))
# for number1 in range(p):
#   for number2 in range(p):
#     data_set[number1*p + number2][number1] = 1
#     data_set[number1*p + number2][number2+p] = 1
#     labels[number1*p + number2][(number1 + number2) % p] = 1

# # shuffle the dataset
# shuffle = np.random.permutation(p*p)
# data_set = data_set[shuffle]
# labels = labels[shuffle]

# # divide in train and validation set
# train_proportion = 0.8
# train_data = data_set[:int(train_proportion*p*p)]
# train_labels = labels[:int(train_proportion*p*p)]
# val_data = data_set[int(train_proportion*p*p):]
# val_labels = labels[int(train_proportion*p*p):]


### (a+1)mod p and (a+2)mod p dataset

In [3]:
p = 113
# define the dataset for modular addition
data_set = np.zeros((2*p, 2*p))
labels = np.zeros((2*p, p))
idx = 0
for a in range(p):
    for offset in [10, 20]:          # only +1 and +2
        # encode a
        data_set[idx, a] = 1
        # encode offset (put offset at position p+offset)
        data_set[idx, p + offset] = 1
        # label = (a + offset) mod p
        labels[idx, (a + offset) % p] = 1
        idx += 1
        
        
# shuffle the dataset
shuffle = np.random.permutation(2*p)
data_set = data_set[shuffle]
labels = labels[shuffle]

# divide in train and validation set
train_proportion = 0.8
train_data = data_set[:int(train_proportion*2*p)]
train_labels = labels[:int(train_proportion*2*p)]
val_data = data_set[int(train_proportion*2*p):]
val_labels = labels[int(train_proportion*2*p):]

In [4]:
# Check if CUDA is available, else use CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# convert to tensors
train_data = torch.from_numpy(train_data).float().to(device)
train_labels = torch.from_numpy(train_labels).float().to(device)
val_data = torch.from_numpy(val_data).float().to(device)
val_labels = torch.from_numpy(val_labels).float().to(device)

# define the 1-hidden layer MLP
class MLP(torch.nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(MLP, self).__init__()
        self.bl1 = Bilinear(input_size, hidden_size, bias=False)
        self.bl2 = Bilinear(hidden_size, output_size, bias=False)
        self.output_size = output_size
        self.hidden_size = hidden_size

    def forward(self, x, val=False):
        pre_relu = self.bl1(x)
        post_relu = torch.nn.functional.relu(pre_relu)
        x = self.bl2(post_relu)
        return x, pre_relu, post_relu



# define the training loop
def train(model, train_data, train_labels, val_data, val_labels, epochs, batch_size, lr):
    optimzer = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=0.5)
    loss_fn = torch.nn.CrossEntropyLoss()

    train_loss_values = []
    val_loss_values = []
    train_acc_values = []
    val_acc_values = []

    running_train_loss = 0
    print(epochs)
    for epoch in range(epochs):
        model.train()
        correct_train_preds = 0
        total_train_preds = 0
        for batch in range(0, len(train_data), batch_size):
            optimzer.zero_grad()
            output, pre_relu, post_relu = model(train_data[batch:batch+batch_size])
            loss = loss_fn(output, torch.argmax(train_labels[batch:batch+batch_size], axis=1)) #+ 0.0000001*l1_norm
            running_train_loss += loss.item()
            preds = torch.argmax(output, axis=1)
            correct_train_preds += (preds == torch.argmax(train_labels[batch:batch+batch_size], axis=1)).sum().item()
            total_train_preds += len(preds)
            loss.backward()
            optimzer.step()
        model.eval()


        output, _, _ = model(val_data, val=True)
        val_loss = loss_fn(output, torch.argmax(val_labels, axis=1)).item()
        val_preds = torch.argmax(output, axis=1)
        correct_val_preds = (val_preds == torch.argmax(val_labels, axis=1)).sum().item()
        total_val_preds = len(val_preds)
        avg_train_loss = running_train_loss / (len(train_data) / batch_size)
        train_acc = correct_train_preds / total_train_preds
        val_acc = correct_val_preds / total_val_preds
        train_loss_values.append(avg_train_loss)
        val_loss_values.append(val_loss)
        train_acc_values.append(train_acc)
        val_acc_values.append(val_acc)

        print("Epoch: {} | Train loss: {:.2f} | Validation loss: {:.2f} | Train accuracy: {:.2f} | Validation accuracy: {:.2f}".format(epoch, avg_train_loss, val_loss, train_acc, val_acc))

        running_train_loss = 0
    return model, train_loss_values, val_loss_values, train_acc_values, val_acc_values

# train the model
model = MLP(2*p, 50, p).to(device)
num_epochs = 500
model, train_loss_values, val_loss_values, train_acc_values, val_acc_values = train(model, train_data, train_labels, val_data, val_labels, num_epochs, 128, 0.003)


500
Epoch: 0 | Train loss: 6.72 | Validation loss: 4.73 | Train accuracy: 0.01 | Validation accuracy: 0.00
Epoch: 1 | Train loss: 6.72 | Validation loss: 4.73 | Train accuracy: 0.05 | Validation accuracy: 0.00
Epoch: 2 | Train loss: 6.72 | Validation loss: 4.73 | Train accuracy: 0.14 | Validation accuracy: 0.00
Epoch: 3 | Train loss: 6.72 | Validation loss: 4.73 | Train accuracy: 0.25 | Validation accuracy: 0.00
Epoch: 4 | Train loss: 6.72 | Validation loss: 4.73 | Train accuracy: 0.34 | Validation accuracy: 0.00
Epoch: 5 | Train loss: 6.72 | Validation loss: 4.73 | Train accuracy: 0.45 | Validation accuracy: 0.00
Epoch: 6 | Train loss: 6.72 | Validation loss: 4.73 | Train accuracy: 0.49 | Validation accuracy: 0.00
Epoch: 7 | Train loss: 6.72 | Validation loss: 4.73 | Train accuracy: 0.53 | Validation accuracy: 0.00
Epoch: 8 | Train loss: 6.72 | Validation loss: 4.73 | Train accuracy: 0.61 | Validation accuracy: 0.00
Epoch: 9 | Train loss: 6.72 | Validation loss: 4.73 | Train accuracy: