In [6]:
import numpy as np
import matplotlib.pyplot as plt
import torch
import random
import torch.nn as nn
import torch.nn.functional as F
from sklearn.decomposition import PCA       # Needed for PCA of embeddings
import os

device = "cpu"
if torch.cuda.is_available():
    device = "cuda"
    print("Using GPU", torch.cuda.get_device_name(0))
else:
    print("Using CPU")

Using CPU


In [7]:
SEED = 42

def set_seed(seed_value=0):
    random.seed(seed_value)
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)
    
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed_value)
        torch.cuda.manual_seed_all(seed_value)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False

In [8]:
# Prime number for modular addition
P = 53

# Create the dataset
set_seed(SEED)
data = []
for i in range(P):
    for j in range(P):
        data.append([i,j,(i+j)%P])
data = np.array(data)

# Split into train and test
TRAIN_FRACTION = 0.5
np.random.shuffle(data)
train_data = data[:int(len(data) * TRAIN_FRACTION)]
test_data = data[int(len(data) * TRAIN_FRACTION):]

# Convert to tensors and create dataloaders with batch size
BATCH_SIZE = 32
train_data = torch.tensor(train_data, dtype=torch.long, device=device)
test_data = torch.tensor(test_data, dtype=torch.long, device=device)
train_loader = torch.utils.data.DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_data, batch_size=BATCH_SIZE, shuffle=False)

In [9]:
import torch
import torch.nn as nn
import torch.nn.functional as F

D_EMBED = 128
HIDDEN = 256
P = 59

class RNN(nn.Module):
    def __init__(self):
        super(RNN, self).__init__()
        self.embed = nn.Embedding(P, D_EMBED)
        self.rnn = nn.RNN(input_size=D_EMBED, hidden_size=HIDDEN, batch_first=True)
        self.linear = nn.Linear(HIDDEN, P)
        self.init_weights()

    def forward(self, x1, x2):
        x1 = self.embed(x1)
        x2 = self.embed(x2)
        x = torch.stack((x1, x2), dim=1)
        _, h_n = self.rnn(x)
        x = self.linear(h_n.squeeze(0))
        return x

    # Weight initialization
    def init_weights(self):
        for m in self.modules():
            if isinstance(m, nn.Embedding):
                nn.init.xavier_normal_(m.weight)
            if isinstance(m, nn.Linear):
                nn.init.xavier_normal_(m.weight)
                nn.init.zeros_(m.bias)

In [10]:
NB_EPOCHS = 5000
LEARNING_RATE = 0.0003
WEIGHT_DECAY = 1

set_seed(SEED)
model = RNN().to(device)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)

# Recording
REPORT_INTERVAL = 20    # How often we print
SAVE_INTERVAL = 50      # How often we save
model_folder = 'intermediary'
# Create folder if it doesn't exist
if not os.path.exists(model_folder):
    os.makedirs(model_folder)

train_loss_history = []
train_acc_history = []
test_loss_history = []
test_acc_history = []

for epoch in range(NB_EPOCHS):
    
    # Training phase
    model.train()
    train_loss = 0.0
    train_acc = 0.0
    for batch in train_loader:
        x1, x2, y = batch[:,0], batch[:,1], batch[:,2]
        optimizer.zero_grad()
        output = model(x1,x2)
        loss = criterion(output, y)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
        train_acc += (output.argmax(dim=1) == y).sum().item()
        
    train_loss /= len(train_loader)
    train_loss_history.append(train_loss)
    train_acc /= len(train_data)
    train_acc_history.append(train_acc)
    
    # Testing phase
    model.eval()
    with torch.no_grad():
        test_loss = 0.0
        test_acc = 0.0
        for batch in test_loader:
            x1, x2, y = batch[:,0], batch[:,1], batch[:,2]
            output = model(x1,x2)
            loss = criterion(output, y)
            test_loss += loss.item()
            test_acc += (output.argmax(dim=1) == y).sum().item()
        
        test_loss /= len(test_loader)
        test_loss_history.append(test_loss)
        test_acc /= len(test_data)
        test_acc_history.append(test_acc)
        
    if epoch % REPORT_INTERVAL == 0:
        print(f"{epoch}/{NB_EPOCHS}: Train loss={train_loss:.4f}, acc={100*train_acc:.1f}%  /  Test loss={test_loss:.4f}, acc={100*test_acc:.1f}%")
    
    if epoch % SAVE_INTERVAL == 0:
        # Save model in intermediary folder
        torch.save(model.state_dict(), f"{model_folder}/model_{epoch}.pth")
        
torch.save(model.state_dict(), f"model.pth")

0/5000: Train loss=4.0707, acc=0.9%  /  Test loss=4.0608, acc=1.6%
20/5000: Train loss=3.6826, acc=6.4%  /  Test loss=4.4905, acc=0.0%
40/5000: Train loss=3.4797, acc=7.0%  /  Test loss=5.0123, acc=0.0%
60/5000: Train loss=3.4112, acc=8.5%  /  Test loss=5.1950, acc=0.0%
80/5000: Train loss=3.3849, acc=8.3%  /  Test loss=5.2611, acc=0.0%
100/5000: Train loss=3.3739, acc=8.0%  /  Test loss=5.2920, acc=0.0%
120/5000: Train loss=3.3670, acc=7.5%  /  Test loss=5.3152, acc=0.0%
140/5000: Train loss=3.3621, acc=7.8%  /  Test loss=5.3320, acc=0.0%
160/5000: Train loss=3.3597, acc=7.4%  /  Test loss=5.3472, acc=0.0%
180/5000: Train loss=3.3562, acc=7.1%  /  Test loss=5.3566, acc=0.0%
200/5000: Train loss=3.3535, acc=7.6%  /  Test loss=5.3685, acc=0.0%
220/5000: Train loss=3.3525, acc=8.1%  /  Test loss=5.3783, acc=0.0%
240/5000: Train loss=3.3487, acc=8.0%  /  Test loss=5.3889, acc=0.0%
260/5000: Train loss=3.3456, acc=7.5%  /  Test loss=5.3974, acc=0.0%
280/5000: Train loss=3.3405, acc=8.0%  /