In [1]:
import pickle
import os.path as path
from dataclasses import dataclass

import numpy as np
import torch as t
import torch.nn.functional as F
import torchtext as tt

import mlflow
import mlflow.pytorch

from haikunator import Haikunator

In [2]:
DATAROOT = path.expanduser("~/mldata/pytorch")
DEVICE = t.device("cuda" if t.cuda.is_available() else "cpu")
DEVICE

device(type='cuda')

In [3]:
mlruns_dir = path.expanduser("~/mlruns")
mlflow.set_tracking_uri(mlruns_dir)

In [4]:
agnews_path = path.join(DATAROOT, "agnews")
trainset_path = path.join(agnews_path, "trainset.pkl")
testset_path = path.join(agnews_path, "testset.pkl")

if path.exists(trainset_path):
    print("Found pickled datasets. Loading from there.")
    with open(trainset_path, "rb") as f:
        train_val_set = pickle.load(f)
    with open(testset_path, "rb") as f:
        testset = pickle.load(f)
else:
    print("Downloading and serializing datasets.")
    train_val_set, testset = tt.datasets.AG_NEWS(agnews_path)
    with open(trainset_path, "wb") as f:
        pickle.dump(train_val_set, f, protocol=pickle.HIGHEST_PROTOCOL)
    with open(testset_path, "wb") as f:
        pickle.dump(testset, f, protocol=pickle.HIGHEST_PROTOCOL)

Found pickled datasets. Loading from there.


In [5]:
print(len(train_val_set), len(testset))

120000 7600


In [6]:
trainsize = int(len(train_val_set) * 0.9)
valsize = len(train_val_set) - trainsize
trainset, valset = t.utils.data.random_split(train_val_set, [trainsize, valsize])
print(len(trainset), len(valset))

108000 12000


In [7]:
vocab = train_val_set.get_vocab()
n_classes = len(train_val_set.get_labels())

# Experiment 1
Lets learn the embeddings along with the rest of the NN. The NN architecture is very simple, a couple of fully connected layers with ReLU activation.

In [8]:
@dataclass
class Hyperparams:
    max_seq_len: int = 50
    embedding_dim: int = 100
    batch_size: int = 32
    epochs: int = 7
    learning_rate: float = 0.001
    clip: float = 5.0
    l2: float = 0.0

    def to_dict(self):
        return {
            "max_seq_len": self.max_seq_len,
            "embedding_dim": self.embedding_dim,
            "batch_size": self.batch_size,
            "epochs": self.epochs,
            "learning_rate": self.learning_rate,
            "clip": self.clip,
            "l2": self.l2
        }

In [9]:
def build_batch_processor(max_seq_len):
    def process_batch(batch):
        targets = t.empty(len(batch), dtype=t.long)
        contents = t.zeros(len(batch), max_seq_len, dtype=t.long)
        for idx, (target, content) in enumerate(batch):
            targets[idx] = target
            seq_len = content.shape[0]
            if seq_len >= max_seq_len:
                contents[idx] = content[:max_seq_len]
            else:
                contents[idx][:seq_len] = content
        return contents, targets
    return process_batch

In [None]:
class Simple(t.nn.Module):
    def __init__(self, vocab_size, max_seq_len, n_classes, embedding_dim):
        super().__init__()
        self.embedding = t.nn.Embedding(num_embeddings=vocab_size, embedding_dim=embedding_dim)
        self.fc1 = t.nn.Linear(max_seq_len * embedding_dim, 1024)
        self.fc2 = t.nn.Linear(1024, 64)
        self.logits = t.nn.Linear(64, n_classes)

    def forward(self, contents):
        batch_size = contents.shape[0]
        x = self.embedding(contents)
        x = x.view(batch_size, -1)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        return self.logits(x)

In [None]:
hparams = Hyperparams()
model = Simple(len(vocab), hparams.max_seq_len, n_classes, hparams.embedding_dim)
dl = t.utils.data.DataLoader(trainset, batch_size=32, shuffle=True, collate_fn=build_batch_processor(hparams.max_seq_len))

In [None]:
contents, targets = next(iter(dl))

In [None]:
outputs = model(contents)
outputs.shape

In [15]:
def accuracy(outputs, targets):
    assert outputs.shape[0] == targets.shape[0]
    predictions = t.argmax(outputs, dim=1)
    correct = t.sum(predictions == targets).item()
    return correct / targets.shape[0]

In [16]:
def train(model, optim, loss_fn, epochs, trainloader, valloader, hparams):
    run_name = Haikunator().haikunate()
    model = model.to(DEVICE)

    with mlflow.start_run(run_name=run_name):
        mlflow.log_params(hparams.to_dict())
        for epoch in range(epochs):
            # Process the training set
            train_losses = []
            train_outputs = t.empty(0, n_classes).to(DEVICE)
            train_targets = t.tensor([], dtype=t.long).to(DEVICE)
            model.train()
            with t.enable_grad():
                for images, targets in trainloader:
                    images = images.to(DEVICE)
                    targets = targets.to(DEVICE)

                    optim.zero_grad()
                    outputs = model.forward(images)
                    loss = loss_fn(outputs, targets)
                    loss.backward()
                    # printed = print_param_stats("BEFORE CLIPPING", model, hparams.clip)
                    t.nn.utils.clip_grad_value_(model.parameters(), hparams.clip)
                    # if printed: print_param_stats("AFTER CLIPPING", model, float("-inf"))
                    optim.step()

                    train_losses.append(loss.detach().item())
                    train_outputs = t.cat((train_outputs, outputs.detach()))
                    train_targets = t.cat((train_targets, targets.detach()))
            train_loss = np.mean(train_losses)
            train_acc = accuracy(train_outputs, train_targets)

            # Calculate the validation metrics
            val_losses = []
            val_outputs = t.empty(0, n_classes).to(DEVICE)
            val_targets = t.tensor([], dtype=t.long).to(DEVICE)
            model.eval()
            with t.no_grad():
                for images, targets in valloader:
                    images = images.to(DEVICE)
                    targets = targets.to(DEVICE)
                    outputs = model(images)
                    loss = loss_fn(outputs, targets)
                    val_losses.append(loss.detach().item())
                    val_outputs = t.cat((val_outputs, outputs.detach()))
                    val_targets = t.cat((val_targets, targets.detach()))
            val_loss = np.mean(val_losses)
            val_acc = accuracy(val_outputs, val_targets)

            mlflow.log_metric("train_loss", np.around(train_loss, 3), step=epoch)
            mlflow.log_metric("val_loss", np.around(val_loss, 3), step=epoch)
            mlflow.log_metric("train_acc", np.around(train_acc, 2), step=epoch)
            mlflow.log_metric("val_acc", np.around(val_acc, 2), step=epoch)
            print(f"\nEpoch {epoch}:")
            print(f"Loss: train={train_loss:.3f}, validation={val_loss:.3f}")
            print(f"Accuracy: train={train_acc:.3f}, validaiton={val_acc:.3f}")

        mlflow.pytorch.log_model(model, "model")

In [None]:
mlflow.set_experiment("Simple")
hparams = Hyperparams()
model = Simple(len(vocab), hparams.max_seq_len, n_classes, hparams.embedding_dim)
optim = t.optim.Adam(model.parameters(), lr=hparams.learning_rate)
loss_fn = t.nn.CrossEntropyLoss()
trainloader = t.utils.data.DataLoader(
    trainset, 
    batch_size=hparams.batch_size, 
    shuffle=True, 
    collate_fn=build_batch_processor(hparams.max_seq_len)
)
valloader = t.utils.data.DataLoader(
    valset,
    batch_size=5000,
    collate_fn=build_batch_processor(hparams.max_seq_len)
)

In [None]:
train(model, optim, loss_fn, hparams.epochs, trainloader, valloader, hparams)

This model was terribly overfit. I can probably try to do the following things:
  * Regularization techniques like L2 or dropouts
  * Data augmentation (how?)

But time for second experiment.

# Experiment 2
Lets use GloVe word vectors instead of trying to learn the embeddings from scratch. But we'll still use a relatively simple architecture of fully connected layers.

In [10]:
glove_datapath = path.join(DATAROOT, "glove")
glove = tt.vocab.GloVe(name="6B", dim=100, cache=glove_datapath)

In [11]:
vocab.load_vectors(glove)

In [None]:
class SimpleGlove(t.nn.Module):
    def __init__(self, max_seq_len, embedding_dim):
        super().__init__()
        self.embedding = t.nn.Embedding.from_pretrained(vocab.vectors)
        self.fc1 = t.nn.Linear(max_seq_len * embedding_dim, 1024)
        self.fc2 = t.nn.Linear(1024, 64)
        self.logits = t.nn.Linear(64, n_classes)

    def forward(self, contents):
        batch_size = contents.shape[0]
        x = self.embedding(contents)
        x = x.view(batch_size, -1)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        return self.logits(x)

The `embedding` layer is frozen. Verify this.

In [None]:
hparams = Hyperparams()
model = SimpleGlove(hparams.max_seq_len, hparams.embedding_dim)

print("embedding layer -")
for param in model.embedding.parameters():
    print(param.requires_grad)

print("\nfc1 layer -")
for param in model.fc1.parameters():
    print(param.requires_grad)

print("\nfc2 layer -")
for param in model.fc2.parameters():
    print(param.requires_grad)

print("\nlogit layer -")
for param in model.logits.parameters():
    print(param.requires_grad)

In [None]:
mlflow.set_experiment("SimpleGlove")
hparams = Hyperparams()
model = SimpleGlove(hparams.max_seq_len, hparams.embedding_dim)
optim = t.optim.Adam(model.parameters(), lr=hparams.learning_rate)
loss_fn = t.nn.CrossEntropyLoss()
trainloader = t.utils.data.DataLoader(
    trainset, 
    batch_size=hparams.batch_size, 
    shuffle=True, 
    collate_fn=build_batch_processor(hparams.max_seq_len)
)
valloader = t.utils.data.DataLoader(
    valset,
    batch_size=5000,
    collate_fn=build_batch_processor(hparams.max_seq_len)
)

In [None]:
train(model, optim, loss_fn, hparams.epochs, trainloader, valloader, hparams)

Slightly better than last time, but still overfitting like crazy. Time for experiment 3.

# Experiment 3
Use a RNN instead of an FCN along with GloVe embeddings.

In [12]:
class RnnGlove(t.nn.Module):
    def __init__(self, max_seq_len, embedding_dim):
        super().__init__()
        self.embedding = t.nn.Embedding.from_pretrained(vocab.vectors)
        self.rnn = t.nn.RNN(input_size=embedding_dim, hidden_size=128, num_layers=1, batch_first=True, nonlinearity="tanh")
        self.logits = t.nn.Linear(128, n_classes)
        
    def forward(self, contents):
        # contents \in batch_size x max_seq_len
        # x \in batch_size x max_seq_len x embedding_dim
        # h \in 1 x batch_size x hidden_size => batch_size x hidden_size
        # logits \in batch_size x n_classes
        x = self.embedding(contents)
        _, h = self.rnn(x)
        h = h.squeeze()
        return self.logits(h)

In [None]:
hparams = Hyperparams()
model = SimpleGlove(hparams.max_seq_len, hparams.embedding_dim)
dl = t.utils.data.DataLoader(trainset, batch_size=32, shuffle=True, collate_fn=build_batch_processor(hparams.max_seq_len))


In [None]:
contents, targets = next(iter(dl))

In [None]:
outputs = model(contents)
outputs.shape

In [13]:
mlflow.set_experiment("RnnGlove")
hparams = Hyperparams(
    max_seq_len=50,
    embedding_dim=100,
    batch_size=64,
    epochs=10,
    learning_rate=0.00008,
    clip=0.9,
    l2=0.01
)
model = RnnGlove(hparams.max_seq_len, hparams.embedding_dim)
optim = t.optim.Adam(model.parameters(), lr=hparams.learning_rate)
loss_fn = t.nn.CrossEntropyLoss()
trainloader = t.utils.data.DataLoader(
    trainset, 
    batch_size=hparams.batch_size, 
    shuffle=True, 
    collate_fn=build_batch_processor(hparams.max_seq_len)
)
valloader = t.utils.data.DataLoader(
    valset,
    batch_size=5000,
    collate_fn=build_batch_processor(hparams.max_seq_len)
)

INFO: 'RnnGlove' does not exist. Creating a new experiment


In [17]:
train(model, optim, loss_fn, hparams.epochs, trainloader, valloader, hparams)


Epoch 0:
Loss: train=0.741, validation=0.493
Accuracy: train=0.692, validaiton=0.849

Epoch 1:
Loss: train=0.473, validation=0.465
Accuracy: train=0.856, validaiton=0.851

Epoch 2:
Loss: train=0.418, validation=0.381
Accuracy: train=0.872, validaiton=0.879

Epoch 3:
Loss: train=0.374, validation=0.350
Accuracy: train=0.883, validaiton=0.888

Epoch 4:
Loss: train=0.360, validation=0.346
Accuracy: train=0.885, validaiton=0.888

Epoch 5:
Loss: train=0.353, validation=0.342
Accuracy: train=0.886, validaiton=0.888

Epoch 6:
Loss: train=0.346, validation=0.347
Accuracy: train=0.887, validaiton=0.885

Epoch 7:
Loss: train=0.345, validation=0.353
Accuracy: train=0.888, validaiton=0.885

Epoch 8:
Loss: train=0.345, validation=0.337
Accuracy: train=0.887, validaiton=0.889

Epoch 9:
Loss: train=0.342, validation=0.332
Accuracy: train=0.888, validaiton=0.889


  "type " + obj.__name__ + ". It won't be checked "


In [None]:
def print_param_stats(caption, model, threshold):
    to_print = False

    for param in model.rnn.parameters():
        if to_print: break
        if t.max(param.grad) >= threshold or t.min(param.grad) <= -threshold:
            to_print = True
    
    for param in model.logits.parameters():
        if to_print: break
        if t.max(param.grad) >= threshold or t.min(param.grad) <= -threshold:
            to_print = True
    
    if not to_print: return False

    print(f"\n{caption} ---")
    print("embedding layer -")
    for param in model.embedding.parameters():
        print(param.requires_grad, param.shape)

    print("\nrnn layer -")
    for param in model.rnn.parameters():
        print(param.requires_grad, param.shape, t.min(param.grad), t.max(param.grad))

    print("\nlogit layer -")
    for param in model.logits.parameters():
        print(param.requires_grad, param.shape, t.min(param.grad), t.max(param.grad))
    
    return True