# Bayesian Transfer Learning for Deep Networks

In this project we are concerned with **Bayesian Deep Learning**. Specifically, we want to know whether having a deep Bayesian model will improve the transfer of learning. Our hypothesis is that that knowledge gained from training a model on tasks **A** and then using the learned weights as a basis for learning on tasks $B$ will perform better than training **B** from scratch - assuming the domains are similar.

![Transfer Learning](https://image.slidesharecdn.com/13aibigdata-160606103446/95/aibigdata-lab-2016-transfer-learning-7-638.jpg?cb=1465209397)

We use Bayes By Backprop introduced by [Blundell, 2015](https://arxiv.org/abs/1505.05424)). to learn a probability distribution over each of the weights in the network. These weight distributions are fitted using variational inference given some prior.

By inferring the posterior weight distribution in task **A** $p(w|D_A)$, a model is trained which is able to solve the second task **B** when exposed to new data $D_B$, while remembering task **A**. Variational Bayasian approximations of $p(w|D_A)$ are considered for this operation.

> The model constructed in this notebook tries to dynamically adapt its weights when confronted with new tasks. A method named **elastic weight consolidation (EWC)** ([Kirkpatrick, 2016](http://www.pnas.org/content/114/13/3521.full.pdf)) is implemented that considers data from two different tasks as independent.

### Import packages

In [None]:
import gc
import pickle

import torch
from torch.autograd import Variable
from torch.utils.data import DataLoader
from tqdm import tqdm

from auxiliary import merge_add, merge_average
from bbbmlp import BBBMLP
from datasets import LimitedMNIST
from loggers import PrintLogger, WeightLogger

### Hyperparameters

In [None]:
cuda        = torch.cuda.is_available()
NUM_EPOCHS  = 50
SAVE_EVERY  = 9
N_SAMPLES   = 1
LR          = 1e-3
MNIST       = "./"
batch_size  = 64

### call log functions for weights

In [None]:
file_logger = WeightLogger()
print_logger = PrintLogger()

We wish to attain a tighter bound on the objective, we therefore implement [Normalizing flows](https://arxiv.org/abs/1505.05770).

In [None]:
number_of_flows = 16

### Define network

Bayes by Backprop Multi Layer Perceptron (BBBMLP) with 2 hidden layers, each with 100 units

In [None]:
hidden = 100
layers = 2


def get_model(digits):
    model = BBBMLP(in_features=784, num_class=len(digits), num_hidden=hidden,
           num_layers=layers, p_logvar_init=0., p_pi=1.0, q_logvar_init=-5, nflows=number_of_flows)
    
    if cuda: model.cuda()
    return model

def get_loaders(digits, fraction)
    mnist_train = LimitedMNIST(root=MNIST, set_type="train",
                               target_transform=lambda x: x - min(digits),
                               digits=digits, fraction=fraction)

    mnist_val = LimitedMNIST(root=MNIST, set_type="validation",
                             target_transform=lambda x: x - min(digits),
                             digits=digits, fraction=fraction)

    loader_train = DataLoader(mnist_train, batch_size=batch_size, num_workers=2, pin_memory=cuda)
    loader_val = DataLoader(mnist_val, batch_size=batch_size, num_workers=2, pin_memory=cuda)
    
    return (loader_train, loader_val)

In [None]:
def train_model(filename, digits=[0], fraction=1.0, pretrained=False):
    (loader_train, loader_val) = get_loaders(digits, fraction)
    model = get_model(digits)

    if pretrained:
        path = "original/weights/model_epoch49.pkl"
        d = pickle.load(open(path, "rb"))
        model.load_state_dict(d)

    file_logger.initialise(filename)
    print_logger.initialise(filename)

    # Create optimizer
    optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=LR)

    def run_epoch(loader, MAP=False, is_training=False):
        diagnostics = {}
        nbatch_per_epoch = len(loader.dataset) // loader.batch_size

        for i, (data, labels) in tqdm(enumerate(loader)):
            # Repeat samples
            x = data.repeat(N_SAMPLES, 1, 1, 1)
            y = labels.repeat(N_SAMPLES, 0)          

            if cuda:
                x = x.cuda()
                y = y.cuda()

            logits, loss, _diagnostics = model.getloss(Variable(x), Variable(y),
                                                       dataset_size=len(loader.dataset), MAP=MAP)
            diagnostics = merge_add(diagnostics, _diagnostics)

            if is_training:
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

        diagnostics = merge_average(diagnostics, nbatch_per_epoch)
        return diagnostics


    diagnostics_batch_train, diagnostics_batch_valid, diagnostics_batch_valid_MAP = [], [], []

    file_logger.dump(model, -1, None, p_logvar_init)

    for epoch in range(NUM_EPOCHS):
        diagnostics_batch_train += [run_epoch(loader_train, is_training=True)]
        diagnostics_batch_valid += [run_epoch(loader_val)]
        diagnostics_batch_valid_MAP += [run_epoch(loader_val, MAP=True)]

        batch_diagnostics = [diagnostics_batch_train, diagnostics_batch_valid, diagnostics_batch_valid_MAP]

        if epoch % SAVE_EVERY == 0:
            file_logger.dump(model, epoch, batch_diagnostics, p_logvar_init)

        print_logger.dump(epoch, batch_diagnostics)

        gc.collect()

    file_logger.dump(model, epoch, batch_diagnostics, p_logvar_init)

### Parameters for experiment

In [None]:
digits = [0, 1, 2, 3, 4]
transfer = [5, 6, 7, 8, 9]

### 1. Call model to train it first on the data of 'digits'

To get distribution $p(w | D_A)$

In [None]:
train_model("original", digits, fraction=1.0)

### 2. Call model to train on the data of 'transfer'

To get distribution $p(w|D_B)$ when trained with a **uniform prior**.

Different fractions for comparison of performances.

In [None]:
train_model("domain0.05", transfer, fraction=0.05, pretrained=False)
train_model("domain0.1", transfer, fraction=0.1, pretrained=False)
train_model("domain0.2", transfer, fraction=0.2, pretrained=False)
train_model("domain0.3", transfer, fraction=0.3, pretrained=False)
train_model("domain0.5", transfer, fraction=0.5, pretrained=False)
train_model("domain1", transfer, fraction=1.0, pretrained=False)

### 3. Transfer to the second domain with the trained model

To get distribution $p(w|D_B)$ when trained with $p(w|D_A)$ as its **pretrained prior**.

Different fractions for comparison of performances.

In [None]:
train_model("transfer_domain0.05", transfer, fraction=0.05, pretrained=True)
train_model("transfer_domain0.1", transfer, fraction=0.1, pretrained=True)
train_model("transfer_domain0.2", transfer, fraction=0.2, pretrained=True)
train_model("transfer_domain0.3", transfer, fraction=0.3, pretrained=True)
train_model("transfer_domain0.5", transfer, fraction=0.5, pretrained=True)
train_model("transfer_domain1", transfer, fraction=1.0, pretrained=True)

### Plotting results

In [None]:
import matplotlib
matplotlib.use("Agg")
import matplotlib.pyplot as plt
plt.style.use("seaborn")
import re
import numpy as np

def load_data(basename, intervals):
    files = [open("{}{}/logfile.txt".format(basename, i)).read() for i in intervals]
    acc = [list(map(lambda x: x.split(" ")[-1], re.findall(r"(acc: \d.\d+)", f))) for f in files]
    if basename is "domain":
        print(acc)
    train = list(map(lambda x: x[-3], acc))
    valid = list(map(lambda x: x[-2], acc))
    MAP = list(map(lambda x: x[-1], acc))
    return np.array(train).astype(np.float32), np.array(valid).astype(np.float32), np.array(MAP).astype(np.float32)

i = [0.05, 0.1, 0.2, 0.3, 0.5, 1]
f = plt.figure(figsize=(10, 8))

train, valid, MAP = load_data("transfer_cnn_domain", [0.05, 0.1, 0.2, 0.3, 0.5, 0.5])

plt.plot(i, train, label=r"Train, prior: $q(w \mid \theta)$", color="#9c209b")
plt.plot(i, valid, "--", label=r"Validation, prior: $q(w \mid \theta)$", color="#d534d3")
plt.plot(i, MAP, "--", label=r"MAP, prior: $q(w \mid \theta)$", color="#e273e1")

train, valid, MAP = load_data("cnn_domain", i)

plt.plot(i, train, label=r"Train, prior: $\mathcal{U}(a, b)$", color="#209c22")
plt.plot(i, valid, "--", label=r"Validation, prior: $\mathcal{U}(a, b)$", color="#34d536")
plt.plot(i, MAP, "--", label=r"MAP, prior: $\mathcal{U}(a, b)$", color="#73e275")

plt.xlabel("Size of transfer dataset")
plt.ylabel("Accuracy")
plt.xticks(i, map(lambda x: "{}%".format(int(x*100)), i))
f.suptitle("Accuracy after training for 50 epochs")
plt.legend()

### Save figure

In [None]:
plt.savefig("cnn_train_acc.pdf")