# **imports**

In [1]:
!pip install wandb -qU
import wandb
from functools import partial
wandb.login(key='62bc107823f2e5d51fd5f21b1081d81dd76a07db')

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m22.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m188.6/188.6 kB[0m [31m19.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m218.8/218.8 kB[0m [31m22.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.7/62.7 kB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for pathtools (setup.py) ... [?25l[?25hdone


[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [None]:
!pip install wandb -qU
!pip install torch
!pip install torchvision
!pip install matplotlib
import numpy as np
from matplotlib import pyplot as plt
import torch.nn as nn
import torch.nn.functional as F
import torch
import torchvision
import torchvision.transforms as transforms
from functools import partial
import torch.optim as optim

torch.manual_seed(0)

device = "cuda:0" if torch.cuda.is_available() else "cpu"

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')
root_dir = '/content/gdrive/MyDrive/NN for Images/Ex2'

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


# **Data Loaders**
The values `(0.1307,), (0.3081,)` are the mean and the standard deviation of the MNIST dataset.

In [None]:
def get_dataloader(is_train, batch_size=64):
    transform = transforms.Compose(
    [transforms.ToTensor(),
     transforms.Normalize((0.1307,), (0.3081,))])

    dataset = torchvision.datasets.MNIST(root='./data', train=is_train,
                                            download=True, transform=transform)
    loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size,
                                              shuffle=is_train)

    return loader

# **Encoder Definition**
The Encoder here is a subject to the next parameters:
1. Amount of convolutional layers (`n_conv_layers`)
2. Kernel size, which is to be constant in all convolutional layers (`kernel_size`)
3. Increasing channels amount factor, which determines how many channels will be added to the next convolution layer (`c_factor`)
4. The latenet dimension, mentioned in the excercise as *d* (`latent_dim`)

The architecture of the Encoder is basically:

1.   ***Strided Convolution Layer***
2.   ***ReLU***
3.   *repeate 1-2 `n_conv_layers` times*
4.   ***Flatten***
4.   ***Fully Connected layer*** with output dimension of `1/3` of the last convolution output size
6.   **ReLU**
5.   ***Fully Connected layer*** with output dimension of `d`

The Encoder object will be determined in a single-valued way with those, to mirror this specific architecture.

In [None]:
class Encoder(nn.Module):
    def __init__(self, n_conv_layers, kernel_size, c_factor, latent_dim, stride=2, padding=1):
        super().__init__()
        self.layers = nn.ModuleList()
        self.build_layers(n_conv_layers, c_factor, kernel_size, latent_dim, stride, padding)

    def build_layers(self, n_conv_layers, cout, kernel_size, latent_dim, stride, padding):
        c_in = 1         # c_in is 1 for grayscale input
        cur_dim = 28     # size of input data
        c_factor = cout

        self.dimensions = [cur_dim]
        # Build the convolution layers
        for l in range(n_conv_layers):
          self.layers.append(nn.Conv2d(c_in, cout, kernel_size, stride=stride, padding=padding))
          c_in = cout
          cur_dim = ((cur_dim + (2 * padding) - (kernel_size - 1) - 1) // stride) + 1
          self.dimensions.append(cur_dim)
          cout += c_factor

        # Build the 2 fully-connected layers
        cur_dim = (cout-c_factor)*(self.dimensions[-1]**2)
        self.dimensions.append(cur_dim)
        self.layers.append(nn.Linear(cur_dim, cur_dim // 3))
        cur_dim = cur_dim // 3
        self.dimensions.append(cur_dim)
        self.layers.append(nn.Linear(cur_dim, latent_dim))

    def forward(self, x):
        # Pass throught all the convolution layers
        for conv in self.layers[:-2]:
            x = F.relu(conv(x))
        x = torch.flatten(x, 1)

        # Pass throught ths FC layers
        x = F.relu(self.layers[-2](x))
        x = self.layers[-1](x)
        return x

    def get_decoder_architecture(self):
        # This function returns an information about the exact architecture
        # of this Encoder instance, to help building the Encoder architecture
        arch = []
        for layer in self.layers:
            for p in layer.parameters():
                arch.append(p.shape)
                break
        return arch[::-1], self.dimensions, 2

# **Decoder Definition**
Since the decoder is mirroring the encoder, it determined exactly (in a single-valued way) by the architecutre of the encoder. Hence, the parameters for this model are subject to the Encoder and are given by the Encoder's method `get_decoder_architecture()`.

The architecture of the Decoder is basically:

1.   ***Fully Connected layer***
2.   ***ReLU***
3.   ***Fully Connected layer***
4.   ***ReLU***
5.   ***Reshape***  
1.   ***Convolution 2D Transpose Layer***
2.   ***ReLU***
3.   *repeate 6-7 `n_conv_layers-1` times*
4.   ***Convolution 2D Transpose Layer***
10.  ***Sigmoid***

Notice that the after the *Conv2DTranspose* last layer (or the singlel layer if there is only one), There is no ReLU activation since the *Sigmoid* activates as an activation layer.


In [None]:
class Decoder(nn.Module):
    # The architecture argument holds the dimensions of all the layers
    # in the Encoder object, from the last to the first
    # The num_of_fc is in fact 2, and here only for making my coding routine
    # easier and make the Decoder depends exlusively on the Encoder

    def __init__(self, architecture, encoder_dimensions, num_of_fc, stride=2, padding=1):
        super().__init__()
        self.layers = nn.ModuleList()

        # The dimension of the Encoder's FC layer is important for correctly
        # reshaping the data after the FC layer here.
        # The shape is (Cin, LastD, LastD), where:
        # Cin = The number of channels of Encoder's last convolution layer
        # LastD = The spatial dimensions of the data when processing by the
        #         Encoder's last convolution layer
        self.num_of_fc = num_of_fc
        self.reshape_shape = (-1, architecture[num_of_fc][0], encoder_dimensions[-num_of_fc-1], encoder_dimensions[-num_of_fc-1])
        self.build_layers(architecture, encoder_dimensions[:-num_of_fc-1], num_of_fc, stride, padding)

    def build_layers(self, architecture, encoder_dimensions, num_of_fc, stride, padding):

        # The first elements in architecture is encoder's FC layers dimensions
        for i in range(num_of_fc):
          self.layers.append(nn.Linear(architecture[i][0], architecture[i][1]))

        # All others are convolution layers dimensions
        # If stirde > 1, the natural ConvTranspose2d will result in an odd
        # dimension, hence for an even target dimension, an "outer_padding"
        # is required
        for layer, dim in zip(architecture[num_of_fc:], encoder_dimensions[::-1]):
            if dim % 2 == 0 and stride > 1:
                self.layers.append(nn.ConvTranspose2d(layer[0], layer[1], layer[2], stride=stride, padding=padding,
                                                      output_padding=1))
            else:
                self.layers.append(nn.ConvTranspose2d(layer[0], layer[1], layer[2], stride=stride, padding=padding))

    def forward(self, x):
        # Pass throught the FC layers
        for i in range(self.num_of_fc):
            x = F.relu(self.layers[i](x))
        x = torch.reshape(x, self.reshape_shape)

        # Pass throught all the upconv layers
        for upconv in self.layers[self.num_of_fc:-1]:
            x = F.relu(upconv(x))
        x = self.layers[-1](x)
        return F.sigmoid(x)

In [None]:
def get_auto_encoder(param):
    # parameters format:
    # [n of conv layers, kernel size, increasing channels factor, latent dim,
    # stride, padding]

    encoder = Encoder(*param).to(device)
    decoder_architecture, encoder_dimensions, num_of_fc = encoder.get_decoder_architecture()
    decoder = Decoder(decoder_architecture, encoder_dimensions, num_of_fc, param[4], param[5]).to(device)

    return nn.Sequential(encoder, decoder).to(device)

# **Evaluation**
In this exercise, the evaluation is only for checking out the test loss, which is either
*   Reconstruction error: (`|AE(I) - I|`), corresponding to the MSE loss since power(2) is monotonic and sinlge-valued mapping
* Cross Entropy Loss: For classification tasks

In [None]:
def evaluate(model, test_set_loader, criterion, classification=False):
    test_loss = []

    # Evaluation shouldn't change the gradients calculation
    with torch.no_grad():
        for data in test_set_loader:
            inputs, labels = data[0].to(device), data[1].to(device)
            outputs = model(inputs)

            if classification:
                # For classification, the criterin is CrossEntropyLoss
                # operates on predictions and ground-truth labels
                loss = criterion(outputs, labels)

            else:
                # For reconstruction, the criterion is MSELoss, operates
                # only on inputs and predictions
                loss = criterion(outputs, inputs)

            test_loss.append(loss.detach().cpu().numpy())
    test_loss = np.mean(test_loss)
    test_metrics = {"Average test loss": test_loss}
    wandb.log(test_metrics)

# **Train Loop for AutoEncoder**
**This is the train loop for the AutoEncoders training, and not for the classification tasks.**

`model` here is a `nn.Sequential` object contains the Encoder and the Decoder by the order, and used here as a single model.
The criterion is MSE loss, which is correspondings to the reconstruction error `|AE(I) - I|`.

In [None]:
def train_loop(config=None, dir_path=None):
    with wandb.init(config=config):
        config = wandb.config

        # Create data loaders iterators
        train_data_loader = get_dataloader(is_train=True, batch_size=config.batch_size)
        test_data_loader = get_dataloader(is_train=False, batch_size=config.batch_size)

        # Define the model and the parameters
        param = [config.num_of_conv, config.kernel, config.c_factor,
                 config.latent_dim, config.stride, config.padding]
        AE = get_auto_encoder(param)
        optimizer = optim.AdamW(AE.parameters(), lr=config.lr)
        criterion = nn.MSELoss()

        # Train the model and log the averaged losses
        for epoch in range(config.epochs):
            epoch_loss = []
            for data in train_data_loader:
                inputs = data[0].to(device)

                # Update model's parameters with gradient descend method
                outputs = AE(inputs)

                batch_loss = criterion(outputs, inputs)
                optimizer.zero_grad()
                batch_loss.backward()
                optimizer.step()

                epoch_loss.append(batch_loss.detach().cpu().numpy())

            # Plot the loss
            metrics = {"Average train loss": np.mean(epoch_loss)}
            wandb.log(metrics)

            # Evaluate the model's phase and plot it
            evaluate(AE, test_data_loader, criterion)

        if dir_path is not None:
          torch.save(AE.state_dict(), dir_path.format(config.num_of_conv, config.latent_dim))

# **Q1: Explore the reconstruction error**
1. Using lower and higher latent space dimension *d*
2. using a fixed latent dimension *d* but with encoder/decoder architecture with more or fewer layers/weights.

I have implemented a 'sweep' report for searching the best parameters. The method 'grid' is looking for the best combination of values from a given values lists. The lists are called `sweep_configuration`.

In the W&B dashboard I then have isolated *d*, and the size of the AE and the results are reported in the PDF.

In [None]:
def Q1():
    spec_sweep_configuration = {
        'method': 'grid',
        'name': 'Ex2_Q1_AdamW',
        'metric': {
            'goal': 'minimize',
            'name': 'Average test loss'
            },
        'parameters': {
            "num_of_conv": {'values': [1 ,2 ,3 ,4, 5]},
            "c_factor": {'value': 8},
            "latent_dim": {'values': [5, 10, 15, 20]},
            "kernel": {'value': 3},
            "stride": {'value': 2},
            "padding": {'value': 1},
            "epochs": {'value': 15},
            "lr": {'value': 0.001},
            "batch_size": {'value': 16}
        }
    }


    my_sweep = wandb.sweep(spec_sweep_configuration, project="Ex2_2_FC")
    train = partial(train_loop, spec_sweep_configuration)
    wandb.agent(my_sweep, function=train)

# **Q2: Interpolation**
Here I've run forparameters:
`d = {5, 10, 15, 20}`,
`num_of_conv = 3`,
`kernel = 3`,
`c_factor= 8`,
Then I chose the `d = {10, 20}` for report results.

I have calculated the interpolation for 20 different values of alpha, to create a continious interpolation plots.



In [None]:
def get_digits():
  # Extract two images of a random pair of different digits

  pair = torch.randint(0, 9, (2,))
  while pair[0].item() == pair[1].item():
      pair = torch.randint(0, 9, (2,))
  digit_x, digit_y = pair[0].item(), pair[1].item()

  for batch in get_dataloader(True):
      imgs, labels = batch[0], batch[1]
      images_x, images_y = imgs[labels == digit_x], imgs[labels == digit_y]
      img_x = images_x[torch.randint(0, images_x.shape[0], (1,))]
      img_y = images_y[torch.randint(0, images_y.shape[0], (1,))]
      return [img_x, img_y], [digit_x, digit_y]

In [None]:
def interpolate(model, dim, pair, digits):
    # Extract the Encoder and the Decoder as separate models
    encoder, decoder = model[0], model[1]

    # Preparing the ground for the plot, which is to be a concatenated images
    result = np.zeros((28, 1))
    for alpha in np.linspace(0, 1, 20):
        first = encoder(pair[0]) * alpha
        second = encoder(pair[1]) * (1 - alpha)
        dec = decoder(first + second).detach().numpy()
        result = np.hstack((result, dec.reshape(28, 28)))

    # Plot the data
    plt.cla()
    plt.suptitle("digits: {}".format(digits))
    plt.imshow(result, cmap='gray')
    plt.xticks([])
    plt.yticks([])
    plt.savefig(root_dir + r'/Interpolations/digits_{}_and_{}_d_{}.png'.format(digits[0], digits[1], dim))

In [None]:
def Q2(train_and_save_models=False, latent_dim=None, load=False, pairs=5):
    # determine configuration
    q2_sweep_parameters = {
        'method': 'grid',
        'name': 'Training_For_Q2_ker_3',
        'metric': {
            'goal': 'minimize',
            'name': 'Average test loss'
            },
        'parameters': {
            "num_of_conv": {'value': 3},
            "c_factor": {'value': 8},
            "latent_dim": {'values': latent_dim},
            "kernel": {'value': 3},
            "stride": {'value': 2},
            "padding": {'value': 1},
            "epochs": {'value': 15},
            "lr": {'value': 0.001},
            "batch_size": {'value': 16}
        }
    }

    # train the models and save them
    dir_path = root_dir + r'/models/model_conv_{}_d_{}.pth'
    if train_and_save_models:
        my_sweep = wandb.sweep(q2_sweep_parameters, project="Ex2_2_FC")
        train = partial(train_loop, q2_sweep_parameters, dir_path)
        wandb.agent(my_sweep, function=train)

    if load:
      device = 'cpu'
      # Interpolate two images of random pair of digits
      for i in range(pairs):
          pair, digits = get_digits()
          for dim in latent_dim:
              model = get_auto_encoder([3, 3, 8, dim, 2, 1])
              model.load_state_dict(torch.load(dir_path.format(3, dim)))
              model.eval()
              interpolate(model, dim, pair, digits)

# **Q3: Correlations**
The correlation between every two featurs in the latent dimension is given by Pearson's Correlation Matrix. Here the matrix is calculated using 8000 random images (note that the dataloader is shuffling the data in the proper function).

The final measure to estimate the total correlations that I chose is the mean of the absolute values in the matrix, since also negative values are indicators for existing correlation.

In [None]:
def calc_corr(AE):
    # The final measure I chose is the mean of the absolute values in the
    # Pearson's correlation matrix
    encoder = AE[0]
    dataloader = get_dataloader(is_train=True, batch_size=8000)
    for data in dataloader:
        latent_vector = encoder(data[0]).detach().numpy()
        corr_mat = np.corrcoef(latent_vector.T)
        return np.mean(np.abs(corr_mat))

In [None]:
def Q3(dimensions, path_for_model):
    device = 'cpu'
    abs_val_means = []
    for dim in dimensions:
        AE = get_auto_encoder([3, 3, 8, dim, 2, 1])
        AE.load_state_dict(torch.load(path.format(dim), map_location=torch.device('cpu')))
        AE.eval()
        mean_abs = Q3(AE)
        abs_val_means.append(mean_abs)

    plt.plot(dimensions, abs_val_means)
    plt.title("Mean of Abs. values")
    plt.xticks(dimensions)
    plt.yticks(abs_val_means)
    plt.xlabel("d")
    plt.show()

# **Q4: Trasnfer Learning**
In this section we investigate the performence of classification task, based on the AutoEncoders we have trained before, in two different ways:


1.   Train only the MLP
2.   Train both the MLP and the pre-trained AutoEncoder

The MLP architecture is:


*   ***Fully Connected*** with output dimension given to the constructor
*   ***ReLU***
*   ***Fully Connected*** with output dimension of 100
*   ***ReLU***
*   ***Fully Connected*** with output dimension of 10





In [None]:
class MLP(nn.Module):
    # A simple multi layered perceptron including 3 fully-connected layers,
    # RelU activations, and sigmoid with argMax extraction for classifying

    def __init__(self, input_dim, hidden_dim):
        super().__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, 100)
        self.fc3 = nn.Linear(100, 10)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

# **Train Loop for MLP**
**This is a train loop for the classification task, either for the MLP alone or with the Encoder parameters.**

There are several implementation differences that motivated me write another function for this, and this is only for convenience.

In [None]:
def train_loop_Q4(config=None, path_for_model=None):
    with wandb.init(config=config):
        config = wandb.config

        # Create data loaders iterators
        train_data_loader = get_dataloader(is_train=True, batch_size=config.batch_size)
        test_data_loader = get_dataloader(is_train=False, batch_size=config.batch_size)

        # load the pre-trained AE to extract the encoder from
        # changing the device temporarily for this task
        device = 'cpu'
        AE = get_auto_encoder([3, 3, 8, config.latent_dim, 2, 1])
        AE = AE.to(device)
        AE.load_state_dict(torch.load(path_for_model.format(config.latent_dim), map_location=torch.device(device)))

        # trasnfer the model to cuda, extract the encoder and build a MLP
        device = "cuda:0" if torch.cuda.is_available() else "cpu"
        encoder = AE[0].to(device)
        mlp = MLP(input_dim=config.latent_dim, hidden_dim=config.mlp_hidden_layer).to(device)

        # define the proper parameters for optimizer according to the task
        if config.train_encoder:
            params = list(encoder.parameters()) + list(mlp.parameters())
        else:
            params = mlp.parameters()
        optimizer = optim.AdamW(params, lr=config.lr)

        # For classification tasks, CrossEntropyLoss is a good choice
        criterion = nn.CrossEntropyLoss()

        # Train the model and log the averaged losses
        for epoch in range(config.epochs):
            epoch_loss = []
            for data in train_data_loader:
                inputs, labels = data[0].to(device), data[1].to(device)

                # As we required, train only on ~tens of images
                if len(epoch_loss) > 100:
                    break

                # Update model's parameters with gradient descend method
                outputs = mlp(encoder(inputs))
                batch_loss = criterion(outputs, labels)
                optimizer.zero_grad()
                batch_loss.backward()
                optimizer.step()

                epoch_loss.append(batch_loss.detach().cpu().numpy())

            # Plot the loss
            metrics = {"Average train loss": np.mean(epoch_loss)}
            wandb.log(metrics)

            # Evaluate the model's phase and plot it
            evaluate(nn.Sequential(encoder, mlp), test_data_loader, criterion, classification=True)

In [None]:
def Q4(dimensions, path_for_model):
    # determine configuration
    q4_sweep_parameters = {
        'method': 'grid',
        'name': 'Q4',
        'metric': {
            'goal': 'minimize',
            'name': 'Average test loss'
            },
        'parameters': {
            "num_of_conv": {'value': 3},
            "c_factor": {'value': 8},
            "latent_dim": {'values': dimensions},
            "kernel": {'value': 3},
            "stride": {'value': 2},
            "padding": {'value': 1},
            "epochs": {'value': 15},
            "lr": {'value': 0.001},
            "batch_size": {'value': 16},
            "train_encoder": {'values': [True, False]},
            "mlp_hidden_layer": {'values': [200, 650, 1000]}
        }
    }

    my_sweep = wandb.sweep(q4_sweep_parameters, project="Ex2_2_Q4")
    train = partial(train_loop_Q4, q4_sweep_parameters, path_for_model)
    wandb.agent(my_sweep, function=train)

# Just a Runner
Running the separate questions.
Making sure that before every question, the device is allocated to `'cuda'` if possible.
If loading models is required, the device is to be changed to `'cpu'` within the different questions.

In [None]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"
Q1()

device = "cuda:0" if torch.cuda.is_available() else "cpu"
Q2(train_and_save_models=False, latent_dim=[20, 15, 10, 5], load=True)

device = "cuda:0" if torch.cuda.is_available() else "cpu"
Q3(dimensions=[20, 15, 10, 5], path_for_model=root_dir + r'/models/model_conv_3_d_{}.pth')

device = "cuda:0" if torch.cuda.is_available() else "cpu"
Q4(dimensions=[20, 15, 10, 5], path_for_model=root_dir + r'/models/model_conv_3_d_{}.pth')