In [None]:
!wget https://www.dropbox.com/s/o0e0p2ahkj60bzq/teacher0to6.pt
!wget https://www.dropbox.com/s/kze5cdv7dvas3e4/teacher3to9.pt
!wget https://www.dropbox.com/s/h7l845py2d3t6o5/data.zip
!wget https://www.dropbox.com/s/la55owlh2tnsg0d/u_methods.csv
!unzip data.zip

# LOADING MODEL AND MNIST CLASSES

In [None]:
import os
import time
import torch
import pandas as pd
from skimage import io, transform
import numpy as np
import matplotlib.pyplot as plt
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, utils
import torch.nn as nn
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

# Import torchvision functions/classes for MNIST import and data loaders
import torchvision
import torchvision.transforms as transforms


class Model(nn.Module):

    def __init__(self, n_classes, hidden_size=1200, dropout=0.0, hidden_dropout=0.0):
        super(Model, self).__init__()
        self.dropout = nn.Dropout(dropout)
        self.hidden1 = nn.Linear(784, hidden_size, bias=True)
        self.hidden1_dropout = nn.Dropout(hidden_dropout)
        self.hidden2 = nn.Linear(hidden_size, hidden_size, bias=True)
        self.hidden2_dropout = nn.Dropout(hidden_dropout)
        self.hidden3 = nn.Linear(hidden_size, n_classes, bias=True)

    def forward(self, x):

        x = self.dropout(x)
        x = F.relu(self.hidden1(x))
        x = self.hidden1_dropout(x)
        x = F.relu(self.hidden2(x))
        x = self.hidden2_dropout(x)
        x = self.hidden3(x)
        return x



class MnistDataset(Dataset):
    def __init__(self, data, target, transformation=None):
        self.images = data
        self.targets = target
        self.transformation = transforms.Compose([
              transforms.RandomAffine(0, (1/14, 1/14)),
              transforms.Normalize((0.5,), (0.5,))
            ])

    def __len__(self):
        return len(self.images)


    def __getitem__(self, idx):
        return self.images[idx], self.targets[idx]


class MnistQs():
      def __init__(self):
          self.df = pd.read_pickle("u_methods.csv")
          self.u_CE = self.df["u_CE"]
          self.u_MFPS = self.df["u_MFPS"]
          self.u_MFLS = self.df["u_MFLS"]

      def __len__(self):
          return self.df.shape[0]

      def __getitem__(self, idx):
          return self.u_CE.iloc[idx], self.u_MFPS.iloc[idx], self.u_MFLS.iloc[idx]

      def get_u_CE(self):
          self.u_CE = torch.zeros(30000, 10)

          for idx, u in enumerate(self.df.u_CE):
              self.u_CE[idx, :] = torch.tensor(u)

          return self.u_CE

      def get_u_MFPS(self):
          self.u_MFPS = torch.zeros(30000, 10)

          for idx, u in enumerate(self.df.u_MFPS):
              self.u_MFPS[idx, :] = torch.tensor(u)
          
          return self.u_MFPS

      def get_u_MFLS(self):
          self.u_MFLS = torch.zeros(30000, 10)

          for idx, u in enumerate(self.df.u_MFLS):
              self.u_MFLS[idx, :] = torch.tensor(u)
          
          return self.u_MFLS


# LOAD THE DATASET AND THE TEACHER FILES

In [None]:
images, targets = torch.load("/content/data/MNIST/processed/test.pt")
images_train, targets_train = torch.load("/content/data/MNIST/processed/training.pt")

In [None]:
teacher1 = torch.load("teacher0to6.pt")
teacher1.to("cuda")
teacher2 = torch.load("teacher3to9.pt")
teacher2.to("cuda")

Model(
  (dropout): Dropout(p=0.2, inplace=False)
  (hidden1): Linear(in_features=784, out_features=1200, bias=True)
  (hidden1_dropout): Dropout(p=0.5, inplace=False)
  (hidden2): Linear(in_features=1200, out_features=1200, bias=True)
  (hidden2_dropout): Dropout(p=0.5, inplace=False)
  (hidden3): Linear(in_features=1200, out_features=7, bias=True)
)

# FIRST METHOD (CROSS ENTROPY)

## Implement the gradient for the cross-entropy

In [None]:
def grad_j(dict_probs_t1, dict_probs_t2, u):
    grad_j = np.random.rand(10)
    for i,u_i in enumerate(u):
        dui = 0
        if i in dict_probs_t1.keys():
            dui = dui - dict_probs_t1[i]
            e = np.exp(u_i)/np.sum(np.exp(list(dict_probs_t1.values())))
            dui = dui + np.sum(np.array(list(dict_probs_t1.values()))*e)
        if i in dict_probs_t2.keys():
            dui = dui - dict_probs_t2[i]
            e = np.exp(u_i)/np.sum(np.exp(list(dict_probs_t2.values())))
            dui = dui + np.sum(np.array(list(dict_probs_t2.values()))*e)
        grad_j[i] = dui
    return grad_j


def ce_method1(image):
    iters = 3000
    m = nn.Softmax(dim=1)
    m2 = nn.Softmax(dim=0)
    # image = image.to("cuda")

    # Obtain logits from teacher
    z1 = teacher1(image.reshape(1, 784).float())
    z2 = teacher2(image.reshape(1, 784).float())

    probs_t1 = m(z1).cpu().data.numpy()[0] 
    probs_t2 = m(z2).cpu().data.numpy()[0]

    dict_probs_t1 = {idx:probs_t1[idx] for idx in range(7)}
    dict_probs_t2 = {idx:probs_t2[idx-3] for idx in range(3, 10)}

    u = np.random.rand(10)
    for it in range(iters):
        u = u - 0.1 * grad_j(dict_probs_t1, dict_probs_t2, u)


    q2 = m2(torch.from_numpy(u))

    return dict_probs_t1, dict_probs_t2, u, q2


def ce_method1_batch(image_batch):
    iters = 3000
    m = nn.Softmax(dim=1)
    m2 = nn.Softmax(dim=0)
    # u_batch = torch.zeros(image_batch.shape[0], 10).to("cuda")
    u_batch = torch.zeros(image_batch.shape[0], 10)

    for idx, image in enumerate(image_batch):
        print(idx)
        # Obtain logits from teacher
        z1 = teacher1(image.float())
        z2 = teacher2(image.float())

        probs_t1 = m2(z1).cpu().data.numpy() 
        probs_t2 = m2(z2).cpu().data.numpy()

        dict_probs_t1 = {idx:probs_t1[idx] for idx in range(7)}
        dict_probs_t2 = {idx:probs_t2[idx-3] for idx in range(3, 10)}

        u = np.random.rand(10)
        for it in range(iters):
            u = u - 0.1 * grad_j(dict_probs_t1, dict_probs_t2, u)
        
        u_batch[idx, :] = torch.tensor(u)

    return u_batch  # batch of logits



def ce_method1_csv(image, p1, p2):
    iters = 3000

    dict_probs_t1 = {idx:p1[idx] for idx in range(7)}
    dict_probs_t2 = {idx:p2[idx-3] for idx in range(3, 10)}

    u = np.random.rand(10)
    for it in range(iters):
        u = u - 0.1 * grad_j(dict_probs_t1, dict_probs_t2, u)

    return u

## Load test images and get logits and probs

In [None]:
img1 = images[2].to("cuda") #1
img2 = images[4].to("cuda") #4
img3 = images[0].to("cuda") #7

In [None]:
z1_1 = teacher1(img1.reshape(1, 784).float()) #logits teacher1 per num 1
z1_2 = teacher2(img1.reshape(1, 784).float()) #logits teacher2 per num 1

z2_1 = teacher1(img2.reshape(1, 784).float()) #logits teacher1 per num 4
z2_2 = teacher2(img2.reshape(1, 784).float()) #logits teacher2 per num 4

z3_1 = teacher1(img3.reshape(1, 784).float()) #logits teacher1 per num 7
z3_2 = teacher2(img3.reshape(1, 784).float()) #logits teacher2 per num 7

In [None]:
m = nn.Softmax(dim=1)

probs_t1 = m(z3_1)
probs_t2 = m(z3_2)

probs_t1 = probs_t1.cpu().data.numpy()
probs_t2 = probs_t2.cpu().data.numpy()

dict_probs_t1 = {}
dict_probs_t2 = {}

for idx in range(7):
    dict_probs_t1[idx] = probs_t1[0][idx]

for idx in range(3, 10):
    dict_probs_t2[idx] = probs_t2[0][idx - 3]

print(dict_probs_t1)
print(dict_probs_t2)

{0: 1.3191016e-15, 1: 1.5482159e-15, 2: 1.6019008e-16, 3: 3.744862e-08, 4: 2.728826e-16, 5: 1.0, 6: 4.1164277e-11}
{3: 6.393586e-05, 4: 9.513434e-16, 5: 0.999936, 6: 8.791197e-11, 7: 2.1728551e-11, 8: 6.504364e-08, 9: 7.016933e-09}


## Get the q for each train image (sampling 10 for testing purpose)

In [None]:
iters = 3000
m = nn.Softmax(dim=1)
m2 = nn.Softmax()
q2 = torch.rand([images_train.shape[0],10])

for i in range(images_train.shape[0])[:10]:
    # Obtain image
    img = images_train[i].to("cuda")
    
    # Obtain logits from teacher
    z1 = teacher1(img.reshape(1, 784).float())
    z2 = teacher2(img.reshape(1, 784).float())
    
    # compute softmax to obtain p_i
    probs_t1 = m(z1).cpu().data.numpy()[0] 
    probs_t2 = m(z2).cpu().data.numpy()[0]
    print(probs_t1)
    print(probs_t2)

    dict_probs_t1 = {idx:probs_t1[idx] for idx in range(7)}
    dict_probs_t2 = {idx:probs_t2[idx-3] for idx in range(3, 10)}

    # compute gradient descent
    u = np.random.rand(10)
    for it in range(iters):
        u = u - 0.1 * grad_j(dict_probs_t1, dict_probs_t2, u)
    
    # compute softmax to obtain q
    q2[i] = m2(torch.from_numpy(u))
    if i%100==0: print(f'Computing q of image: {i}')

AttributeError: ignored

# SECOND METHOD (MATRIX FACTORIZATION)

## INITIALIZE M, P, Z

In [None]:
M = np.array([[1, 1, 1, 1, 1, 1, 1, 0, 0, 0],
              [0, 0, 0, 1, 1, 1, 1, 1, 1, 1]])
M = M.T

def get_PZ_matrices(image, M):
    # image = image.to("cuda")
    z1 = teacher1(image.reshape(1, 784).float())
    z2 = teacher2(image.reshape(1, 784).float())

    Z = np.zeros(M.shape)
    Z[:7, 0] = z1.cpu().data.numpy()
    Z[3:, 1] = z2.cpu().data.numpy()

    m = nn.Softmax(dim=1)
    prob1 = m(z1)
    prob2 = m(z2)

    P = np.zeros(M.shape)
    P[:7, 0] = prob1.cpu().data.numpy()
    P[3:, 1] = prob2.cpu().data.numpy()

    return P, Z

def get_PZ_matrices_batch(image, M):
    # image = image.to("cuda")
    z1 = teacher1(image.float())
    z2 = teacher2(image.float())

    Z = np.zeros(M.shape)
    Z[:7, 0] = z1.cpu().data.numpy()
    Z[3:, 1] = z2.cpu().data.numpy()

    m = nn.Softmax(dim=0)
    prob1 = m(z1)
    prob2 = m(z2)

    P = np.zeros(M.shape)
    P[:7, 0] = prob1.cpu().data.numpy()
    P[3:, 1] = prob2.cpu().data.numpy()

    return P, Z

## CODE FOR MF IN PROBABILITY AND LOGIT SPACE

In [None]:
import numpy as np
from sklearn.metrics import mean_squared_error as RMSE


# MATRIX FACTORIZATION IN PROBABILITY SPACE
def mf_prob_space(M, P):
    # Parameter initialization
    L, N = M.shape
    v = np.ones(N)
    u = np.ones(L)

    u_k = u.copy() * 2
    v_k = v.copy() * 2
    iters = 0

    # Run until convergence
    while RMSE(u, u_k) > 1e-3 and RMSE(v, v_k) > 1e-3 or iters < 3000:
        u_k = u.copy()
        for j in range(L):
            # First for loop
            u[j] = np.sum(M[j, :] * P[j, :] * v) / np.sum(M[j, :] * np.power(v, 2))
            u[j] = max(0, u[j])

            u = u / np.sum(u)

        for i in range(N):
            v[i] = np.sum(M[:, i] * P[:, i] * u) / np.sum(M[:, i] * np.power(u, 2))
            v[i] = max(0, v[i])
        
        iters += 1

    
    # print(f"u converged after {iters} iterations.")
    return u, v


def mf_prob_space_batch(image_batch, M):
    # Parameter initialization
    L, N = M.shape
    u_batch = np.zeros((image_batch.shape[0], 10))

    for idx, image in enumerate(image_batch):
        P, Z = get_PZ_matrices_batch(image, M)
        v = np.ones(N)
        u = np.ones(L)

        u_k = u.copy() * 2
        v_k = v.copy() * 2
        iters = 0

        # Run until convergence
        while RMSE(u, u_k) > 1e-3 and RMSE(v, v_k) > 1e-3 or iters < 3000:
            u_k = u.copy()
            for j in range(L):
                # First for loop
                u[j] = np.sum(M[j, :] * P[j, :] * v) / np.sum(M[j, :] * np.power(v, 2))
                u[j] = max(0, u[j])

                u = u / np.sum(u)

            for i in range(N):
                v[i] = np.sum(M[:, i] * P[:, i] * u) / np.sum(M[:, i] * np.power(u, 2))
                v[i] = max(0, v[i])
            
            iters += 1

        u_batch[idx, :] = u
    
    print("Batch done!")
    return u_batch


# MATRIX FACTORIZATION IN LOGIT SPACE
def mf_logit_space(M, Z):
    lambd = 0.01
    L, N = M.shape

    v = np.ones(N)
    u = np.ones(L)
    c = np.ones(N)

    u_k = u.copy() * 2
    v_k = v.copy() * 2
    iters = 0

    # Initialize c
    for i in range(N):
      c[i] = np.sum(M[:, i] * Z[:, i]) / np.sum(M[:, i])


    # Run until convergence
    while RMSE(u, u_k) > 1e-3 and RMSE(v, v_k) > 1e-3 or iters < 3000:
        u_k = u.copy()
        v_k = v.copy()

        for j in range(L):
            # First for loop
            u[j] = np.sum(M[j, :] * (Z[j, :] - c) * v) / (lambd + np.sum(M[j, :] * np.power(v, 2)))

        for i in range(N):
            v[i] = np.sum(M[:, i] * (Z[:, i]) * u) / (lambd + np.sum(M[:, i] * np.power(u, 2)))
            v[i] = max(0, v[i])

            c[i] = np.sum(M[:, i] * (Z[:, i] - u * v[i])) / np.sum(M[:, i])
      

        iters += 1
    # print(f"u converged after {iters} iterations.")

    return u, v, c

In [None]:
def get_PZ_matrices_csv(image, M, z1, z2, p1, p2):
    Z = np.zeros(M.shape)
    Z[:7, 0] = z1.cpu().data.numpy()
    Z[3:, 1] = z2.cpu().data.numpy()

    P = np.zeros(M.shape)
    P[:7, 0] = p1
    P[3:, 1] = p2

    return P, Z

# Save soft labels for each image in a csv file

In [None]:
m = nn.Softmax(dim=1)
M = np.array([[1, 1, 1, 1, 1, 1, 1, 0, 0, 0],
              [0, 0, 0, 1, 1, 1, 1, 1, 1, 1]])
M = M.T

u_df = pd.DataFrame()

for i, img in enumerate(images_train[:16384]):
    # img = img.to("cuda")
    z1 = teacher1(img.reshape(1, 784).float())
    z2 = teacher2(img.reshape(1, 784).float())

    probs_t1 = m(z1).cpu().data.numpy()[0] 
    probs_t2 = m(z2).cpu().data.numpy()[0]

    P, Z = get_PZ_matrices_csv(img, M, z1, z2, probs_t1, probs_t2)

    # Get u from Cross-Entropy method 1
    u_CE = ce_method1_csv(img, probs_t1, probs_t2)

    # Get u from MF probability space method 2
    u_MFPS, _ = mf_prob_space(M, P)

    # Get u from MF logit space method 2
    u_MFLS, _, _ = mf_logit_space(M, Z)

    u_df = u_df.append({
        "u_CE": np.array(u_CE),
        "u_MFPS": np.array(u_MFPS),
        "u_MFLS": np.array(u_MFLS)
    }, ignore_index=True)

    if i%100==0: print(i)

# u_df.to_csv("u_methods.csv", index=False)
u_df.to_pickle("u_methods.csv")

0
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500
2600
2700
2800
2900
3000
3100
3200
3300


## Experiments

In [None]:
img1 = images[2].to("cuda") #1
img2 = images[4].to("cuda") #4
img3 = images[0].to("cuda") #7

In [None]:
p1, p2, q = ce_method1(img3)
p1, p2, q

({0: 0.016459377,
  1: 0.00024381149,
  2: 0.8683283,
  3: 0.10226064,
  4: 0.0058595897,
  5: 0.00679617,
  6: 5.2110398e-05},
 {3: 1.0885118e-06,
  4: 6.21561e-08,
  5: 7.412574e-10,
  6: 6.7759192e-12,
  7: 0.9999881,
  8: 8.296881e-09,
  9: 1.0719809e-05},
 tensor([0.0084, 0.0017, 0.4387, 0.0261, 0.0018, 0.0020, 0.0009, 0.5170, 0.0017,
         0.0017], dtype=torch.float64))

In [None]:
P, Z = get_PZ_matrices(img3, M)
q, v = mf_prob_space(M, P)
q

u converged after 3000 iterations.


array([8.96863583e-16, 3.00439310e-15, 2.66815065e-14, 3.73005951e-04,
       3.96062008e-14, 9.99626953e-01, 4.47262278e-11, 2.10652293e-13,
       1.34281626e-09, 4.01277446e-08])

In [None]:
q, v, c = mf_logit_space(M, Z)
q

u converged after 3000 iterations.




tensor([1.8968e-31, 2.2811e-30, 2.0386e-28, 1.0673e-13, 1.4766e-31, 1.0000e+00,
        9.5436e-24, 1.2873e-31, 8.0476e-23, 2.0744e-19], dtype=torch.float64)

# IMPLEMENTING THE STUDENT

In [None]:
images, targets = torch.load("/content/data/MNIST/processed/training.pt")
images = images[:30000]
targets = targets[:30000]
trainset = MnistDataset(images, targets)
trainloader = torch.utils.data.DataLoader(trainset, batch_size=125, shuffle=False, num_workers=2)
mnistq = MnistQs()

In [None]:
student_CE = Model(n_classes=10, hidden_size=500, dropout=0.1, hidden_dropout=0.1)
student_CE.to("cuda")

criterion = nn.KLDivLoss()
m = nn.Softmax(dim=0)
learning_rate = 0.001
epochs = 50

# Set up loss function and optimizer
optimizer = optim.SGD(student_CE.parameters(), lr=learning_rate, momentum=0.9)

# Training student with method 1

In [None]:
# Run over 1000 epochs (1 epoch = visited all items in dataset)
for epoch in range(epochs):
    running_loss = 0.0
    total = 0

    for image, qs in zip(trainloader, np.array_split(mnistq.get_u_CE(), 240)):
        # Apply the learning rate decay
        if(epoch % 100 == 0 and epoch != 0):
            learning_rate = learning_rate * 0.5
            optimizer = optim.SGD(student_CE.parameters(), lr= learning_rate, momentum=0.9)
        
        # get the inputs; data is a list of [inputs, labels]
        inputs, targets = image
        inputs = torch.flatten(inputs, start_dim=1).to("cuda")
        qs = qs.to("cuda")
        # target = labels.to("cuda").long()
        
        # zero the parameter gradients
        optimizer.zero_grad()

        # Set temperature and the weights for losses linear combination
        w = 0.7
        T = 3

        # Student forward + backward + optimize
        logits_student = student_CE(inputs.float())
        # loss = torch.cdist(m(qs), m(logits_student), p=2)
        loss = criterion(F.log_softmax(qs/T, dim=1), F.softmax(logits_student/T, dim=1))
        loss.backward()
        optimizer.step()

        total += len(image)

        # print statistics
        running_loss += loss.item()
    # print every epoch
    print('[%d] loss: %.3f' % (epoch + 1, running_loss / total))

print('Finished Training')

  "reduction: 'mean' divides the total loss by both the batch size and the support size."


[1] loss: 0.038
[2] loss: 0.020
[3] loss: 0.015
[4] loss: 0.012
[5] loss: 0.011
[6] loss: 0.010
[7] loss: 0.009
[8] loss: 0.009
[9] loss: 0.008
[10] loss: 0.008
[11] loss: 0.008
[12] loss: 0.008
[13] loss: 0.007
[14] loss: 0.007
[15] loss: 0.007
[16] loss: 0.007
[17] loss: 0.007
[18] loss: 0.007
[19] loss: 0.007
[20] loss: 0.007
[21] loss: 0.006
[22] loss: 0.006
[23] loss: 0.006
[24] loss: 0.006
[25] loss: 0.006
[26] loss: 0.006
[27] loss: 0.006
[28] loss: 0.006
[29] loss: 0.006
[30] loss: 0.006
[31] loss: 0.006
[32] loss: 0.006
[33] loss: 0.006
[34] loss: 0.006
[35] loss: 0.006
[36] loss: 0.006
[37] loss: 0.006
[38] loss: 0.005
[39] loss: 0.005
[40] loss: 0.005
[41] loss: 0.005
[42] loss: 0.005
[43] loss: 0.005
[44] loss: 0.005
[45] loss: 0.005
[46] loss: 0.005
[47] loss: 0.005
[48] loss: 0.005
[49] loss: 0.005
[50] loss: 0.005
Finished Training


# Testing student with method 1

In [None]:
images_test, targets_test = torch.load("/content/data/MNIST/processed/test.pt")
testset = MnistDataset(images_test, targets_test)
testloader = torch.utils.data.DataLoader(testset, batch_size=128, shuffle=True, num_workers=2)

In [None]:
# Define support function used to convert label to one-hot encoded tensor
def convert_labels(labels):
    target = torch.zeros([len(labels), 10], dtype=torch.float32)
    for i, l in enumerate(labels):
      target[i][l] = 1.0
    return target

# Run model on test set and determine accuracy
correct = 0
total = 0

with torch.no_grad():
    for data in testloader:
        inputs, labels = data
        inputs = torch.flatten(inputs, start_dim=1).to("cuda")
        target = convert_labels(labels).to("cuda")
        outputs = student_CE(inputs.float())
        _, predicted = torch.max(outputs.data, 1)
        _, target = torch.max(target.data, 1)
        total += target.size(0)
        correct += (predicted == target).sum().item()
        # for i, val in enumerate(predicted):
        #   wrong[target[i]][val] += 1

# Output model accuracy to user
print('Accuracy of the network on test images: %d %% (%d wrong out of %d)' % (
    100 * correct / total, total - correct, total))

Accuracy of the network on test images: 82 % (1755 wrong out of 10000)


# Training student with method 2 (MFPS)

In [None]:
student_MFPS = Model(n_classes=10, hidden_size=500, dropout=0.1, hidden_dropout=0.1)
student_MFPS.to("cuda")

# bce_with_logits = torch.nn.BCEWithLogitsLoss()
learning_rate = 0.001
epochs = 50

# Set up loss function and optimizer
optimizer = optim.SGD(student_MFPS.parameters(), lr=0.001, momentum=0.9)

In [None]:
for epoch in range(epochs):
    running_loss = 0.0
    total = 0

    for image, qs in zip(trainloader, np.array_split(mnistq.get_u_MFPS(), 240)):
        # Apply the learning rate decay
        if(epoch % 100 == 0 and epoch != 0):
            learning_rate = learning_rate * 0.5
            optimizer = optim.SGD(student_MFPS.parameters(), lr= learning_rate, momentum=0.9)
        
        # get the inputs; data is a list of [inputs, labels]
        inputs, targets = image
        inputs = torch.flatten(inputs, start_dim=1).to("cuda")
        
        qs = qs.to("cuda")
        # target = labels.to("cuda").long()
        
        # zero the parameter gradients
        optimizer.zero_grad()

        # Set temperature and the weights for losses linear combination
        w = 0.7
        T = 3

        # Student forward + backward + optimize
        logits_student = student_MFPS(inputs.float())
        qs = torch.log(qs/(1 - qs))
        loss = criterion(F.log_softmax(qs/T, dim=1), F.softmax(logits_student/T, dim=1))
        loss.backward()
        optimizer.step()

        total += len(image)

        # print statistics
        running_loss += loss.item()
    # print every epoch
    print('[%d] loss: %.3f' % (epoch + 1, running_loss / total))

print('Finished Training')

  "reduction: 'mean' divides the total loss by both the batch size and the support size."


[1] loss: nan
[2] loss: 0.000
[3] loss: 0.000
[4] loss: 0.000
[5] loss: 0.000
[6] loss: 0.000
[7] loss: 0.000
[8] loss: 0.000
[9] loss: 0.000


KeyboardInterrupt: ignored

# Training student with method 2 (MFLS)

In [None]:
student_MFLS = Model(n_classes=10, hidden_size=500, dropout=0.1, hidden_dropout=0.1)
student_MFLS.to("cuda")

criterion = nn.KLDivLoss()
m = nn.Softmax(dim=0)
learning_rate = 0.001
epochs = 50

# Set up loss function and optimizer
optimizer = optim.SGD(student_MFLS.parameters(), lr=0.001, momentum=0.9)

In [None]:
for epoch in range(epochs):
    running_loss = 0.0
    total = 0

    for image, qs in zip(trainloader, np.array_split(mnistq.get_u_MFLS(), 240)):
        # Apply the learning rate decay
        if(epoch % 100 == 0 and epoch != 0):
            learning_rate = learning_rate * 0.5
            optimizer = optim.SGD(student_MFLS.parameters(), lr= learning_rate, momentum=0.9)
        
        # get the inputs; data is a list of [inputs, labels]
        inputs, targets = image
        inputs = torch.flatten(inputs, start_dim=1).to("cuda")
        qs = qs.to("cuda")
        # target = labels.to("cuda").long()
        
        # zero the parameter gradients
        optimizer.zero_grad()

        # Set temperature and the weights for losses linear combination
        w = 0.7
        T = 3

        # Student forward + backward + optimize
        logits_student = student_MFLS(inputs.float())
        loss = criterion(F.log_softmax(qs/T, dim=1), F.softmax(logits_student/T, dim=1))
        loss.backward()
        optimizer.step()

        total += len(image)

        # print statistics
        running_loss += loss.item()
    # print every epoch
    print('[%d] loss: %.3f' % (epoch + 1, running_loss / total))

print('Finished Training')

  "reduction: 'mean' divides the total loss by both the batch size and the support size."


[1] loss: 0.188
[2] loss: 0.048
[3] loss: 0.037
[4] loss: 0.032
[5] loss: 0.030
[6] loss: 0.028
[7] loss: 0.026
[8] loss: 0.025
[9] loss: 0.024
[10] loss: 0.023
[11] loss: 0.022
[12] loss: 0.022
[13] loss: 0.021
[14] loss: 0.021
[15] loss: 0.021
[16] loss: 0.020
[17] loss: 0.020
[18] loss: 0.020
[19] loss: 0.019
[20] loss: 0.019
[21] loss: 0.019
[22] loss: 0.018
[23] loss: 0.018
[24] loss: 0.018
[25] loss: 0.018
[26] loss: 0.018
[27] loss: 0.017
[28] loss: 0.017
[29] loss: 0.017
[30] loss: 0.017
[31] loss: 0.017
[32] loss: 0.017
[33] loss: 0.017
[34] loss: 0.017
[35] loss: 0.017
[36] loss: 0.016
[37] loss: 0.016
[38] loss: 0.017
[39] loss: 0.016
[40] loss: 0.016
[41] loss: 0.016
[42] loss: 0.016
[43] loss: 0.016
[44] loss: 0.016
[45] loss: 0.016
[46] loss: 0.016
[47] loss: 0.016
[48] loss: 0.016
[49] loss: 0.016
[50] loss: 0.015
Finished Training


# Testing student with method 2 MFLS

In [None]:
# Define support function used to convert label to one-hot encoded tensor
def convert_labels(labels):
    target = torch.zeros([len(labels), 10], dtype=torch.float32)
    for i, l in enumerate(labels):
      target[i][l] = 1.0
    return target

# Run model on test set and determine accuracy
correct = 0
total = 0

with torch.no_grad():
    for data in testloader:
        inputs, labels = data
        inputs = torch.flatten(inputs, start_dim=1).to("cuda")
        target = convert_labels(labels).to("cuda")
        outputs = student_MFLS(inputs.float())
        _, predicted = torch.max(outputs.data, 1)
        _, target = torch.max(target.data, 1)
        total += target.size(0)
        correct += (predicted == target).sum().item()
        # for i, val in enumerate(predicted):
        #   wrong[target[i]][val] += 1

# Output model accuracy to user
print('Accuracy of the network on test images: %d %% (%d wrong out of %d)' % (
    100 * correct / total, total - correct, total))

Accuracy of the network on test images: 95 % (433 wrong out of 10000)
