In [None]:
!wget https://www.dropbox.com/s/2nyrobwxi99suda/u_methods_CNN.csv
!wget https://www.dropbox.com/s/06s5ivbtnz64ii7/teacher0to4_CNN.pt
!wget https://www.dropbox.com/s/aq196om1h3lvkjs/teacher5to9_CNN.pt
!wget https://www.dropbox.com/s/h7l845py2d3t6o5/data.zip
!unzip data.zip

# LOADING MODEL AND MNIST CLASSES

In [85]:
from __future__ import print_function
import numpy as np
import pandas as pd
import argparse
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms
from torch.utils.data import Dataset
from torch.optim.lr_scheduler import StepLR


class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = nn.Conv2d(1, 32, kernel_size=5)
        self.conv2 = nn.Conv2d(32, 32, kernel_size=5)
        self.conv3 = nn.Conv2d(32,64, kernel_size=5)
        self.fc1 = nn.Linear(3*3*64, 256)
        self.fc2 = nn.Linear(256, 5)

    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = F.relu(F.max_pool2d(self.conv2(x), 2))
        x = F.dropout(x, p=0.5, training=self.training)
        x = F.relu(F.max_pool2d(self.conv3(x),2))
        x = F.dropout(x, p=0.5, training=self.training)
        x = x.view(-1,3*3*64 )
        x = F.relu(self.fc1(x))
        x = F.dropout(x, training=self.training)
        x = self.fc2(x)
        return F.log_softmax(x, dim=1)



class student(nn.Module):
    def __init__(self):
        super(student, self).__init__()
        self.conv1 = nn.Conv2d(1, 32, kernel_size=5)
        self.conv2 = nn.Conv2d(32, 32, kernel_size=5)
        self.conv3 = nn.Conv2d(32,64, kernel_size=5)
        self.fc1 = nn.Linear(3*3*64, 256)
        self.fc2 = nn.Linear(256, 10)

    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = F.relu(F.max_pool2d(self.conv2(x), 2))
        x = F.dropout(x, p=0.5, training=self.training)
        x = F.relu(F.max_pool2d(self.conv3(x),2))
        x = F.dropout(x, p=0.5, training=self.training)
        x = x.view(-1,3*3*64)
        x = F.relu(self.fc1(x))
        x = F.dropout(x, training=self.training)
        x = self.fc2(x)
        return F.log_softmax(x, dim=1)


class MnistDataset(Dataset):
    def __init__(self, data, target, transformation=None):
        self.images = data
        self.targets = target
        self.transformation = transforms.Compose(
              [transforms.ToTensor(), 
               transforms.Normalize((0.5, 0.5, 0.5), 
                                    (0.5, 0.5, 0.5))])

    def __len__(self):
        return len(self.images)


    def __getitem__(self, idx):
        return self.images[idx], self.targets[idx]


class MnistQs():
      def __init__(self):
          self.df = pd.read_pickle("u_methods_CNN.csv")
          self.u_CE = self.df["u_CE"]
          self.u_MFPS = self.df["u_MFPS"]
          self.u_MFLS = self.df["u_MFLS"]

      def __len__(self):
          return self.df.shape[0]

      def __getitem__(self, idx):
          return self.u_CE.iloc[idx], self.u_MFPS.iloc[idx], self.u_MFLS.iloc[idx]

      def get_u_CE(self):
          self.u_CE = torch.zeros(60000, 10)

          for idx, u in enumerate(self.df.u_CE):
              self.u_CE[idx, :] = torch.tensor(u)

          return self.u_CE

      def get_u_MFPS(self):
          self.u_MFPS = torch.zeros(60000, 10)

          for idx, u in enumerate(self.df.u_MFPS):
              self.u_MFPS[idx, :] = torch.tensor(u)
          
          return self.u_MFPS

      def get_u_MFLS(self):
          self.u_MFLS = torch.zeros(60000, 10)

          for idx, u in enumerate(self.df.u_MFLS):
              self.u_MFLS[idx, :] = torch.tensor(u)
          
          return self.u_MFLS

# LOAD THE DATASET AND THE TEACHER FILES

In [86]:
teacher1 = torch.load("teacher0to4_CNN.pt")
teacher1.to("cuda")
teacher2 = torch.load("teacher5to9_CNN.pt")
teacher2.to("cuda")

Net(
  (conv1): Conv2d(1, 32, kernel_size=(5, 5), stride=(1, 1))
  (conv2): Conv2d(32, 32, kernel_size=(5, 5), stride=(1, 1))
  (conv3): Conv2d(32, 64, kernel_size=(5, 5), stride=(1, 1))
  (fc1): Linear(in_features=576, out_features=256, bias=True)
  (fc2): Linear(in_features=256, out_features=5, bias=True)
)

In [87]:
images_train, targets_train = torch.load("/content/data/MNIST/processed/training.pt")
trainset = MnistDataset(images_train, targets_train)
trainset.images = trainset.images.view(-1, 1,28,28).float()
trainloader = torch.utils.data.DataLoader(trainset, batch_size=125, shuffle=False, num_workers=2)

In [97]:
images_test, targets_test = torch.load("/content/data/MNIST/processed/test.pt")
testset = MnistDataset(images_test, targets_test)
testset.images = testset.images.view(-1, 1,28,28).float()
testloader = torch.utils.data.DataLoader(testset, batch_size=125, shuffle=True, num_workers=2)

In [89]:
mnistq = MnistQs()

# FIRST METHOD (CROSS ENTROPY)

# Training student with method 1

In [95]:
device = "cuda"
student_CE = student()
student_CE.to("cuda")

# bce_with_logits = torch.nn.BCEWithLogitsLoss()
learning_rate = 0.001
epochs = 100
m = nn.Softmax(dim=1)
criterion = nn.KLDivLoss()

# Set up loss function and optimizer
optimizer = optim.SGD(student_CE.parameters(), lr=learning_rate, momentum=0.9)

In [96]:
for epoch in range(epochs):
    running_loss = 0.0
    total = 0

    for image, qs in zip(trainloader, np.array_split(mnistq.get_u_CE(), 480)):
        # Apply the learning rate decay
        if(epoch % 100 == 0 and epoch != 0):
            learning_rate = learning_rate * 0.5
            optimizer = optim.SGD(student_CE.parameters(), lr= learning_rate, momentum=0.9)
        
        # get the inputs
        inputs, targets = image
        inputs = inputs.to("cuda")
        qs = qs.to("cuda")
        
        # target = labels.to("cuda").long()
        
        # zero the parameter gradients
        optimizer.zero_grad()

        # Set temperature and the weights for losses linear combination
        w = 0.7
        T = 3

        # Student forward + backward + optimize
        logits_student = student_CE(inputs.float())
        loss = criterion(F.log_softmax(qs/T, dim=1), F.softmax(logits_student/T, dim=1))
        loss.backward()
        optimizer.step()

        total += len(image)

        # print statistics
        running_loss += loss.item()
    # print every epoch
    print('[%d] loss: %.3f' % (epoch + 1, running_loss / total))

print('Finished Training')

  "reduction: 'mean' divides the total loss by both the batch size and the support size."


[1] loss: 0.021
[2] loss: 0.016
[3] loss: 0.016
[4] loss: 0.016
[5] loss: 0.015
[6] loss: 0.015
[7] loss: 0.014
[8] loss: 0.014
[9] loss: 0.013
[10] loss: 0.013
[11] loss: 0.012
[12] loss: 0.012
[13] loss: 0.012
[14] loss: 0.011
[15] loss: 0.011
[16] loss: 0.011
[17] loss: 0.011
[18] loss: 0.010
[19] loss: 0.010
[20] loss: 0.010
[21] loss: 0.010
[22] loss: 0.010
[23] loss: 0.009
[24] loss: 0.009
[25] loss: 0.009
[26] loss: 0.009
[27] loss: 0.009
[28] loss: 0.009
[29] loss: 0.009
[30] loss: 0.009
[31] loss: 0.009
[32] loss: 0.009
[33] loss: 0.008
[34] loss: 0.008
[35] loss: 0.008
[36] loss: 0.008
[37] loss: 0.008
[38] loss: 0.008
[39] loss: 0.008
[40] loss: 0.008
[41] loss: 0.008
[42] loss: 0.008
[43] loss: 0.008
[44] loss: 0.008
[45] loss: 0.008
[46] loss: 0.008
[47] loss: 0.008
[48] loss: 0.008
[49] loss: 0.008
[50] loss: 0.008
[51] loss: 0.007
[52] loss: 0.007
[53] loss: 0.007
[54] loss: 0.007
[55] loss: 0.007
[56] loss: 0.007
[57] loss: 0.007
[58] loss: 0.007
[59] loss: 0.007
[60] l

# Testing student with method 1

In [None]:
test_loss = 0
correct = 0

with torch.no_grad():
    for data, target in testloader:
        data, target = data.to(device), target.to(device)
        output = student_CE(data)
        test_loss += F.nll_loss(output, target, reduction='sum').item()  # sum up batch loss
        pred = output.argmax(dim=1, keepdim=True)  # get the index of the max log-probability
        correct += pred.eq(target.view_as(pred)).sum().item()

test_loss /= len(testloader.dataset)

print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
    test_loss, correct, len(testloader.dataset),
    100. * correct / len(testloader.dataset)))

# Training student with method 2 (MFPS)

In [101]:
device = "cuda"
student_MFPS = student()
student_MFPS.to("cuda")

# bce_with_logits = torch.nn.BCEWithLogitsLoss()
learning_rate = 0.001
epochs = 100
m = nn.Softmax(dim=1)
criterion = nn.KLDivLoss()

# Set up loss function and optimizer
optimizer = optim.SGD(student_MFPS.parameters(), lr=learning_rate, momentum=0.9)

In [102]:
for epoch in range(epochs):
    running_loss = 0.0
    total = 0

    for image, qs in zip(trainloader, np.array_split(mnistq.get_u_CE(), 480)):
        # Apply the learning rate decay
        if(epoch % 100 == 0 and epoch != 0):
            learning_rate = learning_rate * 0.5
            optimizer = optim.SGD(student_MFPS.parameters(), lr= learning_rate, momentum=0.9)
        
        # get the inputs
        inputs, targets = image
        inputs = inputs.to("cuda")
        qs = qs.to("cuda")
        
        # target = labels.to("cuda").long()
        
        # zero the parameter gradients
        optimizer.zero_grad()

        # Set temperature and the weights for losses linear combination
        w = 0.7
        T = 3

        # Student forward + backward + optimize
        logits_student = student_MFPS(inputs.float())
        loss = criterion(qs, m(logits_student))
        loss.backward()
        optimizer.step()

        total += len(image)

        # print statistics
        running_loss += loss.item()
    # print every epoch
    print('[%d] loss: %.3f' % (epoch + 1, running_loss / total))

print('Finished Training')

  "reduction: 'mean' divides the total loss by both the batch size and the support size."


[1] loss: -0.007


KeyboardInterrupt: ignored

# Testing student with method 2 (MFPS)

In [None]:
test_loss = 0
correct = 0

with torch.no_grad():
    for data, target in testloader:
        data, target = data.to(device), target.to(device)
        output = student_MFPS(data)
        test_loss += F.nll_loss(output, target, reduction='sum').item()  # sum up batch loss
        pred = output.argmax(dim=1, keepdim=True)  # get the index of the max log-probability
        correct += pred.eq(target.view_as(pred)).sum().item()

test_loss /= len(testloader.dataset)

print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
    test_loss, correct, len(testloader.dataset),
    100. * correct / len(testloader.dataset)))

Accuracy of the network on test images: 33 % (6675 wrong out of 10000)


# Training student with method 2 (MFLS)

In [106]:
device = "cuda"
student_MFLS = student()
student_MFLS.to("cuda")

# bce_with_logits = torch.nn.BCEWithLogitsLoss()
learning_rate = 0.001
epochs = 100
m = nn.Softmax(dim=1)
criterion = nn.KLDivLoss()

# Set up loss function and optimizer
optimizer = optim.SGD(student_MFLS.parameters(), lr=learning_rate, momentum=0.9)

In [107]:
for epoch in range(epochs):
    running_loss = 0.0
    total = 0

    for image, qs in zip(trainloader, np.array_split(mnistq.get_u_MFLS(), 480)):
        # Apply the learning rate decay
        if(epoch % 100 == 0 and epoch != 0):
            learning_rate = learning_rate * 0.5
            optimizer = optim.SGD(student_MFLS.parameters(), lr= learning_rate, momentum=0.9)
        
        # get the inputs
        inputs, targets = image
        inputs = inputs.to("cuda")
        qs = qs.to("cuda")
        
        # target = labels.to("cuda").long()
        
        # zero the parameter gradients
        optimizer.zero_grad()

        # Set temperature and the weights for losses linear combination
        w = 0.7
        T = 8

        # Student forward + backward + optimize
        logits_student = student_MFLS(inputs.float())
        loss = criterion(F.log_softmax(qs/T, dim=1), F.softmax(logits_student/T, dim=1))
        loss.backward()
        optimizer.step()

        total += len(image)

        # print statistics
        running_loss += loss.item()
    # print every epoch
    print('[%d] loss: %.3f' % (epoch + 1, running_loss / total))

print('Finished Training')

  "reduction: 'mean' divides the total loss by both the batch size and the support size."


[1] loss: 0.021
[2] loss: 0.018
[3] loss: 0.018
[4] loss: 0.018
[5] loss: 0.018
[6] loss: 0.017
[7] loss: 0.017
[8] loss: 0.017
[9] loss: 0.017
[10] loss: 0.017
[11] loss: 0.017
[12] loss: 0.016
[13] loss: 0.016
[14] loss: 0.016
[15] loss: 0.016
[16] loss: 0.016
[17] loss: 0.016
[18] loss: 0.016
[19] loss: 0.016
[20] loss: 0.016
[21] loss: 0.016
[22] loss: 0.016
[23] loss: 0.016
[24] loss: 0.016
[25] loss: 0.016
[26] loss: 0.015
[27] loss: 0.015
[28] loss: 0.015
[29] loss: 0.015
[30] loss: 0.015
[31] loss: 0.015
[32] loss: 0.015
[33] loss: 0.015
[34] loss: 0.015
[35] loss: 0.015
[36] loss: 0.015
[37] loss: 0.015
[38] loss: 0.015
[39] loss: 0.015
[40] loss: 0.015
[41] loss: 0.015
[42] loss: 0.015
[43] loss: 0.015
[44] loss: 0.015
[45] loss: 0.015
[46] loss: 0.015
[47] loss: 0.015
[48] loss: 0.015
[49] loss: 0.015
[50] loss: 0.015
[51] loss: 0.015
[52] loss: 0.015
[53] loss: 0.014
[54] loss: 0.014
[55] loss: 0.014
[56] loss: 0.015
[57] loss: 0.014
[58] loss: 0.014
[59] loss: 0.014
[60] l

# Testing student with method 2 MFLS

In [110]:
test_loss = 0
correct = 0

with torch.no_grad():
    for data, target in testloader:
        data, target = data.to(device), target.to(device)
        output = student_MFLS(data)
        test_loss += F.nll_loss(output, target, reduction='sum').item()  # sum up batch loss
        pred = output.argmax(dim=1, keepdim=True)  # get the index of the max log-probability
        correct += pred.eq(target.view_as(pred)).sum().item()

test_loss /= len(testloader.dataset)

print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
    test_loss, correct, len(testloader.dataset),
    100. * correct / len(testloader.dataset)))


Test set: Average loss: 1.9971, Accuracy: 3189/10000 (32%)

