In [1]:
%matplotlib inline

In [2]:
# global params
# trained =  True # loads models with random weight and not pretrained weights
case = 'normalized'
train_models = True
epochs = 10 

# num_data

In [3]:
from __future__ import print_function, division

import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim import lr_scheduler
import torch.backends.cudnn as cudnn
import numpy as np
import torchvision
from torchvision import datasets, models, transforms
import matplotlib.pyplot as plt
import time
import os
import copy
import torch.nn.functional as F

cudnn.benchmark = True
plt.ion()   # interactive mode


use_cuda=True
device = torch.device("cuda" if (use_cuda and torch.cuda.is_available()) else "cpu")


## MNIST Experiment **I**

In [4]:
# LeNet Model definition
class NetTest(nn.Module):
    def __init__(self):
        super(NetTest, self).__init__()
        self.conv1 = nn.Conv2d(1, 10, kernel_size=5)
        self.conv2 = nn.Conv2d(10, 20, kernel_size=5)
        # self.conv2_drop = nn.Dropout2d()
        self.fc1 = nn.Linear(320, 50)
        self.fc2 = nn.Linear(50, 10)

    def forward(self, x):
        x = F.relu(F.max_pool2d(self.conv1(x), 2))        
        # x = F.relu(F.max_pool2d(self.conv2_drop(self.conv2(x)), 2))    
        x = F.relu(F.max_pool2d(self.conv2(x), 2))    

        x = x.view(-1, 320)
        x = F.relu(self.fc1(x))        
        # x = F.dropout(x, training=self.training)
        x = self.fc2(x)

        return x #F.log_softmax(x, dim=1)  ### SUPER IMPORTANT ***************************** works with nll loss loss now
#         return x  ### SUPER IMPORTANT ***************************** works with cross entropy loss loss now


# LeNet Model definition
class NetTest2(nn.Module):
    def __init__(self):
        super(NetTest2, self).__init__()
        self.conv1 = nn.Conv2d(1, 10, kernel_size=2)
        self.conv2 = nn.Conv2d(10, 20, kernel_size=2)
        self.conv3 = nn.Conv2d(20, 10, kernel_size=2)
        # self.conv3_drop = nn.Dropout2d()
        self.fc1 = nn.Linear(40, 50)
        self.fc2 = nn.Linear(50, 10)

    def forward(self, x):
        x = F.relu(F.max_pool2d(self.conv1(x), 2))   
        x = F.relu(F.max_pool2d(self.conv2(x), 2))        
        x = F.relu(F.max_pool2d(self.conv3(x), 2))    

        x = x.view(-1, 40)
        x = F.relu(self.fc1(x))        
        # x = F.dropout(x, training=self.training)
        x = self.fc2(x)

        return x #F.log_softmax(x, dim=1)  ### SUPER IMPORTANT ***************************** works with nll loss loss now
#         return x  ### SUPER IMPORTANT ***************************** works with cross entropy loss loss now



# LeNet Model definition
class NetTest3(nn.Module):
    def __init__(self):
        super(NetTest3, self).__init__()
        self.conv1 = nn.Conv2d(1, 30, kernel_size=3)
        self.conv2 = nn.Conv2d(30, 20, kernel_size=3)

        self.fc1 = nn.Linear(500, 50)
        self.fc2 = nn.Linear(50, 10)

    def forward(self, x):
        x = F.relu(F.max_pool2d(self.conv1(x), 2))   
        x = F.relu(F.max_pool2d(self.conv2(x), 2))        

        x = x.view(-1, 500)
        x = F.relu(self.fc1(x))        
        x = self.fc2(x)

        return x #F.log_softmax(x, dim=1)  ### SUPER IMPORTANT ***************************** works with nll loss loss now
#         return x  ### SUPER IMPORTANT ***************************** works with cross entropy loss loss now



models = [NetTest().to(device), NetTest2().to(device), NetTest3().to(device)]  # training with slope 1 for now




train_loader = torch.utils.data.DataLoader(
    datasets.MNIST('../data', train=True, download=True, transform=transforms.Compose([
            transforms.ToTensor(),
            ])), 
        batch_size=100, shuffle=True)

# # MNIST Test dataset and dataloader declaration
test_loader = torch.utils.data.DataLoader(
    datasets.MNIST('../data', train=False, download=True, transform=transforms.Compose([
            transforms.ToTensor(),
            ])), 
        batch_size=100, shuffle=False)


dataloaders = {}
dataloaders['train'] = train_loader
dataloaders['val'] = test_loader

dataset_sizes = {x: len(dataloaders[x].dataset) for x in ['train', 'val']}


# # Define what device we are using
# print("CUDA Available: ",torch.cuda.is_available())

num_classes = 10



In [5]:
from torchsummary import summary
summary(models[0], (1, 28, 28))
# summary(models[1], (1, 28, 28))

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1           [-1, 10, 24, 24]             260
            Conv2d-2             [-1, 20, 8, 8]           5,020
            Linear-3                   [-1, 50]          16,050
            Linear-4                   [-1, 10]             510
Total params: 21,840
Trainable params: 21,840
Non-trainable params: 0
----------------------------------------------------------------
Input size (MB): 0.00
Forward/backward pass size (MB): 0.05
Params size (MB): 0.08
Estimated Total Size (MB): 0.14
----------------------------------------------------------------


## MNIST Experiment **II**

In [6]:
# LeNet Model definition
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = nn.Conv2d(1, 10, kernel_size=5)
        self.conv2 = nn.Conv2d(10, 20, kernel_size=5)
        # self.conv2_drop = nn.Dropout2d()
        self.fc1 = nn.Linear(320, 50)
        self.fc2 = nn.Linear(50, 10)

    def forward(self, x):
        x = F.relu(F.max_pool2d(self.conv1(x), 2))        
        # x = F.relu(F.max_pool2d(self.conv2_drop(self.conv2(x)), 2))    
        x = F.relu(F.max_pool2d(self.conv2(x), 2))    

        x = x.view(-1, 320)
        x = F.relu(self.fc1(x))        
        # x = F.dropout(x, training=self.training)
        x = self.fc2(x)

        return x #F.log_softmax(x, dim=1)  ### SUPER IMPORTANT ***************************** works with nll loss loss now
#         return x  ### SUPER IMPORTANT ***************************** works with cross entropy loss loss now


# LeNet Model definition
class Net2(nn.Module):
    def __init__(self):
        super(Net2, self).__init__()
        self.conv1 = nn.Conv2d(1, 10, kernel_size=5)
        self.conv2 = nn.Conv2d(10, 20, kernel_size=5)
        # self.conv2_drop = nn.Dropout2d()
        self.fc1 = nn.Linear(320, 50)
        self.fc2 = nn.Linear(50, 10)

    def forward(self, x):
        x = F.max_pool2d(self.conv1(x), 2)
        # x = F.relu(F.max_pool2d(self.conv2_drop(self.conv2(x)), 2))    
        x = F.max_pool2d(self.conv2(x), 2)    

        x = x.view(-1, 320)
        x = self.fc1(x)        
        # x = F.dropout(x, training=self.training)
        x = self.fc2(x)

        return x #F.log_softmax(x, dim=1)  ### SUPER IMPORTANT ***************************** works with nll loss loss now
#         return x  ### SUPER IMPORTANT ***************************** works with cross entropy loss loss now

# LeNet Model definition
class Net3(nn.Module):
    def __init__(self):
        super(Net3, self).__init__()
        self.conv1 = nn.Conv2d(1, 10, kernel_size=5)
        self.conv2 = nn.Conv2d(10, 20, kernel_size=5)
        # self.conv2_drop = nn.Dropout2d()
        self.fc1 = nn.Linear(320, 50)
        self.fc2 = nn.Linear(50, 10)

    def forward(self, x):
        x = torch.tanh(F.max_pool2d(self.conv1(x), 2))        
        # x = F.relu(F.max_pool2d(self.conv2_drop(self.conv2(x)), 2))    
        x = torch.tanh(F.max_pool2d(self.conv2(x), 2))    

        x = x.view(-1, 320)
        x = torch.tanh(self.fc1(x))        
        # x = F.dropout(x, training=self.training)
        x = self.fc2(x)

        return x #F.log_softmax(x, dim=1)  ### SUPER IMPORTANT ***************************** works with nll loss loss now
#         return x  ### SUPER IMPORTANT ***************************** works with cross entropy loss loss now





models = [Net().to(device), Net2().to(device), Net3().to(device)]  # training with slope 1 for now




train_loader = torch.utils.data.DataLoader(
    datasets.MNIST('../data', train=True, download=True, transform=transforms.Compose([
            transforms.ToTensor(),
            ])), 
        batch_size=100, shuffle=True)

# # MNIST Test dataset and dataloader declaration
test_loader = torch.utils.data.DataLoader(
    datasets.MNIST('../data', train=False, download=True, transform=transforms.Compose([
            transforms.ToTensor(),
            ])), 
        batch_size=100, shuffle=False)


dataloaders = {}
dataloaders['train'] = train_loader
dataloaders['val'] = test_loader

dataset_sizes = {x: len(dataloaders[x].dataset) for x in ['train', 'val']}


# # Define what device we are using
# print("CUDA Available: ",torch.cuda.is_available())

num_classes = 10



## Training the model

Now, let's write a general function to train a model. Here, we will
illustrate:

-  Scheduling the learning rate
-  Saving the best model

In the following, parameter ``scheduler`` is an LR scheduler object from
``torch.optim.lr_scheduler``.



In [7]:
def train_model(model, criterion, optimizer, scheduler, num_epochs=25):
    since = time.time()

    best_model_wts = copy.deepcopy(model.state_dict())
    best_acc = 0.0

    for epoch in range(num_epochs):
        print(f'Epoch {epoch}/{num_epochs - 1}')
        print('-' * 10)

        # Each epoch has a training and validation phase
        for phase in ['train', 'val']:
            if phase == 'train':
                model.train()  # Set model to training mode
            else:
                model.eval()   # Set model to evaluate mode

            running_loss = 0.0
            running_corrects = 0

            # Iterate over data.
            for inputs, labels in dataloaders[phase]:
                inputs = inputs.to(device)
                labels = labels.to(device)
                # breakpoint()
                # zero the parameter gradients
                optimizer.zero_grad()

                # forward
                # track history if only in train
                with torch.set_grad_enabled(phase == 'train'):
                    outputs = model(inputs)
                    _, preds = torch.max(outputs, 1)
                    loss = criterion(outputs, labels)

                    # backward + optimize only if in training phase
                    if phase == 'train':
                        loss.backward()
                        optimizer.step()

                # statistics
                running_loss += loss.item() * inputs.size(0)
                running_corrects += torch.sum(preds == labels.data)
            if phase == 'train':
                scheduler.step()

            epoch_loss = running_loss / dataset_sizes[phase]
            epoch_acc = running_corrects.double() / dataset_sizes[phase]

            print(f'{phase} Loss: {epoch_loss:.4f} Acc: {epoch_acc:.4f}')

            # deep copy the model
            if phase == 'val' and epoch_acc > best_acc:
                best_acc = epoch_acc
                best_model_wts = copy.deepcopy(model.state_dict())

        print()

    time_elapsed = time.time() - since
    print(f'Training complete in {time_elapsed // 60:.0f}m {time_elapsed % 60:.0f}s')
    print(f'Best val Acc: {best_acc:4f}')

    # load best model weights
    model.load_state_dict(best_model_wts)
    return model

In [8]:
def eval_model(model):
    phase = 'val'
    model.eval()   # Set model to evaluate mode

    running_corrects = 0

    # Iterate over data.
    for inputs, labels in dataloaders[phase]:
        inputs = inputs.to(device)
        labels = labels.to(device)

        outputs = model(inputs)
        _, preds = torch.max(outputs, 1)

        with torch.set_grad_enabled(phase == 'train'):
          running_corrects += torch.sum(preds == labels.data)

    epoch_acc = running_corrects.double() / dataset_sizes[phase]


    print(f'val Acc: {epoch_acc:4f}')


# Train models

It should take around 15-25 min on CPU. On GPU though, it takes less than a
minute.




In [9]:
if train_models: # models should be trained

  criterion = nn.CrossEntropyLoss()

  for model in models:

    # Observe that all parameters are being optimized
    optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9)
    # Decay LR by a factor of 0.1 every 7 epochs
    exp_lr_scheduler = lr_scheduler.StepLR(optimizer, step_size=7, gamma=0.1)  

    # print(
    print(model.__class__.__name__)
    model = train_model(model, criterion, optimizer, exp_lr_scheduler,
                        num_epochs=epochs)

Net
Epoch 0/9
----------
train Loss: 1.6867 Acc: 0.4791
val Loss: 0.5391 Acc: 0.8380

Epoch 1/9
----------
train Loss: 0.4138 Acc: 0.8749
val Loss: 0.3212 Acc: 0.9014

Epoch 2/9
----------
train Loss: 0.2945 Acc: 0.9104
val Loss: 0.2367 Acc: 0.9278

Epoch 3/9
----------
train Loss: 0.2238 Acc: 0.9332
val Loss: 0.1899 Acc: 0.9430

Epoch 4/9
----------
train Loss: 0.1809 Acc: 0.9457
val Loss: 0.1535 Acc: 0.9553

Epoch 5/9
----------
train Loss: 0.1517 Acc: 0.9545
val Loss: 0.1239 Acc: 0.9614

Epoch 6/9
----------
train Loss: 0.1318 Acc: 0.9604
val Loss: 0.1056 Acc: 0.9666

Epoch 7/9
----------
train Loss: 0.1146 Acc: 0.9663
val Loss: 0.1009 Acc: 0.9683

Epoch 8/9
----------
train Loss: 0.1129 Acc: 0.9672
val Loss: 0.0993 Acc: 0.9690

Epoch 9/9
----------
train Loss: 0.1117 Acc: 0.9671
val Loss: 0.0985 Acc: 0.9685

Training complete in 1m 15s
Best val Acc: 0.969000
Net2
Epoch 0/9
----------
train Loss: 1.3374 Acc: 0.6376
val Loss: 0.4149 Acc: 0.8802

Epoch 1/9
----------
train Loss: 0.346

# Extract logits

In [10]:
# extract features over training data 
def extract_features(model, num_datapoints=100, case='normalized'):
    X_train, X_val = torch.empty(1,num_classes, dtype=torch.float16), torch.empty(1,num_classes,dtype=torch.float16)
    model.eval()   # Set model to evaluate mode

    with torch.no_grad():
      for phase in ['train', 'val']:

            for inputs, labels in dataloaders[phase]:
                inputs = inputs.to(device)
                labels = labels.to(device)

                # forward
                # track history if only in train
                outputs = model(inputs).cpu() # logits which should be saved
                # breakpoint()
                if case == 'normalized':
                    outputs = outputs/outputs.sum(axis=1)[:,None]

                
                # print(outputs.shape)
                # _, preds = torch.max(outputs, 1)
                
                if phase == 'train':
                  X_train = torch.vstack((X_train,outputs))
                  if X_train.shape[0] > num_datapoints: break
                else:
                  X_val = torch.vstack((X_val,outputs))
                  if X_val.shape[0] > num_datapoints: break                  

    return X_train[1:num_datapoints], X_val[1:num_datapoints]


In [11]:
X, X_val = torch.empty(1,num_classes, dtype=torch.float16), torch.empty(1,num_classes,dtype=torch.float16)
y, y_val = [], []

num_datapoints = 3000
case = 'normalized'

for idx, model in enumerate(models):
  print(model.__class__.__name__)
  train_ft, val_ft = extract_features(model, num_datapoints, case)  
  X = torch.vstack((X,train_ft))
  X_val = torch.vstack((X_val,val_ft))

  y.extend([idx]*train_ft.shape[0]) 
  y_val.extend([idx]*val_ft.shape[0]) 


X = X[1:]
X_val = X_val[1:]

y = torch.tensor(y)
y_val = torch.tensor(y_val)

idx_t = torch.randperm(X.shape[0])
X = X[idx_t]
y = y[idx_t]

idx_v = torch.randperm(X_val.shape[0])
X_val = X_val[idx_v]
y_val = y_val[idx_v]


print(X.shape, X_val.shape) #, train_ft2.shape, val_ft2.shape)
print(y.shape, y_val.shape) #, train_ft2.shape, val_ft2.shape)

Net
Net2
Net3
torch.Size([8997, 10]) torch.Size([8997, 10])
torch.Size([8997]) torch.Size([8997])


# Train a classifier on logits 

In [12]:
# multi class
#Importing the necessary packages and libaries
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn import svm
import matplotlib.pyplot as plt
import numpy as np


In [13]:
num_trains = [10, 100, 500, 1000, 2000]

for num_train in num_trains:
  model_svm = svm.SVC(kernel='linear', C=1, decision_function_shape='ovo').fit(X[:num_train], y[:num_train])
  model_rbf = svm.SVC(kernel='rbf', gamma=1, C=1, decision_function_shape='ovo').fit(X[:num_train], y[:num_train])
  # model = svm.SVC(kernel='poly', degree=3, C=1, decision_function_shape='ovo').fit(X, y)
  # model = svm.SVC(kernel='sigmoid', C=1, decision_function_shape='ovo').fit(X, y)

  # print()
  for m in [model_svm, model_rbf]:
    pred = m.predict(X_val)
    accuracy = m.score(X_val, y_val)
    print(f'{num_train}: {accuracy}')
    
    # cm = confusion_matrix(y_val, pred)

    # print(cm)

    # plt.imshow(cm)
    # plt.show()

10: 0.4706013115482939
10: 0.3653440035567411
100: 0.5368456152050683
100: 0.48860731354896075
500: 0.5063910192286317
500: 0.6003112148493942
1000: 0.5139490941424919
1000: 0.582972101811715
2000: 0.5013893520062243
2000: 0.6501055907524731


In [None]:
model_svm = svm.SVC(kernel='linear', C=1, decision_function_shape='ovo').fit(X, y)
model_rbf = svm.SVC(kernel='rbf', gamma=1, C=1, decision_function_shape='ovo').fit(X, y)
# model = svm.SVC(kernel='poly', degree=3, C=1, decision_function_shape='ovo').fit(X, y)
# model = svm.SVC(kernel='sigmoid', C=1, decision_function_shape='ovo').fit(X, y)

# print()
for m in [model_svm, model_rbf]:
  pred = m.predict(X_val)
  accuracy = m.score(X_val, y_val)
  print(f'{X.shape[0]}: {accuracy}')
  
  cm = confusion_matrix(y_val, pred)

  # print(cm)

  plt.imshow(cm)
  plt.show()