### Set GPU

In [1]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = "3"

## Set Dataset Name

In [2]:
# dataset_name = 'CIFAR10'
# dataset_name = 'CIFAR100'
# dataset_name = 'MNIST'
# dataset_name = 'TINYIMAGENET'
dataset_name = 'IMBALANCED_CIFAR10'

### Run All Now

In [3]:
# from models.resnet_stl import resnet18
import torch
import numpy as np
from tqdm import tqdm

from models.resnet_cifar import resnet18
from utils.memory import MemoryBank
from utils.train_utils import simclr_train
from utils.utils import fill_memory_bank
from utils.config import create_config
from utils.common_config import get_model, get_train_dataset, get_val_transformations, get_train_dataloader
from utils.evaluate_utils import hungarian_evaluate2, scan_evaluate

In [4]:
output_folder = '../results/'
if dataset_name == "CIFAR10":
    output_folder += 'cifar-10/'
    config_exp_path = './configs/scan/scan_cifar10.yml'
    cfg_path = 'configs/CIFAR10_RESNET18.yaml'
elif dataset_name == "CIFAR100":
    output_folder += 'cifar-20/'
    config_exp_path = './configs/scan/scan_cifar20.yml'
    cfg_path = 'configs/CIFAR100_RESNET18.yaml'
elif dataset_name == "MNIST":
    output_folder += 'mnist/'
    config_exp_path = './configs/scan/scan_mnist.yml'
    cfg_path = 'configs/MNIST_RESNET18.yaml'
elif dataset_name == "TINYIMAGENET":
    output_folder += 'tinyimagenet/'
    config_exp_path = './configs/scan/scan_tinyimagenet.yml'
    cfg_path = 'configs/TINYIMAGENET_RESNET18.yaml'
elif dataset_name == 'IMBALANCED_CIFAR10':
    output_folder += 'imbalanced-cifar-10/'
    config_exp_path = './configs/scan/scan_cifar10_im.yml'
    cfg_path = 'configs/CIFAR10_RESNET18.yaml'
    
path_to_model = output_folder + 'scan/model.pth.tar'

temp = torch.load(path_to_model)

In [5]:
import argparse

config_env_path = './configs/env.yml'
p = create_config(config_env_path, config_exp_path)

In [6]:
model = get_model(p)
model.load_state_dict(temp['model'])
model.eval()
model.cuda();

train_data = get_train_dataset(p, get_val_transformations(p),
                                        split='train', to_augmented_dataset=False) 
train_dataloader = get_train_dataloader(p, train_data)

### Change batch size if you run into out of memory error 

In [7]:
from pycls.datasets.data import Data
from pycls.config import cfg
cfg.merge_from_file(cfg_path)
cfg.DATASET.NAME = dataset_name
data_obj = Data(cfg)

train_data, train_size = data_obj.getDataset(save_dir='../data', isTrain=True, isDownload=True)
trainSet = [i for i in range(train_size)]
trainSet = np.array(trainSet, dtype=np.ndarray)
train_dataloader = data_obj.getSequentialDataLoader(indexes=trainSet, batch_size=256, data=train_data)

test_data, test_size = data_obj.getDataset(save_dir='../data', isTrain=False, isDownload=True)
test_dataloader = data_obj.getTestLoader(data=test_data, test_batch_size=cfg.TRAIN.BATCH_SIZE, seed_id=cfg.RNG_SEED)

Preprocess Operations Selected ==>  [RandomCrop(size=(32, 32), padding=4), ToTensor(), Normalize(mean=[0.4914, 0.4822, 0.4465], std=[0.247, 0.2435, 0.2616])]
Files already downloaded and verified
Train Mode: Contain 13996 images
Files already downloaded and verified
Test Mode: Contain 10000 images


In [8]:
import torch.nn.functional as F

@torch.no_grad()
def get_predictions(p, dataloader, model, return_features=False):
    # Make predictions on a dataset with neighbors
    model.eval()
    predictions = [[] for _ in range(p['num_heads'])]
    probs = [[] for _ in range(p['num_heads'])]
    targets = []
    if return_features:
        ft_dim = get_feature_dimensions_backbone(p)
        features = torch.zeros((len(dataloader.sampler), ft_dim)).cuda()
    
    key_ = 'image'

    ptr = 0
    for row in tqdm(dataloader, desc="Extracting Self Label Predictions"):
#         images = row['image']
#         lbl = row['target']
        images, lbl = row
        images = images.cuda()
        output = model(images, forward_pass='default')
        for i, output_i in enumerate(output):
            predictions[i].append(torch.argmax(output_i, dim=1))
        targets.append(lbl)

    predictions = [torch.cat(pred_, dim=0) for pred_ in predictions]
    targets = torch.cat(targets, dim=0)

    out = [{'predictions': pred_, 'targets': targets} for pred_, prob_ in zip(predictions, probs)]

    if return_features:
        return out, features.cpu()
    else:
        return out

In [9]:
# from utils.evaluate_utils import get_predictions

In [10]:
predictions = get_predictions(p, train_dataloader, model)

Extracting Self Label Predictions: 100%|██████████| 55/55 [00:04<00:00, 13.52it/s]


#### Note: Stats are irrelevant for CIFAR100

In [11]:
clustering_stats = hungarian_evaluate2(0, predictions, 
                                class_names=train_data.classes,
                                compute_confusion_matrix=False,
                                confusion_matrix_file=os.path.join('confusion_matrix.png'))

In [12]:
clustering_stats

{'ACC': 0.29351243212346384,
 'ARI': -0.012018654655578461,
 'NMI': 0.022166544097787316,
 'hungarian_match': [(0, 5),
  (1, 7),
  (2, 2),
  (3, 8),
  (4, 1),
  (5, 4),
  (6, 6),
  (7, 9),
  (8, 3),
  (9, 0)]}

In [13]:
predictions[0]['predictions'].cpu()

tensor([9, 9, 9,  ..., 9, 8, 9])

In [14]:
np.save(f'{output_folder}/{dataset_name}_SCAN_cluster_ids.npy', predictions[0]['predictions'].cpu())