In [3]:
import multiprocessing
import sys

import numpy as np
import pytorch_lightning as pl
import sklearn
import torch
import torchvision

from data.dataloaders import ImagesDataset
from models.model import SelfSupervisedLearner
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA
from torch.utils.data import DataLoader

BATCH_SIZE = 256
EPOCHS     = 1000
LR         = 3e-4
IMAGE_SIZE = 96 # Change this depending on dataset
NUM_GPUS= 0 # Change this depending on host
NUM_WORKERS = multiprocessing.cpu_count()

In [5]:
resnet = torchvision.models.resnet18(pretrained=False)
model = SelfSupervisedLearner(
    resnet,
    image_size = IMAGE_SIZE,
    hidden_layer = 'avgpool',
    projection_size = 256,
    projection_hidden_size = 4096,
    moving_average_decay = 0.99,
    lr = LR
)

    
argv = ["train.py", "--load", "./ckpt/learner_0510_v100.pt"]
model.load_state_dict(torch.load(argv[2]))
print("Loaded checkpoint from ", argv[2])

#TODO: for some reason labels don't exist in my wget data 
#ds = ImagesDataset("./dataset/test_images", IMAGE_SIZE, train=False)
data_transforms = torchvision.transforms.Compose([torchvision.transforms.ToTensor()])
train_dataset = torchvision.datasets.STL10('./dataset/train_split', split='train', download=False,
                   transform=data_transforms)
train_loader = DataLoader(train_dataset, batch_size=5000, num_workers=NUM_WORKERS, shuffle=False)


train_imgs, train_labels = next(iter(train_loader))
print("Train loading done")

test_dataset = torchvision.datasets.STL10('./dataset/test_split', split='test', download=False, transform=data_transforms)
test_loader = DataLoader(test_dataset, batch_size=8000, num_workers=NUM_WORKERS, shuffle=False)
test_imgs, test_labels = next(iter(test_loader))
print("Test loading done")

train_projs, train_embeddings = model.learner.forward(train_imgs, return_embedding=True)
test_projs, test_embeddings = model.learner.forward(test_imgs, return_embedding=True)

print("got embeddings")

train_imgs = torch.flatten(train_imgs, start_dim=1)
test_imgs = torch.flatten(test_imgs, start_dim=1)

scaler = sklearn.preprocessing.StandardScaler()
scaler.fit(train_imgs)
train_imgs = scaler.transform(train_imgs).astype(np.float32)
test_imgs = scaler.transform(test_imgs).astype(np.float32)


pca = PCA(n_components=512)
train_imgs_pca = pca.fit_transform(train_imgs)
test_imgs_pca = pca.transform(test_imgs)

lr_baseline = LogisticRegression(max_iter=100000)
baseline_preds = lr_baseline.fit(train_imgs_pca, train_labels)

baseline_preds = lr_baseline.predict_proba(test_imgs_pca)
baseline_classes = lr_baseline.predict(test_imgs_pca)
baseline_acc = sklearn.metrics.accuracy_score(test_labels, baseline_classes)

lr_byol = LogisticRegression(max_iter=100000)
lr_byol.fit(train_embeddings.detach().numpy(), train_labels)

byol_preds = lr_byol.predict_proba(test_embeddings.detach().numpy())
byol_classes = lr_byol.predict(test_embeddings.detach().numpy())
byol_acc = sklearn.metrics.accuracy_score(test_labels, byol_classes)

Loaded checkpoint from  ./ckpt/learner_0510_v100.pt
Train loading done
Test loading done
got embeddings


In [131]:
# lr_baseline: fitting logistic regression to random images directly
# lr_rand: fitting logistic regression to randomly chosen embeddings from BYOL

for _ in range(10):
    random_idx = np.random.randint(0, high=train_imgs.shape[0], size = 30)

    embeddings_subset = train_embeddings.detach().numpy()[random_idx]
    train_labels_subset = train_labels[random_idx]

    lr_rand = LogisticRegression(max_iter=100000)
    lr_rand.fit(embeddings_subset, train_labels_subset)

    rand_preds =lr_rand.predict(test_embeddings.detach().numpy())
    rand_acc = sklearn.metrics.accuracy_score(test_labels, rand_preds)
    
    lr_baseline = LogisticRegression(max_iter=100000)
    lr_baseline.fit(train_imgs[random_idx], train_labels_subset)
    
    lr_baseline_preds = lr_baseline.predict(test_imgs)
    lr_baseline_acc = sklearn.metrics.accuracy_score(test_labels, lr_baseline_preds)
    
    print("lr baseline: ", lr_baseline_acc)

    print("random embeddings: ", rand_acc)

lr baseline:  0.22225
random embeddings:  0.402625
lr baseline:  0.209375
random embeddings:  0.379375
lr baseline:  0.188625
random embeddings:  0.329
lr baseline:  0.220875
random embeddings:  0.392
lr baseline:  0.1755
random embeddings:  0.318
lr baseline:  0.198375
random embeddings:  0.331
lr baseline:  0.191375
random embeddings:  0.3875
lr baseline:  0.1955
random embeddings:  0.41925
lr baseline:  0.166625
random embeddings:  0.369625
lr baseline:  0.160125
random embeddings:  0.336125


In [125]:
from sklearn.cluster import KMeans

km = KMeans(n_clusters=10, max_iter=100000)
km.fit(train_embeddings.detach().numpy())

clusters = km.labels_

In [126]:
from collections import Counter

counts = Counter(clusters)
total = train_embeddings.detach().numpy().shape[0]

weights = {}
uniform_prob = 0.1
for k in counts:
    weights[k] = uniform_prob / (counts[k] / total)
    
print(counts)
print(weights)

weights_full = [weights[k] for k in clusters]

Counter({3: 901, 5: 784, 1: 644, 0: 620, 4: 557, 9: 492, 2: 384, 8: 285, 7: 232, 6: 101})
{1: 0.7763975155279503, 4: 0.8976660682226213, 3: 0.5549389567147615, 5: 0.6377551020408164, 9: 1.016260162601626, 2: 1.3020833333333335, 6: 4.950495049504951, 0: 0.8064516129032259, 8: 1.7543859649122808, 7: 2.1551724137931036}


In [132]:
import random

for _ in range(10):
    kmeans_idx = random.choices(range(train_imgs.shape[0]), weights=weights_full, k=30)

    """
    cluster_subset = clusters[kmeans_idx]
    cluster_counts = Counter(cluster_subset)
    print(cluster_subset)
    print(cluster_counts)
    """

    embeddings_subset = train_embeddings.detach().numpy()[kmeans_idx]
    train_labels_subset = train_labels[kmeans_idx]

    lr_km = LogisticRegression(max_iter=100000)
    lr_km.fit(embeddings_subset, train_labels_subset)

    km_preds =lr_km.predict(test_embeddings.detach().numpy())
    km_acc = sklearn.metrics.accuracy_score(test_labels, km_preds)
    
    lr_baseline = LogisticRegression(max_iter=100000)
    lr_baseline.fit(train_imgs[kmeans_idx], train_labels_subset)
    
    lr_baseline_preds = lr_baseline.predict(test_imgs)
    lr_baseline_acc = sklearn.metrics.accuracy_score(test_labels, lr_baseline_preds)

    print("km: ", km_acc)
    print("lr baseline acc:", lr_baseline_acc)

km:  0.3675
lr baseline acc: 0.18175
km:  0.45175
lr baseline acc: 0.224875
km:  0.403625
lr baseline acc: 0.17425
km:  0.369875
lr baseline acc: 0.187
km:  0.451875
lr baseline acc: 0.222375
km:  0.328
lr baseline acc: 0.193125
km:  0.356
lr baseline acc: 0.192375
km:  0.36025
lr baseline acc: 0.224625
km:  0.417375
lr baseline acc: 0.207375
km:  0.406625
lr baseline acc: 0.190375


In [None]:
lr baseline:  0.30075
random embeddings:  0.655375
lr baseline:  0.29575
random embeddings:  0.6615