In [None]:
def connect_to_gdrive():
    from google.colab import drive
    drive.mount('/content/drive')

In [None]:
!pip install torch torchvision pytorch_lightning lightly torchsummary matplotlib sklearn pillow numpy pandas tensorboard

In [None]:
!pip install plotly==5.5.0 pyyaml==5.1

In [None]:
#!pip freeze

In [None]:
"""
After a note in a lightning tutorial a batch size of 256 and image resolution of
64x64px and the resnet-18 model from lightning lib will require 16GB GPU memory.
todo: is that so? can we measure that some how?

"""

# todo: check if all imports are needed/used

import os
import time
from datetime import datetime
import zipfile
import pickle
from collections import Counter

import torch
from torch.utils.data import random_split
import torch.nn as nn

import torchvision
from torchvision.datasets import ImageFolder, DatasetFolder, VisionDataset

import pytorch_lightning as pl

import lightly
from lightly.models.modules.heads import SimCLRProjectionHead
from lightly.loss import NTXentLoss

from pytorch_lightning.loggers import TensorBoardLogger
from torchsummary import summary

import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go

from sklearn.neighbors import NearestNeighbors
from sklearn.cluster import KMeans
from sklearn.preprocessing import normalize
from sklearn.model_selection import train_test_split
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA

from PIL import Image, ImageOps

import numpy as np
import pandas as pd

In [None]:
global_config = {
    'input_size': 128,
    #'input_size': 224,
    'projection_head_out': 128,
    'pretraining-batchsize': 1024,
    'finetuning-batchsize': 128, #256
    'pretraining-epochs': 20,
    'finetuning-epochs': 40,
    'pretraining-learningrate': 5e-4,  #6e-2
    'finetuning-learningrate': 5e-5,  #5e-6 1e-4
    'optimizer': 'Adam',  #'SGD'
    'training_size': 0.7,
    'test_size': 0.3,
    'dew_dataset_size': 1,
    'hzille_dataset_size': 1,
    'random_seed': 2022,
    'num_workers': 2,
    'path_to_dew_data': 'drive/MyDrive/Data Sets/dew/until_1950/until_1950',
    'path_to_hzille_data': 'drive/MyDrive/Data Sets/Heinrich Zille/ausgeschnitten_klein',
    'load_dew_image_list_from_disk' : False,
    'load_hzille_image_list_from_disk' : True,
    'path_to_pickled_dew_file_names' : 'drive/MyDrive/Data Sets/dew/until_1950/until_1950.pickle',  # gets used instead of path to data to avoid reading all images from disk
    'hzille_metadata_path': 'drive/MyDrive/Data Sets/Heinrich Zille/parsed_image_meta_data.csv',
    'logs_base_folder': 'runs',
    'plot_fig_size': (120, 60)
}

In [None]:
def load_image_lists(path_to_data, load_image_list_from_disk, dataset_size):
    """
    Get all image file names in order to split dataset into train and test
    """
    print()
    print('Loading images list...')

    print()
    print('Start time: ', datetime.now())
    print()

    start_time = time.time()

    if load_image_list_from_disk:
        print('...reading image list from disk...')

        try:
            all_images = os.listdir(path_to_data)
        except OSError:
            print('Error reading image list from', path_to_data)

    else:
        print('...loading image list previously stored...\n')

        path_to_pickled_data_file_names = path_to_data

        opened_pickle_file = open(path_to_pickled_data_file_names, "rb")
        all_images = pickle.load(opened_pickle_file)
        opened_pickle_file.close()


    selected_data_set_size = int(len(all_images) * dataset_size)
    selected_image_set = np.random.choice(all_images, selected_data_set_size, replace=False)


    print('End time: ', datetime.now())

    end_time = time.time()
    total_time = end_time - start_time
    total_mins = total_time / 60
    print(f'Loading image file list took {round(total_mins, 1)} mins')

    print('Number of all images:', len(all_images), all_images[:3])
    print('Number of selected images:', len(selected_image_set), selected_image_set[:3])


    return all_images, selected_image_set

In [None]:
def get_training_and_test_filenames(training_size, test_size, _selected_image_set, seed):
    """
    Split data filenames into train and test in order to create dataset objects with pytorch
    """

    print()
    print('Splitting images into train and test...')

    train_filenames, test_filenames = train_test_split(
        _selected_image_set, 
        shuffle=True, 
        train_size=training_size, 
        test_size=test_size, 
        random_state=seed
    )

    print('Training images:', len(train_filenames), 'Test image:', len(test_filenames))

    return train_filenames, test_filenames

In [None]:
def get_train_dataloader(input_size, collate_fn, path_to_data, file_names, batch_size, num_workers):
    
    print()
    print('...creating train dataloader...')

    print()
    print('Start time: ', datetime.now())
    print()

    start_time = time.time()
    
    if collate_fn is None:
        #collate_fn = lightly.data.SimCLRCollateFunction(
        #    input_size=input_size,
        #    cj_prob=0,
        #    random_gray_scale=1.0,
        #    min_scale=0.3,
        #    gaussian_blur=0.33,
        #    kernel_size=0.03,
        #    hf_prob=0.33
        #    #vf_prob=0.5,
        #    #rr_prob=0.5
        #)
        print('There should be a collate function passed....')
        raise ValueError()

    dataset_train = lightly.data.LightlyDataset(
        input_dir=path_to_data,
        filenames=file_names
    )

    _dataloader_train = torch.utils.data.DataLoader(
        dataset_train,
        batch_size=batch_size,
        shuffle=True,
        collate_fn=collate_fn,
        drop_last=True,
        num_workers=num_workers
    )

    print('End time: ', datetime.now())

    end_time = time.time()
    total_time = end_time - start_time
    total_mins = total_time / 60
    print(f'Creating dataset took {round(total_mins, 1)} mins')

    return _dataloader_train

In [None]:
def get_test_dataloader(input_size, path_to_data, file_names, batch_size, num_workers):
    
    print()
    print('...creating test dataloader...')

    print()
    print('Start time: ', datetime.now())
    print()

    start_time = time.time()
    
    test_transforms = torchvision.transforms.Compose([
        torchvision.transforms.Resize((input_size, input_size)),
        torchvision.transforms.Grayscale(num_output_channels=1),
        torchvision.transforms.ToTensor(),
        torchvision.transforms.Normalize(
            mean=np.mean(lightly.data.collate.imagenet_normalize['mean']),
            std=np.mean(lightly.data.collate.imagenet_normalize['std']),
        )
    ])

    dataset_test = lightly.data.LightlyDataset(
        input_dir=path_to_data,
        filenames=file_names,
        transform=test_transforms
    )

    dataloader_test = torch.utils.data.DataLoader(
        dataset_test,
        batch_size=batch_size,
        shuffle=False,
        drop_last=False,
        num_workers=num_workers
    )

    print('End time: ', datetime.now())

    end_time = time.time()
    total_time = end_time - start_time
    total_mins = total_time / 60
    print(f'Creating dataset took {round(total_mins, 1)} mins')

    return dataloader_test

In [None]:
class RandomGamma(torch.nn.Module):
    """Apply randomly gamma from torchvision functional transforms."""


    def __init__(self, p=0.5, gamma=(0.5, 1.5), gain=1):
        super().__init__()
        self.p = p
        self.gamma = gamma
        self.gain = gain


    def forward(self, img):
        
        if self.p < torch.rand(1):
            return img
        
        gamma = float(torch.empty(1).uniform_(self.gamma[0], self.gamma[1]))

        return torchvision.transforms.functional.adjust_gamma(img, gamma, self.gain)
    

    def __repr__(self):
        return self.__class__.__name__ + '(p={})'.format(self.p)

def get_training_collate_fnc():

    training_transforms = torchvision.transforms.Compose([
        #HistogramNormalize(),
        torchvision.transforms.RandomResizedCrop(size=global_config['input_size'], scale=(0.5, 1.0), ratio=(0.5, 2)),
        torchvision.transforms.Grayscale(num_output_channels=1),
        #torchvision.transforms.RandomHorizontalFlip(p=0.33),
        #torchvision.transforms.RandomVerticalFlip(p=0.5),
        torchvision.transforms.GaussianBlur(kernel_size=21, sigma=(0.1, 1.0)),
        torchvision.transforms.RandomInvert(p=0.2),
        torchvision.transforms.RandomSolarize(threshold=128, p=0.2),
        torchvision.transforms.RandomApply(
            torch.nn.ModuleList([torchvision.transforms.ColorJitter(brightness=(0.2, 1.8), contrast=(0.2, 2.2), saturation=1, hue=0)]), 
            p=0.5
        ),
        RandomGamma(p=0.5, gamma=(0.025, 2), gain=1.05),
        torchvision.transforms.ToTensor(),
        torchvision.transforms.Normalize(
            mean=np.mean(lightly.data.collate.imagenet_normalize['mean']),
            std=np.mean(lightly.data.collate.imagenet_normalize['std']),
        )
    ])

    _collate_fnc = lightly.data.BaseCollateFunction(training_transforms)

    return _collate_fnc

In [None]:
def load_dew_data(batch_size):
    
    all_images_list, selected_images_list = load_image_lists(
        path_to_data=global_config['path_to_pickled_dew_file_names'],
        load_image_list_from_disk=global_config['load_dew_image_list_from_disk'],
        dataset_size=global_config['dew_dataset_size']
    )

    training_file_names, test_file_names = get_training_and_test_filenames(
        training_size=global_config['training_size'],
        test_size=global_config['test_size'],
        _selected_image_set=selected_images_list,
        seed=global_config['random_seed']
    )

    collate_fn = get_training_collate_fnc()

    _dataloader_train = get_train_dataloader(
        input_size=global_config['input_size'], 
        collate_fn=collate_fn,
        path_to_data=global_config['path_to_dew_data'], 
        file_names=training_file_names, 
        batch_size=batch_size, 
        num_workers=global_config['num_workers']
    )

    _dataloader_test = get_test_dataloader(
        input_size=global_config['input_size'],
        path_to_data=global_config['path_to_dew_data'],
        file_names=test_file_names,
        batch_size=batch_size, 
        num_workers=global_config['num_workers']
    )

    return _dataloader_train, _dataloader_test

In [None]:
def load_hzille_data(batch_size, get_full_dataset=False):
    
    all_images_list, selected_images_list = load_image_lists(
        path_to_data=global_config['path_to_hzille_data'],
        load_image_list_from_disk=global_config['load_hzille_image_list_from_disk'],
        dataset_size=global_config['hzille_dataset_size']
    )

    if get_full_dataset:
        
        _dataloader_all = get_test_dataloader(
            input_size=global_config['input_size'],
            path_to_data=global_config['path_to_hzille_data'],
            file_names=all_images_list,
            batch_size=batch_size,
            num_workers=global_config['num_workers']
        )

        return _dataloader_all


    training_file_names, test_file_names = get_training_and_test_filenames(
        training_size=global_config['training_size'],
        test_size=global_config['test_size'],
        _selected_image_set=selected_images_list,
        seed=global_config['random_seed']
    )

    collate_fn = get_training_collate_fnc()
    
    _dataloader_train = get_train_dataloader(
        input_size=global_config['input_size'], 
        collate_fn=collate_fn,
        path_to_data=global_config['path_to_hzille_data'], 
        file_names=training_file_names, 
        batch_size=batch_size, 
        num_workers=global_config['num_workers']
    )

    _dataloader_test = get_test_dataloader(
        input_size=global_config['input_size'],
        path_to_data=global_config['path_to_hzille_data'],
        file_names=test_file_names,
        batch_size=batch_size, 
        num_workers=global_config['num_workers']
    )

    return _dataloader_train, _dataloader_test

In [None]:
class ResnetSimCLR(pl.LightningModule):
    def __init__(
        self,
        max_epochs,
        batchsize,
        projection_head_out,
        learning_rate,
        optimizer,
        momentum=0.9,
        weight_decay=1e-6,  #5e-4,
        pretrained=False
    ):
        super().__init__()

        # configuration:
        self.max_epochs = max_epochs
        self.batchsize = batchsize
        self.learning_rate = learning_rate
        self.optimizer = optimizer
        self.momentum = momentum
        self.weight_decay = weight_decay
        self.pretrained = pretrained

        # load resnet
        resnet = torchvision.models.resnet18(pretrained=self.pretrained)

        # convert model to 1-channel without loosing pretrained weights like the fast.ai implementation
        # from https://datascience.stackexchange.com/questions/65783/pytorch-how-to-use-pytorch-pretrained-for-single-channel-image
        
        # unpack model architecture / all layers
        model_architecture = list(resnet.children())
        
        # store weights of first convolutions
        first_conv_weight = model_architecture[0].weight

        # replace original layer with new 1-channel convolutional layer
        # original: (0): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
        new_1_channel_conv = nn.Conv2d(1, 64, kernel_size=7, stride=2, padding=3, bias=False)
        model_architecture[0] = new_1_channel_conv
        
        # convert original 3 dimensional weights to 1 dimension
        model_architecture[0].weight = nn.Parameter(torch.mean(first_conv_weight, dim=1, keepdim=True))
        
        # pack architecture / layers again into sequential
        model_architecture = nn.Sequential(*model_architecture)

        # create backbone without original classification head
        self.backbone = nn.Sequential(
            *list(model_architecture.children())[:-1]
        )

        # create projection head
        hidden_dim = resnet.fc.in_features

        self.projection_head = SimCLRProjectionHead(
            hidden_dim, hidden_dim, projection_head_out
        )

        # define loss function
        self.criterion = NTXentLoss()

    def forward(self, x):
        
        # todo: for actual prediction the head should not be used!
        h = self.backbone(x).flatten(start_dim=1)
        z = self.projection_head(h)
        
        return z


    def training_step(self, batch, batch_idx):
        
        (x0, x1), _, _ = batch
        z0 = self.forward(x0)
        z1 = self.forward(x1)
        loss = self.criterion(z0, z1)
        
        self.log(
            'train_loss', 
            loss, 
            on_step=True, 
            on_epoch=True, 
            batch_size=self.batchsize
        )
        
        return loss

    def configure_optimizers(self):
        
        if self.optimizer == 'SGD':
            optim = torch.optim.SGD(
                self.parameters(),
                lr=self.learning_rate,  # 6e-2
                momentum=self.momentum,
                weight_decay=self.weight_decay
            )

        elif self.optimizer == 'Adam':
            optim = torch.optim.Adam(
                self.parameters(), 
                lr=self.learning_rate, # heuristically: 5e-4, initially: 1e-3
                weight_decay=self.weight_decay  # initially: 1e-6
            )
        else:
            raise ValueError('Unknown optimizer: ', self.optimizer)    


        scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optim, self.max_epochs)
        
        return [optim], [scheduler]
        #return optim


    def set_learningrate(self, new_learningrate):
        self.learning_rate = new_learningrate


    def set_max_epochs(self, new_max_epochs):
        self.max_epochs = new_max_epochs


    def set_batchsize(self, new_batchsize):
        self.batchsize = new_batchsize


In [None]:
def get_model(_max_epochs, batchsize, projection_head_out, learning_rate, optimizer, pretrained=False, verbose=True):
    
    print()

    _gpus = 1 if torch.cuda.is_available() else 0

    model = ResnetSimCLR(
        max_epochs=_max_epochs,
        batchsize=batchsize, 
        projection_head_out=projection_head_out,
        learning_rate=learning_rate,
        optimizer=optimizer,
        pretrained=pretrained
    )
    
    if verbose:
        print('gpus available:', torch.cuda.is_available())
        print('gpus:', torch.cuda.device_count())
        print('current device:', torch.cuda.current_device())
        print('amount of gpus:', torch.cuda.device_count())
        print('gpu name:', torch.cuda.get_device_name(0))

        #!nvidia-smi
        
        print('model total param:', sum(p.numel() for p in model.parameters()))
        print('model total param:', sum(p.numel() for p in model.parameters() if p.requires_grad))

    return model, _gpus


In [None]:
def train_model(_model, _max_epochs, _gpus, dataloader, log_every_n_steps=10, _logger=True):
    """
    Train model.
    """
    print()
    print('Training model...')

    print()
    print('Epochs:', _max_epochs)

    print()
    print('Start time: ', datetime.now())
    print()

    start_time = time.time()

    _trainer = pl.Trainer(
        logger=_logger,
        max_epochs=_max_epochs,
        gpus=_gpus,
        log_every_n_steps=log_every_n_steps,
        callbacks=pl.callbacks.progress.TQDMProgressBar()
    )
    #trainer.fit(model, tqdm(dataloader_train_simclr))
    _trainer.fit(_model, dataloader)

    print('End time: ', datetime.now())

    end_time = time.time()
    total_time = end_time - start_time
    total_mins = total_time / 60
    print(f'Training took {round(total_mins, 1)} mins, ~{round(total_mins / _max_epochs, 1)} mins per epoch')

    return _trainer

In [None]:
def generate_embeddings(_model, dataloader):
    """
    Used backbone of model (without projection head) accordingly to SimCLR paper
    to create embeddings.
    """

    print()
    print('Generating embeddings...')

    print()
    print('Start time: ', datetime.now())
    print()

    start_time = time.time()

    embeddings = []
    filenames = []
    with torch.no_grad():
        for img, label, fnames in dataloader:
            img = img.to(_model.device)
            emb = _model.backbone(img).flatten(start_dim=1) # todo put into model as ".inference()" or so
            embeddings.append(emb)
            filenames.extend(fnames)

    embeddings = torch.cat(embeddings, 0)
    embeddings = normalize(embeddings)
    
    print('End time: ', datetime.now())

    end_time = time.time()
    total_time = end_time - start_time
    total_mins = total_time / 60
    print(f'Getting embeddings took {round(total_mins, 1)} mins')
    
    return embeddings, filenames

In [None]:
def get_knn(_embeddings, n_neighbours=3):

    print()
    print(f'Creating knn with {n_neighbours}...')

    knn = NearestNeighbors(n_neighbors=n_neighbours).fit(_embeddings)

    return knn


def get_image_as_np_array(filename: str):
    """
    Returns an image as an numpy array
    """
    img = Image.open(filename)
    grayscale_img = ImageOps.grayscale(img)

    return np.asarray(grayscale_img)


def plot_knn_examples(_knn, _embeddings, _filenames, path_to_data, num_examples=6, fig_size=(16,8), plot_saving_folder=None, seed=2022):
    """
    Plots multiple rows of random images with their nearest neighbors

    Most left image is query image, others are k-nearest neighbours.
    Distance is displayed above images.
    """

    print()
    print(f'Plotting knn, {num_examples} examples...')

    distances, indices = _knn.kneighbors(_embeddings)
    
    np.random.seed(seed)
    samples_idx = np.random.choice(len(indices), size=num_examples, replace=False)
    #print('sample indices', samples_idx)

    for idx in samples_idx:
        fig = plt.figure(figsize=fig_size)

        for plot_x_offset, neighbor_idx in enumerate(indices[idx]):

            ax = fig.add_subplot(1, len(indices[idx]), plot_x_offset + 1)
            fname = os.path.join(path_to_data, _filenames[neighbor_idx])
            plt.imshow(get_image_as_np_array(fname), cmap='gray', vmin=0, vmax=25
            ax.set_title(f'd={distances[idx][plot_x_offset]:.3f} ({fname.split("/")[-1]})')
            plt.axis('off')
        
        if plot_saving_folder is not None:
            
            if not os.path.exists(plot_saving_folder):
                os.makedirs(plot_saving_folder)
            
            fig.savefig(f'{plot_saving_folder}/{idx}_{_filenames[idx].split(".")[0]}.jpg')

In [None]:
def visualize_embeddings(
    embeddings, 
    filenames, 
    path_to_store_data, 
    hzille_metadata_path
):

    tsne_3d = TSNE(
        n_components=3, 
        random_state=global_config['random_seed'], 
        perplexity=15, 
        n_iter=5000,
        n_iter_without_progress=500,
        learning_rate=15
    )

    tsne_2d = TSNE(
        n_components=2, 
        random_state=global_config['random_seed'], 
        perplexity=15, 
        n_iter=5000,
        n_iter_without_progress=500,
        learning_rate=15
    )

    pca_3d = PCA(
        n_components=3, 
        random_state=global_config['random_seed']
    )

    pca_2d = PCA(
        n_components=2, 
        random_state=global_config['random_seed']
    )
    
    print('Reducing embeddings dimensions with...')
    tsne_projections_3d = tsne_3d.fit_transform(embeddings)
    tsne_projections_2d = tsne_2d.fit_transform(embeddings)
    pca_components_3d = pca_3d.fit_transform(embeddings)
    pca_components_2d = pca_2d.fit_transform(embeddings)

    print('...creating plots...')

    df_tsne_3d = pd.DataFrame(tsne_projections_3d)
    df_tsne_2d = pd.DataFrame(tsne_projections_2d)
    df_pca_3d = pd.DataFrame(pca_components_3d)
    df_pca_2d = pd.DataFrame(pca_components_2d)

    color_attribute = None
    hover_data_attribute = None
    
    if hzille_metadata_path is not None:
        
        df_zille_metadata = pd.read_csv(hzille_metadata_path)
    
        df_tsne_3d['id'] = [int(filename.split('.')[0]) for filename in filenames]
        df_tsne_2d['id'] = [int(filename.split('.')[0]) for filename in filenames]
        df_pca_3d['id'] = [int(filename.split('.')[0]) for filename in filenames]
        df_pca_2d['id'] = [int(filename.split('.')[0]) for filename in filenames]

        # add authorship data to data frame
        df_tsne_3d = pd.merge(left=df_tsne_3d, right=df_zille_metadata[['id', 'author']], on='id', how='inner')
        df_tsne_2d = pd.merge(left=df_tsne_2d, right=df_zille_metadata[['id', 'author']], on='id', how='inner')
        df_pca_3d = pd.merge(left=df_pca_3d, right=df_zille_metadata[['id', 'author']], on='id', how='inner')
        df_pca_2d = pd.merge(left=df_pca_2d, right=df_zille_metadata[['id', 'author']], on='id', how='inner')

        color_attribute = 'author'
        hover_data_attribute = ['id']

    fig_tsne_3d = px.scatter_3d(
        df_tsne_3d,
        x=0,
        y=1,
        z=2,
        color=color_attribute,
        hover_data=hover_data_attribute,
        labels={
            0: "1st projection",
            1: "2nd projection",
            2: "3rd projection"
        },
        title="t-SNE 3D"
    )
    # fig.update_traces(marker_size=8)
    fig_tsne_2d = px.scatter(
        df_tsne_2d,
        x=0,
        y=1,
        color=color_attribute,
        hover_data=hover_data_attribute,
        labels={
            0: "1st projection",
            1: "2nd projection"
        },
        title="t-SNE 2D"
    )
    # fig.update_traces(marker_size=8)
    fig_pca_3d = px.scatter_3d(
        df_pca_3d,
        x=0,
        y=1,
        z=2,
        color=color_attribute,
        hover_data=hover_data_attribute,
        labels={
            0: "1st component",
            1: "2nd component",
            2: "3rd component"
        },
        title="PCA 3D"
    )
    # fig.update_traces(marker_size=8)
    fig_pca_2d = px.scatter(
        df_pca_2d,
        x=0,
        y=1,
        color=color_attribute,
        hover_data=hover_data_attribute,
        labels={
            0: "1st component",
            1: "2nd component"
        },
        title="PCA 2D"
    )
    # fig.update_traces(marker_size=8)

    saving_folder = f'{path_to_store_data}/dim_reduction'

    os.makedirs(saving_folder)
    
    df_tsne_3d.to_csv(f'{saving_folder}/tsne_3d.csv')
    df_tsne_2d.to_csv(f'{saving_folder}/tsne_2d.csv')
    df_pca_3d.to_csv(f'{saving_folder}/pca_3d.csv')
    df_pca_2d.to_csv(f'{saving_folder}/pca_2d.csv')
    
    fig_tsne_3d.write_html(f'{saving_folder}/tsne_3d_plot.html')
    fig_tsne_2d.write_html(f'{saving_folder}/tsne_2d_plot.html')
    fig_pca_3d.write_html(f'{saving_folder}/pca_3d_plot.html')
    fig_pca_2d.write_html(f'{saving_folder}/pca_2d_plot.html')
    
    fig_tsne_2d.show()
    fig_pca_2d.show()

In [None]:
def plot_data_augmentations(data_loader, rows=6, columns=12, figsize=(40, 20)):
    """
    Sample from data augmentations
    """
    batch_img_tuples, batch_labels, batch_filenames = next(iter(data_loader))
    
    fig = plt.figure(figsize=figsize)
    
    img_channel = 0 # doesn´t matter as all channels are gray here

    image_idx = 0
    for plot_position in range(1, columns * rows + 1, 2):
    
        fig.add_subplot(rows, columns, plot_position)
        img_1 = batch_img_tuples[0][image_idx][img_channel].squeeze()
        plt.imshow(img_1, cmap='gray')
        plt.axis('off')
        plt.title(batch_filenames[image_idx])

        fig.add_subplot(rows, columns, plot_position + 1)
        img_2 = batch_img_tuples[1][image_idx][img_channel].squeeze()
        plt.imshow(img_2, cmap='gray')
        plt.axis('off')
        plt.title(batch_filenames[image_idx])

        image_idx += 1

In [None]:
def run_experiment(
    _experiment_name, 
    config, 
    pretrained, 
    test_data_loader, 
    pretraining_dataset_name=None, 
    pretraining_data_loader=None, 
    finetuning_dataset_name=None, 
    finetuning_data_loader=None,
    pretraining_log_every_n_steps=10,
    finetuning_log_every_n_steps=3,
    model_from_checkpoint=None,
    forced_batch_size_in_pretraining=None,
    do_not_use_hzille_metadata=False
):

    training_options = ['dew', 'hzille', None]

    assert pretraining_dataset_name in training_options
    assert finetuning_dataset_name in training_options


    if pretraining_data_loader:
        
        print('pretraining')
        version_name = f'{datetime.now().strftime("%y-%m-%d-%H%M%S")}_pretraining_{pretraining_dataset_name}'

        logger = TensorBoardLogger(
            save_dir=config['logs_base_folder'], 
            name=_experiment_name, 
            version=version_name
        )

        logger.log_hyperparams(config)

        batch_size = forced_batch_size_in_pretraining if forced_batch_size_in_pretraining else config['pretraining-batchsize']
        print('batch size:', batch_size)
        pl.seed_everything(config['random_seed'])

        model, gpus = get_model(
            _max_epochs=config['pretraining-epochs'],
            batchsize=batch_size,
            projection_head_out=config['projection_head_out'],
            learning_rate=config['pretraining-learningrate'],
            optimizer=config['optimizer'],
            pretrained=pretrained
        )

        trainer = train_model(
            _model=model,
            _max_epochs=config['pretraining-epochs'],
            _gpus=gpus,
            dataloader=pretraining_data_loader,
            _logger=logger,
            log_every_n_steps=pretraining_log_every_n_steps
        )

        trainer.save_checkpoint(f'{config["logs_base_folder"]}/{_experiment_name}/{version_name}/final_model_checkpoints/{version_name}.ckpt')


    if finetuning_data_loader:

        print('finetuning')
        version_name = f'{datetime.now().strftime("%y-%m-%d-%H%M%S")}_finetuning_{finetuning_dataset_name}'

        logger = TensorBoardLogger(
            save_dir=config['logs_base_folder'], 
            name=_experiment_name, 
            version=version_name
        )

        logger.log_hyperparams(config)

        pl.seed_everything(config['random_seed'])

        if pretraining_data_loader or model_from_checkpoint:

            if model_from_checkpoint:
                print('Using model from checkpoint')
                
                gpus = 1 if torch.cuda.is_available() else 0
                model = model_from_checkpoint

            model.set_max_epochs(config['finetuning-epochs'])
            model.set_learningrate(config['finetuning-learningrate'])
            model.set_batchsize(config['finetuning-batchsize'])

        else:

            model, gpus = get_model(
                _max_epochs=config['finetuning-epochs'],
                batchsize=config['finetuning-batchsize'],
                projection_head_out=config['projection_head_out'],
                learning_rate=config['finetuning-learningrate'],
                optimizer=config['optimizer'],
                pretrained=pretrained
            )

        trainer = train_model(
            _model=model,
            _max_epochs=config['finetuning-epochs'],
            _gpus=gpus,
            dataloader=finetuning_data_loader,
            _logger=logger,
            log_every_n_steps=finetuning_log_every_n_steps
        )

        trainer.save_checkpoint(f'{config["logs_base_folder"]}/{_experiment_name}/{version_name}/final_model_checkpoints/{version_name}.ckpt')


    no_training_at_all = pretraining_data_loader == None and finetuning_data_loader == None
    
    if no_training_at_all:
        print('No training at all...')
        version_name = f'{datetime.now().strftime("%y-%m-%d-%H%M%S")}'
        
        if model_from_checkpoint:
            print('Using model from checkpoint')
            model = model_from_checkpoint
        else:
            print('Initializing new model. Pretrained: ', pretrained)
            model, gpus = get_model(
                _max_epochs=config['pretraining-epochs'],
                batchsize=config['pretraining-batchsize'],
                projection_head_out=config['projection_head_out'],
                learning_rate=config['pretraining-learningrate'],
                optimizer=config['optimizer'],
                pretrained=pretrained
            )


    # make embeddings
    model.eval()
    embeddings, filenames = generate_embeddings(model, test_data_loader)
    print(embeddings)
    # store embeddings and filenames    
    embbedings_saving_folder = f'{config["logs_base_folder"]}/{_experiment_name}/{version_name}/embeddings'

    os.makedirs(embbedings_saving_folder)

    np.save(f'{embbedings_saving_folder}/embeddings', embeddings)
    filenames_file = open(f'{embbedings_saving_folder}/embeddings_filenames.pickle', 'wb')
    pickle.dump(filenames, filenames_file)
    filenames_file.close()
    

    # make knn and visualize neighbours of embeddings
    knn = get_knn(_embeddings=embeddings, n_neighbours=30)
    
    path_to_data = config['path_to_dew_data'] if do_not_use_hzille_metadata else config['path_to_hzille_data']
    print(path_to_data)

    plot_knn_examples(
        _knn=knn, 
        _embeddings=embeddings,
        _filenames=filenames,
        path_to_data=path_to_data,
        num_examples=30,
        plot_saving_folder=f'{config["logs_base_folder"]}/{_experiment_name}/{version_name}/knn_plots/',
        fig_size=config['plot_fig_size'],
        seed=config['random_seed']
    )


    # plot embeddings via pca/tsne
    hzille_metadata_path = None if do_not_use_hzille_metadata else config['hzille_metadata_path']

    visualize_embeddings(
        embeddings=embeddings,
        filenames=filenames,
        hzille_metadata_path=hzille_metadata_path,
        path_to_store_data=f'{config["logs_base_folder"]}/{_experiment_name}/{version_name}',
    )


In [None]:
def init_experiments():
    print(global_config)
    print()

    connect_to_gdrive()
    pl.seed_everything(global_config['random_seed'])

In [None]:
!nvidia-smi

NVIDIA-SMI has failed because it couldn't communicate with the NVIDIA driver. Make sure that the latest NVIDIA driver is installed and running.



In [None]:
"""
Connect to drive and set global seed for torch
"""
init_experiments()

In [None]:
%load_ext tensorboard
%tensorboard --logdir runs/

In [None]:
#!kill 3385

In [None]:
"""
Create Heinrich Zille images dataloader.
Batch size differs from using dew due to the smaller size of the dataset (675)
"""
dataloader_hzille_train, dataloader_hzille_test = load_hzille_data(
    batch_size=global_config['finetuning-batchsize']
)

In [None]:
"""
Create dew dataloader
"""
dataloader_dew_train, dataloader_dew_test = load_dew_data(
    batch_size=global_config['pretraining-batchsize']
)

In [None]:
#!rm -r runs/9_dew_only_pretrained/

In [None]:
"""
Experiment 0:

Use UNTRAINED resnet on Heinrich Zille images 
and test on Heinrich Zille images
"""

experiment_name = '0_hzille_untrained'

run_experiment(
    _experiment_name=experiment_name,
    config=global_config,
    pretrained=False,
    test_data_loader=dataloader_hzille_test
)

!cp -r /content/runs/0_hzille_untrained /content/drive/MyDrive/ML_logs/HZille/0_hzille_untrained

In [None]:
"""
Experiment 1:

Use PRETRAINED (ImageNet, torchvision) resnet on Heinrich Zille images
and test on Heinrich Zille images
"""
experiment_name = '1_hzille_only_pretrained'

run_experiment(
    _experiment_name=experiment_name,
    config=global_config,
    pretrained=True,
    test_data_loader=dataloader_hzille_test,
)

!cp -r /content/runs/1_hzille_only_pretrained /content/drive/MyDrive/ML_logs/HZille/1_hzille_only_pretrained

In [None]:
"""
Experiment 2:

Use PRETRAINED (ImageNet, torchvision) resnet,
FINETUNE with SimCLR on Heinrich Zille images,
and test on Heinrich Zille images
"""
experiment_name = '2_hzille_pretrained_finetuned'

run_experiment(
    _experiment_name=experiment_name,
    config=global_config,
    pretrained=True,
    finetuning_dataset_name='hzille',
    finetuning_data_loader=dataloader_hzille_train,
    test_data_loader=dataloader_hzille_test
)

!cp -r /content/runs/2_hzille_pretrained_finetuned /content/drive/MyDrive/ML_logs/HZille/2_hzille_pretrained_finetuned

In [None]:
"""
Experiment 3:

Use PRETRAINED (ImageNet, torchvision) resnet,
OVERTRAIN with SimCLR and dew dataset, 
and test on Heinrich Zille images 
"""
experiment_name = '3_hzille_overtrain'

run_experiment(
    _experiment_name=experiment_name,
    config=global_config,
    pretrained=True,
    pretraining_dataset_name='dew',
    pretraining_data_loader=dataloader_dew_train,
    test_data_loader=dataloader_hzille_test
)

!cp -r /content/runs/3_hzille_overtrain /content/drive/MyDrive/ML_logs/HZille/3_hzille_overtrain

In [None]:
"""
Experiment 4:

Use PRETRAINED (ImageNet, torchvision) resnet,
OVERTRAINED with SimCLR and dew dataset,
FINETUNED on Heinrich Zille images
and test on Heinricht Zille images
"""

#load model checkpoints from experiment 3, as experiment 4 is building up on it
checkpoint_path = 'runs/3_hzille_overtrain/22-02-21-120301_pretraining_dew/final_model_checkpoints/22-02-21-120301_pretraining_dew.ckpt'
loaded_model = ResnetSimCLR.load_from_checkpoint(
    checkpoint_path,
    max_epochs=global_config['pretraining-epochs'],
    batchsize=global_config['pretraining-batchsize'],
    projection_head_out=global_config['projection_head_out'],
    learning_rate=global_config['pretraining-learningrate'],
    optimizer=global_config['optimizer']
)

experiment_name = '4_hzille_overtrain_finetune'

run_experiment(
    _experiment_name=experiment_name,
    config=global_config,
    pretrained=True,
    #pretraining_dataset_name='dew',
    #pretraining_data_loader=dataloader_dew_train,
    finetuning_dataset_name='hzille',
    finetuning_data_loader=dataloader_hzille_train,
    test_data_loader=dataloader_hzille_test,
    model_from_checkpoint=loaded_model
)

!cp -r /content/runs/4_hzille_overtrain_finetune /content/drive/MyDrive/ML_logs/HZille/4_hzille_overtrain_finetune

In [None]:
"""
Experiment 5:

Use UNTRAINED (ImageNet, torchvision) resnet,
TRAIN with SimCLR and dew dataset,
and test on Heinrich Zille images
"""
experiment_name = '5_hzille_train'

run_experiment(
    _experiment_name=experiment_name,
    config=global_config,
    pretrained=False,
    pretraining_dataset_name='dew',
    pretraining_data_loader=dataloader_dew_train,
    test_data_loader=dataloader_hzille_test
)

!cp -r /content/runs/5_hzille_train /content/drive/MyDrive/ML_logs/HZille/5_hzille_train

In [None]:
"""
Experiment 6:

Use UNTRAINED (ImageNet, torchvision) resnet,
TRAIN with SimCLR and dew dataset,
FINETUNE with SimCLR on Heinrich Zille images 
and test on Heinrich Zille images
"""

#load model checkpoints from experiment 5, as experiment 6 is building up on it
checkpoint_path = '/content/drive/MyDrive/ML_logs/HZille/5_hzille_train/22-02-21-204646_pretraining_dew/final_model_checkpoints/22-02-21-204646_pretraining_dew.ckpt'

loaded_model = ResnetSimCLR.load_from_checkpoint(
    checkpoint_path,
    max_epochs=global_config['pretraining-epochs'],
    batchsize=global_config['pretraining-batchsize'],
    projection_head_out=global_config['projection_head_out'],
    learning_rate=global_config['pretraining-learningrate'],
    optimizer=global_config['optimizer']
)

experiment_name = '6_hzille_train_finetune'

run_experiment(
    _experiment_name=experiment_name,
    config=global_config,
    pretrained=False,
    #pretraining_dataset_name='dew',
    #pretraining_data_loader=dataloader_dew_train,
    finetuning_dataset_name='hzille',
    finetuning_data_loader=dataloader_hzille_train,
    test_data_loader=dataloader_hzille_test,
    model_from_checkpoint=loaded_model
)

!cp -r /content/runs/6_hzille_train_finetune /content/drive/MyDrive/ML_logs/HZille/6_hzille_train_finetune

In [None]:
"""
Experiment 7:

Use UNTRAINED (ImageNet, torchvision) resnet,
TRAIN with Heinrich Zille dataset,
and test on Heinrich Zille images
"""
experiment_name = '7_hzille_train'

run_experiment(
    _experiment_name=experiment_name,
    config=global_config,
    pretrained=False,
    pretraining_dataset_name='hzille',
    pretraining_data_loader=dataloader_hzille_train,
    test_data_loader=dataloader_hzille_test,
    pretraining_log_every_n_steps=3,
    forced_batch_size_in_pretraining=global_config['finetuning-batchsize']
)

!cp -r /content/runs/7_hzille_train /content/drive/MyDrive/ML_logs/HZille/7_hzille_train

In [None]:
"""
Experiment 8:

Use PRETRAINED (ImageNet, torchvision) resnet,
TRAIN with Heinrich Zille dataset,
and test on Heinrich Zille images
"""
experiment_name = '8_hzille_overtrain'

run_experiment(
    _experiment_name=experiment_name,
    config=global_config,
    pretrained=True,
    pretraining_dataset_name='hzille',
    pretraining_data_loader=dataloader_hzille_train,
    test_data_loader=dataloader_hzille_test,
    pretraining_log_every_n_steps=3,
    forced_batch_size_in_pretraining=global_config['finetuning-batchsize']
)

!cp -r /content/runs/8_hzille_overtrain /content/drive/MyDrive/ML_logs/HZille/8_hzille_overtrain

In [None]:
"""
Experiment 9 (only dew):

Use PRETRAINED (ImageNet, torchvision) resnet,
and test on dew
"""

experiment_name = '9_dew_only_pretrained'

run_experiment(
    _experiment_name=experiment_name,
    config=global_config,
    pretrained=True,
    test_data_loader=dataloader_dew_test,
    do_not_use_hzille_metadata=True
)

!cp -r /content/runs/9_dew_only_pretrained /content/drive/MyDrive/ML_logs/HZille/9_dew_only_pretrained

In [None]:
"""
Experiment 10 (only dew):

Use PRETRAINED (ImageNet, torchvision) resnet,
OVERTRAIN with SimCLR and dew images,
and test on dew images
"""

#load model checkpoints from experiment 3, as experiment 10 is building up on it
checkpoint_path = '/content/drive/MyDrive/ML_logs/HZille/3_hzille_overtrain/22-02-21-120301_pretraining_dew/final_model_checkpoints/22-02-21-120301_pretraining_dew.ckpt'
loaded_model = ResnetSimCLR.load_from_checkpoint(
    checkpoint_path,
    max_epochs=global_config['pretraining-epochs'],
    batchsize=global_config['pretraining-batchsize'],
    projection_head_out=global_config['projection_head_out'],
    learning_rate=global_config['pretraining-learningrate'],
    optimizer=global_config['optimizer']
)

experiment_name = '10_dew_overtrain'

run_experiment(
    _experiment_name=experiment_name,
    config=global_config,
    pretrained=True,
    #pretraining_dataset_name='dew',
    #pretraining_data_loader=dataloader_dew_train,
    test_data_loader=dataloader_dew_test,
    model_from_checkpoint=loaded_model
)

!cp -r /content/runs/10_dew_overtrain /content/drive/MyDrive/ML_logs/HZille/10_dew_overtrain

In [None]:
"""
Experiment 11 (only dew):

Use UNTRAINED (ImageNet, torchvision) resnet,
TRAIN with SimCLR and dew images,
and visualize dew images
"""
#load model checkpoints from experiment 5, as experiment 10 is building up on it
checkpoint_path = '/content/drive/MyDrive/ML_logs/HZille/5_hzille_train/22-02-21-204646_pretraining_dew/final_model_checkpoints/22-02-21-204646_pretraining_dew.ckpt'
loaded_model = ResnetSimCLR.load_from_checkpoint(
    checkpoint_path,
    max_epochs=global_config['pretraining-epochs'],
    batchsize=global_config['pretraining-batchsize'],
    projection_head_out=global_config['projection_head_out'],
    learning_rate=global_config['pretraining-learningrate'],
    optimizer=global_config['optimizer']
)

experiment_name = '11_dew_train'

run_experiment(
    _experiment_name=experiment_name,
    config=global_config,
    pretrained=False,
    #pretraining_dataset_name='dew',
    #pretraining_data_loader=dataloader_dew_train,
    test_data_loader=dataloader_dew_test,
    model_from_checkpoint=loaded_model
)

In [None]:
plot_data_augmentations(dataloader_hzille_train)

In [None]:
plot_data_augmentations(dataloader_hzille_train)

In [None]:
plot_data_augmentations(dataloader_dew_train)

In [None]:
def plot_all_hzille_test_images():
    
    filenames_filename = 'embeddings_filenames.pickle'
    embeddings_path = '/content/drive/MyDrive/ML_logs/HZille/0_hzille_untrained/22-02-20-211630/embeddings/'
    path_to_data = 'drive/MyDrive/Data Sets/Heinrich Zille/ausgeschnitten_klein/'

    rows = 10
    columns = 20

    opened_pickle_file = open(f'{embeddings_path}{filenames_filename}', 'rb')
    filenames = pickle.load(opened_pickle_file)
    opened_pickle_file.close()
    
    print(len(filenames), 'images')

    fig = plt.figure(figsize=(160, 50))

    for i, filename in enumerate(filenames):
        #print(i, filename)
        fig.add_subplot(rows, columns, i + 1)
        img = get_image_as_np_array(f'{path_to_data}{filename}')
        plt.imshow(img, cmap='gray')
        plt.axis('off')
        plt.title(f'{i} - {filename}')




def compare_nearest_neighbours_of_all_experiments(image_index):
    
    filenames_filename = 'embeddings_filenames.pickle'
    embeddings_file_name = 'embeddings.npy'
    
    embeddings_pathes = [
        '/content/drive/MyDrive/ML_logs/HZille/0_hzille_untrained/22-02-20-211630/embeddings/',
        '/content/drive/MyDrive/ML_logs/HZille/1_hzille_only_pretrained/22-02-20-211922/embeddings/',
        '/content/drive/MyDrive/ML_logs/HZille/2_hzille_pretrained_finetuned/22-02-21-114830_finetuning_hzille/embeddings/',
        '/content/drive/MyDrive/ML_logs/HZille/3_hzille_overtrain/22-02-21-120301_pretraining_dew/embeddings/',
        '/content/drive/MyDrive/ML_logs/HZille/4_hzille_overtrain_finetune/22-02-21-201901_finetuning_hzille/embeddings/',
        '/content/drive/MyDrive/ML_logs/HZille/5_hzille_train/22-02-21-204646_pretraining_dew/embeddings/',
        '/content/drive/MyDrive/ML_logs/HZille/6_hzille_train_finetune/22-02-22-081633_finetuning_hzille/embeddings/',
        '/content/drive/MyDrive/ML_logs/HZille/7_hzille_train/22-02-22-084656_pretraining_hzille/embeddings/',
        '/content/drive/MyDrive/ML_logs/HZille/8_hzille_overtrain/22-02-22-085652_pretraining_hzille/embeddings/'
    ]

    path_to_data = 'drive/MyDrive/Data Sets/Heinrich Zille/ausgeschnitten_klein/'

    opened_pickle_file = open(f'{embeddings_pathes[0]}{filenames_filename}', 'rb')
    filenames = pickle.load(opened_pickle_file)
    opened_pickle_file.close()

    num_nearest_neighbours = 10

    rows = len(embeddings_pathes)
    columns = num_nearest_neighbours

    fig = plt.figure(figsize=(50, 40))

    img_position_in_plot = 1

    for embeddings_path in embeddings_pathes:

        embeddings = np.load(f'{embeddings_path}{embeddings_file_name}')
        knn = NearestNeighbors(n_neighbors=num_nearest_neighbours).fit(embeddings)

        distances, indices = knn.kneighbors(embeddings)
        
        selected_img_and_neighbours = indices[image_index]

        for i, neighbour_i in enumerate(selected_img_and_neighbours):
            
            #print(i, filename)
            
            ax = fig.add_subplot(rows, columns, img_position_in_plot)

            img = get_image_as_np_array(f'{path_to_data}{filenames[neighbour_i]}')
            
            ax.set_title(f'd={distances[image_index][i]:.3f} ({filenames[neighbour_i]})')
            plt.axis('off')
            plt.imshow(img, cmap='gray')

            img_position_in_plot += 1 

        # break




In [None]:
compare_nearest_neighbours_of_all_experiments(image_index=100)

In [None]:
compare_nearest_neighbours_of_all_experiments(image_index=107)

In [None]:
compare_nearest_neighbours_of_all_experiments(image_index=103)

In [None]:
compare_nearest_neighbours_of_all_experiments(image_index=46)

In [None]:
compare_nearest_neighbours_of_all_experiments(image_index=27)

In [None]:
compare_nearest_neighbours_of_all_experiments(image_index=15)

In [None]:
compare_nearest_neighbours_of_all_experiments(image_index=14)

In [None]:
compare_nearest_neighbours_of_all_experiments(image_index=8)

In [None]:
compare_nearest_neighbours_of_all_experiments(image_index=0)

In [None]:
plot_all_hzille_test_images()

In [None]:
"""
Plot all cluster in t-SNE plots 
and all images of the clusters
"""

dim_reduction_pathes = [
    '/content/drive/MyDrive/ML_logs/HZille/0_hzille_untrained/22-02-20-211630/dim_reduction/',
    '/content/drive/MyDrive/ML_logs/HZille/1_hzille_only_pretrained/22-02-20-211922/dim_reduction/',
    '/content/drive/MyDrive/ML_logs/HZille/2_hzille_pretrained_finetuned/22-02-21-114830_finetuning_hzille/dim_reduction/',
    '/content/drive/MyDrive/ML_logs/HZille/3_hzille_overtrain/22-02-21-120301_pretraining_dew/dim_reduction/',
    '/content/drive/MyDrive/ML_logs/HZille/4_hzille_overtrain_finetune/22-02-21-201901_finetuning_hzille/dim_reduction/',
    '/content/drive/MyDrive/ML_logs/HZille/5_hzille_train/22-02-21-204646_pretraining_dew/dim_reduction/',
    '/content/drive/MyDrive/ML_logs/HZille/6_hzille_train_finetune/22-02-22-081633_finetuning_hzille/dim_reduction/',
    '/content/drive/MyDrive/ML_logs/HZille/7_hzille_train/22-02-22-084656_pretraining_hzille/dim_reduction/',
    '/content/drive/MyDrive/ML_logs/HZille/8_hzille_overtrain/22-02-22-085652_pretraining_hzille/dim_reduction/'
]

path_to_data = 'drive/MyDrive/Data Sets/Heinrich Zille/ausgeschnitten_klein/'

dim_reduction_technique = 'tsne'
no_dimensions = 3

n_clusters = 9


for experiment_no, path in enumerate(dim_reduction_pathes):

    print('Processing experiment', experiment_no)

    df = pd.read_csv(f'{path}{dim_reduction_technique}_{no_dimensions}d.csv', index_col=0)

    kmeans = KMeans(n_clusters=n_clusters, random_state=global_config['random_seed'], max_iter=300, n_init=10, tol=1e-4).fit(df[['0', '1', '2']])
    # centroids = kmeans.cluster_centers_
    cluster_labels = [str(label) for label in kmeans.labels_]

    #total_within_cluster_distances = []
    #for i in range(1, 20):
    #    kmeans = KMeans(n_clusters=i, random_state=global_config['random_seed'], max_iter=500, n_init=20, tol=1e-4).fit(df[['0', '1', '2']])
    #    total_within_cluster_distances.append(kmeans.inertia_)
    #    
    #fig = go.Figure(data = go.Scatter(x = list(range(1,21)), y = total_within_cluster_distances))
    #
    #
    #fig.update_layout(title='Total Within Cluster Distances per Cluster',
    #                   xaxis_title='Cluster no.',
    #                   yaxis_title='Total Within Cluster Distance')
    #fig.show()


    fig = px.scatter_3d(
        df,
        x='0',
        y='1',
        z='2',
        color=cluster_labels,
        symbol='author',
        hover_data=['id'],
        labels={
            '0': '1st projection',
            '1': '2nd projection',
            '2': '3rd projection',
            'color' : 'cluster'
        },
        title=f'Experiment {experiment_no} - t-SNE 3D - {n_clusters} clusters (kmeans)'
    )
    fig.update_traces(marker_size=6)

    fig.show()
    fig.write_html(f'/content/drive/MyDrive/ML_logs/HZille/cluster_plots/experiment_{experiment_no}.html')

    rows = 4
    columns = 10

    df['cluster'] = cluster_labels

    for cluster in df['cluster'].unique():
        print('Plotting cluster', cluster)
        
        fig = plt.figure(figsize=(60, 20))
        fig.suptitle(f'Experiment {experiment_no} cluster {cluster}', fontsize=16)

        selected_images = df[df['cluster'] == cluster]['id']

        for i, img_id in enumerate(selected_images):
            #print(i, img_id)

            fig.add_subplot(rows, columns, i + 1)
            img = get_image_as_np_array(f'{path_to_data}{img_id}.jpg')
            plt.imshow(img, cmap='gray')
            plt.axis('off')
            plt.title(f'{img_id}.jpg')

        fig.savefig(f'/content/drive/MyDrive/ML_logs/HZille/cluster_plots/experiment_{experiment_no}_cluster_{cluster}.jpg')

In [None]:
"""
Plot random samples of dew dataset
"""


filenames_file = 'drive/MyDrive/Data Sets/dew/until_1950/until_1950.pickle'
path_to_data = 'drive/MyDrive/Data Sets/dew/until_1950/until_1950/'

rows = 6
columns = 8

opened_pickle_file = open(filenames_file, 'rb')
filenames = pickle.load(opened_pickle_file)
opened_pickle_file.close()

samples_indices = np.random.choice(len(filenames), size=rows*columns, replace=False)

print('sample indices', samples_indices)

selected_filenames = [
    filename for i, filename in enumerate(filenames) if i in samples_indices
]

print('selected images', len(selected_filenames), selected_filenames)

fig = plt.figure(figsize=(60, 80))

for i, filename in enumerate(selected_filenames):
    print(i, filename)
    fig.add_subplot(rows, columns, i + 1)
    img = get_image_as_np_array(f'{path_to_data}{filename}')
    plt.imshow(img, cmap='gray')
    plt.axis('off')
    plt.title(f'{i} - {filename}')